Merge pull request #223 from mailgun/maxim/develop

PIP-1509: Optimise sender name check [python3]
This commit is contained in:
Maxim Vladimirskiy
2021-11-19 13:11:29 +03:00
committed by GitHub
3 changed files with 32 additions and 23 deletions

View File

@@ -29,7 +29,7 @@ class InstallCommand(install):
setup(name='talon', setup(name='talon',
version='1.4.9', version='1.4.10',
description=("Mailgun library " description=("Mailgun library "
"to extract message quotations and signatures."), "to extract message quotations and signatures."),
long_description=open("README.rst").read(), long_description=open("README.rst").read(),

View File

@@ -23,17 +23,14 @@ trained against, don't forget to regenerate:
from __future__ import absolute_import from __future__ import absolute_import
import os import os
from . import extraction from talon.signature import extraction
from . extraction import extract #noqa from talon.signature.extraction import extract
from . learning import classifier from talon.signature.learning import classifier
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
EXTRACTOR_FILENAME = os.path.join(DATA_DIR, 'classifier')
EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data')
def initialize(): def initialize():
extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME, data_dir = os.path.join(os.path.dirname(__file__), 'data')
EXTRACTOR_DATA) extractor_filename = os.path.join(data_dir, 'classifier')
extractor_data_filename = os.path.join(data_dir, 'train.data')
extraction.EXTRACTOR = classifier.load(extractor_filename,
extractor_data_filename)

View File

@@ -102,7 +102,7 @@ def flatten_list(list_to_flatten):
def contains_sender_names(sender): def contains_sender_names(sender):
'''Returns a functions to search sender\'s name or it\'s part. """Returns a functions to search sender\'s name or it\'s part.
>>> feature = contains_sender_names("Sergey N. Obukhov <xxx@example.com>") >>> feature = contains_sender_names("Sergey N. Obukhov <xxx@example.com>")
>>> feature("Sergey Obukhov") >>> feature("Sergey Obukhov")
@@ -115,7 +115,7 @@ def contains_sender_names(sender):
1 1
>>> contains_sender_names("<serobnic@mail.ru>")("serobnic") >>> contains_sender_names("<serobnic@mail.ru>")("serobnic")
1 1
''' """
names = '( |$)|'.join(flatten_list([[e, e.capitalize()] names = '( |$)|'.join(flatten_list([[e, e.capitalize()]
for e in extract_names(sender)])) for e in extract_names(sender)]))
names = names or sender names = names or sender
@@ -140,10 +140,16 @@ def extract_names(sender):
sender = "".join([char if char.isalpha() else ' ' for char in sender]) sender = "".join([char if char.isalpha() else ' ' for char in sender])
# Remove too short words and words from "black" list i.e. # Remove too short words and words from "black" list i.e.
# words like `ru`, `gmail`, `com`, `org`, etc. # words like `ru`, `gmail`, `com`, `org`, etc.
sender = [word for word in sender.split() if len(word) > 1 and names = list()
not word in BAD_SENDER_NAMES] for word in sender.split():
# Remove duplicates if len(word) < 2:
names = list(set(sender)) continue
if word in BAD_SENDER_NAMES:
continue
if word in names:
continue
names.append(word)
return names return names
@@ -208,20 +214,26 @@ def many_capitalized_words(s):
def has_signature(body, sender): def has_signature(body, sender):
'''Checks if the body has signature. Returns True or False.''' """Checks if the body has signature. Returns True or False."""
non_empty = [line for line in body.splitlines() if line.strip()] non_empty = [line for line in body.splitlines() if line.strip()]
candidate = non_empty[-SIGNATURE_MAX_LINES:] candidate = non_empty[-SIGNATURE_MAX_LINES:]
upvotes = 0 upvotes = 0
sender_check = contains_sender_names(sender)
for line in candidate: for line in candidate:
# we check lines for sender's name, phone, email and url, # we check lines for sender's name, phone, email and url,
# those signature lines don't take more then 27 lines # those signature lines don't take more then 27 lines
if len(line.strip()) > 27: if len(line.strip()) > 27:
continue continue
elif contains_sender_names(sender)(line):
if sender_check(line):
return True return True
elif (binary_regex_search(RE_RELAX_PHONE)(line) +
binary_regex_search(RE_EMAIL)(line) + if (binary_regex_search(RE_RELAX_PHONE)(line) +
binary_regex_search(RE_URL)(line) == 1): binary_regex_search(RE_EMAIL)(line) +
binary_regex_search(RE_URL)(line) == 1):
upvotes += 1 upvotes += 1
if upvotes > 1: if upvotes > 1:
return True return True
return False