diff --git a/setup.py b/setup.py index 847d736..c458d98 100755 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ class InstallCommand(install): setup(name='talon', - version='1.4.9', + version='1.4.10', description=("Mailgun library " "to extract message quotations and signatures."), long_description=open("README.rst").read(), diff --git a/talon/signature/__init__.py b/talon/signature/__init__.py index fc60e1d..7700cba 100644 --- a/talon/signature/__init__.py +++ b/talon/signature/__init__.py @@ -23,17 +23,14 @@ trained against, don't forget to regenerate: from __future__ import absolute_import import os -from . import extraction -from . extraction import extract #noqa -from . learning import classifier - - -DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') - -EXTRACTOR_FILENAME = os.path.join(DATA_DIR, 'classifier') -EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data') +from talon.signature import extraction +from talon.signature.extraction import extract +from talon.signature.learning import classifier def initialize(): - extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME, - EXTRACTOR_DATA) + data_dir = os.path.join(os.path.dirname(__file__), 'data') + extractor_filename = os.path.join(data_dir, 'classifier') + extractor_data_filename = os.path.join(data_dir, 'train.data') + extraction.EXTRACTOR = classifier.load(extractor_filename, + extractor_data_filename) diff --git a/talon/signature/learning/helpers.py b/talon/signature/learning/helpers.py index 4fa78d9..6814f81 100644 --- a/talon/signature/learning/helpers.py +++ b/talon/signature/learning/helpers.py @@ -102,7 +102,7 @@ def flatten_list(list_to_flatten): def contains_sender_names(sender): - '''Returns a functions to search sender\'s name or it\'s part. + """Returns a functions to search sender\'s name or it\'s part. >>> feature = contains_sender_names("Sergey N. Obukhov ") >>> feature("Sergey Obukhov") @@ -115,7 +115,7 @@ def contains_sender_names(sender): 1 >>> contains_sender_names("")("serobnic") 1 - ''' + """ names = '( |$)|'.join(flatten_list([[e, e.capitalize()] for e in extract_names(sender)])) names = names or sender @@ -140,10 +140,16 @@ def extract_names(sender): sender = "".join([char if char.isalpha() else ' ' for char in sender]) # Remove too short words and words from "black" list i.e. # words like `ru`, `gmail`, `com`, `org`, etc. - sender = [word for word in sender.split() if len(word) > 1 and - not word in BAD_SENDER_NAMES] - # Remove duplicates - names = list(set(sender)) + names = list() + for word in sender.split(): + if len(word) < 2: + continue + if word in BAD_SENDER_NAMES: + continue + if word in names: + continue + names.append(word) + return names @@ -208,20 +214,26 @@ def many_capitalized_words(s): def has_signature(body, sender): - '''Checks if the body has signature. Returns True or False.''' + """Checks if the body has signature. Returns True or False.""" non_empty = [line for line in body.splitlines() if line.strip()] candidate = non_empty[-SIGNATURE_MAX_LINES:] upvotes = 0 + sender_check = contains_sender_names(sender) for line in candidate: # we check lines for sender's name, phone, email and url, # those signature lines don't take more then 27 lines if len(line.strip()) > 27: continue - elif contains_sender_names(sender)(line): + + if sender_check(line): return True - elif (binary_regex_search(RE_RELAX_PHONE)(line) + - binary_regex_search(RE_EMAIL)(line) + - binary_regex_search(RE_URL)(line) == 1): + + if (binary_regex_search(RE_RELAX_PHONE)(line) + + binary_regex_search(RE_EMAIL)(line) + + binary_regex_search(RE_URL)(line) == 1): upvotes += 1 + if upvotes > 1: return True + + return False