Merge pull request #223 from mailgun/maxim/develop
PIP-1509: Optimise sender name check [python3]
This commit is contained in:
2
setup.py
2
setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):
|
|||||||
|
|
||||||
|
|
||||||
setup(name='talon',
|
setup(name='talon',
|
||||||
version='1.4.9',
|
version='1.4.10',
|
||||||
description=("Mailgun library "
|
description=("Mailgun library "
|
||||||
"to extract message quotations and signatures."),
|
"to extract message quotations and signatures."),
|
||||||
long_description=open("README.rst").read(),
|
long_description=open("README.rst").read(),
|
||||||
|
|||||||
@@ -23,17 +23,14 @@ trained against, don't forget to regenerate:
|
|||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from . import extraction
|
from talon.signature import extraction
|
||||||
from . extraction import extract #noqa
|
from talon.signature.extraction import extract
|
||||||
from . learning import classifier
|
from talon.signature.learning import classifier
|
||||||
|
|
||||||
|
|
||||||
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
|
|
||||||
|
|
||||||
EXTRACTOR_FILENAME = os.path.join(DATA_DIR, 'classifier')
|
|
||||||
EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data')
|
|
||||||
|
|
||||||
|
|
||||||
def initialize():
|
def initialize():
|
||||||
extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME,
|
data_dir = os.path.join(os.path.dirname(__file__), 'data')
|
||||||
EXTRACTOR_DATA)
|
extractor_filename = os.path.join(data_dir, 'classifier')
|
||||||
|
extractor_data_filename = os.path.join(data_dir, 'train.data')
|
||||||
|
extraction.EXTRACTOR = classifier.load(extractor_filename,
|
||||||
|
extractor_data_filename)
|
||||||
|
|||||||
@@ -102,7 +102,7 @@ def flatten_list(list_to_flatten):
|
|||||||
|
|
||||||
|
|
||||||
def contains_sender_names(sender):
|
def contains_sender_names(sender):
|
||||||
'''Returns a functions to search sender\'s name or it\'s part.
|
"""Returns a functions to search sender\'s name or it\'s part.
|
||||||
|
|
||||||
>>> feature = contains_sender_names("Sergey N. Obukhov <xxx@example.com>")
|
>>> feature = contains_sender_names("Sergey N. Obukhov <xxx@example.com>")
|
||||||
>>> feature("Sergey Obukhov")
|
>>> feature("Sergey Obukhov")
|
||||||
@@ -115,7 +115,7 @@ def contains_sender_names(sender):
|
|||||||
1
|
1
|
||||||
>>> contains_sender_names("<serobnic@mail.ru>")("serobnic")
|
>>> contains_sender_names("<serobnic@mail.ru>")("serobnic")
|
||||||
1
|
1
|
||||||
'''
|
"""
|
||||||
names = '( |$)|'.join(flatten_list([[e, e.capitalize()]
|
names = '( |$)|'.join(flatten_list([[e, e.capitalize()]
|
||||||
for e in extract_names(sender)]))
|
for e in extract_names(sender)]))
|
||||||
names = names or sender
|
names = names or sender
|
||||||
@@ -140,10 +140,16 @@ def extract_names(sender):
|
|||||||
sender = "".join([char if char.isalpha() else ' ' for char in sender])
|
sender = "".join([char if char.isalpha() else ' ' for char in sender])
|
||||||
# Remove too short words and words from "black" list i.e.
|
# Remove too short words and words from "black" list i.e.
|
||||||
# words like `ru`, `gmail`, `com`, `org`, etc.
|
# words like `ru`, `gmail`, `com`, `org`, etc.
|
||||||
sender = [word for word in sender.split() if len(word) > 1 and
|
names = list()
|
||||||
not word in BAD_SENDER_NAMES]
|
for word in sender.split():
|
||||||
# Remove duplicates
|
if len(word) < 2:
|
||||||
names = list(set(sender))
|
continue
|
||||||
|
if word in BAD_SENDER_NAMES:
|
||||||
|
continue
|
||||||
|
if word in names:
|
||||||
|
continue
|
||||||
|
names.append(word)
|
||||||
|
|
||||||
return names
|
return names
|
||||||
|
|
||||||
|
|
||||||
@@ -208,20 +214,26 @@ def many_capitalized_words(s):
|
|||||||
|
|
||||||
|
|
||||||
def has_signature(body, sender):
|
def has_signature(body, sender):
|
||||||
'''Checks if the body has signature. Returns True or False.'''
|
"""Checks if the body has signature. Returns True or False."""
|
||||||
non_empty = [line for line in body.splitlines() if line.strip()]
|
non_empty = [line for line in body.splitlines() if line.strip()]
|
||||||
candidate = non_empty[-SIGNATURE_MAX_LINES:]
|
candidate = non_empty[-SIGNATURE_MAX_LINES:]
|
||||||
upvotes = 0
|
upvotes = 0
|
||||||
|
sender_check = contains_sender_names(sender)
|
||||||
for line in candidate:
|
for line in candidate:
|
||||||
# we check lines for sender's name, phone, email and url,
|
# we check lines for sender's name, phone, email and url,
|
||||||
# those signature lines don't take more then 27 lines
|
# those signature lines don't take more then 27 lines
|
||||||
if len(line.strip()) > 27:
|
if len(line.strip()) > 27:
|
||||||
continue
|
continue
|
||||||
elif contains_sender_names(sender)(line):
|
|
||||||
|
if sender_check(line):
|
||||||
return True
|
return True
|
||||||
elif (binary_regex_search(RE_RELAX_PHONE)(line) +
|
|
||||||
binary_regex_search(RE_EMAIL)(line) +
|
if (binary_regex_search(RE_RELAX_PHONE)(line) +
|
||||||
binary_regex_search(RE_URL)(line) == 1):
|
binary_regex_search(RE_EMAIL)(line) +
|
||||||
|
binary_regex_search(RE_URL)(line) == 1):
|
||||||
upvotes += 1
|
upvotes += 1
|
||||||
|
|
||||||
if upvotes > 1:
|
if upvotes > 1:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|||||||
Reference in New Issue
Block a user