Compare commits
8 Commits
dietz/REP-
...
v1.5.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a8c7e6a972 | ||
|
|
b30c375c5b | ||
|
|
cec5acf58f | ||
|
|
24d0f2d00a | ||
|
|
94007b0b92 | ||
|
|
1a5548f171 | ||
|
|
53c49b9121 | ||
|
|
bd50872043 |
@@ -6,6 +6,6 @@ joblib
|
|||||||
lxml>=2.3.3
|
lxml>=2.3.3
|
||||||
numpy
|
numpy
|
||||||
regex>=1
|
regex>=1
|
||||||
scikit-learn==0.24.1 # pickled versions of classifier, else rebuild
|
scikit-learn>=1.0.0
|
||||||
scipy
|
scipy
|
||||||
six>=1.10.0
|
six>=1.10.0
|
||||||
|
|||||||
16
setup.py
16
setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):
|
|||||||
|
|
||||||
|
|
||||||
setup(name='talon',
|
setup(name='talon',
|
||||||
version='1.4.8',
|
version='1.5.0',
|
||||||
description=("Mailgun library "
|
description=("Mailgun library "
|
||||||
"to extract message quotations and signatures."),
|
"to extract message quotations and signatures."),
|
||||||
long_description=open("README.rst").read(),
|
long_description=open("README.rst").read(),
|
||||||
@@ -44,21 +44,21 @@ setup(name='talon',
|
|||||||
include_package_data=True,
|
include_package_data=True,
|
||||||
zip_safe=True,
|
zip_safe=True,
|
||||||
install_requires=[
|
install_requires=[
|
||||||
"lxml>=2.3.3",
|
"lxml",
|
||||||
"regex>=1",
|
"regex",
|
||||||
"numpy",
|
"numpy",
|
||||||
"scipy",
|
"scipy",
|
||||||
"scikit-learn==0.24.1", # pickled versions of classifier, else rebuild
|
"scikit-learn>=1.0.0",
|
||||||
"chardet>=1.0.1",
|
"chardet",
|
||||||
"cchardet>=0.3.5",
|
"cchardet",
|
||||||
"cssselect",
|
"cssselect",
|
||||||
"six>=1.10.0",
|
"six",
|
||||||
"html5lib",
|
"html5lib",
|
||||||
"joblib",
|
"joblib",
|
||||||
],
|
],
|
||||||
tests_require=[
|
tests_require=[
|
||||||
"mock",
|
"mock",
|
||||||
"nose>=1.2.1",
|
"nose",
|
||||||
"coverage"
|
"coverage"
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -193,9 +193,6 @@ RE_PARENTHESIS_LINK = re.compile("\(https?://")
|
|||||||
|
|
||||||
SPLITTER_MAX_LINES = 6
|
SPLITTER_MAX_LINES = 6
|
||||||
MAX_LINES_COUNT = 1000
|
MAX_LINES_COUNT = 1000
|
||||||
# an extensive research shows that exceeding this limit
|
|
||||||
# leads to excessive processing time
|
|
||||||
MAX_HTML_LEN = 2794202
|
|
||||||
|
|
||||||
QUOT_PATTERN = re.compile('^>+ ?')
|
QUOT_PATTERN = re.compile('^>+ ?')
|
||||||
NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
|
NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
|
||||||
@@ -421,25 +418,31 @@ def extract_from_html(msg_body):
|
|||||||
|
|
||||||
Returns a unicode string.
|
Returns a unicode string.
|
||||||
"""
|
"""
|
||||||
|
msg_body_bytes = msg_body
|
||||||
if isinstance(msg_body, six.text_type):
|
if isinstance(msg_body, six.text_type):
|
||||||
msg_body = msg_body.encode('utf8')
|
msg_body_bytes = msg_body.encode('utf8')
|
||||||
elif not isinstance(msg_body, bytes):
|
|
||||||
msg_body = msg_body.encode('ascii')
|
|
||||||
|
|
||||||
result = _extract_from_html(msg_body)
|
if msg_body_bytes.strip() == b'':
|
||||||
if isinstance(result, bytes):
|
return msg_body
|
||||||
result = result.decode('utf8')
|
|
||||||
|
msg_body_bytes = msg_body_bytes.replace(b'\r\n', b'\n')
|
||||||
|
# Cut out xml and doctype tags to avoid conflict with unicode decoding.
|
||||||
|
msg_body_bytes = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", b"", msg_body_bytes)
|
||||||
|
html_tree = html_document_fromstring(msg_body_bytes)
|
||||||
|
if html_tree is None:
|
||||||
|
return msg_body
|
||||||
|
|
||||||
|
result = extract_from_html_tree(html_tree)
|
||||||
|
if not result:
|
||||||
|
return msg_body
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def _extract_from_html(msg_body):
|
def extract_from_html_tree(html_tree):
|
||||||
"""
|
"""
|
||||||
Extract not quoted message from provided html message body
|
Extract not quoted message from provided parsed html tree using tags and
|
||||||
using tags and plain text algorithm.
|
plain text algorithm.
|
||||||
|
|
||||||
Cut out first some encoding html tags such as xml and doctype
|
|
||||||
for avoiding conflict with unicode decoding
|
|
||||||
|
|
||||||
Cut out the 'blockquote', 'gmail_quote' tags.
|
Cut out the 'blockquote', 'gmail_quote' tags.
|
||||||
Cut Microsoft quotations.
|
Cut Microsoft quotations.
|
||||||
@@ -452,18 +455,6 @@ def _extract_from_html(msg_body):
|
|||||||
then checking deleted checkpoints,
|
then checking deleted checkpoints,
|
||||||
then deleting necessary tags.
|
then deleting necessary tags.
|
||||||
"""
|
"""
|
||||||
if msg_body.strip() == b'':
|
|
||||||
return msg_body
|
|
||||||
|
|
||||||
msg_body = msg_body.replace(b'\r\n', b'\n')
|
|
||||||
|
|
||||||
msg_body = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
|
|
||||||
|
|
||||||
html_tree = html_document_fromstring(msg_body)
|
|
||||||
|
|
||||||
if html_tree is None:
|
|
||||||
return msg_body
|
|
||||||
|
|
||||||
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
|
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
|
||||||
html_quotations.cut_zimbra_quote(html_tree) or
|
html_quotations.cut_zimbra_quote(html_tree) or
|
||||||
html_quotations.cut_blockquote(html_tree) or
|
html_quotations.cut_blockquote(html_tree) or
|
||||||
@@ -481,7 +472,7 @@ def _extract_from_html(msg_body):
|
|||||||
|
|
||||||
# Don't process too long messages
|
# Don't process too long messages
|
||||||
if len(lines) > MAX_LINES_COUNT:
|
if len(lines) > MAX_LINES_COUNT:
|
||||||
return msg_body
|
return None
|
||||||
|
|
||||||
# Collect checkpoints on each line
|
# Collect checkpoints on each line
|
||||||
line_checkpoints = [
|
line_checkpoints = [
|
||||||
@@ -500,7 +491,7 @@ def _extract_from_html(msg_body):
|
|||||||
lines_were_deleted, first_deleted, last_deleted = return_flags
|
lines_were_deleted, first_deleted, last_deleted = return_flags
|
||||||
|
|
||||||
if not lines_were_deleted and not cut_quotations:
|
if not lines_were_deleted and not cut_quotations:
|
||||||
return msg_body
|
return None
|
||||||
|
|
||||||
if lines_were_deleted:
|
if lines_were_deleted:
|
||||||
#collect checkpoints from deleted lines
|
#collect checkpoints from deleted lines
|
||||||
@@ -514,7 +505,7 @@ def _extract_from_html(msg_body):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if _readable_text_empty(html_tree_copy):
|
if _readable_text_empty(html_tree_copy):
|
||||||
return msg_body
|
return None
|
||||||
|
|
||||||
# NOTE: We remove_namespaces() because we are using an HTML5 Parser, HTML
|
# NOTE: We remove_namespaces() because we are using an HTML5 Parser, HTML
|
||||||
# parsers do not recognize namespaces in HTML tags. As such the rendered
|
# parsers do not recognize namespaces in HTML tags. As such the rendered
|
||||||
@@ -540,7 +531,11 @@ def _extract_from_html(msg_body):
|
|||||||
# of replacing data outside the <tag> which might be essential to
|
# of replacing data outside the <tag> which might be essential to
|
||||||
# the customer.
|
# the customer.
|
||||||
remove_namespaces(html_tree_copy)
|
remove_namespaces(html_tree_copy)
|
||||||
return html.tostring(html_tree_copy)
|
s = html.tostring(html_tree_copy)
|
||||||
|
if not s:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return s.decode('utf-8')
|
||||||
|
|
||||||
|
|
||||||
def remove_namespaces(root):
|
def remove_namespaces(root):
|
||||||
|
|||||||
@@ -23,17 +23,14 @@ trained against, don't forget to regenerate:
|
|||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from . import extraction
|
from talon.signature import extraction
|
||||||
from . extraction import extract #noqa
|
from talon.signature.extraction import extract
|
||||||
from . learning import classifier
|
from talon.signature.learning import classifier
|
||||||
|
|
||||||
|
|
||||||
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
|
|
||||||
|
|
||||||
EXTRACTOR_FILENAME = os.path.join(DATA_DIR, 'classifier')
|
|
||||||
EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data')
|
|
||||||
|
|
||||||
|
|
||||||
def initialize():
|
def initialize():
|
||||||
extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME,
|
data_dir = os.path.join(os.path.dirname(__file__), 'data')
|
||||||
EXTRACTOR_DATA)
|
extractor_filename = os.path.join(data_dir, 'classifier')
|
||||||
|
extractor_data_filename = os.path.join(data_dir, 'train.data')
|
||||||
|
extraction.EXTRACTOR = classifier.load(extractor_filename,
|
||||||
|
extractor_data_filename)
|
||||||
|
|||||||
Binary file not shown.
@@ -102,7 +102,7 @@ def flatten_list(list_to_flatten):
|
|||||||
|
|
||||||
|
|
||||||
def contains_sender_names(sender):
|
def contains_sender_names(sender):
|
||||||
'''Returns a functions to search sender\'s name or it\'s part.
|
"""Returns a functions to search sender\'s name or it\'s part.
|
||||||
|
|
||||||
>>> feature = contains_sender_names("Sergey N. Obukhov <xxx@example.com>")
|
>>> feature = contains_sender_names("Sergey N. Obukhov <xxx@example.com>")
|
||||||
>>> feature("Sergey Obukhov")
|
>>> feature("Sergey Obukhov")
|
||||||
@@ -115,7 +115,7 @@ def contains_sender_names(sender):
|
|||||||
1
|
1
|
||||||
>>> contains_sender_names("<serobnic@mail.ru>")("serobnic")
|
>>> contains_sender_names("<serobnic@mail.ru>")("serobnic")
|
||||||
1
|
1
|
||||||
'''
|
"""
|
||||||
names = '( |$)|'.join(flatten_list([[e, e.capitalize()]
|
names = '( |$)|'.join(flatten_list([[e, e.capitalize()]
|
||||||
for e in extract_names(sender)]))
|
for e in extract_names(sender)]))
|
||||||
names = names or sender
|
names = names or sender
|
||||||
@@ -140,10 +140,16 @@ def extract_names(sender):
|
|||||||
sender = "".join([char if char.isalpha() else ' ' for char in sender])
|
sender = "".join([char if char.isalpha() else ' ' for char in sender])
|
||||||
# Remove too short words and words from "black" list i.e.
|
# Remove too short words and words from "black" list i.e.
|
||||||
# words like `ru`, `gmail`, `com`, `org`, etc.
|
# words like `ru`, `gmail`, `com`, `org`, etc.
|
||||||
sender = [word for word in sender.split() if len(word) > 1 and
|
names = list()
|
||||||
not word in BAD_SENDER_NAMES]
|
for word in sender.split():
|
||||||
# Remove duplicates
|
if len(word) < 2:
|
||||||
names = list(set(sender))
|
continue
|
||||||
|
if word in BAD_SENDER_NAMES:
|
||||||
|
continue
|
||||||
|
if word in names:
|
||||||
|
continue
|
||||||
|
names.append(word)
|
||||||
|
|
||||||
return names
|
return names
|
||||||
|
|
||||||
|
|
||||||
@@ -208,20 +214,26 @@ def many_capitalized_words(s):
|
|||||||
|
|
||||||
|
|
||||||
def has_signature(body, sender):
|
def has_signature(body, sender):
|
||||||
'''Checks if the body has signature. Returns True or False.'''
|
"""Checks if the body has signature. Returns True or False."""
|
||||||
non_empty = [line for line in body.splitlines() if line.strip()]
|
non_empty = [line for line in body.splitlines() if line.strip()]
|
||||||
candidate = non_empty[-SIGNATURE_MAX_LINES:]
|
candidate = non_empty[-SIGNATURE_MAX_LINES:]
|
||||||
upvotes = 0
|
upvotes = 0
|
||||||
|
sender_check = contains_sender_names(sender)
|
||||||
for line in candidate:
|
for line in candidate:
|
||||||
# we check lines for sender's name, phone, email and url,
|
# we check lines for sender's name, phone, email and url,
|
||||||
# those signature lines don't take more then 27 lines
|
# those signature lines don't take more then 27 lines
|
||||||
if len(line.strip()) > 27:
|
if len(line.strip()) > 27:
|
||||||
continue
|
continue
|
||||||
elif contains_sender_names(sender)(line):
|
|
||||||
|
if sender_check(line):
|
||||||
return True
|
return True
|
||||||
elif (binary_regex_search(RE_RELAX_PHONE)(line) +
|
|
||||||
|
if (binary_regex_search(RE_RELAX_PHONE)(line) +
|
||||||
binary_regex_search(RE_EMAIL)(line) +
|
binary_regex_search(RE_EMAIL)(line) +
|
||||||
binary_regex_search(RE_URL)(line) == 1):
|
binary_regex_search(RE_URL)(line) == 1):
|
||||||
upvotes += 1
|
upvotes += 1
|
||||||
|
|
||||||
if upvotes > 1:
|
if upvotes > 1:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|||||||
@@ -180,9 +180,6 @@ def html_fromstring(s):
|
|||||||
if isinstance(s, six.text_type):
|
if isinstance(s, six.text_type):
|
||||||
s = s.encode('utf8')
|
s = s.encode('utf8')
|
||||||
try:
|
try:
|
||||||
if html_too_big(s):
|
|
||||||
return None
|
|
||||||
|
|
||||||
return html5parser.fromstring(s, parser=_html5lib_parser())
|
return html5parser.fromstring(s, parser=_html5lib_parser())
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
@@ -194,9 +191,6 @@ def html_document_fromstring(s):
|
|||||||
if isinstance(s, six.text_type):
|
if isinstance(s, six.text_type):
|
||||||
s = s.encode('utf8')
|
s = s.encode('utf8')
|
||||||
try:
|
try:
|
||||||
if html_too_big(s):
|
|
||||||
return None
|
|
||||||
|
|
||||||
return html5parser.document_fromstring(s, parser=_html5lib_parser())
|
return html5parser.document_fromstring(s, parser=_html5lib_parser())
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
@@ -206,12 +200,6 @@ def cssselect(expr, tree):
|
|||||||
return CSSSelector(expr)(tree)
|
return CSSSelector(expr)(tree)
|
||||||
|
|
||||||
|
|
||||||
def html_too_big(s):
|
|
||||||
if isinstance(s, six.text_type):
|
|
||||||
s = s.encode('utf8')
|
|
||||||
return s.count(b'<') > _MAX_TAGS_COUNT
|
|
||||||
|
|
||||||
|
|
||||||
def _contains_charset_spec(s):
|
def _contains_charset_spec(s):
|
||||||
"""Return True if the first 4KB contain charset spec
|
"""Return True if the first 4KB contain charset spec
|
||||||
"""
|
"""
|
||||||
@@ -258,7 +246,3 @@ _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
|
|||||||
_HARDBREAKS = ['br', 'hr', 'tr']
|
_HARDBREAKS = ['br', 'hr', 'tr']
|
||||||
|
|
||||||
_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
|
_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
|
||||||
|
|
||||||
# an extensive research shows that exceeding this limit
|
|
||||||
# might lead to excessive processing time
|
|
||||||
_MAX_TAGS_COUNT = 419
|
|
||||||
|
|||||||
@@ -391,18 +391,6 @@ def test_gmail_forwarded_msg():
|
|||||||
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
|
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
|
||||||
|
|
||||||
|
|
||||||
@patch.object(u, '_MAX_TAGS_COUNT', 4)
|
|
||||||
def test_too_large_html():
|
|
||||||
msg_body = 'Reply' \
|
|
||||||
'<div class="gmail_quote">' \
|
|
||||||
'<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:' \
|
|
||||||
'<div>Test</div>' \
|
|
||||||
'</div>' \
|
|
||||||
'</div>'
|
|
||||||
eq_(RE_WHITESPACE.sub('', msg_body),
|
|
||||||
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
|
|
||||||
|
|
||||||
|
|
||||||
def test_readable_html_empty():
|
def test_readable_html_empty():
|
||||||
msg_body = """
|
msg_body = """
|
||||||
<blockquote>
|
<blockquote>
|
||||||
|
|||||||
@@ -125,39 +125,13 @@ def test_html_fromstring_exception():
|
|||||||
eq_(None, u.html_fromstring("<html></html>"))
|
eq_(None, u.html_fromstring("<html></html>"))
|
||||||
|
|
||||||
|
|
||||||
@patch.object(u, 'html_too_big', Mock())
|
|
||||||
@patch.object(u.html5parser, 'fromstring')
|
|
||||||
def test_html_fromstring_too_big(fromstring):
|
|
||||||
eq_(None, u.html_fromstring("<html></html>"))
|
|
||||||
assert_false(fromstring.called)
|
|
||||||
|
|
||||||
|
|
||||||
@patch.object(u.html5parser, 'document_fromstring')
|
@patch.object(u.html5parser, 'document_fromstring')
|
||||||
def test_html_document_fromstring_exception(document_fromstring):
|
def test_html_document_fromstring_exception(document_fromstring):
|
||||||
document_fromstring.side_effect = Exception()
|
document_fromstring.side_effect = Exception()
|
||||||
eq_(None, u.html_document_fromstring("<html></html>"))
|
eq_(None, u.html_document_fromstring("<html></html>"))
|
||||||
|
|
||||||
|
|
||||||
@patch.object(u, 'html_too_big', Mock())
|
|
||||||
@patch.object(u.html5parser, 'document_fromstring')
|
|
||||||
def test_html_document_fromstring_too_big(document_fromstring):
|
|
||||||
eq_(None, u.html_document_fromstring("<html></html>"))
|
|
||||||
assert_false(document_fromstring.called)
|
|
||||||
|
|
||||||
|
|
||||||
@patch.object(u, 'html_fromstring', Mock(return_value=None))
|
@patch.object(u, 'html_fromstring', Mock(return_value=None))
|
||||||
def test_bad_html_to_text():
|
def test_bad_html_to_text():
|
||||||
bad_html = "one<br>two<br>three"
|
bad_html = "one<br>two<br>three"
|
||||||
eq_(None, u.html_to_text(bad_html))
|
eq_(None, u.html_to_text(bad_html))
|
||||||
|
|
||||||
|
|
||||||
@patch.object(u, '_MAX_TAGS_COUNT', 3)
|
|
||||||
def test_html_too_big():
|
|
||||||
eq_(False, u.html_too_big("<div></div>"))
|
|
||||||
eq_(True, u.html_too_big("<div><span>Hi</span></div>"))
|
|
||||||
|
|
||||||
|
|
||||||
@patch.object(u, '_MAX_TAGS_COUNT', 3)
|
|
||||||
def test_html_to_text():
|
|
||||||
eq_(b"Hello", u.html_to_text("<div>Hello</div>"))
|
|
||||||
eq_(None, u.html_to_text("<div><span>Hi</span></div>"))
|
|
||||||
|
|||||||
Reference in New Issue
Block a user