Compare commits
37 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0b55e8fa77 | ||
|
|
6f159e8959 | ||
|
|
85a4c1d855 | ||
|
|
0f5e72623b | ||
|
|
061e549ad7 | ||
|
|
49d1a5d248 | ||
|
|
03d6b00db8 | ||
|
|
a2eb0f7201 | ||
|
|
5c71a0ca07 | ||
|
|
489d16fad9 | ||
|
|
a458707777 | ||
|
|
a1d0a86305 | ||
|
|
29f1d21be7 | ||
|
|
34c5b526c3 | ||
|
|
3edb6578ba | ||
|
|
984c036b6e | ||
|
|
a403ecb5c9 | ||
|
|
a44713409c | ||
|
|
567467b8ed | ||
|
|
139edd6104 | ||
|
|
e756d55abf | ||
|
|
015c8d2a78 | ||
|
|
5af846c13d | ||
|
|
e69a9c7a54 | ||
|
|
23cb2a9a53 | ||
|
|
b5e3397b88 | ||
|
|
5685a4055a | ||
|
|
97b72ef767 | ||
|
|
31489848be | ||
|
|
e5988d447b | ||
|
|
adfed748ce | ||
|
|
2444ba87c0 | ||
|
|
534457e713 | ||
|
|
ea82a9730e | ||
|
|
f04b872e14 | ||
|
|
e61894e425 | ||
|
|
35fbdaadac |
4
setup.py
4
setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):
|
||||
|
||||
|
||||
setup(name='talon',
|
||||
version='1.3.0',
|
||||
version='1.3.6',
|
||||
description=("Mailgun library "
|
||||
"to extract message quotations and signatures."),
|
||||
long_description=open("README.rst").read(),
|
||||
@@ -48,7 +48,7 @@ setup(name='talon',
|
||||
"regex>=1",
|
||||
"numpy",
|
||||
"scipy",
|
||||
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
|
||||
"scikit-learn>=0.16.1", # pickled versions of classifier, else rebuild
|
||||
'chardet>=1.0.1',
|
||||
'cchardet>=0.3.5',
|
||||
'cssselect',
|
||||
|
||||
@@ -139,6 +139,13 @@ RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .
|
||||
'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
|
||||
))), re.I)
|
||||
|
||||
# ---- John Smith wrote ----
|
||||
RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format(
|
||||
u'|'.join((
|
||||
# English
|
||||
'wrote'
|
||||
))), re.I)
|
||||
|
||||
SPLITTER_PATTERNS = [
|
||||
RE_ORIGINAL_MESSAGE,
|
||||
RE_ON_DATE_SMB_WROTE,
|
||||
@@ -154,10 +161,10 @@ SPLITTER_PATTERNS = [
|
||||
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
|
||||
'( \S+){3,6}@\S+:'),
|
||||
# Sent from Samsung MobileName <address@example.com> wrote:
|
||||
re.compile('Sent from Samsung .*@.*> wrote')
|
||||
re.compile('Sent from Samsung .*@.*> wrote'),
|
||||
RE_ANDROID_WROTE
|
||||
]
|
||||
|
||||
|
||||
RE_LINK = re.compile('<(http://[^>]*)>')
|
||||
RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@')
|
||||
|
||||
@@ -172,6 +179,9 @@ MAX_HTML_LEN = 2794202
|
||||
QUOT_PATTERN = re.compile('^>+ ?')
|
||||
NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
|
||||
|
||||
# Regular expression to identify if a line is a header.
|
||||
RE_HEADER = re.compile(": ")
|
||||
|
||||
|
||||
def extract_from(msg_body, content_type='text/plain'):
|
||||
try:
|
||||
@@ -185,6 +195,19 @@ def extract_from(msg_body, content_type='text/plain'):
|
||||
return msg_body
|
||||
|
||||
|
||||
def remove_initial_spaces_and_mark_message_lines(lines):
|
||||
"""
|
||||
Removes the initial spaces in each line before marking message lines.
|
||||
|
||||
This ensures headers can be identified if they are indented with spaces.
|
||||
"""
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
lines[i] = lines[i].lstrip(' ')
|
||||
i += 1
|
||||
return mark_message_lines(lines)
|
||||
|
||||
|
||||
def mark_message_lines(lines):
|
||||
"""Mark message lines with markers to distinguish quotation lines.
|
||||
|
||||
@@ -287,9 +310,21 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
|
||||
|
||||
Converts msg_body into a unicode.
|
||||
"""
|
||||
# normalize links i.e. replace '<', '>' wrapping the link with some symbols
|
||||
# so that '>' closing the link couldn't be mistakenly taken for quotation
|
||||
# marker.
|
||||
msg_body = _replace_link_brackets(msg_body)
|
||||
|
||||
msg_body = _wrap_splitter_with_newline(msg_body, delimiter, content_type)
|
||||
|
||||
return msg_body
|
||||
|
||||
|
||||
def _replace_link_brackets(msg_body):
|
||||
"""
|
||||
Normalize links i.e. replace '<', '>' wrapping the link with some symbols
|
||||
so that '>' closing the link couldn't be mistakenly taken for quotation
|
||||
marker.
|
||||
|
||||
Converts msg_body into a unicode
|
||||
"""
|
||||
if isinstance(msg_body, bytes):
|
||||
msg_body = msg_body.decode('utf8')
|
||||
|
||||
@@ -301,7 +336,14 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
|
||||
return "@@%s@@" % link.group(1)
|
||||
|
||||
msg_body = re.sub(RE_LINK, link_wrapper, msg_body)
|
||||
return msg_body
|
||||
|
||||
|
||||
def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'):
|
||||
"""
|
||||
Splits line in two if splitter pattern preceded by some text on the same
|
||||
line (done only for 'On <date> <person> wrote:' pattern.
|
||||
"""
|
||||
def splitter_wrapper(splitter):
|
||||
"""Wraps splitter with new line"""
|
||||
if splitter.start() and msg_body[splitter.start() - 1] != '\n':
|
||||
@@ -386,9 +428,6 @@ def _extract_from_html(msg_body):
|
||||
then checking deleted checkpoints,
|
||||
then deleting necessary tags.
|
||||
"""
|
||||
if len(msg_body) > MAX_HTML_LEN:
|
||||
return msg_body
|
||||
|
||||
if msg_body.strip() == b'':
|
||||
return msg_body
|
||||
|
||||
@@ -453,6 +492,82 @@ def _extract_from_html(msg_body):
|
||||
return html.tostring(html_tree_copy)
|
||||
|
||||
|
||||
def split_emails(msg):
|
||||
"""
|
||||
Given a message (which may consist of an email conversation thread with
|
||||
multiple emails), mark the lines to identify split lines, content lines and
|
||||
empty lines.
|
||||
|
||||
Correct the split line markers inside header blocks. Header blocks are
|
||||
identified by the regular expression RE_HEADER.
|
||||
|
||||
Return the corrected markers
|
||||
"""
|
||||
msg_body = _replace_link_brackets(msg)
|
||||
|
||||
# don't process too long messages
|
||||
lines = msg_body.splitlines()[:MAX_LINES_COUNT]
|
||||
markers = remove_initial_spaces_and_mark_message_lines(lines)
|
||||
|
||||
markers = _mark_quoted_email_splitlines(markers, lines)
|
||||
|
||||
# we don't want splitlines in header blocks
|
||||
markers = _correct_splitlines_in_headers(markers, lines)
|
||||
|
||||
return markers
|
||||
|
||||
|
||||
def _mark_quoted_email_splitlines(markers, lines):
|
||||
"""
|
||||
When there are headers indented with '>' characters, this method will
|
||||
attempt to identify if the header is a splitline header. If it is, then we
|
||||
mark it with 's' instead of leaving it as 'm' and return the new markers.
|
||||
"""
|
||||
# Create a list of markers to easily alter specific characters
|
||||
markerlist = list(markers)
|
||||
for i, line in enumerate(lines):
|
||||
if markerlist[i] != 'm':
|
||||
continue
|
||||
for pattern in SPLITTER_PATTERNS:
|
||||
matcher = re.search(pattern, line)
|
||||
if matcher:
|
||||
markerlist[i] = 's'
|
||||
break
|
||||
|
||||
return "".join(markerlist)
|
||||
|
||||
|
||||
def _correct_splitlines_in_headers(markers, lines):
|
||||
"""
|
||||
Corrects markers by removing splitlines deemed to be inside header blocks.
|
||||
"""
|
||||
updated_markers = ""
|
||||
i = 0
|
||||
in_header_block = False
|
||||
|
||||
for m in markers:
|
||||
# Only set in_header_block flag when we hit an 's' and line is a header
|
||||
if m == 's':
|
||||
if not in_header_block:
|
||||
if bool(re.search(RE_HEADER, lines[i])):
|
||||
in_header_block = True
|
||||
else:
|
||||
if QUOT_PATTERN.match(lines[i]):
|
||||
m = 'm'
|
||||
else:
|
||||
m = 't'
|
||||
|
||||
# If the line is not a header line, set in_header_block false.
|
||||
if not bool(re.search(RE_HEADER, lines[i])):
|
||||
in_header_block = False
|
||||
|
||||
# Add the marker to the new updated markers string.
|
||||
updated_markers += m
|
||||
i += 1
|
||||
|
||||
return updated_markers
|
||||
|
||||
|
||||
def _readable_text_empty(html_tree):
|
||||
return not bool(html_tree_to_text(html_tree).strip())
|
||||
|
||||
|
||||
@@ -178,7 +178,10 @@ def html_fromstring(s):
|
||||
"""Parse html tree from string. Return None if the string can't be parsed.
|
||||
"""
|
||||
try:
|
||||
return html5parser.fromstring(s, parser=_HTML5LIB_PARSER)
|
||||
if html_too_big(s):
|
||||
return None
|
||||
|
||||
return html5parser.fromstring(s, parser=_html5lib_parser())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -187,7 +190,10 @@ def html_document_fromstring(s):
|
||||
"""Parse html tree from string. Return None if the string can't be parsed.
|
||||
"""
|
||||
try:
|
||||
return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER)
|
||||
if html_too_big(s):
|
||||
return None
|
||||
|
||||
return html5parser.document_fromstring(s, parser=_html5lib_parser())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -196,6 +202,10 @@ def cssselect(expr, tree):
|
||||
return CSSSelector(expr)(tree)
|
||||
|
||||
|
||||
def html_too_big(s):
|
||||
return s.count('<') > _MAX_TAGS_COUNT
|
||||
|
||||
|
||||
def _contains_charset_spec(s):
|
||||
"""Return True if the first 4KB contain charset spec
|
||||
"""
|
||||
@@ -220,6 +230,21 @@ def _encode_utf8(s):
|
||||
return s.encode('utf-8') if isinstance(s, six.text_type) else s
|
||||
|
||||
|
||||
def _html5lib_parser():
|
||||
"""
|
||||
html5lib is a pure-python library that conforms to the WHATWG HTML spec
|
||||
and is not vulnarable to certain attacks common for XML libraries
|
||||
"""
|
||||
return html5lib.HTMLParser(
|
||||
# build lxml tree
|
||||
html5lib.treebuilders.getTreeBuilder("lxml"),
|
||||
# remove namespace value from inside lxml.html.html5paser element tag
|
||||
# otherwise it yields something like "{http://www.w3.org/1999/xhtml}div"
|
||||
# instead of "div", throwing the algo off
|
||||
namespaceHTMLElements=False
|
||||
)
|
||||
|
||||
|
||||
_UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;'
|
||||
b'charset=utf-8">')
|
||||
|
||||
@@ -229,13 +254,6 @@ _HARDBREAKS = ['br', 'hr', 'tr']
|
||||
|
||||
_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
|
||||
|
||||
# html5lib is a pure-python library that conforms to the WHATWG HTML spec
|
||||
# and is not vulnarable to certain attacks common for XML libraries
|
||||
_HTML5LIB_PARSER = html5lib.HTMLParser(
|
||||
# build lxml tree
|
||||
html5lib.treebuilders.getTreeBuilder("lxml"),
|
||||
# remove namespace value from inside lxml.html.html5paser element tag
|
||||
# otherwise it yields something like "{http://www.w3.org/1999/xhtml}div"
|
||||
# instead of "div", throwing the algo off
|
||||
namespaceHTMLElements=False
|
||||
)
|
||||
# an extensive research shows that exceeding this limit
|
||||
# might lead to excessive processing time
|
||||
_MAX_TAGS_COUNT = 419
|
||||
|
||||
@@ -385,7 +385,7 @@ def test_gmail_forwarded_msg():
|
||||
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
|
||||
|
||||
|
||||
@patch.object(quotations, 'MAX_HTML_LEN', 1)
|
||||
@patch.object(u, '_MAX_TAGS_COUNT', 4)
|
||||
def test_too_large_html():
|
||||
msg_body = 'Reply' \
|
||||
'<div class="gmail_quote">' \
|
||||
|
||||
@@ -142,7 +142,8 @@ def _check_pattern_original_message(original_message_indicator):
|
||||
-----{}-----
|
||||
|
||||
Test"""
|
||||
eq_('Test reply', quotations.extract_from_plain(msg_body.format(six.text_type(original_message_indicator))))
|
||||
eq_('Test reply', quotations.extract_from_plain(
|
||||
msg_body.format(six.text_type(original_message_indicator))))
|
||||
|
||||
def test_english_original_message():
|
||||
_check_pattern_original_message('Original Message')
|
||||
@@ -165,6 +166,17 @@ Test reply"""
|
||||
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
||||
|
||||
|
||||
def test_android_wrote():
|
||||
msg_body = """Test reply
|
||||
|
||||
---- John Smith wrote ----
|
||||
|
||||
> quoted
|
||||
> text
|
||||
"""
|
||||
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
||||
|
||||
|
||||
def test_reply_wraps_quotations():
|
||||
msg_body = """Test reply
|
||||
|
||||
@@ -696,3 +708,52 @@ def test_standard_replies():
|
||||
"'%(reply)s' != %(stripped)s for %(fn)s" % \
|
||||
{'reply': reply_text, 'stripped': stripped_text,
|
||||
'fn': filename}
|
||||
|
||||
|
||||
def test_split_email():
|
||||
msg = """From: Mr. X
|
||||
Date: 24 February 2016
|
||||
To: Mr. Y
|
||||
Subject: Hi
|
||||
Attachments: none
|
||||
Goodbye.
|
||||
From: Mr. Y
|
||||
To: Mr. X
|
||||
Date: 24 February 2016
|
||||
Subject: Hi
|
||||
Attachments: none
|
||||
|
||||
Hello.
|
||||
|
||||
On 24th February 2016 at 09.32am, Conal wrote:
|
||||
|
||||
Hey!
|
||||
|
||||
On Mon, 2016-10-03 at 09:45 -0600, Stangel, Dan wrote:
|
||||
> Mohan,
|
||||
>
|
||||
> We have not yet migrated the systems.
|
||||
>
|
||||
> Dan
|
||||
>
|
||||
> > -----Original Message-----
|
||||
> > Date: Mon, 2 Apr 2012 17:44:22 +0400
|
||||
> > Subject: Test
|
||||
> > From: bob@xxx.mailgun.org
|
||||
> > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
|
||||
> >
|
||||
> > Hi
|
||||
> >
|
||||
> > > From: bob@xxx.mailgun.org
|
||||
> > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
|
||||
> > > Date: Mon, 2 Apr 2012 17:44:22 +0400
|
||||
> > > Subject: Test
|
||||
> > > Hi
|
||||
> > >
|
||||
> >
|
||||
>
|
||||
>
|
||||
"""
|
||||
expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm"
|
||||
markers = quotations.split_emails(msg)
|
||||
eq_(markers, expected_markers)
|
||||
|
||||
@@ -29,7 +29,9 @@ def test_unicode():
|
||||
|
||||
def test_detect_encoding():
|
||||
eq_ ('ascii', u.detect_encoding(b'qwe').lower())
|
||||
eq_ ('iso-8859-2', u.detect_encoding(u'Versi\xf3n'.encode('iso-8859-2')).lower())
|
||||
ok_ (u.detect_encoding(
|
||||
u'Versi\xf3n'.encode('iso-8859-2')).lower() in [
|
||||
'iso-8859-1', 'iso-8859-2'])
|
||||
eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower())
|
||||
# fallback to utf-8
|
||||
with patch.object(u.chardet, 'detect') as detect:
|
||||
@@ -39,7 +41,9 @@ def test_detect_encoding():
|
||||
|
||||
def test_quick_detect_encoding():
|
||||
eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower())
|
||||
eq_ ('windows-1252', u.quick_detect_encoding(u'Versi\xf3n'.encode('windows-1252')).lower())
|
||||
ok_ (u.quick_detect_encoding(
|
||||
u'Versi\xf3n'.encode('windows-1252')).lower() in [
|
||||
'windows-1252', 'windows-1250'])
|
||||
eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower())
|
||||
|
||||
|
||||
@@ -120,6 +124,12 @@ def test_comment_no_parent():
|
||||
def test_html_fromstring_exception():
|
||||
eq_(None, u.html_fromstring("<html></html>"))
|
||||
|
||||
@patch.object(u, 'html_too_big', Mock())
|
||||
@patch.object(u.html5parser, 'fromstring')
|
||||
def test_html_fromstring_too_big(fromstring):
|
||||
eq_(None, u.html_fromstring("<html></html>"))
|
||||
assert_false(fromstring.called)
|
||||
|
||||
|
||||
@patch.object(u.html5parser, 'document_fromstring')
|
||||
def test_html_document_fromstring_exception(document_fromstring):
|
||||
@@ -127,7 +137,26 @@ def test_html_document_fromstring_exception(document_fromstring):
|
||||
eq_(None, u.html_document_fromstring("<html></html>"))
|
||||
|
||||
|
||||
@patch.object(u, 'html_too_big', Mock())
|
||||
@patch.object(u.html5parser, 'document_fromstring')
|
||||
def test_html_document_fromstring_too_big(document_fromstring):
|
||||
eq_(None, u.html_document_fromstring("<html></html>"))
|
||||
assert_false(document_fromstring.called)
|
||||
|
||||
|
||||
@patch.object(u, 'html_fromstring', Mock(return_value=None))
|
||||
def test_bad_html_to_text():
|
||||
bad_html = "one<br>two<br>three"
|
||||
eq_(None, u.html_to_text(bad_html))
|
||||
|
||||
|
||||
@patch.object(u, '_MAX_TAGS_COUNT', 3)
|
||||
def test_html_too_big():
|
||||
eq_(False, u.html_too_big("<div></div>"))
|
||||
eq_(True, u.html_too_big("<div><span>Hi</span></div>"))
|
||||
|
||||
|
||||
@patch.object(u, '_MAX_TAGS_COUNT', 3)
|
||||
def test_html_to_text():
|
||||
eq_("Hello", u.html_to_text("<div>Hello</div>"))
|
||||
eq_(None, u.html_to_text("<div><span>Hi</span></div>"))
|
||||
|
||||
Reference in New Issue
Block a user