Compare commits
41 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f16ae5110b | ||
|
|
ab5cbe5ec3 | ||
|
|
be5da92f16 | ||
|
|
95954a65a0 | ||
|
|
0b55e8fa77 | ||
|
|
6f159e8959 | ||
|
|
5c413b4b00 | ||
|
|
cca64d3ed1 | ||
|
|
e11eaf6ff8 | ||
|
|
85a4c1d855 | ||
|
|
0f5e72623b | ||
|
|
061e549ad7 | ||
|
|
49d1a5d248 | ||
|
|
03d6b00db8 | ||
|
|
a2eb0f7201 | ||
|
|
5c71a0ca07 | ||
|
|
489d16fad9 | ||
|
|
a458707777 | ||
|
|
a1d0a86305 | ||
|
|
29f1d21be7 | ||
|
|
34c5b526c3 | ||
|
|
3edb6578ba | ||
|
|
984c036b6e | ||
|
|
a403ecb5c9 | ||
|
|
a44713409c | ||
|
|
567467b8ed | ||
|
|
139edd6104 | ||
|
|
e756d55abf | ||
|
|
015c8d2a78 | ||
|
|
5af846c13d | ||
|
|
e69a9c7a54 | ||
|
|
23cb2a9a53 | ||
|
|
b5e3397b88 | ||
|
|
5685a4055a | ||
|
|
97b72ef767 | ||
|
|
31489848be | ||
|
|
e5988d447b | ||
|
|
adfed748ce | ||
|
|
2444ba87c0 | ||
|
|
534457e713 | ||
|
|
ea82a9730e |
4
setup.py
4
setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):
|
||||
|
||||
|
||||
setup(name='talon',
|
||||
version='1.3.1',
|
||||
version='1.3.7',
|
||||
description=("Mailgun library "
|
||||
"to extract message quotations and signatures."),
|
||||
long_description=open("README.rst").read(),
|
||||
@@ -48,7 +48,7 @@ setup(name='talon',
|
||||
"regex>=1",
|
||||
"numpy",
|
||||
"scipy",
|
||||
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
|
||||
"scikit-learn>=0.16.1", # pickled versions of classifier, else rebuild
|
||||
'chardet>=1.0.1',
|
||||
'cchardet>=0.3.5',
|
||||
'cssselect',
|
||||
|
||||
@@ -131,7 +131,7 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
|
||||
'Oprindelig meddelelse',
|
||||
))), re.I)
|
||||
|
||||
RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format(
|
||||
RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]?.*'.format(
|
||||
u'|'.join((
|
||||
# "From" in different languages.
|
||||
'From', 'Van', 'De', 'Von', 'Fra', u'Från',
|
||||
@@ -139,6 +139,21 @@ RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .
|
||||
'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
|
||||
))), re.I)
|
||||
|
||||
# ---- John Smith wrote ----
|
||||
RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format(
|
||||
u'|'.join((
|
||||
# English
|
||||
'wrote'
|
||||
))), re.I)
|
||||
|
||||
# Support polymail.io reply format
|
||||
# On Tue, Apr 11, 2017 at 10:07 PM John Smith
|
||||
#
|
||||
# <
|
||||
# mailto:John Smith <johnsmith@gmail.com>
|
||||
# > wrote:
|
||||
RE_POLYMAIL = re.compile('On.*\s{2}<\smailto:.*\s> wrote:', re.I)
|
||||
|
||||
SPLITTER_PATTERNS = [
|
||||
RE_ORIGINAL_MESSAGE,
|
||||
RE_ON_DATE_SMB_WROTE,
|
||||
@@ -154,16 +169,17 @@ SPLITTER_PATTERNS = [
|
||||
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
|
||||
'( \S+){3,6}@\S+:'),
|
||||
# Sent from Samsung MobileName <address@example.com> wrote:
|
||||
re.compile('Sent from Samsung .*@.*> wrote')
|
||||
re.compile('Sent from Samsung .*@.*> wrote'),
|
||||
RE_ANDROID_WROTE,
|
||||
RE_POLYMAIL
|
||||
]
|
||||
|
||||
|
||||
RE_LINK = re.compile('<(http://[^>]*)>')
|
||||
RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@')
|
||||
|
||||
RE_PARENTHESIS_LINK = re.compile("\(https?://")
|
||||
|
||||
SPLITTER_MAX_LINES = 4
|
||||
SPLITTER_MAX_LINES = 6
|
||||
MAX_LINES_COUNT = 1000
|
||||
# an extensive research shows that exceeding this limit
|
||||
# leads to excessive processing time
|
||||
@@ -172,6 +188,9 @@ MAX_HTML_LEN = 2794202
|
||||
QUOT_PATTERN = re.compile('^>+ ?')
|
||||
NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
|
||||
|
||||
# Regular expression to identify if a line is a header.
|
||||
RE_HEADER = re.compile(": ")
|
||||
|
||||
|
||||
def extract_from(msg_body, content_type='text/plain'):
|
||||
try:
|
||||
@@ -185,6 +204,19 @@ def extract_from(msg_body, content_type='text/plain'):
|
||||
return msg_body
|
||||
|
||||
|
||||
def remove_initial_spaces_and_mark_message_lines(lines):
|
||||
"""
|
||||
Removes the initial spaces in each line before marking message lines.
|
||||
|
||||
This ensures headers can be identified if they are indented with spaces.
|
||||
"""
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
lines[i] = lines[i].lstrip(' ')
|
||||
i += 1
|
||||
return mark_message_lines(lines)
|
||||
|
||||
|
||||
def mark_message_lines(lines):
|
||||
"""Mark message lines with markers to distinguish quotation lines.
|
||||
|
||||
@@ -287,9 +319,21 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
|
||||
|
||||
Converts msg_body into a unicode.
|
||||
"""
|
||||
# normalize links i.e. replace '<', '>' wrapping the link with some symbols
|
||||
# so that '>' closing the link couldn't be mistakenly taken for quotation
|
||||
# marker.
|
||||
msg_body = _replace_link_brackets(msg_body)
|
||||
|
||||
msg_body = _wrap_splitter_with_newline(msg_body, delimiter, content_type)
|
||||
|
||||
return msg_body
|
||||
|
||||
|
||||
def _replace_link_brackets(msg_body):
|
||||
"""
|
||||
Normalize links i.e. replace '<', '>' wrapping the link with some symbols
|
||||
so that '>' closing the link couldn't be mistakenly taken for quotation
|
||||
marker.
|
||||
|
||||
Converts msg_body into a unicode
|
||||
"""
|
||||
if isinstance(msg_body, bytes):
|
||||
msg_body = msg_body.decode('utf8')
|
||||
|
||||
@@ -301,7 +345,14 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
|
||||
return "@@%s@@" % link.group(1)
|
||||
|
||||
msg_body = re.sub(RE_LINK, link_wrapper, msg_body)
|
||||
return msg_body
|
||||
|
||||
|
||||
def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'):
|
||||
"""
|
||||
Splits line in two if splitter pattern preceded by some text on the same
|
||||
line (done only for 'On <date> <person> wrote:' pattern.
|
||||
"""
|
||||
def splitter_wrapper(splitter):
|
||||
"""Wraps splitter with new line"""
|
||||
if splitter.start() and msg_body[splitter.start() - 1] != '\n':
|
||||
@@ -386,9 +437,6 @@ def _extract_from_html(msg_body):
|
||||
then checking deleted checkpoints,
|
||||
then deleting necessary tags.
|
||||
"""
|
||||
if len(msg_body) > MAX_HTML_LEN:
|
||||
return msg_body
|
||||
|
||||
if msg_body.strip() == b'':
|
||||
return msg_body
|
||||
|
||||
@@ -453,6 +501,82 @@ def _extract_from_html(msg_body):
|
||||
return html.tostring(html_tree_copy)
|
||||
|
||||
|
||||
def split_emails(msg):
|
||||
"""
|
||||
Given a message (which may consist of an email conversation thread with
|
||||
multiple emails), mark the lines to identify split lines, content lines and
|
||||
empty lines.
|
||||
|
||||
Correct the split line markers inside header blocks. Header blocks are
|
||||
identified by the regular expression RE_HEADER.
|
||||
|
||||
Return the corrected markers
|
||||
"""
|
||||
msg_body = _replace_link_brackets(msg)
|
||||
|
||||
# don't process too long messages
|
||||
lines = msg_body.splitlines()[:MAX_LINES_COUNT]
|
||||
markers = remove_initial_spaces_and_mark_message_lines(lines)
|
||||
|
||||
markers = _mark_quoted_email_splitlines(markers, lines)
|
||||
|
||||
# we don't want splitlines in header blocks
|
||||
markers = _correct_splitlines_in_headers(markers, lines)
|
||||
|
||||
return markers
|
||||
|
||||
|
||||
def _mark_quoted_email_splitlines(markers, lines):
|
||||
"""
|
||||
When there are headers indented with '>' characters, this method will
|
||||
attempt to identify if the header is a splitline header. If it is, then we
|
||||
mark it with 's' instead of leaving it as 'm' and return the new markers.
|
||||
"""
|
||||
# Create a list of markers to easily alter specific characters
|
||||
markerlist = list(markers)
|
||||
for i, line in enumerate(lines):
|
||||
if markerlist[i] != 'm':
|
||||
continue
|
||||
for pattern in SPLITTER_PATTERNS:
|
||||
matcher = re.search(pattern, line)
|
||||
if matcher:
|
||||
markerlist[i] = 's'
|
||||
break
|
||||
|
||||
return "".join(markerlist)
|
||||
|
||||
|
||||
def _correct_splitlines_in_headers(markers, lines):
|
||||
"""
|
||||
Corrects markers by removing splitlines deemed to be inside header blocks.
|
||||
"""
|
||||
updated_markers = ""
|
||||
i = 0
|
||||
in_header_block = False
|
||||
|
||||
for m in markers:
|
||||
# Only set in_header_block flag when we hit an 's' and line is a header
|
||||
if m == 's':
|
||||
if not in_header_block:
|
||||
if bool(re.search(RE_HEADER, lines[i])):
|
||||
in_header_block = True
|
||||
else:
|
||||
if QUOT_PATTERN.match(lines[i]):
|
||||
m = 'm'
|
||||
else:
|
||||
m = 't'
|
||||
|
||||
# If the line is not a header line, set in_header_block false.
|
||||
if not bool(re.search(RE_HEADER, lines[i])):
|
||||
in_header_block = False
|
||||
|
||||
# Add the marker to the new updated markers string.
|
||||
updated_markers += m
|
||||
i += 1
|
||||
|
||||
return updated_markers
|
||||
|
||||
|
||||
def _readable_text_empty(html_tree):
|
||||
return not bool(html_tree_to_text(html_tree).strip())
|
||||
|
||||
|
||||
@@ -178,6 +178,9 @@ def html_fromstring(s):
|
||||
"""Parse html tree from string. Return None if the string can't be parsed.
|
||||
"""
|
||||
try:
|
||||
if html_too_big(s):
|
||||
return None
|
||||
|
||||
return html5parser.fromstring(s, parser=_html5lib_parser())
|
||||
except Exception:
|
||||
pass
|
||||
@@ -187,6 +190,9 @@ def html_document_fromstring(s):
|
||||
"""Parse html tree from string. Return None if the string can't be parsed.
|
||||
"""
|
||||
try:
|
||||
if html_too_big(s):
|
||||
return None
|
||||
|
||||
return html5parser.document_fromstring(s, parser=_html5lib_parser())
|
||||
except Exception:
|
||||
pass
|
||||
@@ -196,6 +202,10 @@ def cssselect(expr, tree):
|
||||
return CSSSelector(expr)(tree)
|
||||
|
||||
|
||||
def html_too_big(s):
|
||||
return s.count('<') > _MAX_TAGS_COUNT
|
||||
|
||||
|
||||
def _contains_charset_spec(s):
|
||||
"""Return True if the first 4KB contain charset spec
|
||||
"""
|
||||
@@ -243,3 +253,7 @@ _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
|
||||
_HARDBREAKS = ['br', 'hr', 'tr']
|
||||
|
||||
_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
|
||||
|
||||
# an extensive research shows that exceeding this limit
|
||||
# might lead to excessive processing time
|
||||
_MAX_TAGS_COUNT = 419
|
||||
|
||||
@@ -385,7 +385,7 @@ def test_gmail_forwarded_msg():
|
||||
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
|
||||
|
||||
|
||||
@patch.object(quotations, 'MAX_HTML_LEN', 1)
|
||||
@patch.object(u, '_MAX_TAGS_COUNT', 4)
|
||||
def test_too_large_html():
|
||||
msg_body = 'Reply' \
|
||||
'<div class="gmail_quote">' \
|
||||
|
||||
@@ -35,6 +35,19 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote:
|
||||
|
||||
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
||||
|
||||
def test_pattern_on_date_polymail():
|
||||
msg_body = """Test reply
|
||||
|
||||
On Tue, Apr 11, 2017 at 10:07 PM John Smith
|
||||
|
||||
<
|
||||
mailto:John Smith <johnsmith@gmail.com>
|
||||
> wrote:
|
||||
Test quoted data
|
||||
"""
|
||||
|
||||
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
||||
|
||||
|
||||
def test_pattern_sent_from_samsung_smb_wrote():
|
||||
msg_body = """Test reply
|
||||
@@ -54,7 +67,7 @@ def test_pattern_on_date_wrote_somebody():
|
||||
"""Lorem
|
||||
|
||||
Op 13-02-2014 3:18 schreef Julius Caesar <pantheon@rome.com>:
|
||||
|
||||
|
||||
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
|
||||
"""))
|
||||
|
||||
@@ -142,7 +155,8 @@ def _check_pattern_original_message(original_message_indicator):
|
||||
-----{}-----
|
||||
|
||||
Test"""
|
||||
eq_('Test reply', quotations.extract_from_plain(msg_body.format(six.text_type(original_message_indicator))))
|
||||
eq_('Test reply', quotations.extract_from_plain(
|
||||
msg_body.format(six.text_type(original_message_indicator))))
|
||||
|
||||
def test_english_original_message():
|
||||
_check_pattern_original_message('Original Message')
|
||||
@@ -165,6 +179,17 @@ Test reply"""
|
||||
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
||||
|
||||
|
||||
def test_android_wrote():
|
||||
msg_body = """Test reply
|
||||
|
||||
---- John Smith wrote ----
|
||||
|
||||
> quoted
|
||||
> text
|
||||
"""
|
||||
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
||||
|
||||
|
||||
def test_reply_wraps_quotations():
|
||||
msg_body = """Test reply
|
||||
|
||||
@@ -244,7 +269,7 @@ def test_with_indent():
|
||||
|
||||
------On 12/29/1987 17:32 PM, Julius Caesar wrote-----
|
||||
|
||||
Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur.
|
||||
Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur.
|
||||
"""
|
||||
eq_("YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.", quotations.extract_from_plain(msg_body))
|
||||
|
||||
@@ -369,11 +394,11 @@ Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny p
|
||||
|
||||
def test_dutch_from_block():
|
||||
eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
|
||||
"""Gluten-free culpa lo-fi et nesciunt nostrud.
|
||||
"""Gluten-free culpa lo-fi et nesciunt nostrud.
|
||||
|
||||
Op 17-feb.-2015, om 13:18 heeft Julius Caesar <pantheon@rome.com> het volgende geschreven:
|
||||
|
||||
Small batch beard laboris tempor, non listicle hella Tumblr heirloom.
|
||||
|
||||
Small batch beard laboris tempor, non listicle hella Tumblr heirloom.
|
||||
"""))
|
||||
|
||||
|
||||
@@ -696,3 +721,52 @@ def test_standard_replies():
|
||||
"'%(reply)s' != %(stripped)s for %(fn)s" % \
|
||||
{'reply': reply_text, 'stripped': stripped_text,
|
||||
'fn': filename}
|
||||
|
||||
|
||||
def test_split_email():
|
||||
msg = """From: Mr. X
|
||||
Date: 24 February 2016
|
||||
To: Mr. Y
|
||||
Subject: Hi
|
||||
Attachments: none
|
||||
Goodbye.
|
||||
From: Mr. Y
|
||||
To: Mr. X
|
||||
Date: 24 February 2016
|
||||
Subject: Hi
|
||||
Attachments: none
|
||||
|
||||
Hello.
|
||||
|
||||
On 24th February 2016 at 09.32am, Conal wrote:
|
||||
|
||||
Hey!
|
||||
|
||||
On Mon, 2016-10-03 at 09:45 -0600, Stangel, Dan wrote:
|
||||
> Mohan,
|
||||
>
|
||||
> We have not yet migrated the systems.
|
||||
>
|
||||
> Dan
|
||||
>
|
||||
> > -----Original Message-----
|
||||
> > Date: Mon, 2 Apr 2012 17:44:22 +0400
|
||||
> > Subject: Test
|
||||
> > From: bob@xxx.mailgun.org
|
||||
> > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
|
||||
> >
|
||||
> > Hi
|
||||
> >
|
||||
> > > From: bob@xxx.mailgun.org
|
||||
> > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
|
||||
> > > Date: Mon, 2 Apr 2012 17:44:22 +0400
|
||||
> > > Subject: Test
|
||||
> > > Hi
|
||||
> > >
|
||||
> >
|
||||
>
|
||||
>
|
||||
"""
|
||||
expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm"
|
||||
markers = quotations.split_emails(msg)
|
||||
eq_(markers, expected_markers)
|
||||
|
||||
@@ -29,7 +29,9 @@ def test_unicode():
|
||||
|
||||
def test_detect_encoding():
|
||||
eq_ ('ascii', u.detect_encoding(b'qwe').lower())
|
||||
eq_ ('iso-8859-2', u.detect_encoding(u'Versi\xf3n'.encode('iso-8859-2')).lower())
|
||||
ok_ (u.detect_encoding(
|
||||
u'Versi\xf3n'.encode('iso-8859-2')).lower() in [
|
||||
'iso-8859-1', 'iso-8859-2'])
|
||||
eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower())
|
||||
# fallback to utf-8
|
||||
with patch.object(u.chardet, 'detect') as detect:
|
||||
@@ -39,7 +41,9 @@ def test_detect_encoding():
|
||||
|
||||
def test_quick_detect_encoding():
|
||||
eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower())
|
||||
eq_ ('windows-1252', u.quick_detect_encoding(u'Versi\xf3n'.encode('windows-1252')).lower())
|
||||
ok_ (u.quick_detect_encoding(
|
||||
u'Versi\xf3n'.encode('windows-1252')).lower() in [
|
||||
'windows-1252', 'windows-1250'])
|
||||
eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower())
|
||||
|
||||
|
||||
@@ -120,6 +124,12 @@ def test_comment_no_parent():
|
||||
def test_html_fromstring_exception():
|
||||
eq_(None, u.html_fromstring("<html></html>"))
|
||||
|
||||
@patch.object(u, 'html_too_big', Mock())
|
||||
@patch.object(u.html5parser, 'fromstring')
|
||||
def test_html_fromstring_too_big(fromstring):
|
||||
eq_(None, u.html_fromstring("<html></html>"))
|
||||
assert_false(fromstring.called)
|
||||
|
||||
|
||||
@patch.object(u.html5parser, 'document_fromstring')
|
||||
def test_html_document_fromstring_exception(document_fromstring):
|
||||
@@ -127,7 +137,26 @@ def test_html_document_fromstring_exception(document_fromstring):
|
||||
eq_(None, u.html_document_fromstring("<html></html>"))
|
||||
|
||||
|
||||
@patch.object(u, 'html_too_big', Mock())
|
||||
@patch.object(u.html5parser, 'document_fromstring')
|
||||
def test_html_document_fromstring_too_big(document_fromstring):
|
||||
eq_(None, u.html_document_fromstring("<html></html>"))
|
||||
assert_false(document_fromstring.called)
|
||||
|
||||
|
||||
@patch.object(u, 'html_fromstring', Mock(return_value=None))
|
||||
def test_bad_html_to_text():
|
||||
bad_html = "one<br>two<br>three"
|
||||
eq_(None, u.html_to_text(bad_html))
|
||||
|
||||
|
||||
@patch.object(u, '_MAX_TAGS_COUNT', 3)
|
||||
def test_html_too_big():
|
||||
eq_(False, u.html_too_big("<div></div>"))
|
||||
eq_(True, u.html_too_big("<div><span>Hi</span></div>"))
|
||||
|
||||
|
||||
@patch.object(u, '_MAX_TAGS_COUNT', 3)
|
||||
def test_html_to_text():
|
||||
eq_("Hello", u.html_to_text("<div>Hello</div>"))
|
||||
eq_(None, u.html_to_text("<div><span>Hi</span></div>"))
|
||||
|
||||
Reference in New Issue
Block a user