Compare commits
38 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f16ae5110b | ||
|
|
ab5cbe5ec3 | ||
|
|
be5da92f16 | ||
|
|
95954a65a0 | ||
|
|
0b55e8fa77 | ||
|
|
6f159e8959 | ||
|
|
5c413b4b00 | ||
|
|
cca64d3ed1 | ||
|
|
e11eaf6ff8 | ||
|
|
85a4c1d855 | ||
|
|
0f5e72623b | ||
|
|
061e549ad7 | ||
|
|
49d1a5d248 | ||
|
|
03d6b00db8 | ||
|
|
a2eb0f7201 | ||
|
|
5c71a0ca07 | ||
|
|
489d16fad9 | ||
|
|
a458707777 | ||
|
|
a1d0a86305 | ||
|
|
29f1d21be7 | ||
|
|
34c5b526c3 | ||
|
|
3edb6578ba | ||
|
|
984c036b6e | ||
|
|
a403ecb5c9 | ||
|
|
a44713409c | ||
|
|
567467b8ed | ||
|
|
139edd6104 | ||
|
|
e756d55abf | ||
|
|
015c8d2a78 | ||
|
|
5af846c13d | ||
|
|
e69a9c7a54 | ||
|
|
23cb2a9a53 | ||
|
|
b5e3397b88 | ||
|
|
5685a4055a | ||
|
|
97b72ef767 | ||
|
|
31489848be | ||
|
|
e5988d447b | ||
|
|
adfed748ce |
4
setup.py
4
setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):
|
|||||||
|
|
||||||
|
|
||||||
setup(name='talon',
|
setup(name='talon',
|
||||||
version='1.3.2',
|
version='1.3.7',
|
||||||
description=("Mailgun library "
|
description=("Mailgun library "
|
||||||
"to extract message quotations and signatures."),
|
"to extract message quotations and signatures."),
|
||||||
long_description=open("README.rst").read(),
|
long_description=open("README.rst").read(),
|
||||||
@@ -48,7 +48,7 @@ setup(name='talon',
|
|||||||
"regex>=1",
|
"regex>=1",
|
||||||
"numpy",
|
"numpy",
|
||||||
"scipy",
|
"scipy",
|
||||||
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
|
"scikit-learn>=0.16.1", # pickled versions of classifier, else rebuild
|
||||||
'chardet>=1.0.1',
|
'chardet>=1.0.1',
|
||||||
'cchardet>=0.3.5',
|
'cchardet>=0.3.5',
|
||||||
'cssselect',
|
'cssselect',
|
||||||
|
|||||||
@@ -131,7 +131,7 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
|
|||||||
'Oprindelig meddelelse',
|
'Oprindelig meddelelse',
|
||||||
))), re.I)
|
))), re.I)
|
||||||
|
|
||||||
RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format(
|
RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]?.*'.format(
|
||||||
u'|'.join((
|
u'|'.join((
|
||||||
# "From" in different languages.
|
# "From" in different languages.
|
||||||
'From', 'Van', 'De', 'Von', 'Fra', u'Från',
|
'From', 'Van', 'De', 'Von', 'Fra', u'Från',
|
||||||
@@ -139,6 +139,21 @@ RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .
|
|||||||
'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
|
'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
|
||||||
))), re.I)
|
))), re.I)
|
||||||
|
|
||||||
|
# ---- John Smith wrote ----
|
||||||
|
RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format(
|
||||||
|
u'|'.join((
|
||||||
|
# English
|
||||||
|
'wrote'
|
||||||
|
))), re.I)
|
||||||
|
|
||||||
|
# Support polymail.io reply format
|
||||||
|
# On Tue, Apr 11, 2017 at 10:07 PM John Smith
|
||||||
|
#
|
||||||
|
# <
|
||||||
|
# mailto:John Smith <johnsmith@gmail.com>
|
||||||
|
# > wrote:
|
||||||
|
RE_POLYMAIL = re.compile('On.*\s{2}<\smailto:.*\s> wrote:', re.I)
|
||||||
|
|
||||||
SPLITTER_PATTERNS = [
|
SPLITTER_PATTERNS = [
|
||||||
RE_ORIGINAL_MESSAGE,
|
RE_ORIGINAL_MESSAGE,
|
||||||
RE_ON_DATE_SMB_WROTE,
|
RE_ON_DATE_SMB_WROTE,
|
||||||
@@ -154,16 +169,17 @@ SPLITTER_PATTERNS = [
|
|||||||
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
|
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
|
||||||
'( \S+){3,6}@\S+:'),
|
'( \S+){3,6}@\S+:'),
|
||||||
# Sent from Samsung MobileName <address@example.com> wrote:
|
# Sent from Samsung MobileName <address@example.com> wrote:
|
||||||
re.compile('Sent from Samsung .*@.*> wrote')
|
re.compile('Sent from Samsung .*@.*> wrote'),
|
||||||
|
RE_ANDROID_WROTE,
|
||||||
|
RE_POLYMAIL
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
RE_LINK = re.compile('<(http://[^>]*)>')
|
RE_LINK = re.compile('<(http://[^>]*)>')
|
||||||
RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@')
|
RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@')
|
||||||
|
|
||||||
RE_PARENTHESIS_LINK = re.compile("\(https?://")
|
RE_PARENTHESIS_LINK = re.compile("\(https?://")
|
||||||
|
|
||||||
SPLITTER_MAX_LINES = 4
|
SPLITTER_MAX_LINES = 6
|
||||||
MAX_LINES_COUNT = 1000
|
MAX_LINES_COUNT = 1000
|
||||||
# an extensive research shows that exceeding this limit
|
# an extensive research shows that exceeding this limit
|
||||||
# leads to excessive processing time
|
# leads to excessive processing time
|
||||||
@@ -172,6 +188,9 @@ MAX_HTML_LEN = 2794202
|
|||||||
QUOT_PATTERN = re.compile('^>+ ?')
|
QUOT_PATTERN = re.compile('^>+ ?')
|
||||||
NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
|
NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
|
||||||
|
|
||||||
|
# Regular expression to identify if a line is a header.
|
||||||
|
RE_HEADER = re.compile(": ")
|
||||||
|
|
||||||
|
|
||||||
def extract_from(msg_body, content_type='text/plain'):
|
def extract_from(msg_body, content_type='text/plain'):
|
||||||
try:
|
try:
|
||||||
@@ -185,6 +204,19 @@ def extract_from(msg_body, content_type='text/plain'):
|
|||||||
return msg_body
|
return msg_body
|
||||||
|
|
||||||
|
|
||||||
|
def remove_initial_spaces_and_mark_message_lines(lines):
|
||||||
|
"""
|
||||||
|
Removes the initial spaces in each line before marking message lines.
|
||||||
|
|
||||||
|
This ensures headers can be identified if they are indented with spaces.
|
||||||
|
"""
|
||||||
|
i = 0
|
||||||
|
while i < len(lines):
|
||||||
|
lines[i] = lines[i].lstrip(' ')
|
||||||
|
i += 1
|
||||||
|
return mark_message_lines(lines)
|
||||||
|
|
||||||
|
|
||||||
def mark_message_lines(lines):
|
def mark_message_lines(lines):
|
||||||
"""Mark message lines with markers to distinguish quotation lines.
|
"""Mark message lines with markers to distinguish quotation lines.
|
||||||
|
|
||||||
@@ -287,9 +319,21 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
|
|||||||
|
|
||||||
Converts msg_body into a unicode.
|
Converts msg_body into a unicode.
|
||||||
"""
|
"""
|
||||||
# normalize links i.e. replace '<', '>' wrapping the link with some symbols
|
msg_body = _replace_link_brackets(msg_body)
|
||||||
# so that '>' closing the link couldn't be mistakenly taken for quotation
|
|
||||||
# marker.
|
msg_body = _wrap_splitter_with_newline(msg_body, delimiter, content_type)
|
||||||
|
|
||||||
|
return msg_body
|
||||||
|
|
||||||
|
|
||||||
|
def _replace_link_brackets(msg_body):
|
||||||
|
"""
|
||||||
|
Normalize links i.e. replace '<', '>' wrapping the link with some symbols
|
||||||
|
so that '>' closing the link couldn't be mistakenly taken for quotation
|
||||||
|
marker.
|
||||||
|
|
||||||
|
Converts msg_body into a unicode
|
||||||
|
"""
|
||||||
if isinstance(msg_body, bytes):
|
if isinstance(msg_body, bytes):
|
||||||
msg_body = msg_body.decode('utf8')
|
msg_body = msg_body.decode('utf8')
|
||||||
|
|
||||||
@@ -301,7 +345,14 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
|
|||||||
return "@@%s@@" % link.group(1)
|
return "@@%s@@" % link.group(1)
|
||||||
|
|
||||||
msg_body = re.sub(RE_LINK, link_wrapper, msg_body)
|
msg_body = re.sub(RE_LINK, link_wrapper, msg_body)
|
||||||
|
return msg_body
|
||||||
|
|
||||||
|
|
||||||
|
def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'):
|
||||||
|
"""
|
||||||
|
Splits line in two if splitter pattern preceded by some text on the same
|
||||||
|
line (done only for 'On <date> <person> wrote:' pattern.
|
||||||
|
"""
|
||||||
def splitter_wrapper(splitter):
|
def splitter_wrapper(splitter):
|
||||||
"""Wraps splitter with new line"""
|
"""Wraps splitter with new line"""
|
||||||
if splitter.start() and msg_body[splitter.start() - 1] != '\n':
|
if splitter.start() and msg_body[splitter.start() - 1] != '\n':
|
||||||
@@ -450,6 +501,82 @@ def _extract_from_html(msg_body):
|
|||||||
return html.tostring(html_tree_copy)
|
return html.tostring(html_tree_copy)
|
||||||
|
|
||||||
|
|
||||||
|
def split_emails(msg):
|
||||||
|
"""
|
||||||
|
Given a message (which may consist of an email conversation thread with
|
||||||
|
multiple emails), mark the lines to identify split lines, content lines and
|
||||||
|
empty lines.
|
||||||
|
|
||||||
|
Correct the split line markers inside header blocks. Header blocks are
|
||||||
|
identified by the regular expression RE_HEADER.
|
||||||
|
|
||||||
|
Return the corrected markers
|
||||||
|
"""
|
||||||
|
msg_body = _replace_link_brackets(msg)
|
||||||
|
|
||||||
|
# don't process too long messages
|
||||||
|
lines = msg_body.splitlines()[:MAX_LINES_COUNT]
|
||||||
|
markers = remove_initial_spaces_and_mark_message_lines(lines)
|
||||||
|
|
||||||
|
markers = _mark_quoted_email_splitlines(markers, lines)
|
||||||
|
|
||||||
|
# we don't want splitlines in header blocks
|
||||||
|
markers = _correct_splitlines_in_headers(markers, lines)
|
||||||
|
|
||||||
|
return markers
|
||||||
|
|
||||||
|
|
||||||
|
def _mark_quoted_email_splitlines(markers, lines):
|
||||||
|
"""
|
||||||
|
When there are headers indented with '>' characters, this method will
|
||||||
|
attempt to identify if the header is a splitline header. If it is, then we
|
||||||
|
mark it with 's' instead of leaving it as 'm' and return the new markers.
|
||||||
|
"""
|
||||||
|
# Create a list of markers to easily alter specific characters
|
||||||
|
markerlist = list(markers)
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
if markerlist[i] != 'm':
|
||||||
|
continue
|
||||||
|
for pattern in SPLITTER_PATTERNS:
|
||||||
|
matcher = re.search(pattern, line)
|
||||||
|
if matcher:
|
||||||
|
markerlist[i] = 's'
|
||||||
|
break
|
||||||
|
|
||||||
|
return "".join(markerlist)
|
||||||
|
|
||||||
|
|
||||||
|
def _correct_splitlines_in_headers(markers, lines):
|
||||||
|
"""
|
||||||
|
Corrects markers by removing splitlines deemed to be inside header blocks.
|
||||||
|
"""
|
||||||
|
updated_markers = ""
|
||||||
|
i = 0
|
||||||
|
in_header_block = False
|
||||||
|
|
||||||
|
for m in markers:
|
||||||
|
# Only set in_header_block flag when we hit an 's' and line is a header
|
||||||
|
if m == 's':
|
||||||
|
if not in_header_block:
|
||||||
|
if bool(re.search(RE_HEADER, lines[i])):
|
||||||
|
in_header_block = True
|
||||||
|
else:
|
||||||
|
if QUOT_PATTERN.match(lines[i]):
|
||||||
|
m = 'm'
|
||||||
|
else:
|
||||||
|
m = 't'
|
||||||
|
|
||||||
|
# If the line is not a header line, set in_header_block false.
|
||||||
|
if not bool(re.search(RE_HEADER, lines[i])):
|
||||||
|
in_header_block = False
|
||||||
|
|
||||||
|
# Add the marker to the new updated markers string.
|
||||||
|
updated_markers += m
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return updated_markers
|
||||||
|
|
||||||
|
|
||||||
def _readable_text_empty(html_tree):
|
def _readable_text_empty(html_tree):
|
||||||
return not bool(html_tree_to_text(html_tree).strip())
|
return not bool(html_tree_to_text(html_tree).strip())
|
||||||
|
|
||||||
|
|||||||
@@ -35,6 +35,19 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote:
|
|||||||
|
|
||||||
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
||||||
|
|
||||||
|
def test_pattern_on_date_polymail():
|
||||||
|
msg_body = """Test reply
|
||||||
|
|
||||||
|
On Tue, Apr 11, 2017 at 10:07 PM John Smith
|
||||||
|
|
||||||
|
<
|
||||||
|
mailto:John Smith <johnsmith@gmail.com>
|
||||||
|
> wrote:
|
||||||
|
Test quoted data
|
||||||
|
"""
|
||||||
|
|
||||||
|
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
||||||
|
|
||||||
|
|
||||||
def test_pattern_sent_from_samsung_smb_wrote():
|
def test_pattern_sent_from_samsung_smb_wrote():
|
||||||
msg_body = """Test reply
|
msg_body = """Test reply
|
||||||
@@ -142,7 +155,8 @@ def _check_pattern_original_message(original_message_indicator):
|
|||||||
-----{}-----
|
-----{}-----
|
||||||
|
|
||||||
Test"""
|
Test"""
|
||||||
eq_('Test reply', quotations.extract_from_plain(msg_body.format(six.text_type(original_message_indicator))))
|
eq_('Test reply', quotations.extract_from_plain(
|
||||||
|
msg_body.format(six.text_type(original_message_indicator))))
|
||||||
|
|
||||||
def test_english_original_message():
|
def test_english_original_message():
|
||||||
_check_pattern_original_message('Original Message')
|
_check_pattern_original_message('Original Message')
|
||||||
@@ -165,6 +179,17 @@ Test reply"""
|
|||||||
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
||||||
|
|
||||||
|
|
||||||
|
def test_android_wrote():
|
||||||
|
msg_body = """Test reply
|
||||||
|
|
||||||
|
---- John Smith wrote ----
|
||||||
|
|
||||||
|
> quoted
|
||||||
|
> text
|
||||||
|
"""
|
||||||
|
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
||||||
|
|
||||||
|
|
||||||
def test_reply_wraps_quotations():
|
def test_reply_wraps_quotations():
|
||||||
msg_body = """Test reply
|
msg_body = """Test reply
|
||||||
|
|
||||||
@@ -696,3 +721,52 @@ def test_standard_replies():
|
|||||||
"'%(reply)s' != %(stripped)s for %(fn)s" % \
|
"'%(reply)s' != %(stripped)s for %(fn)s" % \
|
||||||
{'reply': reply_text, 'stripped': stripped_text,
|
{'reply': reply_text, 'stripped': stripped_text,
|
||||||
'fn': filename}
|
'fn': filename}
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_email():
|
||||||
|
msg = """From: Mr. X
|
||||||
|
Date: 24 February 2016
|
||||||
|
To: Mr. Y
|
||||||
|
Subject: Hi
|
||||||
|
Attachments: none
|
||||||
|
Goodbye.
|
||||||
|
From: Mr. Y
|
||||||
|
To: Mr. X
|
||||||
|
Date: 24 February 2016
|
||||||
|
Subject: Hi
|
||||||
|
Attachments: none
|
||||||
|
|
||||||
|
Hello.
|
||||||
|
|
||||||
|
On 24th February 2016 at 09.32am, Conal wrote:
|
||||||
|
|
||||||
|
Hey!
|
||||||
|
|
||||||
|
On Mon, 2016-10-03 at 09:45 -0600, Stangel, Dan wrote:
|
||||||
|
> Mohan,
|
||||||
|
>
|
||||||
|
> We have not yet migrated the systems.
|
||||||
|
>
|
||||||
|
> Dan
|
||||||
|
>
|
||||||
|
> > -----Original Message-----
|
||||||
|
> > Date: Mon, 2 Apr 2012 17:44:22 +0400
|
||||||
|
> > Subject: Test
|
||||||
|
> > From: bob@xxx.mailgun.org
|
||||||
|
> > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
|
||||||
|
> >
|
||||||
|
> > Hi
|
||||||
|
> >
|
||||||
|
> > > From: bob@xxx.mailgun.org
|
||||||
|
> > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
|
||||||
|
> > > Date: Mon, 2 Apr 2012 17:44:22 +0400
|
||||||
|
> > > Subject: Test
|
||||||
|
> > > Hi
|
||||||
|
> > >
|
||||||
|
> >
|
||||||
|
>
|
||||||
|
>
|
||||||
|
"""
|
||||||
|
expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm"
|
||||||
|
markers = quotations.split_emails(msg)
|
||||||
|
eq_(markers, expected_markers)
|
||||||
|
|||||||
@@ -29,7 +29,9 @@ def test_unicode():
|
|||||||
|
|
||||||
def test_detect_encoding():
|
def test_detect_encoding():
|
||||||
eq_ ('ascii', u.detect_encoding(b'qwe').lower())
|
eq_ ('ascii', u.detect_encoding(b'qwe').lower())
|
||||||
eq_ ('iso-8859-2', u.detect_encoding(u'Versi\xf3n'.encode('iso-8859-2')).lower())
|
ok_ (u.detect_encoding(
|
||||||
|
u'Versi\xf3n'.encode('iso-8859-2')).lower() in [
|
||||||
|
'iso-8859-1', 'iso-8859-2'])
|
||||||
eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower())
|
eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower())
|
||||||
# fallback to utf-8
|
# fallback to utf-8
|
||||||
with patch.object(u.chardet, 'detect') as detect:
|
with patch.object(u.chardet, 'detect') as detect:
|
||||||
@@ -39,7 +41,9 @@ def test_detect_encoding():
|
|||||||
|
|
||||||
def test_quick_detect_encoding():
|
def test_quick_detect_encoding():
|
||||||
eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower())
|
eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower())
|
||||||
eq_ ('windows-1252', u.quick_detect_encoding(u'Versi\xf3n'.encode('windows-1252')).lower())
|
ok_ (u.quick_detect_encoding(
|
||||||
|
u'Versi\xf3n'.encode('windows-1252')).lower() in [
|
||||||
|
'windows-1252', 'windows-1250'])
|
||||||
eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower())
|
eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower())
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user