31 Commits

Author SHA1 Message Date
Sergey Obukhov
0b55e8fa77 Merge pull request #137 from mailgun/sergey/chardet
loosen the encoding requirement for detect_encoding
2017-04-25 11:29:06 -07:00
Sergey Obukhov
6f159e8959 loosen the encoding requirement for detect_encoding 2017-04-25 11:19:01 -07:00
Sergey Obukhov
85a4c1d855 Merge pull request #133 from mailgun/sergey/android
add android quotation pattern
2017-04-10 16:37:17 -07:00
Sergey Obukhov
0f5e72623b add android quotation pattern 2017-04-10 16:33:21 -07:00
Sergey Obukhov
061e549ad7 Merge pull request #128 from mailgun/sergey/1.3.4
bump version
2017-02-14 11:17:35 -08:00
Sergey Obukhov
49d1a5d248 bump version 2017-02-14 11:05:50 -08:00
Sergey Obukhov
03d6b00db8 Merge pull request #127 from conalsmith49/mark-splitlines-in-email-quotation-indents
Split_Email(): Mark splitlines for headers indented with spaces or email quotation indents (">")
2017-02-14 11:03:51 -08:00
smitcona
a2eb0f7201 Creating new method which removes initial spaces and marks the message lines. Removing ambiguity introduced to mark_message_lines 2017-02-14 18:19:45 +00:00
smitcona
5c71a0ca07 Split the comment lines so that they are not over 80 characters 2017-02-13 16:45:26 +00:00
Sergey Obukhov
489d16fad9 Merge branch 'master' into mark-splitlines-in-email-quotation-indents 2017-02-09 21:10:16 -08:00
Sergey Obukhov
a458707777 Merge pull request #124 from phanindra-ramesh/issue_123
Fixes issue #123
2017-02-09 20:55:36 -08:00
smitcona
a1d0a86305 Pass ignore_initial_spaces=True as this has better clarity than separate boolean variable 2017-02-07 12:47:33 +00:00
smitcona
29f1d21be7 fixed expected markers and incorrect condensed header not matching regex 2017-02-06 15:03:22 +00:00
smitcona
34c5b526c3 Remove the whitespace before the line if the flag is set 2017-02-03 12:57:26 +00:00
smitcona
3edb6578ba Dividing preprocess method into two methods, split_emails() now calls one without email content being altered. 2017-02-03 11:49:23 +00:00
smitcona
984c036b6e Set the marker back to 'm' rather than 't' if it matches the QUOT_PATTERN. Updated test case. 2017-02-01 18:28:19 +00:00
smitcona
a403ecb5c9 Adding two level indentation test 2017-02-01 18:09:35 +00:00
smitcona
a44713409c Added additional case for testing new functionality of split_emails() 2017-02-01 17:40:59 +00:00
smitcona
567467b8ed Update comment 2017-02-01 17:29:05 +00:00
smitcona
139edd6104 Add new method which marks as splitlines, lines which are splitlines but start with email quotation indents ("> ") 2017-02-01 17:16:30 +00:00
Phanindra Ramesh Challa
e756d55abf Fixes issue #123 2016-12-27 13:53:40 +05:30
Sergey Obukhov
015c8d2a78 Merge pull request #120 from mailgun/sergey/talon-1.3.3
bump talon version
2016-11-30 18:28:39 -08:00
Sergey Obukhov
5af846c13d bump talon version 2016-11-30 12:56:06 -08:00
Sergey Obukhov
e69a9c7a54 Merge pull request #119 from conapart3/master
Addition of new split_email method for issue:115
2016-11-30 12:51:32 -08:00
conapart3
23cb2a9a53 Merge pull request #1 from conapart3/issue-115-date-split-in-headers
split_emails function added, test added
2016-11-22 20:02:54 +00:00
smitcona
b5e3397b88 Updating test to account for --original message-- case 2016-11-22 20:00:31 +00:00
smitcona
5685a4055a Improved algorithm 2016-11-22 19:56:57 +00:00
smitcona
97b72ef767 Adding in_header_block variable for reliability 2016-11-22 19:06:34 +00:00
smitcona
31489848be Remove print lines 2016-11-21 17:36:06 +00:00
smitcona
e5988d447b Add space 2016-11-21 12:48:29 +00:00
smitcona
adfed748ce split_emails function added, test added 2016-11-21 12:35:36 +00:00
4 changed files with 194 additions and 11 deletions

View File

@@ -29,7 +29,7 @@ class InstallCommand(install):
setup(name='talon', setup(name='talon',
version='1.3.2', version='1.3.6',
description=("Mailgun library " description=("Mailgun library "
"to extract message quotations and signatures."), "to extract message quotations and signatures."),
long_description=open("README.rst").read(), long_description=open("README.rst").read(),
@@ -48,7 +48,7 @@ setup(name='talon',
"regex>=1", "regex>=1",
"numpy", "numpy",
"scipy", "scipy",
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild "scikit-learn>=0.16.1", # pickled versions of classifier, else rebuild
'chardet>=1.0.1', 'chardet>=1.0.1',
'cchardet>=0.3.5', 'cchardet>=0.3.5',
'cssselect', 'cssselect',

View File

@@ -131,7 +131,7 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
'Oprindelig meddelelse', 'Oprindelig meddelelse',
))), re.I) ))), re.I)
RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format( RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]?.*'.format(
u'|'.join(( u'|'.join((
# "From" in different languages. # "From" in different languages.
'From', 'Van', 'De', 'Von', 'Fra', u'Från', 'From', 'Van', 'De', 'Von', 'Fra', u'Från',
@@ -139,6 +139,13 @@ RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .
'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt', 'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
))), re.I) ))), re.I)
# ---- John Smith wrote ----
RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format(
u'|'.join((
# English
'wrote'
))), re.I)
SPLITTER_PATTERNS = [ SPLITTER_PATTERNS = [
RE_ORIGINAL_MESSAGE, RE_ORIGINAL_MESSAGE,
RE_ON_DATE_SMB_WROTE, RE_ON_DATE_SMB_WROTE,
@@ -154,10 +161,10 @@ SPLITTER_PATTERNS = [
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
'( \S+){3,6}@\S+:'), '( \S+){3,6}@\S+:'),
# Sent from Samsung MobileName <address@example.com> wrote: # Sent from Samsung MobileName <address@example.com> wrote:
re.compile('Sent from Samsung .*@.*> wrote') re.compile('Sent from Samsung .*@.*> wrote'),
RE_ANDROID_WROTE
] ]
RE_LINK = re.compile('<(http://[^>]*)>') RE_LINK = re.compile('<(http://[^>]*)>')
RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@') RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@')
@@ -172,6 +179,9 @@ MAX_HTML_LEN = 2794202
QUOT_PATTERN = re.compile('^>+ ?') QUOT_PATTERN = re.compile('^>+ ?')
NO_QUOT_LINE = re.compile('^[^>].*[\S].*') NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
# Regular expression to identify if a line is a header.
RE_HEADER = re.compile(": ")
def extract_from(msg_body, content_type='text/plain'): def extract_from(msg_body, content_type='text/plain'):
try: try:
@@ -185,6 +195,19 @@ def extract_from(msg_body, content_type='text/plain'):
return msg_body return msg_body
def remove_initial_spaces_and_mark_message_lines(lines):
"""
Removes the initial spaces in each line before marking message lines.
This ensures headers can be identified if they are indented with spaces.
"""
i = 0
while i < len(lines):
lines[i] = lines[i].lstrip(' ')
i += 1
return mark_message_lines(lines)
def mark_message_lines(lines): def mark_message_lines(lines):
"""Mark message lines with markers to distinguish quotation lines. """Mark message lines with markers to distinguish quotation lines.
@@ -287,9 +310,21 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
Converts msg_body into a unicode. Converts msg_body into a unicode.
""" """
# normalize links i.e. replace '<', '>' wrapping the link with some symbols msg_body = _replace_link_brackets(msg_body)
# so that '>' closing the link couldn't be mistakenly taken for quotation
# marker. msg_body = _wrap_splitter_with_newline(msg_body, delimiter, content_type)
return msg_body
def _replace_link_brackets(msg_body):
"""
Normalize links i.e. replace '<', '>' wrapping the link with some symbols
so that '>' closing the link couldn't be mistakenly taken for quotation
marker.
Converts msg_body into a unicode
"""
if isinstance(msg_body, bytes): if isinstance(msg_body, bytes):
msg_body = msg_body.decode('utf8') msg_body = msg_body.decode('utf8')
@@ -301,7 +336,14 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
return "@@%s@@" % link.group(1) return "@@%s@@" % link.group(1)
msg_body = re.sub(RE_LINK, link_wrapper, msg_body) msg_body = re.sub(RE_LINK, link_wrapper, msg_body)
return msg_body
def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'):
"""
Splits line in two if splitter pattern preceded by some text on the same
line (done only for 'On <date> <person> wrote:' pattern.
"""
def splitter_wrapper(splitter): def splitter_wrapper(splitter):
"""Wraps splitter with new line""" """Wraps splitter with new line"""
if splitter.start() and msg_body[splitter.start() - 1] != '\n': if splitter.start() and msg_body[splitter.start() - 1] != '\n':
@@ -450,6 +492,82 @@ def _extract_from_html(msg_body):
return html.tostring(html_tree_copy) return html.tostring(html_tree_copy)
def split_emails(msg):
"""
Given a message (which may consist of an email conversation thread with
multiple emails), mark the lines to identify split lines, content lines and
empty lines.
Correct the split line markers inside header blocks. Header blocks are
identified by the regular expression RE_HEADER.
Return the corrected markers
"""
msg_body = _replace_link_brackets(msg)
# don't process too long messages
lines = msg_body.splitlines()[:MAX_LINES_COUNT]
markers = remove_initial_spaces_and_mark_message_lines(lines)
markers = _mark_quoted_email_splitlines(markers, lines)
# we don't want splitlines in header blocks
markers = _correct_splitlines_in_headers(markers, lines)
return markers
def _mark_quoted_email_splitlines(markers, lines):
"""
When there are headers indented with '>' characters, this method will
attempt to identify if the header is a splitline header. If it is, then we
mark it with 's' instead of leaving it as 'm' and return the new markers.
"""
# Create a list of markers to easily alter specific characters
markerlist = list(markers)
for i, line in enumerate(lines):
if markerlist[i] != 'm':
continue
for pattern in SPLITTER_PATTERNS:
matcher = re.search(pattern, line)
if matcher:
markerlist[i] = 's'
break
return "".join(markerlist)
def _correct_splitlines_in_headers(markers, lines):
"""
Corrects markers by removing splitlines deemed to be inside header blocks.
"""
updated_markers = ""
i = 0
in_header_block = False
for m in markers:
# Only set in_header_block flag when we hit an 's' and line is a header
if m == 's':
if not in_header_block:
if bool(re.search(RE_HEADER, lines[i])):
in_header_block = True
else:
if QUOT_PATTERN.match(lines[i]):
m = 'm'
else:
m = 't'
# If the line is not a header line, set in_header_block false.
if not bool(re.search(RE_HEADER, lines[i])):
in_header_block = False
# Add the marker to the new updated markers string.
updated_markers += m
i += 1
return updated_markers
def _readable_text_empty(html_tree): def _readable_text_empty(html_tree):
return not bool(html_tree_to_text(html_tree).strip()) return not bool(html_tree_to_text(html_tree).strip())

View File

@@ -142,7 +142,8 @@ def _check_pattern_original_message(original_message_indicator):
-----{}----- -----{}-----
Test""" Test"""
eq_('Test reply', quotations.extract_from_plain(msg_body.format(six.text_type(original_message_indicator)))) eq_('Test reply', quotations.extract_from_plain(
msg_body.format(six.text_type(original_message_indicator))))
def test_english_original_message(): def test_english_original_message():
_check_pattern_original_message('Original Message') _check_pattern_original_message('Original Message')
@@ -165,6 +166,17 @@ Test reply"""
eq_("Test reply", quotations.extract_from_plain(msg_body)) eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_android_wrote():
msg_body = """Test reply
---- John Smith wrote ----
> quoted
> text
"""
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_reply_wraps_quotations(): def test_reply_wraps_quotations():
msg_body = """Test reply msg_body = """Test reply
@@ -696,3 +708,52 @@ def test_standard_replies():
"'%(reply)s' != %(stripped)s for %(fn)s" % \ "'%(reply)s' != %(stripped)s for %(fn)s" % \
{'reply': reply_text, 'stripped': stripped_text, {'reply': reply_text, 'stripped': stripped_text,
'fn': filename} 'fn': filename}
def test_split_email():
msg = """From: Mr. X
Date: 24 February 2016
To: Mr. Y
Subject: Hi
Attachments: none
Goodbye.
From: Mr. Y
To: Mr. X
Date: 24 February 2016
Subject: Hi
Attachments: none
Hello.
On 24th February 2016 at 09.32am, Conal wrote:
Hey!
On Mon, 2016-10-03 at 09:45 -0600, Stangel, Dan wrote:
> Mohan,
>
> We have not yet migrated the systems.
>
> Dan
>
> > -----Original Message-----
> > Date: Mon, 2 Apr 2012 17:44:22 +0400
> > Subject: Test
> > From: bob@xxx.mailgun.org
> > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
> >
> > Hi
> >
> > > From: bob@xxx.mailgun.org
> > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
> > > Date: Mon, 2 Apr 2012 17:44:22 +0400
> > > Subject: Test
> > > Hi
> > >
> >
>
>
"""
expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm"
markers = quotations.split_emails(msg)
eq_(markers, expected_markers)

View File

@@ -29,7 +29,9 @@ def test_unicode():
def test_detect_encoding(): def test_detect_encoding():
eq_ ('ascii', u.detect_encoding(b'qwe').lower()) eq_ ('ascii', u.detect_encoding(b'qwe').lower())
eq_ ('iso-8859-2', u.detect_encoding(u'Versi\xf3n'.encode('iso-8859-2')).lower()) ok_ (u.detect_encoding(
u'Versi\xf3n'.encode('iso-8859-2')).lower() in [
'iso-8859-1', 'iso-8859-2'])
eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower()) eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower())
# fallback to utf-8 # fallback to utf-8
with patch.object(u.chardet, 'detect') as detect: with patch.object(u.chardet, 'detect') as detect:
@@ -39,7 +41,9 @@ def test_detect_encoding():
def test_quick_detect_encoding(): def test_quick_detect_encoding():
eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower()) eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower())
eq_ ('windows-1252', u.quick_detect_encoding(u'Versi\xf3n'.encode('windows-1252')).lower()) ok_ (u.quick_detect_encoding(
u'Versi\xf3n'.encode('windows-1252')).lower() in [
'windows-1252', 'windows-1250'])
eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower()) eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower())