13 Commits

Author SHA1 Message Date
Sergey Obukhov
015c8d2a78 Merge pull request #120 from mailgun/sergey/talon-1.3.3
bump talon version
2016-11-30 18:28:39 -08:00
Sergey Obukhov
5af846c13d bump talon version 2016-11-30 12:56:06 -08:00
Sergey Obukhov
e69a9c7a54 Merge pull request #119 from conapart3/master
Addition of new split_email method for issue:115
2016-11-30 12:51:32 -08:00
conapart3
23cb2a9a53 Merge pull request #1 from conapart3/issue-115-date-split-in-headers
split_emails function added, test added
2016-11-22 20:02:54 +00:00
smitcona
b5e3397b88 Updating test to account for --original message-- case 2016-11-22 20:00:31 +00:00
smitcona
5685a4055a Improved algorithm 2016-11-22 19:56:57 +00:00
smitcona
97b72ef767 Adding in_header_block variable for reliability 2016-11-22 19:06:34 +00:00
smitcona
31489848be Remove print lines 2016-11-21 17:36:06 +00:00
smitcona
e5988d447b Add space 2016-11-21 12:48:29 +00:00
smitcona
adfed748ce split_emails function added, test added 2016-11-21 12:35:36 +00:00
Sergey Obukhov
2444ba87c0 Merge pull request #111 from mailgun/sergey/tagscount
restrict html processing to a certain number of tags
2016-09-14 11:06:29 -07:00
Sergey Obukhov
534457e713 protect html_to_text as well 2016-09-14 09:58:41 -07:00
Sergey Obukhov
ea82a9730e restrict html processing to a certain number of tags 2016-09-14 09:33:30 -07:00
6 changed files with 116 additions and 5 deletions

View File

@@ -29,7 +29,7 @@ class InstallCommand(install):
setup(name='talon', setup(name='talon',
version='1.3.1', version='1.3.3',
description=("Mailgun library " description=("Mailgun library "
"to extract message quotations and signatures."), "to extract message quotations and signatures."),
long_description=open("README.rst").read(), long_description=open("README.rst").read(),

View File

@@ -172,6 +172,9 @@ MAX_HTML_LEN = 2794202
QUOT_PATTERN = re.compile('^>+ ?') QUOT_PATTERN = re.compile('^>+ ?')
NO_QUOT_LINE = re.compile('^[^>].*[\S].*') NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
# Regular expression to identify if a line is a header.
RE_HEADER = re.compile(": ")
def extract_from(msg_body, content_type='text/plain'): def extract_from(msg_body, content_type='text/plain'):
try: try:
@@ -386,9 +389,6 @@ def _extract_from_html(msg_body):
then checking deleted checkpoints, then checking deleted checkpoints,
then deleting necessary tags. then deleting necessary tags.
""" """
if len(msg_body) > MAX_HTML_LEN:
return msg_body
if msg_body.strip() == b'': if msg_body.strip() == b'':
return msg_body return msg_body
@@ -453,6 +453,54 @@ def _extract_from_html(msg_body):
return html.tostring(html_tree_copy) return html.tostring(html_tree_copy)
def split_emails(msg):
"""
Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify
split lines, content lines and empty lines.
Correct the split line markers inside header blocks. Header blocks are identified by the regular expression
RE_HEADER.
Return the corrected markers
"""
delimiter = get_delimiter(msg)
msg_body = preprocess(msg, delimiter)
# don't process too long messages
lines = msg_body.splitlines()[:MAX_LINES_COUNT]
markers = mark_message_lines(lines)
# we don't want splitlines in header blocks
markers = _correct_splitlines_in_headers(markers, lines)
return markers
def _correct_splitlines_in_headers(markers, lines):
"""Corrects markers by removing splitlines deemed to be inside header blocks"""
updated_markers = ""
i = 0
in_header_block = False
for m in markers:
# Only set in_header_block flag true when we hit an 's' and the line is a header.
if m == 's':
if not in_header_block:
if bool(re.search(RE_HEADER, lines[i])):
in_header_block = True
else:
m = 't'
# If the line is not a header line, set in_header_block false.
if not bool(re.search(RE_HEADER, lines[i])):
in_header_block = False
# Add the marker to the new updated markers string.
updated_markers += m
i += 1
return updated_markers
def _readable_text_empty(html_tree): def _readable_text_empty(html_tree):
return not bool(html_tree_to_text(html_tree).strip()) return not bool(html_tree_to_text(html_tree).strip())

View File

@@ -178,6 +178,9 @@ def html_fromstring(s):
"""Parse html tree from string. Return None if the string can't be parsed. """Parse html tree from string. Return None if the string can't be parsed.
""" """
try: try:
if html_too_big(s):
return None
return html5parser.fromstring(s, parser=_html5lib_parser()) return html5parser.fromstring(s, parser=_html5lib_parser())
except Exception: except Exception:
pass pass
@@ -187,6 +190,9 @@ def html_document_fromstring(s):
"""Parse html tree from string. Return None if the string can't be parsed. """Parse html tree from string. Return None if the string can't be parsed.
""" """
try: try:
if html_too_big(s):
return None
return html5parser.document_fromstring(s, parser=_html5lib_parser()) return html5parser.document_fromstring(s, parser=_html5lib_parser())
except Exception: except Exception:
pass pass
@@ -196,6 +202,10 @@ def cssselect(expr, tree):
return CSSSelector(expr)(tree) return CSSSelector(expr)(tree)
def html_too_big(s):
return s.count('<') > _MAX_TAGS_COUNT
def _contains_charset_spec(s): def _contains_charset_spec(s):
"""Return True if the first 4KB contain charset spec """Return True if the first 4KB contain charset spec
""" """
@@ -243,3 +253,7 @@ _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
_HARDBREAKS = ['br', 'hr', 'tr'] _HARDBREAKS = ['br', 'hr', 'tr']
_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
# an extensive research shows that exceeding this limit
# might lead to excessive processing time
_MAX_TAGS_COUNT = 419

View File

@@ -385,7 +385,7 @@ def test_gmail_forwarded_msg():
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
@patch.object(quotations, 'MAX_HTML_LEN', 1) @patch.object(u, '_MAX_TAGS_COUNT', 4)
def test_too_large_html(): def test_too_large_html():
msg_body = 'Reply' \ msg_body = 'Reply' \
'<div class="gmail_quote">' \ '<div class="gmail_quote">' \

View File

@@ -696,3 +696,27 @@ def test_standard_replies():
"'%(reply)s' != %(stripped)s for %(fn)s" % \ "'%(reply)s' != %(stripped)s for %(fn)s" % \
{'reply': reply_text, 'stripped': stripped_text, {'reply': reply_text, 'stripped': stripped_text,
'fn': filename} 'fn': filename}
def test_split_email():
msg = """From: Mr. X
Date: 24 February 2016
To: Mr. Y
Subject: Hi
Attachments: none
Goodbye.
From: Mr. Y
To: Mr. X
Date: 24 February 2016
Subject: Hi
Attachments: none
Hello.
-- Original Message --
On 24th February 2016 at 09.32am Conal Wrote:
Hey!
"""
expected_markers = "stttttsttttetestt"
markers = quotations.split_emails(msg)
eq_(markers, expected_markers)

View File

@@ -120,6 +120,12 @@ def test_comment_no_parent():
def test_html_fromstring_exception(): def test_html_fromstring_exception():
eq_(None, u.html_fromstring("<html></html>")) eq_(None, u.html_fromstring("<html></html>"))
@patch.object(u, 'html_too_big', Mock())
@patch.object(u.html5parser, 'fromstring')
def test_html_fromstring_too_big(fromstring):
eq_(None, u.html_fromstring("<html></html>"))
assert_false(fromstring.called)
@patch.object(u.html5parser, 'document_fromstring') @patch.object(u.html5parser, 'document_fromstring')
def test_html_document_fromstring_exception(document_fromstring): def test_html_document_fromstring_exception(document_fromstring):
@@ -127,7 +133,26 @@ def test_html_document_fromstring_exception(document_fromstring):
eq_(None, u.html_document_fromstring("<html></html>")) eq_(None, u.html_document_fromstring("<html></html>"))
@patch.object(u, 'html_too_big', Mock())
@patch.object(u.html5parser, 'document_fromstring')
def test_html_document_fromstring_too_big(document_fromstring):
eq_(None, u.html_document_fromstring("<html></html>"))
assert_false(document_fromstring.called)
@patch.object(u, 'html_fromstring', Mock(return_value=None)) @patch.object(u, 'html_fromstring', Mock(return_value=None))
def test_bad_html_to_text(): def test_bad_html_to_text():
bad_html = "one<br>two<br>three" bad_html = "one<br>two<br>three"
eq_(None, u.html_to_text(bad_html)) eq_(None, u.html_to_text(bad_html))
@patch.object(u, '_MAX_TAGS_COUNT', 3)
def test_html_too_big():
eq_(False, u.html_too_big("<div></div>"))
eq_(True, u.html_too_big("<div><span>Hi</span></div>"))
@patch.object(u, '_MAX_TAGS_COUNT', 3)
def test_html_to_text():
eq_("Hello", u.html_to_text("<div>Hello</div>"))
eq_(None, u.html_to_text("<div><span>Hi</span></div>"))