Merge pull request #120 from mailgun/sergey/talon-1.3.3

bump talon version
2016-11-30 18:28:39 -08:00 · 2016-11-30 12:56:06 -08:00 · 2016-11-30 12:51:32 -08:00 · 2016-11-22 20:02:54 +00:00 · 2016-11-22 20:00:31 +00:00 · 2016-11-22 19:56:57 +00:00
6 changed files with 132 additions and 17 deletions
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):


 setup(name='talon',
-      version='1.3.0',
+      version='1.3.3',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -172,6 +172,9 @@ MAX_HTML_LEN = 2794202
 QUOT_PATTERN = re.compile('^>+ ?')
 NO_QUOT_LINE = re.compile('^[^>].*[\S].*')

+# Regular expression to identify if a line is a header.
+RE_HEADER = re.compile(": ")
+

 def extract_from(msg_body, content_type='text/plain'):
    try:
@@ -386,9 +389,6 @@ def _extract_from_html(msg_body):
    then checking deleted checkpoints,
    then deleting necessary tags.
    """
-    if len(msg_body) > MAX_HTML_LEN:
-        return msg_body
-
    if msg_body.strip() == b'':
        return msg_body

@@ -453,6 +453,54 @@ def _extract_from_html(msg_body):
    return html.tostring(html_tree_copy)


+def split_emails(msg):
+    """
+    Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify
+     split lines, content lines and empty lines.
+
+    Correct the split line markers inside header blocks. Header blocks are identified by the regular expression
+    RE_HEADER.
+
+    Return the corrected markers
+    """
+    delimiter = get_delimiter(msg)
+    msg_body = preprocess(msg, delimiter)
+    # don't process too long messages
+    lines = msg_body.splitlines()[:MAX_LINES_COUNT]
+    markers = mark_message_lines(lines)
+
+    # we don't want splitlines in header blocks
+    markers = _correct_splitlines_in_headers(markers, lines)
+
+    return markers
+
+
+def _correct_splitlines_in_headers(markers, lines):
+    """Corrects markers by removing splitlines deemed to be inside header blocks"""
+    updated_markers = ""
+    i = 0
+    in_header_block = False
+
+    for m in markers:
+        # Only set in_header_block flag true when we hit an 's' and the line is a header.
+        if m == 's':
+            if not in_header_block:
+                if bool(re.search(RE_HEADER, lines[i])):
+                    in_header_block = True
+            else:
+                m = 't'
+
+        # If the line is not a header line, set in_header_block false.
+        if not bool(re.search(RE_HEADER, lines[i])):
+            in_header_block = False
+
+        # Add the marker to the new updated markers string.
+        updated_markers += m
+        i += 1
+
+    return updated_markers
+
+
 def _readable_text_empty(html_tree):
    return not bool(html_tree_to_text(html_tree).strip())

--- a/talon/utils.py
+++ b/talon/utils.py
@@ -178,7 +178,10 @@ def html_fromstring(s):
    """Parse html tree from string. Return None if the string can't be parsed.
    """
    try:
-        return html5parser.fromstring(s, parser=_HTML5LIB_PARSER)
+        if html_too_big(s):
+            return None
+
+        return html5parser.fromstring(s, parser=_html5lib_parser())
    except Exception:
        pass

@@ -187,7 +190,10 @@ def html_document_fromstring(s):
    """Parse html tree from string. Return None if the string can't be parsed.
    """
    try:
-        return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER)
+        if html_too_big(s):
+            return None
+
+        return html5parser.document_fromstring(s, parser=_html5lib_parser())
    except Exception:
        pass

@@ -196,6 +202,10 @@ def cssselect(expr, tree):
    return CSSSelector(expr)(tree)


+def html_too_big(s):
+    return s.count('<') > _MAX_TAGS_COUNT
+
+
 def _contains_charset_spec(s):
    """Return True if the first 4KB contain charset spec
    """
@@ -220,6 +230,21 @@ def _encode_utf8(s):
    return s.encode('utf-8') if isinstance(s, six.text_type) else s


+def _html5lib_parser():
+    """
+    html5lib is a pure-python library that conforms to the WHATWG HTML spec
+    and is not vulnarable to certain attacks common for XML libraries
+    """
+    return html5lib.HTMLParser(
+        # build lxml tree
+        html5lib.treebuilders.getTreeBuilder("lxml"),
+        # remove namespace value from inside lxml.html.html5paser element tag
+        # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div"
+        # instead of "div", throwing the algo off
+        namespaceHTMLElements=False
+    )
+
+
 _UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;'
                     b'charset=utf-8">')

@@ -229,13 +254,6 @@ _HARDBREAKS = ['br', 'hr', 'tr']

 _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")

-# html5lib is a pure-python library that conforms to the WHATWG HTML spec
-# and is not vulnarable to certain attacks common for XML libraries
-_HTML5LIB_PARSER = html5lib.HTMLParser(
-    # build lxml tree
-    html5lib.treebuilders.getTreeBuilder("lxml"),
-    # remove namespace value from inside lxml.html.html5paser element tag
-    # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div"
-    # instead of "div", throwing the algo off
-    namespaceHTMLElements=False
-)
+# an extensive research shows that exceeding this limit
+# might lead to excessive processing time
+_MAX_TAGS_COUNT = 419
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -385,7 +385,7 @@ def test_gmail_forwarded_msg():
    eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))


-@patch.object(quotations, 'MAX_HTML_LEN', 1)
+@patch.object(u, '_MAX_TAGS_COUNT', 4)
 def test_too_large_html():
    msg_body = 'Reply' \
               '<div class="gmail_quote">' \
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -696,3 +696,27 @@ def test_standard_replies():
                "'%(reply)s' != %(stripped)s for %(fn)s" % \
                {'reply': reply_text, 'stripped': stripped_text,
                 'fn': filename}
+
+
+def test_split_email():
+    msg = """From: Mr. X
+Date: 24 February 2016
+To: Mr. Y
+Subject: Hi
+Attachments: none
+Goodbye.
+From: Mr. Y
+To: Mr. X
+Date: 24 February 2016
+Subject: Hi
+Attachments: none
+
+Hello.
+
+-- Original Message --
+On 24th February 2016 at 09.32am Conal Wrote:
+Hey!
+"""
+    expected_markers = "stttttsttttetestt"
+    markers = quotations.split_emails(msg)
+    eq_(markers, expected_markers)
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -120,6 +120,12 @@ def test_comment_no_parent():
 def test_html_fromstring_exception():
    eq_(None, u.html_fromstring("<html></html>"))

+@patch.object(u, 'html_too_big', Mock())
+@patch.object(u.html5parser, 'fromstring')
+def test_html_fromstring_too_big(fromstring):
+    eq_(None, u.html_fromstring("<html></html>"))
+    assert_false(fromstring.called)
+

@patch.object(u.html5parser, 'document_fromstring')
 def test_html_document_fromstring_exception(document_fromstring):
@@ -127,7 +133,26 @@ def test_html_document_fromstring_exception(document_fromstring):
    eq_(None, u.html_document_fromstring("<html></html>"))


+@patch.object(u, 'html_too_big', Mock())
+@patch.object(u.html5parser, 'document_fromstring')
+def test_html_document_fromstring_too_big(document_fromstring):
+    eq_(None, u.html_document_fromstring("<html></html>"))
+    assert_false(document_fromstring.called)
+
+
@patch.object(u, 'html_fromstring', Mock(return_value=None))
 def test_bad_html_to_text():
    bad_html = "one<br>two<br>three"
    eq_(None, u.html_to_text(bad_html))
+
+
+@patch.object(u, '_MAX_TAGS_COUNT', 3)
+def test_html_too_big():
+    eq_(False, u.html_too_big("<div></div>"))
+    eq_(True, u.html_too_big("<div><span>Hi</span></div>"))
+
+
+@patch.object(u, '_MAX_TAGS_COUNT', 3)
+def test_html_to_text():
+    eq_("Hello", u.html_to_text("<div>Hello</div>"))
+    eq_(None, u.html_to_text("<div><span>Hi</span></div>"))
Author	SHA1	Message	Date
Sergey Obukhov	015c8d2a78	Merge pull request #120 from mailgun/sergey/talon-1.3.3 bump talon version	2016-11-30 18:28:39 -08:00
Sergey Obukhov	5af846c13d	bump talon version	2016-11-30 12:56:06 -08:00
Sergey Obukhov	e69a9c7a54	Merge pull request #119 from conapart3/master Addition of new split_email method for issue:115	2016-11-30 12:51:32 -08:00
conapart3	23cb2a9a53	Merge pull request #1 from conapart3/issue-115-date-split-in-headers split_emails function added, test added	2016-11-22 20:02:54 +00:00
smitcona	b5e3397b88	Updating test to account for --original message-- case	2016-11-22 20:00:31 +00:00
smitcona	5685a4055a	Improved algorithm	2016-11-22 19:56:57 +00:00
smitcona	97b72ef767	Adding in_header_block variable for reliability	2016-11-22 19:06:34 +00:00
smitcona	31489848be	Remove print lines	2016-11-21 17:36:06 +00:00
smitcona	e5988d447b	Add space	2016-11-21 12:48:29 +00:00
smitcona	adfed748ce	split_emails function added, test added	2016-11-21 12:35:36 +00:00
Sergey Obukhov	2444ba87c0	Merge pull request #111 from mailgun/sergey/tagscount restrict html processing to a certain number of tags	2016-09-14 11:06:29 -07:00
Sergey Obukhov	534457e713	protect html_to_text as well	2016-09-14 09:58:41 -07:00
Sergey Obukhov	ea82a9730e	restrict html processing to a certain number of tags	2016-09-14 09:33:30 -07:00
Sergey Obukhov	f04b872e14	Merge pull request #108 from mailgun/sergey/html5lib-fix use new parser each time we parse a document	2016-08-22 18:10:35 -07:00
Sergey Obukhov	e61894e425	bump version	2016-08-22 17:34:18 -07:00
Sergey Obukhov	35fbdaadac	use new parser each time we parse a document	2016-08-22 16:25:04 -07:00