From ea82a9730ee69cea82023b05a8e10e43fe1fd885 Mon Sep 17 00:00:00 2001
From: Sergey Obukhov <sergey.obykhov@mailgunhq.com>
Date: Wed, 14 Sep 2016 09:33:30 -0700
Subject: [PATCH 1/2] restrict html processing to a certain number of tags

---
 setup.py                      |  2 +-
 talon/quotations.py           | 11 ++++++++++-
 tests/html_quotations_test.py |  2 +-
 3 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/setup.py b/setup.py
index 7bef4a0..27901de 100755
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):
 
 
 setup(name='talon',
-      version='1.3.1',
+      version='1.3.2',
       description=("Mailgun library "
                    "to extract message quotations and signatures."),
       long_description=open("README.rst").read(),
diff --git a/talon/quotations.py b/talon/quotations.py
index 6f77124..7472b10 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -386,7 +386,7 @@ def _extract_from_html(msg_body):
     then checking deleted checkpoints,
     then deleting necessary tags.
     """
-    if len(msg_body) > MAX_HTML_LEN:
+    if _html_too_big(msg_body):
         return msg_body
 
     if msg_body.strip() == b'':
@@ -483,3 +483,12 @@ def register_xpath_extensions():
     ns.prefix = 'mg'
     ns['text_content'] = text_content
     ns['tail'] = tail
+
+
+def _html_too_big(msg_body):
+    return msg_body.count('<') > _MAX_TAGS_COUNT
+
+
+# an extensive research shows that exceeding this limit
+# might lead to excessive processing time
+_MAX_TAGS_COUNT = 419
diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py
index c087eef..b216710 100644
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -385,7 +385,7 @@ def test_gmail_forwarded_msg():
     eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
 
 
-@patch.object(quotations, 'MAX_HTML_LEN', 1)
+@patch.object(quotations, '_MAX_TAGS_COUNT', 4)
 def test_too_large_html():
     msg_body = 'Reply' \
                '<div class="gmail_quote">' \

From 534457e713be9db6155c1188f04254785ccaae5e Mon Sep 17 00:00:00 2001
From: Sergey Obukhov <sergey.obykhov@mailgunhq.com>
Date: Wed, 14 Sep 2016 09:58:41 -0700
Subject: [PATCH 2/2] protect html_to_text as well

---
 talon/quotations.py           | 12 ------------
 talon/utils.py                | 14 ++++++++++++++
 tests/html_quotations_test.py |  2 +-
 tests/utils_test.py           | 25 +++++++++++++++++++++++++
 4 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/talon/quotations.py b/talon/quotations.py
index 7472b10..8ed3a15 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -386,9 +386,6 @@ def _extract_from_html(msg_body):
     then checking deleted checkpoints,
     then deleting necessary tags.
     """
-    if _html_too_big(msg_body):
-        return msg_body
-
     if msg_body.strip() == b'':
         return msg_body
 
@@ -483,12 +480,3 @@ def register_xpath_extensions():
     ns.prefix = 'mg'
     ns['text_content'] = text_content
     ns['tail'] = tail
-
-
-def _html_too_big(msg_body):
-    return msg_body.count('<') > _MAX_TAGS_COUNT
-
-
-# an extensive research shows that exceeding this limit
-# might lead to excessive processing time
-_MAX_TAGS_COUNT = 419
diff --git a/talon/utils.py b/talon/utils.py
index 91386a3..e6c884b 100644
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -178,6 +178,9 @@ def html_fromstring(s):
     """Parse html tree from string. Return None if the string can't be parsed.
     """
     try:
+        if html_too_big(s):
+            return None
+
         return html5parser.fromstring(s, parser=_html5lib_parser())
     except Exception:
         pass
@@ -187,6 +190,9 @@ def html_document_fromstring(s):
     """Parse html tree from string. Return None if the string can't be parsed.
     """
     try:
+        if html_too_big(s):
+            return None
+
         return html5parser.document_fromstring(s, parser=_html5lib_parser())
     except Exception:
         pass
@@ -196,6 +202,10 @@ def cssselect(expr, tree):
     return CSSSelector(expr)(tree)
 
 
+def html_too_big(s):
+    return s.count('<') > _MAX_TAGS_COUNT
+
+
 def _contains_charset_spec(s):
     """Return True if the first 4KB contain charset spec
     """
@@ -243,3 +253,7 @@ _BLOCKTAGS  = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
 _HARDBREAKS = ['br', 'hr', 'tr']
 
 _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
+
+# an extensive research shows that exceeding this limit
+# might lead to excessive processing time
+_MAX_TAGS_COUNT = 419
diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py
index b216710..b78409b 100644
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -385,7 +385,7 @@ def test_gmail_forwarded_msg():
     eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
 
 
-@patch.object(quotations, '_MAX_TAGS_COUNT', 4)
+@patch.object(u, '_MAX_TAGS_COUNT', 4)
 def test_too_large_html():
     msg_body = 'Reply' \
                '<div class="gmail_quote">' \
diff --git a/tests/utils_test.py b/tests/utils_test.py
index 2ff61bc..08d34bb 100644
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -120,6 +120,12 @@ def test_comment_no_parent():
 def test_html_fromstring_exception():
     eq_(None, u.html_fromstring("<html></html>"))
 
+@patch.object(u, 'html_too_big', Mock())
+@patch.object(u.html5parser, 'fromstring')
+def test_html_fromstring_too_big(fromstring):
+    eq_(None, u.html_fromstring("<html></html>"))
+    assert_false(fromstring.called)
+
 
 @patch.object(u.html5parser, 'document_fromstring')
 def test_html_document_fromstring_exception(document_fromstring):
@@ -127,7 +133,26 @@ def test_html_document_fromstring_exception(document_fromstring):
     eq_(None, u.html_document_fromstring("<html></html>"))
 
 
+@patch.object(u, 'html_too_big', Mock())
+@patch.object(u.html5parser, 'document_fromstring')
+def test_html_document_fromstring_too_big(document_fromstring):
+    eq_(None, u.html_document_fromstring("<html></html>"))
+    assert_false(document_fromstring.called)
+
+
 @patch.object(u, 'html_fromstring', Mock(return_value=None))
 def test_bad_html_to_text():
     bad_html = "one<br>two<br>three"
     eq_(None, u.html_to_text(bad_html))
+
+
+@patch.object(u, '_MAX_TAGS_COUNT', 3)
+def test_html_too_big():
+    eq_(False, u.html_too_big("<div></div>"))
+    eq_(True, u.html_too_big("<div><span>Hi</span></div>"))
+
+
+@patch.object(u, '_MAX_TAGS_COUNT', 3)
+def test_html_to_text():
+    eq_("Hello", u.html_to_text("<div>Hello</div>"))
+    eq_(None, u.html_to_text("<div><span>Hi</span></div>"))