From cec5acf58f48d17c1c746470ab470f8c3081e925 Mon Sep 17 00:00:00 2001
From: Maxim Vladimirskiy <horkhe@gmail.com>
Date: Thu, 6 Jan 2022 14:18:11 +0300
Subject: [PATCH 1/2] Remove max tags limit

---
 setup.py                      |  2 +-
 talon/utils.py                | 16 ----------------
 tests/html_quotations_test.py | 12 ------------
 tests/utils_test.py           | 26 --------------------------
 4 files changed, 1 insertion(+), 55 deletions(-)
diff --git a/setup.py b/setup.py
index c458d98..088a41d 100755
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):
 
 
 setup(name='talon',
-      version='1.4.10',
+      version='1.5.0',
       description=("Mailgun library "
                    "to extract message quotations and signatures."),
       long_description=open("README.rst").read(),
diff --git a/talon/utils.py b/talon/utils.py
index d257c17..14f4509 100644
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -180,9 +180,6 @@ def html_fromstring(s):
     if isinstance(s, six.text_type):
         s = s.encode('utf8')
     try:
-        if html_too_big(s):
-            return None
-
         return html5parser.fromstring(s, parser=_html5lib_parser())
     except Exception:
         pass
@@ -194,9 +191,6 @@ def html_document_fromstring(s):
     if isinstance(s, six.text_type):
         s = s.encode('utf8')
     try:
-        if html_too_big(s):
-            return None
-
         return html5parser.document_fromstring(s, parser=_html5lib_parser())
     except Exception:
         pass
@@ -206,12 +200,6 @@ def cssselect(expr, tree):
     return CSSSelector(expr)(tree)
 
 
-def html_too_big(s):
-    if isinstance(s, six.text_type):
-        s = s.encode('utf8')
-    return s.count(b'<') > _MAX_TAGS_COUNT
-
-
 def _contains_charset_spec(s):
     """Return True if the first 4KB contain charset spec
     """
@@ -258,7 +246,3 @@ _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
 _HARDBREAKS = ['br', 'hr', 'tr']
 
 _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
-
-# an extensive research shows that exceeding this limit
-# might lead to excessive processing time
-_MAX_TAGS_COUNT = 419
diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py
index 541d358..2e5812a 100644
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -391,18 +391,6 @@ def test_gmail_forwarded_msg():
     eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
 
 
-@patch.object(u, '_MAX_TAGS_COUNT', 4)
-def test_too_large_html():
-    msg_body = 'Reply' \
-               '<div class="gmail_quote">' \
-               '<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:' \
-               '<div>Test</div>' \
-               '</div>' \
-               '</div>'
-    eq_(RE_WHITESPACE.sub('', msg_body),
-        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
-
-
 def test_readable_html_empty():
     msg_body = """
 <blockquote>
diff --git a/tests/utils_test.py b/tests/utils_test.py
index 7ba4b52..e7d529d 100644
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -125,39 +125,13 @@ def test_html_fromstring_exception():
     eq_(None, u.html_fromstring("<html></html>"))
 
 
-@patch.object(u, 'html_too_big', Mock())
-@patch.object(u.html5parser, 'fromstring')
-def test_html_fromstring_too_big(fromstring):
-    eq_(None, u.html_fromstring("<html></html>"))
-    assert_false(fromstring.called)
-
-
 @patch.object(u.html5parser, 'document_fromstring')
 def test_html_document_fromstring_exception(document_fromstring):
     document_fromstring.side_effect = Exception()
     eq_(None, u.html_document_fromstring("<html></html>"))
 
 
-@patch.object(u, 'html_too_big', Mock())
-@patch.object(u.html5parser, 'document_fromstring')
-def test_html_document_fromstring_too_big(document_fromstring):
-    eq_(None, u.html_document_fromstring("<html></html>"))
-    assert_false(document_fromstring.called)
-
-
 @patch.object(u, 'html_fromstring', Mock(return_value=None))
 def test_bad_html_to_text():
     bad_html = "one<br>two<br>three"
     eq_(None, u.html_to_text(bad_html))
-
-
-@patch.object(u, '_MAX_TAGS_COUNT', 3)
-def test_html_too_big():
-    eq_(False, u.html_too_big("<div></div>"))
-    eq_(True, u.html_too_big("<div><span>Hi</span></div>"))
-
-
-@patch.object(u, '_MAX_TAGS_COUNT', 3)
-def test_html_to_text():
-    eq_(b"Hello", u.html_to_text("<div>Hello</div>"))
-    eq_(None, u.html_to_text("<div><span>Hi</span></div>"))

From b30c375c5b8cf1c4432581d42f278ea438a40a23 Mon Sep 17 00:00:00 2001
From: Maxim Vladimirskiy <horkhe@gmail.com>
Date: Thu, 6 Jan 2022 15:16:43 +0300
Subject: [PATCH 2/2] Expose extract_from_html_tree

---
 talon/quotations.py | 57 +++++++++++++++++++++------------------------
 1 file changed, 26 insertions(+), 31 deletions(-)

diff --git a/talon/quotations.py b/talon/quotations.py
index aa215fe..c86809d 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -193,9 +193,6 @@ RE_PARENTHESIS_LINK = re.compile("\(https?://")
 
 SPLITTER_MAX_LINES = 6
 MAX_LINES_COUNT = 1000
-# an extensive research shows that exceeding this limit
-# leads to excessive processing time
-MAX_HTML_LEN = 2794202
 
 QUOT_PATTERN = re.compile('^>+ ?')
 NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
@@ -421,25 +418,31 @@ def extract_from_html(msg_body):
 
     Returns a unicode string.
     """
+    msg_body_bytes = msg_body
     if isinstance(msg_body, six.text_type):
-        msg_body = msg_body.encode('utf8')
-    elif not isinstance(msg_body, bytes):
-        msg_body = msg_body.encode('ascii')
+        msg_body_bytes = msg_body.encode('utf8')
 
-    result = _extract_from_html(msg_body)
-    if isinstance(result, bytes):
-        result = result.decode('utf8')
+    if msg_body_bytes.strip() == b'':
+        return msg_body
+
+    msg_body_bytes = msg_body_bytes.replace(b'\r\n', b'\n')
+    # Cut out xml and doctype tags to avoid conflict with unicode decoding.
+    msg_body_bytes = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", b"", msg_body_bytes)
+    html_tree = html_document_fromstring(msg_body_bytes)
+    if html_tree is None:
+        return msg_body
+
+    result = extract_from_html_tree(html_tree)
+    if not result:
+        return msg_body
 
     return result
 
 
-def _extract_from_html(msg_body):
+def extract_from_html_tree(html_tree):
     """
-    Extract not quoted message from provided html message body
-    using tags and plain text algorithm.
-
-    Cut out first some encoding html tags such as xml and doctype
-    for avoiding conflict with unicode decoding
+    Extract not quoted message from provided parsed html tree using tags and
+    plain text algorithm.
 
     Cut out the 'blockquote', 'gmail_quote' tags.
     Cut Microsoft quotations.
@@ -452,18 +455,6 @@ def _extract_from_html(msg_body):
     then checking deleted checkpoints,
     then deleting necessary tags.
     """
-    if msg_body.strip() == b'':
-        return msg_body
-
-    msg_body = msg_body.replace(b'\r\n', b'\n')
-
-    msg_body = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
-
-    html_tree = html_document_fromstring(msg_body)
-
-    if html_tree is None:
-        return msg_body
-
     cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
                       html_quotations.cut_zimbra_quote(html_tree) or
                       html_quotations.cut_blockquote(html_tree) or
@@ -481,7 +472,7 @@ def _extract_from_html(msg_body):
 
     # Don't process too long messages
     if len(lines) > MAX_LINES_COUNT:
-        return msg_body
+        return None
 
     # Collect checkpoints on each line
     line_checkpoints = [
@@ -500,7 +491,7 @@ def _extract_from_html(msg_body):
     lines_were_deleted, first_deleted, last_deleted = return_flags
 
     if not lines_were_deleted and not cut_quotations:
-        return msg_body
+        return None
 
     if lines_were_deleted:
         #collect checkpoints from deleted lines
@@ -514,7 +505,7 @@ def _extract_from_html(msg_body):
         )
 
     if _readable_text_empty(html_tree_copy):
-        return msg_body
+        return None
 
     # NOTE: We remove_namespaces() because we are using an HTML5 Parser, HTML
     # parsers do not recognize namespaces in HTML tags. As such the rendered
@@ -540,7 +531,11 @@ def _extract_from_html(msg_body):
     #    of replacing data outside the <tag> which might be essential to
     #    the customer.
     remove_namespaces(html_tree_copy)
-    return html.tostring(html_tree_copy)
+    s = html.tostring(html_tree_copy)
+    if not s:
+        return None
+
+    return s.decode('utf-8')
 
 
 def remove_namespaces(root):