From cec5acf58f48d17c1c746470ab470f8c3081e925 Mon Sep 17 00:00:00 2001 From: Maxim Vladimirskiy Date: Thu, 6 Jan 2022 14:18:11 +0300 Subject: [PATCH 1/2] Remove max tags limit --- setup.py | 2 +- talon/utils.py | 16 ---------------- tests/html_quotations_test.py | 12 ------------ tests/utils_test.py | 26 -------------------------- 4 files changed, 1 insertion(+), 55 deletions(-) diff --git a/setup.py b/setup.py index c458d98..088a41d 100755 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ class InstallCommand(install): setup(name='talon', - version='1.4.10', + version='1.5.0', description=("Mailgun library " "to extract message quotations and signatures."), long_description=open("README.rst").read(), diff --git a/talon/utils.py b/talon/utils.py index d257c17..14f4509 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -180,9 +180,6 @@ def html_fromstring(s): if isinstance(s, six.text_type): s = s.encode('utf8') try: - if html_too_big(s): - return None - return html5parser.fromstring(s, parser=_html5lib_parser()) except Exception: pass @@ -194,9 +191,6 @@ def html_document_fromstring(s): if isinstance(s, six.text_type): s = s.encode('utf8') try: - if html_too_big(s): - return None - return html5parser.document_fromstring(s, parser=_html5lib_parser()) except Exception: pass @@ -206,12 +200,6 @@ def cssselect(expr, tree): return CSSSelector(expr)(tree) -def html_too_big(s): - if isinstance(s, six.text_type): - s = s.encode('utf8') - return s.count(b'<') > _MAX_TAGS_COUNT - - def _contains_charset_spec(s): """Return True if the first 4KB contain charset spec """ @@ -258,7 +246,3 @@ _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] _HARDBREAKS = ['br', 'hr', 'tr'] _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") - -# an extensive research shows that exceeding this limit -# might lead to excessive processing time -_MAX_TAGS_COUNT = 419 diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index 541d358..2e5812a 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -391,18 +391,6 @@ def test_gmail_forwarded_msg(): eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) -@patch.object(u, '_MAX_TAGS_COUNT', 4) -def test_too_large_html(): - msg_body = 'Reply' \ - '
' \ - '
On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:' \ - '
Test
' \ - '
' \ - '
' - eq_(RE_WHITESPACE.sub('', msg_body), - RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) - - def test_readable_html_empty(): msg_body = """
diff --git a/tests/utils_test.py b/tests/utils_test.py index 7ba4b52..e7d529d 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -125,39 +125,13 @@ def test_html_fromstring_exception(): eq_(None, u.html_fromstring("")) -@patch.object(u, 'html_too_big', Mock()) -@patch.object(u.html5parser, 'fromstring') -def test_html_fromstring_too_big(fromstring): - eq_(None, u.html_fromstring("")) - assert_false(fromstring.called) - - @patch.object(u.html5parser, 'document_fromstring') def test_html_document_fromstring_exception(document_fromstring): document_fromstring.side_effect = Exception() eq_(None, u.html_document_fromstring("")) -@patch.object(u, 'html_too_big', Mock()) -@patch.object(u.html5parser, 'document_fromstring') -def test_html_document_fromstring_too_big(document_fromstring): - eq_(None, u.html_document_fromstring("")) - assert_false(document_fromstring.called) - - @patch.object(u, 'html_fromstring', Mock(return_value=None)) def test_bad_html_to_text(): bad_html = "one
two
three" eq_(None, u.html_to_text(bad_html)) - - -@patch.object(u, '_MAX_TAGS_COUNT', 3) -def test_html_too_big(): - eq_(False, u.html_too_big("
")) - eq_(True, u.html_too_big("
Hi
")) - - -@patch.object(u, '_MAX_TAGS_COUNT', 3) -def test_html_to_text(): - eq_(b"Hello", u.html_to_text("
Hello
")) - eq_(None, u.html_to_text("
Hi
")) From b30c375c5b8cf1c4432581d42f278ea438a40a23 Mon Sep 17 00:00:00 2001 From: Maxim Vladimirskiy Date: Thu, 6 Jan 2022 15:16:43 +0300 Subject: [PATCH 2/2] Expose extract_from_html_tree --- talon/quotations.py | 57 +++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 31 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index aa215fe..c86809d 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -193,9 +193,6 @@ RE_PARENTHESIS_LINK = re.compile("\(https?://") SPLITTER_MAX_LINES = 6 MAX_LINES_COUNT = 1000 -# an extensive research shows that exceeding this limit -# leads to excessive processing time -MAX_HTML_LEN = 2794202 QUOT_PATTERN = re.compile('^>+ ?') NO_QUOT_LINE = re.compile('^[^>].*[\S].*') @@ -421,25 +418,31 @@ def extract_from_html(msg_body): Returns a unicode string. """ + msg_body_bytes = msg_body if isinstance(msg_body, six.text_type): - msg_body = msg_body.encode('utf8') - elif not isinstance(msg_body, bytes): - msg_body = msg_body.encode('ascii') + msg_body_bytes = msg_body.encode('utf8') - result = _extract_from_html(msg_body) - if isinstance(result, bytes): - result = result.decode('utf8') + if msg_body_bytes.strip() == b'': + return msg_body + + msg_body_bytes = msg_body_bytes.replace(b'\r\n', b'\n') + # Cut out xml and doctype tags to avoid conflict with unicode decoding. + msg_body_bytes = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", b"", msg_body_bytes) + html_tree = html_document_fromstring(msg_body_bytes) + if html_tree is None: + return msg_body + + result = extract_from_html_tree(html_tree) + if not result: + return msg_body return result -def _extract_from_html(msg_body): +def extract_from_html_tree(html_tree): """ - Extract not quoted message from provided html message body - using tags and plain text algorithm. - - Cut out first some encoding html tags such as xml and doctype - for avoiding conflict with unicode decoding + Extract not quoted message from provided parsed html tree using tags and + plain text algorithm. Cut out the 'blockquote', 'gmail_quote' tags. Cut Microsoft quotations. @@ -452,18 +455,6 @@ def _extract_from_html(msg_body): then checking deleted checkpoints, then deleting necessary tags. """ - if msg_body.strip() == b'': - return msg_body - - msg_body = msg_body.replace(b'\r\n', b'\n') - - msg_body = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body) - - html_tree = html_document_fromstring(msg_body) - - if html_tree is None: - return msg_body - cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or html_quotations.cut_zimbra_quote(html_tree) or html_quotations.cut_blockquote(html_tree) or @@ -481,7 +472,7 @@ def _extract_from_html(msg_body): # Don't process too long messages if len(lines) > MAX_LINES_COUNT: - return msg_body + return None # Collect checkpoints on each line line_checkpoints = [ @@ -500,7 +491,7 @@ def _extract_from_html(msg_body): lines_were_deleted, first_deleted, last_deleted = return_flags if not lines_were_deleted and not cut_quotations: - return msg_body + return None if lines_were_deleted: #collect checkpoints from deleted lines @@ -514,7 +505,7 @@ def _extract_from_html(msg_body): ) if _readable_text_empty(html_tree_copy): - return msg_body + return None # NOTE: We remove_namespaces() because we are using an HTML5 Parser, HTML # parsers do not recognize namespaces in HTML tags. As such the rendered @@ -540,7 +531,11 @@ def _extract_from_html(msg_body): # of replacing data outside the which might be essential to # the customer. remove_namespaces(html_tree_copy) - return html.tostring(html_tree_copy) + s = html.tostring(html_tree_copy) + if not s: + return None + + return s.decode('utf-8') def remove_namespaces(root):