From ea82a9730ee69cea82023b05a8e10e43fe1fd885 Mon Sep 17 00:00:00 2001 From: Sergey Obukhov Date: Wed, 14 Sep 2016 09:33:30 -0700 Subject: [PATCH 1/2] restrict html processing to a certain number of tags --- setup.py | 2 +- talon/quotations.py | 11 ++++++++++- tests/html_quotations_test.py | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 7bef4a0..27901de 100755 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ class InstallCommand(install): setup(name='talon', - version='1.3.1', + version='1.3.2', description=("Mailgun library " "to extract message quotations and signatures."), long_description=open("README.rst").read(), diff --git a/talon/quotations.py b/talon/quotations.py index 6f77124..7472b10 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -386,7 +386,7 @@ def _extract_from_html(msg_body): then checking deleted checkpoints, then deleting necessary tags. """ - if len(msg_body) > MAX_HTML_LEN: + if _html_too_big(msg_body): return msg_body if msg_body.strip() == b'': @@ -483,3 +483,12 @@ def register_xpath_extensions(): ns.prefix = 'mg' ns['text_content'] = text_content ns['tail'] = tail + + +def _html_too_big(msg_body): + return msg_body.count('<') > _MAX_TAGS_COUNT + + +# an extensive research shows that exceeding this limit +# might lead to excessive processing time +_MAX_TAGS_COUNT = 419 diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index c087eef..b216710 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -385,7 +385,7 @@ def test_gmail_forwarded_msg(): eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) -@patch.object(quotations, 'MAX_HTML_LEN', 1) +@patch.object(quotations, '_MAX_TAGS_COUNT', 4) def test_too_large_html(): msg_body = 'Reply' \ '
' \ From 534457e713be9db6155c1188f04254785ccaae5e Mon Sep 17 00:00:00 2001 From: Sergey Obukhov Date: Wed, 14 Sep 2016 09:58:41 -0700 Subject: [PATCH 2/2] protect html_to_text as well --- talon/quotations.py | 12 ------------ talon/utils.py | 14 ++++++++++++++ tests/html_quotations_test.py | 2 +- tests/utils_test.py | 25 +++++++++++++++++++++++++ 4 files changed, 40 insertions(+), 13 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index 7472b10..8ed3a15 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -386,9 +386,6 @@ def _extract_from_html(msg_body): then checking deleted checkpoints, then deleting necessary tags. """ - if _html_too_big(msg_body): - return msg_body - if msg_body.strip() == b'': return msg_body @@ -483,12 +480,3 @@ def register_xpath_extensions(): ns.prefix = 'mg' ns['text_content'] = text_content ns['tail'] = tail - - -def _html_too_big(msg_body): - return msg_body.count('<') > _MAX_TAGS_COUNT - - -# an extensive research shows that exceeding this limit -# might lead to excessive processing time -_MAX_TAGS_COUNT = 419 diff --git a/talon/utils.py b/talon/utils.py index 91386a3..e6c884b 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -178,6 +178,9 @@ def html_fromstring(s): """Parse html tree from string. Return None if the string can't be parsed. """ try: + if html_too_big(s): + return None + return html5parser.fromstring(s, parser=_html5lib_parser()) except Exception: pass @@ -187,6 +190,9 @@ def html_document_fromstring(s): """Parse html tree from string. Return None if the string can't be parsed. """ try: + if html_too_big(s): + return None + return html5parser.document_fromstring(s, parser=_html5lib_parser()) except Exception: pass @@ -196,6 +202,10 @@ def cssselect(expr, tree): return CSSSelector(expr)(tree) +def html_too_big(s): + return s.count('<') > _MAX_TAGS_COUNT + + def _contains_charset_spec(s): """Return True if the first 4KB contain charset spec """ @@ -243,3 +253,7 @@ _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] _HARDBREAKS = ['br', 'hr', 'tr'] _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") + +# an extensive research shows that exceeding this limit +# might lead to excessive processing time +_MAX_TAGS_COUNT = 419 diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index b216710..b78409b 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -385,7 +385,7 @@ def test_gmail_forwarded_msg(): eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) -@patch.object(quotations, '_MAX_TAGS_COUNT', 4) +@patch.object(u, '_MAX_TAGS_COUNT', 4) def test_too_large_html(): msg_body = 'Reply' \ '
' \ diff --git a/tests/utils_test.py b/tests/utils_test.py index 2ff61bc..08d34bb 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -120,6 +120,12 @@ def test_comment_no_parent(): def test_html_fromstring_exception(): eq_(None, u.html_fromstring("")) +@patch.object(u, 'html_too_big', Mock()) +@patch.object(u.html5parser, 'fromstring') +def test_html_fromstring_too_big(fromstring): + eq_(None, u.html_fromstring("")) + assert_false(fromstring.called) + @patch.object(u.html5parser, 'document_fromstring') def test_html_document_fromstring_exception(document_fromstring): @@ -127,7 +133,26 @@ def test_html_document_fromstring_exception(document_fromstring): eq_(None, u.html_document_fromstring("")) +@patch.object(u, 'html_too_big', Mock()) +@patch.object(u.html5parser, 'document_fromstring') +def test_html_document_fromstring_too_big(document_fromstring): + eq_(None, u.html_document_fromstring("")) + assert_false(document_fromstring.called) + + @patch.object(u, 'html_fromstring', Mock(return_value=None)) def test_bad_html_to_text(): bad_html = "one
two
three" eq_(None, u.html_to_text(bad_html)) + + +@patch.object(u, '_MAX_TAGS_COUNT', 3) +def test_html_too_big(): + eq_(False, u.html_too_big("
")) + eq_(True, u.html_too_big("
Hi
")) + + +@patch.object(u, '_MAX_TAGS_COUNT', 3) +def test_html_to_text(): + eq_("Hello", u.html_to_text("
Hello
")) + eq_(None, u.html_to_text("
Hi
"))