diff --git a/talon/quotations.py b/talon/quotations.py index 7472b10..8ed3a15 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -386,9 +386,6 @@ def _extract_from_html(msg_body): then checking deleted checkpoints, then deleting necessary tags. """ - if _html_too_big(msg_body): - return msg_body - if msg_body.strip() == b'': return msg_body @@ -483,12 +480,3 @@ def register_xpath_extensions(): ns.prefix = 'mg' ns['text_content'] = text_content ns['tail'] = tail - - -def _html_too_big(msg_body): - return msg_body.count('<') > _MAX_TAGS_COUNT - - -# an extensive research shows that exceeding this limit -# might lead to excessive processing time -_MAX_TAGS_COUNT = 419 diff --git a/talon/utils.py b/talon/utils.py index 91386a3..e6c884b 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -178,6 +178,9 @@ def html_fromstring(s): """Parse html tree from string. Return None if the string can't be parsed. """ try: + if html_too_big(s): + return None + return html5parser.fromstring(s, parser=_html5lib_parser()) except Exception: pass @@ -187,6 +190,9 @@ def html_document_fromstring(s): """Parse html tree from string. Return None if the string can't be parsed. """ try: + if html_too_big(s): + return None + return html5parser.document_fromstring(s, parser=_html5lib_parser()) except Exception: pass @@ -196,6 +202,10 @@ def cssselect(expr, tree): return CSSSelector(expr)(tree) +def html_too_big(s): + return s.count('<') > _MAX_TAGS_COUNT + + def _contains_charset_spec(s): """Return True if the first 4KB contain charset spec """ @@ -243,3 +253,7 @@ _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] _HARDBREAKS = ['br', 'hr', 'tr'] _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") + +# an extensive research shows that exceeding this limit +# might lead to excessive processing time +_MAX_TAGS_COUNT = 419 diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index b216710..b78409b 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -385,7 +385,7 @@ def test_gmail_forwarded_msg(): eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) -@patch.object(quotations, '_MAX_TAGS_COUNT', 4) +@patch.object(u, '_MAX_TAGS_COUNT', 4) def test_too_large_html(): msg_body = 'Reply' \ '