From cec5acf58f48d17c1c746470ab470f8c3081e925 Mon Sep 17 00:00:00 2001 From: Maxim Vladimirskiy Date: Thu, 6 Jan 2022 14:18:11 +0300 Subject: [PATCH] Remove max tags limit --- setup.py | 2 +- talon/utils.py | 16 ---------------- tests/html_quotations_test.py | 12 ------------ tests/utils_test.py | 26 -------------------------- 4 files changed, 1 insertion(+), 55 deletions(-) diff --git a/setup.py b/setup.py index c458d98..088a41d 100755 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ class InstallCommand(install): setup(name='talon', - version='1.4.10', + version='1.5.0', description=("Mailgun library " "to extract message quotations and signatures."), long_description=open("README.rst").read(), diff --git a/talon/utils.py b/talon/utils.py index d257c17..14f4509 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -180,9 +180,6 @@ def html_fromstring(s): if isinstance(s, six.text_type): s = s.encode('utf8') try: - if html_too_big(s): - return None - return html5parser.fromstring(s, parser=_html5lib_parser()) except Exception: pass @@ -194,9 +191,6 @@ def html_document_fromstring(s): if isinstance(s, six.text_type): s = s.encode('utf8') try: - if html_too_big(s): - return None - return html5parser.document_fromstring(s, parser=_html5lib_parser()) except Exception: pass @@ -206,12 +200,6 @@ def cssselect(expr, tree): return CSSSelector(expr)(tree) -def html_too_big(s): - if isinstance(s, six.text_type): - s = s.encode('utf8') - return s.count(b'<') > _MAX_TAGS_COUNT - - def _contains_charset_spec(s): """Return True if the first 4KB contain charset spec """ @@ -258,7 +246,3 @@ _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] _HARDBREAKS = ['br', 'hr', 'tr'] _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") - -# an extensive research shows that exceeding this limit -# might lead to excessive processing time -_MAX_TAGS_COUNT = 419 diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index 541d358..2e5812a 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -391,18 +391,6 @@ def test_gmail_forwarded_msg(): eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) -@patch.object(u, '_MAX_TAGS_COUNT', 4) -def test_too_large_html(): - msg_body = 'Reply' \ - '
' \ - '
On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:' \ - '
Test
' \ - '
' \ - '
' - eq_(RE_WHITESPACE.sub('', msg_body), - RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) - - def test_readable_html_empty(): msg_body = """
diff --git a/tests/utils_test.py b/tests/utils_test.py index 7ba4b52..e7d529d 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -125,39 +125,13 @@ def test_html_fromstring_exception(): eq_(None, u.html_fromstring("")) -@patch.object(u, 'html_too_big', Mock()) -@patch.object(u.html5parser, 'fromstring') -def test_html_fromstring_too_big(fromstring): - eq_(None, u.html_fromstring("")) - assert_false(fromstring.called) - - @patch.object(u.html5parser, 'document_fromstring') def test_html_document_fromstring_exception(document_fromstring): document_fromstring.side_effect = Exception() eq_(None, u.html_document_fromstring("")) -@patch.object(u, 'html_too_big', Mock()) -@patch.object(u.html5parser, 'document_fromstring') -def test_html_document_fromstring_too_big(document_fromstring): - eq_(None, u.html_document_fromstring("")) - assert_false(document_fromstring.called) - - @patch.object(u, 'html_fromstring', Mock(return_value=None)) def test_bad_html_to_text(): bad_html = "one
two
three" eq_(None, u.html_to_text(bad_html)) - - -@patch.object(u, '_MAX_TAGS_COUNT', 3) -def test_html_too_big(): - eq_(False, u.html_too_big("
")) - eq_(True, u.html_too_big("
Hi
")) - - -@patch.object(u, '_MAX_TAGS_COUNT', 3) -def test_html_to_text(): - eq_(b"Hello", u.html_to_text("
Hello
")) - eq_(None, u.html_to_text("
Hi
"))