From 37c95ff97b5ea108d646a90863ab12110d849dbc Mon Sep 17 00:00:00 2001 From: Sergey Obukhov Date: Fri, 19 Aug 2016 11:38:12 -0700 Subject: [PATCH] fallback untouched html if we can not parse html tree --- talon/quotations.py | 4 ++++ talon/utils.py | 19 +++++++++++++++++-- tests/html_quotations_test.py | 6 ++++++ tests/utils_test.py | 19 ++++++++++++++++++- 4 files changed, 45 insertions(+), 3 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index 82d2c91..6f77124 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -394,6 +394,10 @@ def _extract_from_html(msg_body): msg_body = msg_body.replace(b'\r\n', b'\n') html_tree = html_document_fromstring(msg_body) + + if html_tree is None: + return msg_body + cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or html_quotations.cut_zimbra_quote(html_tree) or html_quotations.cut_blockquote(html_tree) or diff --git a/talon/utils.py b/talon/utils.py index 03314d4..093bc95 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -159,6 +159,7 @@ def html_to_text(string): NOTES: 1. the string is expected to contain UTF-8 encoded HTML! 2. returns utf-8 encoded str (not unicode) + 3. if html can't be parsed returns None """ if isinstance(string, six.text_type): string = string.encode('utf8') @@ -166,15 +167,29 @@ def html_to_text(string): s = _prepend_utf8_declaration(string) s = s.replace(b"\n", b"") tree = html_fromstring(s) + + if tree is None: + return None + return html_tree_to_text(tree) def html_fromstring(s): - return html5parser.fromstring(s, parser=_HTML5LIB_PARSER) + """Parse html tree from string. Return None if the string can't be parsed. + """ + try: + return html5parser.fromstring(s, parser=_HTML5LIB_PARSER) + except Exception: + pass def html_document_fromstring(s): - return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER) + """Parse html tree from string. Return None if the string can't be parsed. + """ + try: + return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER) + except Exception: + pass def cssselect(expr, tree): diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index eb52285..c087eef 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -413,3 +413,9 @@ def test_readable_html_empty(): eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) + + +@patch.object(quotations, 'html_document_fromstring', Mock(return_value=None)) +def test_bad_html(): + bad_html = "" + eq_(bad_html, quotations.extract_from_html(bad_html)) diff --git a/tests/utils_test.py b/tests/utils_test.py index a902746..2ff61bc 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -112,5 +112,22 @@ font: 13px 'Lucida Grande', Arial, sans-serif; def test_comment_no_parent(): s = " no comment" - d = html.document_fromstring(s) + d = u.html_document_fromstring(s) eq_("no comment", u.html_tree_to_text(d)) + + +@patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception())) +def test_html_fromstring_exception(): + eq_(None, u.html_fromstring("")) + + +@patch.object(u.html5parser, 'document_fromstring') +def test_html_document_fromstring_exception(document_fromstring): + document_fromstring.side_effect = Exception() + eq_(None, u.html_document_fromstring("")) + + +@patch.object(u, 'html_fromstring', Mock(return_value=None)) +def test_bad_html_to_text(): + bad_html = "one
two
three" + eq_(None, u.html_to_text(bad_html))