diff --git a/talon/quotations.py b/talon/quotations.py
index 82d2c91..6f77124 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -394,6 +394,10 @@ def _extract_from_html(msg_body):
msg_body = msg_body.replace(b'\r\n', b'\n')
html_tree = html_document_fromstring(msg_body)
+
+ if html_tree is None:
+ return msg_body
+
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
html_quotations.cut_zimbra_quote(html_tree) or
html_quotations.cut_blockquote(html_tree) or
diff --git a/talon/utils.py b/talon/utils.py
index 03314d4..093bc95 100644
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -159,6 +159,7 @@ def html_to_text(string):
NOTES:
1. the string is expected to contain UTF-8 encoded HTML!
2. returns utf-8 encoded str (not unicode)
+ 3. if html can't be parsed returns None
"""
if isinstance(string, six.text_type):
string = string.encode('utf8')
@@ -166,15 +167,29 @@ def html_to_text(string):
s = _prepend_utf8_declaration(string)
s = s.replace(b"\n", b"")
tree = html_fromstring(s)
+
+ if tree is None:
+ return None
+
return html_tree_to_text(tree)
def html_fromstring(s):
- return html5parser.fromstring(s, parser=_HTML5LIB_PARSER)
+ """Parse html tree from string. Return None if the string can't be parsed.
+ """
+ try:
+ return html5parser.fromstring(s, parser=_HTML5LIB_PARSER)
+ except Exception:
+ pass
def html_document_fromstring(s):
- return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER)
+ """Parse html tree from string. Return None if the string can't be parsed.
+ """
+ try:
+ return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER)
+ except Exception:
+ pass
def cssselect(expr, tree):
diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py
index eb52285..c087eef 100644
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -413,3 +413,9 @@ def test_readable_html_empty():
eq_(RE_WHITESPACE.sub('', msg_body),
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
+
+
+@patch.object(quotations, 'html_document_fromstring', Mock(return_value=None))
+def test_bad_html():
+ bad_html = ""
+ eq_(bad_html, quotations.extract_from_html(bad_html))
diff --git a/tests/utils_test.py b/tests/utils_test.py
index a902746..2ff61bc 100644
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -112,5 +112,22 @@ font: 13px 'Lucida Grande', Arial, sans-serif;
def test_comment_no_parent():
s = " no comment"
- d = html.document_fromstring(s)
+ d = u.html_document_fromstring(s)
eq_("no comment", u.html_tree_to_text(d))
+
+
+@patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception()))
+def test_html_fromstring_exception():
+ eq_(None, u.html_fromstring(""))
+
+
+@patch.object(u.html5parser, 'document_fromstring')
+def test_html_document_fromstring_exception(document_fromstring):
+ document_fromstring.side_effect = Exception()
+ eq_(None, u.html_document_fromstring(""))
+
+
+@patch.object(u, 'html_fromstring', Mock(return_value=None))
+def test_bad_html_to_text():
+ bad_html = "one
two
three"
+ eq_(None, u.html_to_text(bad_html))