fallback untouched html if we can not parse html tree
This commit is contained in:
@@ -394,6 +394,10 @@ def _extract_from_html(msg_body):
|
|||||||
|
|
||||||
msg_body = msg_body.replace(b'\r\n', b'\n')
|
msg_body = msg_body.replace(b'\r\n', b'\n')
|
||||||
html_tree = html_document_fromstring(msg_body)
|
html_tree = html_document_fromstring(msg_body)
|
||||||
|
|
||||||
|
if html_tree is None:
|
||||||
|
return msg_body
|
||||||
|
|
||||||
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
|
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
|
||||||
html_quotations.cut_zimbra_quote(html_tree) or
|
html_quotations.cut_zimbra_quote(html_tree) or
|
||||||
html_quotations.cut_blockquote(html_tree) or
|
html_quotations.cut_blockquote(html_tree) or
|
||||||
|
|||||||
@@ -159,6 +159,7 @@ def html_to_text(string):
|
|||||||
NOTES:
|
NOTES:
|
||||||
1. the string is expected to contain UTF-8 encoded HTML!
|
1. the string is expected to contain UTF-8 encoded HTML!
|
||||||
2. returns utf-8 encoded str (not unicode)
|
2. returns utf-8 encoded str (not unicode)
|
||||||
|
3. if html can't be parsed returns None
|
||||||
"""
|
"""
|
||||||
if isinstance(string, six.text_type):
|
if isinstance(string, six.text_type):
|
||||||
string = string.encode('utf8')
|
string = string.encode('utf8')
|
||||||
@@ -166,15 +167,29 @@ def html_to_text(string):
|
|||||||
s = _prepend_utf8_declaration(string)
|
s = _prepend_utf8_declaration(string)
|
||||||
s = s.replace(b"\n", b"")
|
s = s.replace(b"\n", b"")
|
||||||
tree = html_fromstring(s)
|
tree = html_fromstring(s)
|
||||||
|
|
||||||
|
if tree is None:
|
||||||
|
return None
|
||||||
|
|
||||||
return html_tree_to_text(tree)
|
return html_tree_to_text(tree)
|
||||||
|
|
||||||
|
|
||||||
def html_fromstring(s):
|
def html_fromstring(s):
|
||||||
return html5parser.fromstring(s, parser=_HTML5LIB_PARSER)
|
"""Parse html tree from string. Return None if the string can't be parsed.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return html5parser.fromstring(s, parser=_HTML5LIB_PARSER)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def html_document_fromstring(s):
|
def html_document_fromstring(s):
|
||||||
return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER)
|
"""Parse html tree from string. Return None if the string can't be parsed.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def cssselect(expr, tree):
|
def cssselect(expr, tree):
|
||||||
|
|||||||
@@ -413,3 +413,9 @@ def test_readable_html_empty():
|
|||||||
|
|
||||||
eq_(RE_WHITESPACE.sub('', msg_body),
|
eq_(RE_WHITESPACE.sub('', msg_body),
|
||||||
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
|
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
|
||||||
|
|
||||||
|
|
||||||
|
@patch.object(quotations, 'html_document_fromstring', Mock(return_value=None))
|
||||||
|
def test_bad_html():
|
||||||
|
bad_html = "<html></html>"
|
||||||
|
eq_(bad_html, quotations.extract_from_html(bad_html))
|
||||||
|
|||||||
@@ -112,5 +112,22 @@ font: 13px 'Lucida Grande', Arial, sans-serif;
|
|||||||
|
|
||||||
def test_comment_no_parent():
|
def test_comment_no_parent():
|
||||||
s = "<!-- COMMENT 1 --> no comment"
|
s = "<!-- COMMENT 1 --> no comment"
|
||||||
d = html.document_fromstring(s)
|
d = u.html_document_fromstring(s)
|
||||||
eq_("no comment", u.html_tree_to_text(d))
|
eq_("no comment", u.html_tree_to_text(d))
|
||||||
|
|
||||||
|
|
||||||
|
@patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception()))
|
||||||
|
def test_html_fromstring_exception():
|
||||||
|
eq_(None, u.html_fromstring("<html></html>"))
|
||||||
|
|
||||||
|
|
||||||
|
@patch.object(u.html5parser, 'document_fromstring')
|
||||||
|
def test_html_document_fromstring_exception(document_fromstring):
|
||||||
|
document_fromstring.side_effect = Exception()
|
||||||
|
eq_(None, u.html_document_fromstring("<html></html>"))
|
||||||
|
|
||||||
|
|
||||||
|
@patch.object(u, 'html_fromstring', Mock(return_value=None))
|
||||||
|
def test_bad_html_to_text():
|
||||||
|
bad_html = "one<br>two<br>three"
|
||||||
|
eq_(None, u.html_to_text(bad_html))
|
||||||
|
|||||||
Reference in New Issue
Block a user