diff --git a/setup.py b/setup.py index d8b0554..0837dea 100755 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ class InstallCommand(install): setup(name='talon', - version='1.2.16', + version='1.3.0', description=("Mailgun library " "to extract message quotations and signatures."), long_description=open("README.rst").read(), @@ -53,6 +53,7 @@ setup(name='talon', 'cchardet>=0.3.5', 'cssselect', 'six>=1.10.0', + 'html5lib' ], tests_require=[ "mock", diff --git a/talon/html_quotations.py b/talon/html_quotations.py index 4aa7e74..44afb6b 100644 --- a/talon/html_quotations.py +++ b/talon/html_quotations.py @@ -6,6 +6,7 @@ messages (without quoted messages) from html from __future__ import absolute_import import regex as re +from talon.utils import cssselect CHECKPOINT_PREFIX = '#!%!' CHECKPOINT_SUFFIX = '!%!#' @@ -78,7 +79,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints): def cut_gmail_quote(html_message): ''' Cuts the outermost block element with class gmail_quote. ''' - gmail_quote = html_message.cssselect('div.gmail_quote') + gmail_quote = cssselect('div.gmail_quote', html_message) if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)): gmail_quote[0].getparent().remove(gmail_quote[0]) return True @@ -135,7 +136,7 @@ def cut_microsoft_quote(html_message): def cut_by_id(html_message): found = False for quote_id in QUOTE_IDS: - quote = html_message.cssselect('#{}'.format(quote_id)) + quote = cssselect('#{}'.format(quote_id), html_message) if quote: found = True quote[0].getparent().remove(quote[0]) diff --git a/talon/quotations.py b/talon/quotations.py index 2834be0..6f77124 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -12,7 +12,8 @@ from copy import deepcopy from lxml import html, etree -from talon.utils import get_delimiter, html_tree_to_text +from talon.utils import (get_delimiter, html_tree_to_text, + html_document_fromstring) from talon import html_quotations from six.moves import range import six @@ -392,10 +393,11 @@ def _extract_from_html(msg_body): return msg_body msg_body = msg_body.replace(b'\r\n', b'\n') - html_tree = html.document_fromstring( - msg_body, - parser=html.HTMLParser(encoding="utf-8") - ) + html_tree = html_document_fromstring(msg_body) + + if html_tree is None: + return msg_body + cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or html_quotations.cut_zimbra_quote(html_tree) or html_quotations.cut_blockquote(html_tree) or @@ -468,7 +470,7 @@ def is_splitter(line): def text_content(context): '''XPath Extension function to return a node text content.''' - return context.context_node.text_content().strip() + return context.context_node.xpath("string()").strip() def tail(context): diff --git a/talon/utils.py b/talon/utils.py index 70de98c..093bc95 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -7,9 +7,11 @@ import chardet import cchardet import regex as re -from lxml import html +from lxml.html import html5parser from lxml.cssselect import CSSSelector +import html5lib + from talon.constants import RE_DELIMITER import six @@ -112,6 +114,7 @@ def get_delimiter(msg_body): return delimiter + def html_tree_to_text(tree): for style in CSSSelector('style')(tree): style.getparent().remove(style) @@ -120,7 +123,7 @@ def html_tree_to_text(tree): parent = c.getparent() # comment with no parent does not impact produced text - if not parent: + if parent is None: continue parent.remove(c) @@ -156,17 +159,43 @@ def html_to_text(string): NOTES: 1. the string is expected to contain UTF-8 encoded HTML! 2. returns utf-8 encoded str (not unicode) + 3. if html can't be parsed returns None """ if isinstance(string, six.text_type): string = string.encode('utf8') s = _prepend_utf8_declaration(string) s = s.replace(b"\n", b"") + tree = html_fromstring(s) + + if tree is None: + return None - tree = html.fromstring(s) return html_tree_to_text(tree) +def html_fromstring(s): + """Parse html tree from string. Return None if the string can't be parsed. + """ + try: + return html5parser.fromstring(s, parser=_HTML5LIB_PARSER) + except Exception: + pass + + +def html_document_fromstring(s): + """Parse html tree from string. Return None if the string can't be parsed. + """ + try: + return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER) + except Exception: + pass + + +def cssselect(expr, tree): + return CSSSelector(expr)(tree) + + def _contains_charset_spec(s): """Return True if the first 4KB contain charset spec """ @@ -198,5 +227,15 @@ _UTF8_DECLARATION = (b'""" - eq_("
Reply
", + eq_("Reply", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -44,7 +44,7 @@ def test_quotation_splitter_outside_blockquote(): """ - eq_("Reply
", + eq_("Reply", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -62,7 +62,7 @@ def test_regular_blockquote(): """ - eq_("Reply
Regular", + eq_("Reply
Regular", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -85,6 +85,7 @@ Reply reply = """ + Reply @@ -128,7 +129,7 @@ def test_gmail_quote(): """ - eq_("
Reply
", + eq_("Reply", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -139,7 +140,7 @@ def test_gmail_quote_compact(): 'Reply
", + eq_("Reply", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -166,7 +167,7 @@ def test_unicode_in_reply(): Quote """.encode("utf-8") - eq_("Reply Text
Reply
My\nreply\n
", extracted) + eq_("My\nreply\n", extracted) def test_gmail_forwarded_msg(): @@ -411,3 +413,9 @@ def test_readable_html_empty(): eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) + + +@patch.object(quotations, 'html_document_fromstring', Mock(return_value=None)) +def test_bad_html(): + bad_html = "" + eq_(bad_html, quotations.extract_from_html(bad_html)) diff --git a/tests/utils_test.py b/tests/utils_test.py index a902746..2ff61bc 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -112,5 +112,22 @@ font: 13px 'Lucida Grande', Arial, sans-serif; def test_comment_no_parent(): s = " no comment" - d = html.document_fromstring(s) + d = u.html_document_fromstring(s) eq_("no comment", u.html_tree_to_text(d)) + + +@patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception())) +def test_html_fromstring_exception(): + eq_(None, u.html_fromstring("")) + + +@patch.object(u.html5parser, 'document_fromstring') +def test_html_document_fromstring_exception(document_fromstring): + document_fromstring.side_effect = Exception() + eq_(None, u.html_document_fromstring("")) + + +@patch.object(u, 'html_fromstring', Mock(return_value=None)) +def test_bad_html_to_text(): + bad_html = "one