From bcf97eccfa73aa5b5c37aa6d342a72887a874229 Mon Sep 17 00:00:00 2001 From: Sergey Obukhov Date: Mon, 15 Aug 2016 19:36:21 -0700 Subject: [PATCH 1/4] use html5lib to parse html --- setup.py | 2 +- talon/quotations.py | 10 ++++------ talon/utils.py | 29 ++++++++++++++++++++++++----- tests/html_quotations_test.py | 28 +++++++++++++++------------- 4 files changed, 44 insertions(+), 25 deletions(-) diff --git a/setup.py b/setup.py index d8b0554..1cb8938 100755 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ class InstallCommand(install): setup(name='talon', - version='1.2.16', + version='1.3.0', description=("Mailgun library " "to extract message quotations and signatures."), long_description=open("README.rst").read(), diff --git a/talon/quotations.py b/talon/quotations.py index 2834be0..82d2c91 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -12,7 +12,8 @@ from copy import deepcopy from lxml import html, etree -from talon.utils import get_delimiter, html_tree_to_text +from talon.utils import (get_delimiter, html_tree_to_text, + html_document_fromstring) from talon import html_quotations from six.moves import range import six @@ -392,10 +393,7 @@ def _extract_from_html(msg_body): return msg_body msg_body = msg_body.replace(b'\r\n', b'\n') - html_tree = html.document_fromstring( - msg_body, - parser=html.HTMLParser(encoding="utf-8") - ) + html_tree = html_document_fromstring(msg_body) cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or html_quotations.cut_zimbra_quote(html_tree) or html_quotations.cut_blockquote(html_tree) or @@ -468,7 +466,7 @@ def is_splitter(line): def text_content(context): '''XPath Extension function to return a node text content.''' - return context.context_node.text_content().strip() + return context.context_node.xpath("string()").strip() def tail(context): diff --git a/talon/utils.py b/talon/utils.py index 70de98c..7b118f9 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -7,9 +7,11 @@ import chardet import cchardet import regex as re -from lxml import html +from lxml.html import html5parser from lxml.cssselect import CSSSelector +import html5lib + from talon.constants import RE_DELIMITER import six @@ -120,7 +122,7 @@ def html_tree_to_text(tree): parent = c.getparent() # comment with no parent does not impact produced text - if not parent: + if parent is None: continue parent.remove(c) @@ -162,11 +164,18 @@ def html_to_text(string): s = _prepend_utf8_declaration(string) s = s.replace(b"\n", b"") - - tree = html.fromstring(s) + tree = html_fromstring(s) return html_tree_to_text(tree) +def html_fromstring(s): + return html5parser.fromstring(s, parser=_HTML5LIB_PARSER) + + +def html_document_fromstring(s): + return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER) + + def _contains_charset_spec(s): """Return True if the first 4KB contain charset spec """ @@ -198,5 +207,15 @@ _UTF8_DECLARATION = (b'""" - eq_("

Reply

", + eq_("Reply", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -44,7 +44,7 @@ def test_quotation_splitter_outside_blockquote(): """ - eq_("

Reply

", + eq_("Reply", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -62,7 +62,7 @@ def test_regular_blockquote(): """ - eq_("

Reply

Regular
", + eq_("Reply
Regular
", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -85,6 +85,7 @@ Reply reply = """ + Reply @@ -128,7 +129,7 @@ def test_gmail_quote(): """ - eq_("

Reply

", + eq_("Reply", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -139,7 +140,7 @@ def test_gmail_quote_compact(): '
Test
' \ '' \ '' - eq_("

Reply

", + eq_("Reply", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -166,7 +167,7 @@ def test_unicode_in_reply(): Quote """.encode("utf-8") - eq_("

Reply  Text


" + eq_("Reply  Text

" "", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -192,6 +193,7 @@ def test_blockquote_disclaimer(): stripped_html = """ +
@@ -223,7 +225,7 @@ def test_date_block():
""" - eq_('
message
', + eq_('
message
', RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -240,7 +242,7 @@ Subject: You Have New Mail From Mary!

text """ - eq_('
message
', + eq_('
message
', RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -258,7 +260,7 @@ def test_reply_shares_div_with_from_block(): ''' - eq_('
Blah

', + eq_('
Blah

', RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -269,13 +271,13 @@ def test_reply_quotations_share_block(): def test_OLK_SRC_BODY_SECTION_stripped(): - eq_('
Reply
', + eq_('
Reply
', RE_WHITESPACE.sub( '', quotations.extract_from_html(OLK_SRC_BODY_SECTION))) def test_reply_separated_by_hr(): - eq_('
Hi
there
', + eq_('
Hi
there
', RE_WHITESPACE.sub( '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) @@ -296,7 +298,7 @@ Reply ''' - eq_('

Reply


', + eq_('Reply

', RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -373,7 +375,7 @@ reply extracted = quotations.extract_from_html(msg_body) assert_false(symbol in extracted) # Keep new lines otherwise "My reply" becomes one word - "Myreply" - eq_("

My\nreply\n

", extracted) + eq_("My\nreply\n", extracted) def test_gmail_forwarded_msg(): From ec8e09b34e859c638a00f0432dbb98d71b352e6c Mon Sep 17 00:00:00 2001 From: Sergey Obukhov Date: Mon, 15 Aug 2016 20:31:04 -0700 Subject: [PATCH 2/4] fix --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 1cb8938..0837dea 100755 --- a/setup.py +++ b/setup.py @@ -53,6 +53,7 @@ setup(name='talon', 'cchardet>=0.3.5', 'cssselect', 'six>=1.10.0', + 'html5lib' ], tests_require=[ "mock", From 5b1ca33c57b1115f9a1158aa0ff75710158cc0f3 Mon Sep 17 00:00:00 2001 From: Sergey Obukhov Date: Tue, 16 Aug 2016 17:09:07 -0700 Subject: [PATCH 3/4] fix cssselect --- talon/html_quotations.py | 5 +++-- talon/utils.py | 5 +++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/talon/html_quotations.py b/talon/html_quotations.py index 4aa7e74..44afb6b 100644 --- a/talon/html_quotations.py +++ b/talon/html_quotations.py @@ -6,6 +6,7 @@ messages (without quoted messages) from html from __future__ import absolute_import import regex as re +from talon.utils import cssselect CHECKPOINT_PREFIX = '#!%!' CHECKPOINT_SUFFIX = '!%!#' @@ -78,7 +79,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints): def cut_gmail_quote(html_message): ''' Cuts the outermost block element with class gmail_quote. ''' - gmail_quote = html_message.cssselect('div.gmail_quote') + gmail_quote = cssselect('div.gmail_quote', html_message) if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)): gmail_quote[0].getparent().remove(gmail_quote[0]) return True @@ -135,7 +136,7 @@ def cut_microsoft_quote(html_message): def cut_by_id(html_message): found = False for quote_id in QUOTE_IDS: - quote = html_message.cssselect('#{}'.format(quote_id)) + quote = cssselect('#{}'.format(quote_id), html_message) if quote: found = True quote[0].getparent().remove(quote[0]) diff --git a/talon/utils.py b/talon/utils.py index 7b118f9..03314d4 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -114,6 +114,7 @@ def get_delimiter(msg_body): return delimiter + def html_tree_to_text(tree): for style in CSSSelector('style')(tree): style.getparent().remove(style) @@ -176,6 +177,10 @@ def html_document_fromstring(s): return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER) +def cssselect(expr, tree): + return CSSSelector(expr)(tree) + + def _contains_charset_spec(s): """Return True if the first 4KB contain charset spec """ From 37c95ff97b5ea108d646a90863ab12110d849dbc Mon Sep 17 00:00:00 2001 From: Sergey Obukhov Date: Fri, 19 Aug 2016 11:38:12 -0700 Subject: [PATCH 4/4] fallback untouched html if we can not parse html tree --- talon/quotations.py | 4 ++++ talon/utils.py | 19 +++++++++++++++++-- tests/html_quotations_test.py | 6 ++++++ tests/utils_test.py | 19 ++++++++++++++++++- 4 files changed, 45 insertions(+), 3 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index 82d2c91..6f77124 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -394,6 +394,10 @@ def _extract_from_html(msg_body): msg_body = msg_body.replace(b'\r\n', b'\n') html_tree = html_document_fromstring(msg_body) + + if html_tree is None: + return msg_body + cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or html_quotations.cut_zimbra_quote(html_tree) or html_quotations.cut_blockquote(html_tree) or diff --git a/talon/utils.py b/talon/utils.py index 03314d4..093bc95 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -159,6 +159,7 @@ def html_to_text(string): NOTES: 1. the string is expected to contain UTF-8 encoded HTML! 2. returns utf-8 encoded str (not unicode) + 3. if html can't be parsed returns None """ if isinstance(string, six.text_type): string = string.encode('utf8') @@ -166,15 +167,29 @@ def html_to_text(string): s = _prepend_utf8_declaration(string) s = s.replace(b"\n", b"") tree = html_fromstring(s) + + if tree is None: + return None + return html_tree_to_text(tree) def html_fromstring(s): - return html5parser.fromstring(s, parser=_HTML5LIB_PARSER) + """Parse html tree from string. Return None if the string can't be parsed. + """ + try: + return html5parser.fromstring(s, parser=_HTML5LIB_PARSER) + except Exception: + pass def html_document_fromstring(s): - return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER) + """Parse html tree from string. Return None if the string can't be parsed. + """ + try: + return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER) + except Exception: + pass def cssselect(expr, tree): diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index eb52285..c087eef 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -413,3 +413,9 @@ def test_readable_html_empty(): eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) + + +@patch.object(quotations, 'html_document_fromstring', Mock(return_value=None)) +def test_bad_html(): + bad_html = "" + eq_(bad_html, quotations.extract_from_html(bad_html)) diff --git a/tests/utils_test.py b/tests/utils_test.py index a902746..2ff61bc 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -112,5 +112,22 @@ font: 13px 'Lucida Grande', Arial, sans-serif; def test_comment_no_parent(): s = " no comment" - d = html.document_fromstring(s) + d = u.html_document_fromstring(s) eq_("no comment", u.html_tree_to_text(d)) + + +@patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception())) +def test_html_fromstring_exception(): + eq_(None, u.html_fromstring("")) + + +@patch.object(u.html5parser, 'document_fromstring') +def test_html_document_fromstring_exception(document_fromstring): + document_fromstring.side_effect = Exception() + eq_(None, u.html_document_fromstring("")) + + +@patch.object(u, 'html_fromstring', Mock(return_value=None)) +def test_bad_html_to_text(): + bad_html = "one
two
three" + eq_(None, u.html_to_text(bad_html))