From bcf97eccfa73aa5b5c37aa6d342a72887a874229 Mon Sep 17 00:00:00 2001 From: Sergey Obukhov Date: Mon, 15 Aug 2016 19:36:21 -0700 Subject: [PATCH] use html5lib to parse html --- setup.py | 2 +- talon/quotations.py | 10 ++++------ talon/utils.py | 29 ++++++++++++++++++++++++----- tests/html_quotations_test.py | 28 +++++++++++++++------------- 4 files changed, 44 insertions(+), 25 deletions(-) diff --git a/setup.py b/setup.py index d8b0554..1cb8938 100755 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ class InstallCommand(install): setup(name='talon', - version='1.2.16', + version='1.3.0', description=("Mailgun library " "to extract message quotations and signatures."), long_description=open("README.rst").read(), diff --git a/talon/quotations.py b/talon/quotations.py index 2834be0..82d2c91 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -12,7 +12,8 @@ from copy import deepcopy from lxml import html, etree -from talon.utils import get_delimiter, html_tree_to_text +from talon.utils import (get_delimiter, html_tree_to_text, + html_document_fromstring) from talon import html_quotations from six.moves import range import six @@ -392,10 +393,7 @@ def _extract_from_html(msg_body): return msg_body msg_body = msg_body.replace(b'\r\n', b'\n') - html_tree = html.document_fromstring( - msg_body, - parser=html.HTMLParser(encoding="utf-8") - ) + html_tree = html_document_fromstring(msg_body) cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or html_quotations.cut_zimbra_quote(html_tree) or html_quotations.cut_blockquote(html_tree) or @@ -468,7 +466,7 @@ def is_splitter(line): def text_content(context): '''XPath Extension function to return a node text content.''' - return context.context_node.text_content().strip() + return context.context_node.xpath("string()").strip() def tail(context): diff --git a/talon/utils.py b/talon/utils.py index 70de98c..7b118f9 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -7,9 +7,11 @@ import chardet import cchardet import regex as re -from lxml import html +from lxml.html import html5parser from lxml.cssselect import CSSSelector +import html5lib + from talon.constants import RE_DELIMITER import six @@ -120,7 +122,7 @@ def html_tree_to_text(tree): parent = c.getparent() # comment with no parent does not impact produced text - if not parent: + if parent is None: continue parent.remove(c) @@ -162,11 +164,18 @@ def html_to_text(string): s = _prepend_utf8_declaration(string) s = s.replace(b"\n", b"") - - tree = html.fromstring(s) + tree = html_fromstring(s) return html_tree_to_text(tree) +def html_fromstring(s): + return html5parser.fromstring(s, parser=_HTML5LIB_PARSER) + + +def html_document_fromstring(s): + return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER) + + def _contains_charset_spec(s): """Return True if the first 4KB contain charset spec """ @@ -198,5 +207,15 @@ _UTF8_DECLARATION = (b'""" - eq_("

Reply

", + eq_("Reply", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -44,7 +44,7 @@ def test_quotation_splitter_outside_blockquote(): """ - eq_("

Reply

", + eq_("Reply", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -62,7 +62,7 @@ def test_regular_blockquote(): """ - eq_("

Reply

Regular
", + eq_("Reply
Regular
", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -85,6 +85,7 @@ Reply reply = """ + Reply @@ -128,7 +129,7 @@ def test_gmail_quote(): """ - eq_("

Reply

", + eq_("Reply", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -139,7 +140,7 @@ def test_gmail_quote_compact(): '
Test
' \ '' \ '' - eq_("

Reply

", + eq_("Reply", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -166,7 +167,7 @@ def test_unicode_in_reply(): Quote """.encode("utf-8") - eq_("

Reply  Text


" + eq_("Reply  Text

" "", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -192,6 +193,7 @@ def test_blockquote_disclaimer(): stripped_html = """ +
@@ -223,7 +225,7 @@ def test_date_block():
""" - eq_('
message
', + eq_('
message
', RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -240,7 +242,7 @@ Subject: You Have New Mail From Mary!

text """ - eq_('
message
', + eq_('
message
', RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -258,7 +260,7 @@ def test_reply_shares_div_with_from_block(): ''' - eq_('
Blah

', + eq_('
Blah

', RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -269,13 +271,13 @@ def test_reply_quotations_share_block(): def test_OLK_SRC_BODY_SECTION_stripped(): - eq_('
Reply
', + eq_('
Reply
', RE_WHITESPACE.sub( '', quotations.extract_from_html(OLK_SRC_BODY_SECTION))) def test_reply_separated_by_hr(): - eq_('
Hi
there
', + eq_('
Hi
there
', RE_WHITESPACE.sub( '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) @@ -296,7 +298,7 @@ Reply ''' - eq_('

Reply


', + eq_('Reply

', RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -373,7 +375,7 @@ reply extracted = quotations.extract_from_html(msg_body) assert_false(symbol in extracted) # Keep new lines otherwise "My reply" becomes one word - "Myreply" - eq_("

My\nreply\n

", extracted) + eq_("My\nreply\n", extracted) def test_gmail_forwarded_msg():