use html5lib to parse html

This commit is contained in:
Sergey Obukhov
2016-08-15 19:36:21 -07:00
parent f53b5cc7a6
commit bcf97eccfa
4 changed files with 44 additions and 25 deletions

View File

@@ -29,7 +29,7 @@ class InstallCommand(install):
setup(name='talon', setup(name='talon',
version='1.2.16', version='1.3.0',
description=("Mailgun library " description=("Mailgun library "
"to extract message quotations and signatures."), "to extract message quotations and signatures."),
long_description=open("README.rst").read(), long_description=open("README.rst").read(),

View File

@@ -12,7 +12,8 @@ from copy import deepcopy
from lxml import html, etree from lxml import html, etree
from talon.utils import get_delimiter, html_tree_to_text from talon.utils import (get_delimiter, html_tree_to_text,
html_document_fromstring)
from talon import html_quotations from talon import html_quotations
from six.moves import range from six.moves import range
import six import six
@@ -392,10 +393,7 @@ def _extract_from_html(msg_body):
return msg_body return msg_body
msg_body = msg_body.replace(b'\r\n', b'\n') msg_body = msg_body.replace(b'\r\n', b'\n')
html_tree = html.document_fromstring( html_tree = html_document_fromstring(msg_body)
msg_body,
parser=html.HTMLParser(encoding="utf-8")
)
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
html_quotations.cut_zimbra_quote(html_tree) or html_quotations.cut_zimbra_quote(html_tree) or
html_quotations.cut_blockquote(html_tree) or html_quotations.cut_blockquote(html_tree) or
@@ -468,7 +466,7 @@ def is_splitter(line):
def text_content(context): def text_content(context):
'''XPath Extension function to return a node text content.''' '''XPath Extension function to return a node text content.'''
return context.context_node.text_content().strip() return context.context_node.xpath("string()").strip()
def tail(context): def tail(context):

View File

@@ -7,9 +7,11 @@ import chardet
import cchardet import cchardet
import regex as re import regex as re
from lxml import html from lxml.html import html5parser
from lxml.cssselect import CSSSelector from lxml.cssselect import CSSSelector
import html5lib
from talon.constants import RE_DELIMITER from talon.constants import RE_DELIMITER
import six import six
@@ -120,7 +122,7 @@ def html_tree_to_text(tree):
parent = c.getparent() parent = c.getparent()
# comment with no parent does not impact produced text # comment with no parent does not impact produced text
if not parent: if parent is None:
continue continue
parent.remove(c) parent.remove(c)
@@ -162,11 +164,18 @@ def html_to_text(string):
s = _prepend_utf8_declaration(string) s = _prepend_utf8_declaration(string)
s = s.replace(b"\n", b"") s = s.replace(b"\n", b"")
tree = html_fromstring(s)
tree = html.fromstring(s)
return html_tree_to_text(tree) return html_tree_to_text(tree)
def html_fromstring(s):
return html5parser.fromstring(s, parser=_HTML5LIB_PARSER)
def html_document_fromstring(s):
return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER)
def _contains_charset_spec(s): def _contains_charset_spec(s):
"""Return True if the first 4KB contain charset spec """Return True if the first 4KB contain charset spec
""" """
@@ -198,5 +207,15 @@ _UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;'
_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
_HARDBREAKS = ['br', 'hr', 'tr'] _HARDBREAKS = ['br', 'hr', 'tr']
_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
# html5lib is a pure-python library that conforms to the WHATWG HTML spec
# and is not vulnarable to certain attacks common for XML libraries
_HTML5LIB_PARSER = html5lib.HTMLParser(
# build lxml tree
html5lib.treebuilders.getTreeBuilder("lxml"),
# remove namespace value from inside lxml.html.html5paser element tag
# otherwise it yields something like "{http://www.w3.org/1999/xhtml}div"
# instead of "div", throwing the algo off
namespaceHTMLElements=False
)

View File

@@ -27,7 +27,7 @@ def test_quotation_splitter_inside_blockquote():
</blockquote>""" </blockquote>"""
eq_("<html><body><p>Reply</p></body></html>", eq_("<html><head></head><body>Reply</body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -44,7 +44,7 @@ def test_quotation_splitter_outside_blockquote():
</div> </div>
</blockquote> </blockquote>
""" """
eq_("<html><body><p>Reply</p></body></html>", eq_("<html><head></head><body>Reply</body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -62,7 +62,7 @@ def test_regular_blockquote():
</div> </div>
</blockquote> </blockquote>
""" """
eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote></body></html>", eq_("<html><head></head><body>Reply<blockquote>Regular</blockquote></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -85,6 +85,7 @@ Reply
reply = """ reply = """
<html> <html>
<head></head>
<body> <body>
Reply Reply
@@ -128,7 +129,7 @@ def test_gmail_quote():
</div> </div>
</div> </div>
</div>""" </div>"""
eq_("<html><body><p>Reply</p></body></html>", eq_("<html><head></head><body>Reply</body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -139,7 +140,7 @@ def test_gmail_quote_compact():
'<div>Test</div>' \ '<div>Test</div>' \
'</div>' \ '</div>' \
'</div>' '</div>'
eq_("<html><body><p>Reply</p></body></html>", eq_("<html><head></head><body>Reply</body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -166,7 +167,7 @@ def test_unicode_in_reply():
Quote Quote
</blockquote>""".encode("utf-8") </blockquote>""".encode("utf-8")
eq_("<html><body><p>Reply&#160;&#160;Text<br></p><div><br></div>" eq_("<html><head></head><body>Reply&#160;&#160;Text<br><div><br></div>"
"</body></html>", "</body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -192,6 +193,7 @@ def test_blockquote_disclaimer():
stripped_html = """ stripped_html = """
<html> <html>
<head></head>
<body> <body>
<div> <div>
<div> <div>
@@ -223,7 +225,7 @@ def test_date_block():
</div> </div>
</div> </div>
""" """
eq_('<html><body><div>message<br></div></body></html>', eq_('<html><head></head><body><div>message<br></div></body></html>',
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -240,7 +242,7 @@ Subject: You Have New Mail From Mary!<br><br>
text text
</div></div> </div></div>
""" """
eq_('<html><body><div>message<br></div></body></html>', eq_('<html><head></head><body><div>message<br></div></body></html>',
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -258,7 +260,7 @@ def test_reply_shares_div_with_from_block():
</div> </div>
</body>''' </body>'''
eq_('<html><body><div>Blah<br><br></div></body></html>', eq_('<html><head></head><body><div>Blah<br><br></div></body></html>',
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -269,13 +271,13 @@ def test_reply_quotations_share_block():
def test_OLK_SRC_BODY_SECTION_stripped(): def test_OLK_SRC_BODY_SECTION_stripped():
eq_('<html><body><div>Reply</div></body></html>', eq_('<html><head></head><body><div>Reply</div></body></html>',
RE_WHITESPACE.sub( RE_WHITESPACE.sub(
'', quotations.extract_from_html(OLK_SRC_BODY_SECTION))) '', quotations.extract_from_html(OLK_SRC_BODY_SECTION)))
def test_reply_separated_by_hr(): def test_reply_separated_by_hr():
eq_('<html><body><div>Hi<div>there</div></div></body></html>', eq_('<html><head></head><body><div>Hi<div>there</div></div></body></html>',
RE_WHITESPACE.sub( RE_WHITESPACE.sub(
'', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR)))
@@ -296,7 +298,7 @@ Reply
</div> </div>
</div> </div>
''' '''
eq_('<html><body><p>Reply</p><div><hr></div></body></html>', eq_('<html><head></head><body>Reply<div><hr></div></body></html>',
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -373,7 +375,7 @@ reply
extracted = quotations.extract_from_html(msg_body) extracted = quotations.extract_from_html(msg_body)
assert_false(symbol in extracted) assert_false(symbol in extracted)
# Keep new lines otherwise "My reply" becomes one word - "Myreply" # Keep new lines otherwise "My reply" becomes one word - "Myreply"
eq_("<html><body><p>My\nreply\n</p></body></html>", extracted) eq_("<html><head></head><body>My\nreply\n</body></html>", extracted)
def test_gmail_forwarded_msg(): def test_gmail_forwarded_msg():