use html5lib to parse html

2016-08-15 19:36:21 -07:00
parent f53b5cc7a6
commit bcf97eccfa
4 changed files with 44 additions and 25 deletions
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -12,7 +12,8 @@ from copy import deepcopy

 from lxml import html, etree

-from talon.utils import get_delimiter, html_tree_to_text
+from talon.utils import (get_delimiter, html_tree_to_text,
+                         html_document_fromstring)
 from talon import html_quotations
 from six.moves import range
 import six
@@ -392,10 +393,7 @@ def _extract_from_html(msg_body):
        return msg_body

    msg_body = msg_body.replace(b'\r\n', b'\n')
-    html_tree = html.document_fromstring(
-        msg_body,
-        parser=html.HTMLParser(encoding="utf-8")
-    )
+    html_tree = html_document_fromstring(msg_body)
    cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
                      html_quotations.cut_zimbra_quote(html_tree) or
                      html_quotations.cut_blockquote(html_tree) or
@@ -468,7 +466,7 @@ def is_splitter(line):

 def text_content(context):
    '''XPath Extension function to return a node text content.'''
-    return context.context_node.text_content().strip()
+    return context.context_node.xpath("string()").strip()


 def tail(context):
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -7,9 +7,11 @@ import chardet
 import cchardet
 import regex as re

-from lxml import html
+from lxml.html import html5parser
 from lxml.cssselect import CSSSelector

+import html5lib
+
 from talon.constants import RE_DELIMITER
 import six

@@ -120,7 +122,7 @@ def html_tree_to_text(tree):
        parent = c.getparent()

        # comment with no parent does not impact produced text
-        if not parent:
+        if parent is None:
            continue

        parent.remove(c)
@@ -162,11 +164,18 @@ def html_to_text(string):

    s = _prepend_utf8_declaration(string)
    s = s.replace(b"\n", b"")
-
-    tree = html.fromstring(s)
+    tree = html_fromstring(s)
    return html_tree_to_text(tree)


+def html_fromstring(s):
+    return html5parser.fromstring(s, parser=_HTML5LIB_PARSER)
+
+
+def html_document_fromstring(s):
+    return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER)
+
+
 def _contains_charset_spec(s):
    """Return True if the first 4KB contain charset spec
    """
@@ -198,5 +207,15 @@ _UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;'
 _BLOCKTAGS  = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
 _HARDBREAKS = ['br', 'hr', 'tr']

-
 _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
+
+# html5lib is a pure-python library that conforms to the WHATWG HTML spec
+# and is not vulnarable to certain attacks common for XML libraries
+_HTML5LIB_PARSER = html5lib.HTMLParser(
+    # build lxml tree
+    html5lib.treebuilders.getTreeBuilder("lxml"),
+    # remove namespace value from inside lxml.html.html5paser element tag
+    # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div"
+    # instead of "div", throwing the algo off
+    namespaceHTMLElements=False
+)