Merge pull request #108 from mailgun/sergey/html5lib-fix

use new parser each time we parse a document
bump version
2016-08-22 18:10:35 -07:00 · 2016-08-22 17:34:18 -07:00 · 2016-08-22 16:25:04 -07:00
2 changed files with 18 additions and 14 deletions
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):
 setup(name='talon',
-      version='1.3.0',
+      version='1.3.1',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -178,7 +178,7 @@ def html_fromstring(s):
    """Parse html tree from string. Return None if the string can't be parsed.
    """
    try:
-        return html5parser.fromstring(s, parser=_HTML5LIB_PARSER)
+        return html5parser.fromstring(s, parser=_html5lib_parser())
    except Exception:
        pass
@@ -187,7 +187,7 @@ def html_document_fromstring(s):
    """Parse html tree from string. Return None if the string can't be parsed.
    """
    try:
-        return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER)
+        return html5parser.document_fromstring(s, parser=_html5lib_parser())
    except Exception:
        pass
@@ -220,6 +220,21 @@ def _encode_utf8(s):
    return s.encode('utf-8') if isinstance(s, six.text_type) else s
 def _html5lib_parser():
    """
    html5lib is a pure-python library that conforms to the WHATWG HTML spec
    and is not vulnarable to certain attacks common for XML libraries
    """
    return html5lib.HTMLParser(
        # build lxml tree
        html5lib.treebuilders.getTreeBuilder("lxml"),
        # remove namespace value from inside lxml.html.html5paser element tag
        # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div"
        # instead of "div", throwing the algo off
        namespaceHTMLElements=False
    )
 _UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;'
                     b'charset=utf-8">')
@@ -228,14 +243,3 @@ _BLOCKTAGS  = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
 _HARDBREAKS = ['br', 'hr', 'tr']
 _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
 # html5lib is a pure-python library that conforms to the WHATWG HTML spec
 # and is not vulnarable to certain attacks common for XML libraries
 _HTML5LIB_PARSER = html5lib.HTMLParser(
    # build lxml tree
    html5lib.treebuilders.getTreeBuilder("lxml"),
    # remove namespace value from inside lxml.html.html5paser element tag
    # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div"
    # instead of "div", throwing the algo off
    namespaceHTMLElements=False
 )
Author	SHA1	Message	Date
Sergey Obukhov	f04b872e14	Merge pull request #108 from mailgun/sergey/html5lib-fix use new parser each time we parse a document	2016-08-22 18:10:35 -07:00
Sergey Obukhov	e61894e425	bump version	2016-08-22 17:34:18 -07:00
Sergey Obukhov	35fbdaadac	use new parser each time we parse a document	2016-08-22 16:25:04 -07:00