diff --git a/setup.py b/setup.py index 0837dea..7bef4a0 100755 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ class InstallCommand(install): setup(name='talon', - version='1.3.0', + version='1.3.1', description=("Mailgun library " "to extract message quotations and signatures."), long_description=open("README.rst").read(), diff --git a/talon/utils.py b/talon/utils.py index 093bc95..91386a3 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -178,7 +178,7 @@ def html_fromstring(s): """Parse html tree from string. Return None if the string can't be parsed. """ try: - return html5parser.fromstring(s, parser=_HTML5LIB_PARSER) + return html5parser.fromstring(s, parser=_html5lib_parser()) except Exception: pass @@ -187,7 +187,7 @@ def html_document_fromstring(s): """Parse html tree from string. Return None if the string can't be parsed. """ try: - return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER) + return html5parser.document_fromstring(s, parser=_html5lib_parser()) except Exception: pass @@ -220,6 +220,21 @@ def _encode_utf8(s): return s.encode('utf-8') if isinstance(s, six.text_type) else s +def _html5lib_parser(): + """ + html5lib is a pure-python library that conforms to the WHATWG HTML spec + and is not vulnarable to certain attacks common for XML libraries + """ + return html5lib.HTMLParser( + # build lxml tree + html5lib.treebuilders.getTreeBuilder("lxml"), + # remove namespace value from inside lxml.html.html5paser element tag + # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div" + # instead of "div", throwing the algo off + namespaceHTMLElements=False + ) + + _UTF8_DECLARATION = (b'') @@ -228,14 +243,3 @@ _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] _HARDBREAKS = ['br', 'hr', 'tr'] _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") - -# html5lib is a pure-python library that conforms to the WHATWG HTML spec -# and is not vulnarable to certain attacks common for XML libraries -_HTML5LIB_PARSER = html5lib.HTMLParser( - # build lxml tree - html5lib.treebuilders.getTreeBuilder("lxml"), - # remove namespace value from inside lxml.html.html5paser element tag - # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div" - # instead of "div", throwing the algo off - namespaceHTMLElements=False -)