From 35fbdaadac318d5c87cb3626eaf67a010c30bee3 Mon Sep 17 00:00:00 2001 From: Sergey Obukhov Date: Mon, 22 Aug 2016 16:25:04 -0700 Subject: [PATCH] use new parser each time we parse a document --- talon/utils.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/talon/utils.py b/talon/utils.py index 093bc95..91386a3 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -178,7 +178,7 @@ def html_fromstring(s): """Parse html tree from string. Return None if the string can't be parsed. """ try: - return html5parser.fromstring(s, parser=_HTML5LIB_PARSER) + return html5parser.fromstring(s, parser=_html5lib_parser()) except Exception: pass @@ -187,7 +187,7 @@ def html_document_fromstring(s): """Parse html tree from string. Return None if the string can't be parsed. """ try: - return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER) + return html5parser.document_fromstring(s, parser=_html5lib_parser()) except Exception: pass @@ -220,6 +220,21 @@ def _encode_utf8(s): return s.encode('utf-8') if isinstance(s, six.text_type) else s +def _html5lib_parser(): + """ + html5lib is a pure-python library that conforms to the WHATWG HTML spec + and is not vulnarable to certain attacks common for XML libraries + """ + return html5lib.HTMLParser( + # build lxml tree + html5lib.treebuilders.getTreeBuilder("lxml"), + # remove namespace value from inside lxml.html.html5paser element tag + # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div" + # instead of "div", throwing the algo off + namespaceHTMLElements=False + ) + + _UTF8_DECLARATION = (b'') @@ -228,14 +243,3 @@ _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] _HARDBREAKS = ['br', 'hr', 'tr'] _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") - -# html5lib is a pure-python library that conforms to the WHATWG HTML spec -# and is not vulnarable to certain attacks common for XML libraries -_HTML5LIB_PARSER = html5lib.HTMLParser( - # build lxml tree - html5lib.treebuilders.getTreeBuilder("lxml"), - # remove namespace value from inside lxml.html.html5paser element tag - # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div" - # instead of "div", throwing the algo off - namespaceHTMLElements=False -)