Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f04b872e14 | ||
|
|
e61894e425 | ||
|
|
35fbdaadac |
2
setup.py
2
setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):
|
|||||||
|
|
||||||
|
|
||||||
setup(name='talon',
|
setup(name='talon',
|
||||||
version='1.3.0',
|
version='1.3.1',
|
||||||
description=("Mailgun library "
|
description=("Mailgun library "
|
||||||
"to extract message quotations and signatures."),
|
"to extract message quotations and signatures."),
|
||||||
long_description=open("README.rst").read(),
|
long_description=open("README.rst").read(),
|
||||||
|
|||||||
@@ -178,7 +178,7 @@ def html_fromstring(s):
|
|||||||
"""Parse html tree from string. Return None if the string can't be parsed.
|
"""Parse html tree from string. Return None if the string can't be parsed.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
return html5parser.fromstring(s, parser=_HTML5LIB_PARSER)
|
return html5parser.fromstring(s, parser=_html5lib_parser())
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -187,7 +187,7 @@ def html_document_fromstring(s):
|
|||||||
"""Parse html tree from string. Return None if the string can't be parsed.
|
"""Parse html tree from string. Return None if the string can't be parsed.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER)
|
return html5parser.document_fromstring(s, parser=_html5lib_parser())
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -220,6 +220,21 @@ def _encode_utf8(s):
|
|||||||
return s.encode('utf-8') if isinstance(s, six.text_type) else s
|
return s.encode('utf-8') if isinstance(s, six.text_type) else s
|
||||||
|
|
||||||
|
|
||||||
|
def _html5lib_parser():
|
||||||
|
"""
|
||||||
|
html5lib is a pure-python library that conforms to the WHATWG HTML spec
|
||||||
|
and is not vulnarable to certain attacks common for XML libraries
|
||||||
|
"""
|
||||||
|
return html5lib.HTMLParser(
|
||||||
|
# build lxml tree
|
||||||
|
html5lib.treebuilders.getTreeBuilder("lxml"),
|
||||||
|
# remove namespace value from inside lxml.html.html5paser element tag
|
||||||
|
# otherwise it yields something like "{http://www.w3.org/1999/xhtml}div"
|
||||||
|
# instead of "div", throwing the algo off
|
||||||
|
namespaceHTMLElements=False
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
_UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;'
|
_UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;'
|
||||||
b'charset=utf-8">')
|
b'charset=utf-8">')
|
||||||
|
|
||||||
@@ -228,14 +243,3 @@ _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
|
|||||||
_HARDBREAKS = ['br', 'hr', 'tr']
|
_HARDBREAKS = ['br', 'hr', 'tr']
|
||||||
|
|
||||||
_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
|
_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
|
||||||
|
|
||||||
# html5lib is a pure-python library that conforms to the WHATWG HTML spec
|
|
||||||
# and is not vulnarable to certain attacks common for XML libraries
|
|
||||||
_HTML5LIB_PARSER = html5lib.HTMLParser(
|
|
||||||
# build lxml tree
|
|
||||||
html5lib.treebuilders.getTreeBuilder("lxml"),
|
|
||||||
# remove namespace value from inside lxml.html.html5paser element tag
|
|
||||||
# otherwise it yields something like "{http://www.w3.org/1999/xhtml}div"
|
|
||||||
# instead of "div", throwing the algo off
|
|
||||||
namespaceHTMLElements=False
|
|
||||||
)
|
|
||||||
|
|||||||
Reference in New Issue
Block a user