Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2444ba87c0 | ||
|
|
534457e713 | ||
|
|
ea82a9730e | ||
|
|
f04b872e14 | ||
|
|
e61894e425 | ||
|
|
35fbdaadac |
2
setup.py
2
setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):
|
|||||||
|
|
||||||
|
|
||||||
setup(name='talon',
|
setup(name='talon',
|
||||||
version='1.3.0',
|
version='1.3.2',
|
||||||
description=("Mailgun library "
|
description=("Mailgun library "
|
||||||
"to extract message quotations and signatures."),
|
"to extract message quotations and signatures."),
|
||||||
long_description=open("README.rst").read(),
|
long_description=open("README.rst").read(),
|
||||||
|
|||||||
@@ -386,9 +386,6 @@ def _extract_from_html(msg_body):
|
|||||||
then checking deleted checkpoints,
|
then checking deleted checkpoints,
|
||||||
then deleting necessary tags.
|
then deleting necessary tags.
|
||||||
"""
|
"""
|
||||||
if len(msg_body) > MAX_HTML_LEN:
|
|
||||||
return msg_body
|
|
||||||
|
|
||||||
if msg_body.strip() == b'':
|
if msg_body.strip() == b'':
|
||||||
return msg_body
|
return msg_body
|
||||||
|
|
||||||
|
|||||||
@@ -178,7 +178,10 @@ def html_fromstring(s):
|
|||||||
"""Parse html tree from string. Return None if the string can't be parsed.
|
"""Parse html tree from string. Return None if the string can't be parsed.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
return html5parser.fromstring(s, parser=_HTML5LIB_PARSER)
|
if html_too_big(s):
|
||||||
|
return None
|
||||||
|
|
||||||
|
return html5parser.fromstring(s, parser=_html5lib_parser())
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -187,7 +190,10 @@ def html_document_fromstring(s):
|
|||||||
"""Parse html tree from string. Return None if the string can't be parsed.
|
"""Parse html tree from string. Return None if the string can't be parsed.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER)
|
if html_too_big(s):
|
||||||
|
return None
|
||||||
|
|
||||||
|
return html5parser.document_fromstring(s, parser=_html5lib_parser())
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -196,6 +202,10 @@ def cssselect(expr, tree):
|
|||||||
return CSSSelector(expr)(tree)
|
return CSSSelector(expr)(tree)
|
||||||
|
|
||||||
|
|
||||||
|
def html_too_big(s):
|
||||||
|
return s.count('<') > _MAX_TAGS_COUNT
|
||||||
|
|
||||||
|
|
||||||
def _contains_charset_spec(s):
|
def _contains_charset_spec(s):
|
||||||
"""Return True if the first 4KB contain charset spec
|
"""Return True if the first 4KB contain charset spec
|
||||||
"""
|
"""
|
||||||
@@ -220,6 +230,21 @@ def _encode_utf8(s):
|
|||||||
return s.encode('utf-8') if isinstance(s, six.text_type) else s
|
return s.encode('utf-8') if isinstance(s, six.text_type) else s
|
||||||
|
|
||||||
|
|
||||||
|
def _html5lib_parser():
|
||||||
|
"""
|
||||||
|
html5lib is a pure-python library that conforms to the WHATWG HTML spec
|
||||||
|
and is not vulnarable to certain attacks common for XML libraries
|
||||||
|
"""
|
||||||
|
return html5lib.HTMLParser(
|
||||||
|
# build lxml tree
|
||||||
|
html5lib.treebuilders.getTreeBuilder("lxml"),
|
||||||
|
# remove namespace value from inside lxml.html.html5paser element tag
|
||||||
|
# otherwise it yields something like "{http://www.w3.org/1999/xhtml}div"
|
||||||
|
# instead of "div", throwing the algo off
|
||||||
|
namespaceHTMLElements=False
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
_UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;'
|
_UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;'
|
||||||
b'charset=utf-8">')
|
b'charset=utf-8">')
|
||||||
|
|
||||||
@@ -229,13 +254,6 @@ _HARDBREAKS = ['br', 'hr', 'tr']
|
|||||||
|
|
||||||
_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
|
_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
|
||||||
|
|
||||||
# html5lib is a pure-python library that conforms to the WHATWG HTML spec
|
# an extensive research shows that exceeding this limit
|
||||||
# and is not vulnarable to certain attacks common for XML libraries
|
# might lead to excessive processing time
|
||||||
_HTML5LIB_PARSER = html5lib.HTMLParser(
|
_MAX_TAGS_COUNT = 419
|
||||||
# build lxml tree
|
|
||||||
html5lib.treebuilders.getTreeBuilder("lxml"),
|
|
||||||
# remove namespace value from inside lxml.html.html5paser element tag
|
|
||||||
# otherwise it yields something like "{http://www.w3.org/1999/xhtml}div"
|
|
||||||
# instead of "div", throwing the algo off
|
|
||||||
namespaceHTMLElements=False
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -385,7 +385,7 @@ def test_gmail_forwarded_msg():
|
|||||||
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
|
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
|
||||||
|
|
||||||
|
|
||||||
@patch.object(quotations, 'MAX_HTML_LEN', 1)
|
@patch.object(u, '_MAX_TAGS_COUNT', 4)
|
||||||
def test_too_large_html():
|
def test_too_large_html():
|
||||||
msg_body = 'Reply' \
|
msg_body = 'Reply' \
|
||||||
'<div class="gmail_quote">' \
|
'<div class="gmail_quote">' \
|
||||||
|
|||||||
@@ -120,6 +120,12 @@ def test_comment_no_parent():
|
|||||||
def test_html_fromstring_exception():
|
def test_html_fromstring_exception():
|
||||||
eq_(None, u.html_fromstring("<html></html>"))
|
eq_(None, u.html_fromstring("<html></html>"))
|
||||||
|
|
||||||
|
@patch.object(u, 'html_too_big', Mock())
|
||||||
|
@patch.object(u.html5parser, 'fromstring')
|
||||||
|
def test_html_fromstring_too_big(fromstring):
|
||||||
|
eq_(None, u.html_fromstring("<html></html>"))
|
||||||
|
assert_false(fromstring.called)
|
||||||
|
|
||||||
|
|
||||||
@patch.object(u.html5parser, 'document_fromstring')
|
@patch.object(u.html5parser, 'document_fromstring')
|
||||||
def test_html_document_fromstring_exception(document_fromstring):
|
def test_html_document_fromstring_exception(document_fromstring):
|
||||||
@@ -127,7 +133,26 @@ def test_html_document_fromstring_exception(document_fromstring):
|
|||||||
eq_(None, u.html_document_fromstring("<html></html>"))
|
eq_(None, u.html_document_fromstring("<html></html>"))
|
||||||
|
|
||||||
|
|
||||||
|
@patch.object(u, 'html_too_big', Mock())
|
||||||
|
@patch.object(u.html5parser, 'document_fromstring')
|
||||||
|
def test_html_document_fromstring_too_big(document_fromstring):
|
||||||
|
eq_(None, u.html_document_fromstring("<html></html>"))
|
||||||
|
assert_false(document_fromstring.called)
|
||||||
|
|
||||||
|
|
||||||
@patch.object(u, 'html_fromstring', Mock(return_value=None))
|
@patch.object(u, 'html_fromstring', Mock(return_value=None))
|
||||||
def test_bad_html_to_text():
|
def test_bad_html_to_text():
|
||||||
bad_html = "one<br>two<br>three"
|
bad_html = "one<br>two<br>three"
|
||||||
eq_(None, u.html_to_text(bad_html))
|
eq_(None, u.html_to_text(bad_html))
|
||||||
|
|
||||||
|
|
||||||
|
@patch.object(u, '_MAX_TAGS_COUNT', 3)
|
||||||
|
def test_html_too_big():
|
||||||
|
eq_(False, u.html_too_big("<div></div>"))
|
||||||
|
eq_(True, u.html_too_big("<div><span>Hi</span></div>"))
|
||||||
|
|
||||||
|
|
||||||
|
@patch.object(u, '_MAX_TAGS_COUNT', 3)
|
||||||
|
def test_html_to_text():
|
||||||
|
eq_("Hello", u.html_to_text("<div>Hello</div>"))
|
||||||
|
eq_(None, u.html_to_text("<div><span>Hi</span></div>"))
|
||||||
|
|||||||
Reference in New Issue
Block a user