Merge pull request #111 from mailgun/sergey/tagscount

restrict html processing to a certain number of tags
This commit is contained in:
Sergey Obukhov
2016-09-14 11:06:29 -07:00
committed by GitHub
5 changed files with 41 additions and 5 deletions

View File

@@ -29,7 +29,7 @@ class InstallCommand(install):
setup(name='talon',
version='1.3.1',
version='1.3.2',
description=("Mailgun library "
"to extract message quotations and signatures."),
long_description=open("README.rst").read(),

View File

@@ -386,9 +386,6 @@ def _extract_from_html(msg_body):
then checking deleted checkpoints,
then deleting necessary tags.
"""
if len(msg_body) > MAX_HTML_LEN:
return msg_body
if msg_body.strip() == b'':
return msg_body

View File

@@ -178,6 +178,9 @@ def html_fromstring(s):
"""Parse html tree from string. Return None if the string can't be parsed.
"""
try:
if html_too_big(s):
return None
return html5parser.fromstring(s, parser=_html5lib_parser())
except Exception:
pass
@@ -187,6 +190,9 @@ def html_document_fromstring(s):
"""Parse html tree from string. Return None if the string can't be parsed.
"""
try:
if html_too_big(s):
return None
return html5parser.document_fromstring(s, parser=_html5lib_parser())
except Exception:
pass
@@ -196,6 +202,10 @@ def cssselect(expr, tree):
return CSSSelector(expr)(tree)
def html_too_big(s):
return s.count('<') > _MAX_TAGS_COUNT
def _contains_charset_spec(s):
"""Return True if the first 4KB contain charset spec
"""
@@ -243,3 +253,7 @@ _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
_HARDBREAKS = ['br', 'hr', 'tr']
_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
# an extensive research shows that exceeding this limit
# might lead to excessive processing time
_MAX_TAGS_COUNT = 419

View File

@@ -385,7 +385,7 @@ def test_gmail_forwarded_msg():
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
@patch.object(quotations, 'MAX_HTML_LEN', 1)
@patch.object(u, '_MAX_TAGS_COUNT', 4)
def test_too_large_html():
msg_body = 'Reply' \
'<div class="gmail_quote">' \

View File

@@ -120,6 +120,12 @@ def test_comment_no_parent():
def test_html_fromstring_exception():
eq_(None, u.html_fromstring("<html></html>"))
@patch.object(u, 'html_too_big', Mock())
@patch.object(u.html5parser, 'fromstring')
def test_html_fromstring_too_big(fromstring):
eq_(None, u.html_fromstring("<html></html>"))
assert_false(fromstring.called)
@patch.object(u.html5parser, 'document_fromstring')
def test_html_document_fromstring_exception(document_fromstring):
@@ -127,7 +133,26 @@ def test_html_document_fromstring_exception(document_fromstring):
eq_(None, u.html_document_fromstring("<html></html>"))
@patch.object(u, 'html_too_big', Mock())
@patch.object(u.html5parser, 'document_fromstring')
def test_html_document_fromstring_too_big(document_fromstring):
eq_(None, u.html_document_fromstring("<html></html>"))
assert_false(document_fromstring.called)
@patch.object(u, 'html_fromstring', Mock(return_value=None))
def test_bad_html_to_text():
bad_html = "one<br>two<br>three"
eq_(None, u.html_to_text(bad_html))
@patch.object(u, '_MAX_TAGS_COUNT', 3)
def test_html_too_big():
eq_(False, u.html_too_big("<div></div>"))
eq_(True, u.html_too_big("<div><span>Hi</span></div>"))
@patch.object(u, '_MAX_TAGS_COUNT', 3)
def test_html_to_text():
eq_("Hello", u.html_to_text("<div>Hello</div>"))
eq_(None, u.html_to_text("<div><span>Hi</span></div>"))