restrict html processing to a certain number of tags

This commit is contained in:
Sergey Obukhov
2016-09-14 09:33:30 -07:00
parent f04b872e14
commit ea82a9730e
3 changed files with 12 additions and 3 deletions

View File

@@ -29,7 +29,7 @@ class InstallCommand(install):
setup(name='talon', setup(name='talon',
version='1.3.1', version='1.3.2',
description=("Mailgun library " description=("Mailgun library "
"to extract message quotations and signatures."), "to extract message quotations and signatures."),
long_description=open("README.rst").read(), long_description=open("README.rst").read(),

View File

@@ -386,7 +386,7 @@ def _extract_from_html(msg_body):
then checking deleted checkpoints, then checking deleted checkpoints,
then deleting necessary tags. then deleting necessary tags.
""" """
if len(msg_body) > MAX_HTML_LEN: if _html_too_big(msg_body):
return msg_body return msg_body
if msg_body.strip() == b'': if msg_body.strip() == b'':
@@ -483,3 +483,12 @@ def register_xpath_extensions():
ns.prefix = 'mg' ns.prefix = 'mg'
ns['text_content'] = text_content ns['text_content'] = text_content
ns['tail'] = tail ns['tail'] = tail
def _html_too_big(msg_body):
return msg_body.count('<') > _MAX_TAGS_COUNT
# an extensive research shows that exceeding this limit
# might lead to excessive processing time
_MAX_TAGS_COUNT = 419

View File

@@ -385,7 +385,7 @@ def test_gmail_forwarded_msg():
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
@patch.object(quotations, 'MAX_HTML_LEN', 1) @patch.object(quotations, '_MAX_TAGS_COUNT', 4)
def test_too_large_html(): def test_too_large_html():
msg_body = 'Reply' \ msg_body = 'Reply' \
'<div class="gmail_quote">' \ '<div class="gmail_quote">' \