restrict html processing to a certain number of tags
This commit is contained in:
2
setup.py
2
setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):
|
|||||||
|
|
||||||
|
|
||||||
setup(name='talon',
|
setup(name='talon',
|
||||||
version='1.3.1',
|
version='1.3.2',
|
||||||
description=("Mailgun library "
|
description=("Mailgun library "
|
||||||
"to extract message quotations and signatures."),
|
"to extract message quotations and signatures."),
|
||||||
long_description=open("README.rst").read(),
|
long_description=open("README.rst").read(),
|
||||||
|
|||||||
@@ -386,7 +386,7 @@ def _extract_from_html(msg_body):
|
|||||||
then checking deleted checkpoints,
|
then checking deleted checkpoints,
|
||||||
then deleting necessary tags.
|
then deleting necessary tags.
|
||||||
"""
|
"""
|
||||||
if len(msg_body) > MAX_HTML_LEN:
|
if _html_too_big(msg_body):
|
||||||
return msg_body
|
return msg_body
|
||||||
|
|
||||||
if msg_body.strip() == b'':
|
if msg_body.strip() == b'':
|
||||||
@@ -483,3 +483,12 @@ def register_xpath_extensions():
|
|||||||
ns.prefix = 'mg'
|
ns.prefix = 'mg'
|
||||||
ns['text_content'] = text_content
|
ns['text_content'] = text_content
|
||||||
ns['tail'] = tail
|
ns['tail'] = tail
|
||||||
|
|
||||||
|
|
||||||
|
def _html_too_big(msg_body):
|
||||||
|
return msg_body.count('<') > _MAX_TAGS_COUNT
|
||||||
|
|
||||||
|
|
||||||
|
# an extensive research shows that exceeding this limit
|
||||||
|
# might lead to excessive processing time
|
||||||
|
_MAX_TAGS_COUNT = 419
|
||||||
|
|||||||
@@ -385,7 +385,7 @@ def test_gmail_forwarded_msg():
|
|||||||
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
|
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
|
||||||
|
|
||||||
|
|
||||||
@patch.object(quotations, 'MAX_HTML_LEN', 1)
|
@patch.object(quotations, '_MAX_TAGS_COUNT', 4)
|
||||||
def test_too_large_html():
|
def test_too_large_html():
|
||||||
msg_body = 'Reply' \
|
msg_body = 'Reply' \
|
||||||
'<div class="gmail_quote">' \
|
'<div class="gmail_quote">' \
|
||||||
|
|||||||
Reference in New Issue
Block a user