Merge pull request #100 from mailgun/sergey/restrict

do not parse html quotations if html is longer then certain threshold
This commit is contained in:
Sergey Obukhov
2016-08-11 16:08:03 -07:00
committed by GitHub
3 changed files with 19 additions and 1 deletions

View File

@@ -29,7 +29,7 @@ class InstallCommand(install):
setup(name='talon', setup(name='talon',
version='1.2.12', version='1.2.14',
description=("Mailgun library " description=("Mailgun library "
"to extract message quotations and signatures."), "to extract message quotations and signatures."),
long_description=open("README.rst").read(), long_description=open("README.rst").read(),

View File

@@ -164,6 +164,9 @@ RE_PARENTHESIS_LINK = re.compile("\(https?://")
SPLITTER_MAX_LINES = 4 SPLITTER_MAX_LINES = 4
MAX_LINES_COUNT = 1000 MAX_LINES_COUNT = 1000
# an extensive research shows that exceeding this limit
# leads to excessive processing time
MAX_HTML_LEN = 2794202
QUOT_PATTERN = re.compile('^>+ ?') QUOT_PATTERN = re.compile('^>+ ?')
NO_QUOT_LINE = re.compile('^[^>].*[\S].*') NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
@@ -382,6 +385,9 @@ def _extract_from_html(msg_body):
then checking deleted checkpoints, then checking deleted checkpoints,
then deleting necessary tags. then deleting necessary tags.
""" """
if len(msg_body) > MAX_HTML_LEN:
return msg_body
if msg_body.strip() == b'': if msg_body.strip() == b'':
return msg_body return msg_body

View File

@@ -380,3 +380,15 @@ def test_gmail_forwarded_msg():
</div><br></div>""" </div><br></div>"""
extracted = quotations.extract_from_html(msg_body) extracted = quotations.extract_from_html(msg_body)
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
@patch.object(quotations, 'MAX_HTML_LEN', 1)
def test_too_large_html():
msg_body = 'Reply' \
'<div class="gmail_quote">' \
'<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:' \
'<div>Test</div>' \
'</div>' \
'</div>'
eq_(RE_WHITESPACE.sub('', msg_body),
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))