Merge pull request #100 from mailgun/sergey/restrict
do not parse html quotations if html is longer then certain threshold
This commit is contained in:
2
setup.py
2
setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):
|
||||
|
||||
|
||||
setup(name='talon',
|
||||
version='1.2.12',
|
||||
version='1.2.14',
|
||||
description=("Mailgun library "
|
||||
"to extract message quotations and signatures."),
|
||||
long_description=open("README.rst").read(),
|
||||
|
||||
@@ -164,6 +164,9 @@ RE_PARENTHESIS_LINK = re.compile("\(https?://")
|
||||
|
||||
SPLITTER_MAX_LINES = 4
|
||||
MAX_LINES_COUNT = 1000
|
||||
# an extensive research shows that exceeding this limit
|
||||
# leads to excessive processing time
|
||||
MAX_HTML_LEN = 2794202
|
||||
|
||||
QUOT_PATTERN = re.compile('^>+ ?')
|
||||
NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
|
||||
@@ -382,6 +385,9 @@ def _extract_from_html(msg_body):
|
||||
then checking deleted checkpoints,
|
||||
then deleting necessary tags.
|
||||
"""
|
||||
if len(msg_body) > MAX_HTML_LEN:
|
||||
return msg_body
|
||||
|
||||
if msg_body.strip() == b'':
|
||||
return msg_body
|
||||
|
||||
|
||||
@@ -380,3 +380,15 @@ def test_gmail_forwarded_msg():
|
||||
</div><br></div>"""
|
||||
extracted = quotations.extract_from_html(msg_body)
|
||||
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
|
||||
|
||||
|
||||
@patch.object(quotations, 'MAX_HTML_LEN', 1)
|
||||
def test_too_large_html():
|
||||
msg_body = 'Reply' \
|
||||
'<div class="gmail_quote">' \
|
||||
'<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:' \
|
||||
'<div>Test</div>' \
|
||||
'</div>' \
|
||||
'</div>'
|
||||
eq_(RE_WHITESPACE.sub('', msg_body),
|
||||
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
|
||||
|
||||
Reference in New Issue
Block a user