diff --git a/setup.py b/setup.py index 83f0714..8f054df 100755 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ class InstallCommand(install): setup(name='talon', - version='1.2.12', + version='1.2.14', description=("Mailgun library " "to extract message quotations and signatures."), long_description=open("README.rst").read(), diff --git a/talon/quotations.py b/talon/quotations.py index d9dba21..9999e6a 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -164,6 +164,9 @@ RE_PARENTHESIS_LINK = re.compile("\(https?://") SPLITTER_MAX_LINES = 4 MAX_LINES_COUNT = 1000 +# an extensive research shows that exceeding this limit +# leads to excessive processing time +MAX_HTML_LEN = 2794202 QUOT_PATTERN = re.compile('^>+ ?') NO_QUOT_LINE = re.compile('^[^>].*[\S].*') @@ -382,6 +385,9 @@ def _extract_from_html(msg_body): then checking deleted checkpoints, then deleting necessary tags. """ + if len(msg_body) > MAX_HTML_LEN: + return msg_body + if msg_body.strip() == b'': return msg_body diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index c155cbb..03c66a8 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -380,3 +380,15 @@ def test_gmail_forwarded_msg():
""" extracted = quotations.extract_from_html(msg_body) eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) + + +@patch.object(quotations, 'MAX_HTML_LEN', 1) +def test_too_large_html(): + msg_body = 'Reply' \ + '
' \ + '
On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:' \ + '
Test
' \ + '
' \ + '
' + eq_(RE_WHITESPACE.sub('', msg_body), + RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))