From 4ee46c0a977477e889209f3f5940f1f759f5bd5d Mon Sep 17 00:00:00 2001 From: Sergey Obukhov Date: Tue, 9 Aug 2016 17:08:58 -0700 Subject: [PATCH 1/3] do not parse html quotations if html is longer then certain threshold --- talon/quotations.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/talon/quotations.py b/talon/quotations.py index d9dba21..f6122ff 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -164,6 +164,7 @@ RE_PARENTHESIS_LINK = re.compile("\(https?://") SPLITTER_MAX_LINES = 4 MAX_LINES_COUNT = 1000 +MAX_HTML_LEN = 2794202 QUOT_PATTERN = re.compile('^>+ ?') NO_QUOT_LINE = re.compile('^[^>].*[\S].*') @@ -382,6 +383,9 @@ def _extract_from_html(msg_body): then checking deleted checkpoints, then deleting necessary tags. """ + if len(msg_body) > MAX_HTML_LEN: + return msg_body + if msg_body.strip() == b'': return msg_body From 21e9a31ffe981ae9d58b4143b478b89232875d8d Mon Sep 17 00:00:00 2001 From: Sergey Obukhov Date: Tue, 9 Aug 2016 17:15:49 -0700 Subject: [PATCH 2/3] add test --- tests/html_quotations_test.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index c155cbb..03c66a8 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -380,3 +380,15 @@ def test_gmail_forwarded_msg():
""" extracted = quotations.extract_from_html(msg_body) eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) + + +@patch.object(quotations, 'MAX_HTML_LEN', 1) +def test_too_large_html(): + msg_body = 'Reply' \ + '
' \ + '
On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:' \ + '
Test
' \ + '
' \ + '
' + eq_(RE_WHITESPACE.sub('', msg_body), + RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) From a0d7236d0b8d7c2635d0aff040a485b8b0365d36 Mon Sep 17 00:00:00 2001 From: Sergey Obukhov Date: Thu, 11 Aug 2016 15:49:09 -0700 Subject: [PATCH 3/3] bump version and add a comment --- setup.py | 2 +- talon/quotations.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 83f0714..8f054df 100755 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ class InstallCommand(install): setup(name='talon', - version='1.2.12', + version='1.2.14', description=("Mailgun library " "to extract message quotations and signatures."), long_description=open("README.rst").read(), diff --git a/talon/quotations.py b/talon/quotations.py index f6122ff..9999e6a 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -164,6 +164,8 @@ RE_PARENTHESIS_LINK = re.compile("\(https?://") SPLITTER_MAX_LINES = 4 MAX_LINES_COUNT = 1000 +# an extensive research shows that exceeding this limit +# leads to excessive processing time MAX_HTML_LEN = 2794202 QUOT_PATTERN = re.compile('^>+ ?')