Merge pull request #100 from mailgun/sergey/restrict

do not parse html quotations if html is longer then certain threshold
2016-08-11 16:08:03 -07:00
parent 10d9a930f9 a0d7236d0b
commit 5a9bc967f1
3 changed files with 19 additions and 1 deletions
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):
 setup(name='talon',
-      version='1.2.12',
+      version='1.2.14',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -164,6 +164,9 @@ RE_PARENTHESIS_LINK = re.compile("\(https?://")
 SPLITTER_MAX_LINES = 4
 MAX_LINES_COUNT = 1000
 # an extensive research shows that exceeding this limit
 # leads to excessive processing time
 MAX_HTML_LEN = 2794202
 QUOT_PATTERN = re.compile('^>+ ?')
 NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
@@ -382,6 +385,9 @@ def _extract_from_html(msg_body):
    then checking deleted checkpoints,
    then deleting necessary tags.
    """
    if len(msg_body) > MAX_HTML_LEN:
        return msg_body
    if msg_body.strip() == b'':
        return msg_body
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -380,3 +380,15 @@ def test_gmail_forwarded_msg():
 </div><br></div>"""
    extracted = quotations.extract_from_html(msg_body)
    eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
@patch.object(quotations, 'MAX_HTML_LEN', 1)
 def test_too_large_html():
    msg_body = 'Reply' \
               '<div class="gmail_quote">' \
               '<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:' \
               '<div>Test</div>' \
               '</div>' \
               '</div>'
    eq_(RE_WHITESPACE.sub('', msg_body),
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))