Merge pull request #105 from mailgun/sergey/fromstring

html with comment that has no parent crashes html_tree_to_text
bump version
2016-08-15 13:40:37 -07:00 · 2016-08-15 13:21:10 -07:00 · 2016-08-12 17:40:12 -07:00 · 2016-08-12 12:18:50 -07:00 · 2016-08-11 23:59:18 -07:00 · 2016-08-11 23:58:11 -07:00
5 changed files with 94 additions and 37 deletions
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):


 setup(name='talon',
-      version='1.2.12',
+      version='1.2.16',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -12,7 +12,7 @@ from copy import deepcopy

 from lxml import html, etree

-from talon.utils import get_delimiter, html_to_text
+from talon.utils import get_delimiter, html_tree_to_text
 from talon import html_quotations
 from six.moves import range
 import six
@@ -164,6 +164,9 @@ RE_PARENTHESIS_LINK = re.compile("\(https?://")

 SPLITTER_MAX_LINES = 4
 MAX_LINES_COUNT = 1000
+# an extensive research shows that exceeding this limit
+# leads to excessive processing time
+MAX_HTML_LEN = 2794202

 QUOT_PATTERN = re.compile('^>+ ?')
 NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
@@ -382,10 +385,13 @@ def _extract_from_html(msg_body):
    then checking deleted checkpoints,
    then deleting necessary tags.
    """
+    if len(msg_body) > MAX_HTML_LEN:
+        return msg_body
+
    if msg_body.strip() == b'':
        return msg_body

-    msg_body = msg_body.replace(b'\r\n', b'').replace(b'\n', b'')
+    msg_body = msg_body.replace(b'\r\n', b'\n')
    html_tree = html.document_fromstring(
        msg_body,
        parser=html.HTMLParser(encoding="utf-8")
@@ -401,8 +407,7 @@ def _extract_from_html(msg_body):

    number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
    quotation_checkpoints = [False] * number_of_checkpoints
-    msg_with_checkpoints = html.tostring(html_tree)
-    plain_text = html_to_text(msg_with_checkpoints)
+    plain_text = html_tree_to_text(html_tree)
    plain_text = preprocess(plain_text, '\n', content_type='text/html')
    lines = plain_text.splitlines()

@@ -425,25 +430,31 @@ def _extract_from_html(msg_body):
    return_flags = []
    process_marked_lines(lines, markers, return_flags)
    lines_were_deleted, first_deleted, last_deleted = return_flags
+
+    if not lines_were_deleted and not cut_quotations:
+        return msg_body
+
    if lines_were_deleted:
        #collect checkpoints from deleted lines
        for i in range(first_deleted, last_deleted):
            for checkpoint in line_checkpoints[i]:
                quotation_checkpoints[checkpoint] = True
-    else:
-        if cut_quotations:
-            return html.tostring(html_tree_copy)
-        else:
-            return msg_body

        # Remove tags with quotation checkpoints
        html_quotations.delete_quotation_tags(
            html_tree_copy, 0, quotation_checkpoints
        )

+    if _readable_text_empty(html_tree_copy):
+        return msg_body
+
    return html.tostring(html_tree_copy)


+def _readable_text_empty(html_tree):
+    return not bool(html_tree_to_text(html_tree).strip())
+
+
 def is_splitter(line):
    '''
    Returns Matcher object if provided string is a splitter and
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -112,30 +112,18 @@ def get_delimiter(msg_body):

    return delimiter

-
-def html_to_text(string):
-    """
-    Dead-simple HTML-to-text converter:
-        >>> html_to_text("one<br>two<br>three")
-        >>> "one\ntwo\nthree"
-
-    NOTES:
-        1. the string is expected to contain UTF-8 encoded HTML!
-        2. returns utf-8 encoded str (not unicode)
-    """
-    if isinstance(string, six.text_type):
-        string = string.encode('utf8')
-
-    s = _prepend_utf8_declaration(string)
-    s = s.replace(b"\n", b"")
-
-    tree = html.fromstring(s)
-
+def html_tree_to_text(tree):
    for style in CSSSelector('style')(tree):
        style.getparent().remove(style)

    for c in tree.xpath('//comment()'):
-        c.getparent().remove(c)
+        parent = c.getparent()
+
+        # comment with no parent does not impact produced text
+        if not parent:
+            continue
+
+        parent.remove(c)

    text   = ""
    for el in tree.iter():
@@ -159,6 +147,26 @@ def html_to_text(string):
    return _encode_utf8(retval)


+def html_to_text(string):
+    """
+    Dead-simple HTML-to-text converter:
+        >>> html_to_text("one<br>two<br>three")
+        >>> "one\ntwo\nthree"
+
+    NOTES:
+        1. the string is expected to contain UTF-8 encoded HTML!
+        2. returns utf-8 encoded str (not unicode)
+    """
+    if isinstance(string, six.text_type):
+        string = string.encode('utf8')
+
+    s = _prepend_utf8_declaration(string)
+    s = s.replace(b"\n", b"")
+
+    tree = html.fromstring(s)
+    return html_tree_to_text(tree)
+
+
 def _contains_charset_spec(s):
    """Return True if the first 4KB contain charset spec
    """
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -356,7 +356,8 @@ def test_CRLF():
    assert_false(symbol in extracted)
    eq_('<html></html>', RE_WHITESPACE.sub('', extracted))

-    msg_body = """Reply
+    msg_body = """My
+reply
 <blockquote>

  <div>
@@ -371,8 +372,8 @@ def test_CRLF():
    msg_body = msg_body.replace('\n', '\r\n')
    extracted = quotations.extract_from_html(msg_body)
    assert_false(symbol in extracted)    
-    eq_("<html><body><p>Reply</p></body></html>",
-        RE_WHITESPACE.sub('', extracted))
+    # Keep new lines otherwise "My reply" becomes one word - "Myreply" 
+    eq_("<html><body><p>My\nreply\n</p></body></html>", extracted)


 def test_gmail_forwarded_msg():
@@ -380,3 +381,33 @@ def test_gmail_forwarded_msg():
 </div><br></div>"""
    extracted = quotations.extract_from_html(msg_body)
    eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
+
+
+@patch.object(quotations, 'MAX_HTML_LEN', 1)
+def test_too_large_html():
+    msg_body = 'Reply' \
+               '<div class="gmail_quote">' \
+               '<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:' \
+               '<div>Test</div>' \
+               '</div>' \
+               '</div>'
+    eq_(RE_WHITESPACE.sub('', msg_body),
+        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
+
+
+def test_readable_html_empty():
+    msg_body = """
+<blockquote>
+  Reply
+  <div>
+    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
+  </div>
+
+  <div>
+    Test
+  </div>
+
+</blockquote>"""
+
+    eq_(RE_WHITESPACE.sub('', msg_body),
+        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -6,6 +6,7 @@ from . import *
 from talon import utils as u
 import cchardet
 import six
+from lxml import html


 def test_get_delimiter():
@@ -107,3 +108,9 @@ font: 13px 'Lucida Grande', Arial, sans-serif;
 <p>TEXT 2 <!-- COMMENT 2 --></p>
 </div>"""
    eq_(b'TEXT 1 \nTEXT 2', u.html_to_text(html))
+
+
+def test_comment_no_parent():
+    s = "<!-- COMMENT 1 --> no comment"
+    d = html.document_fromstring(s)
+    eq_("no comment", u.html_tree_to_text(d))
Author	SHA1	Message	Date
Sergey Obukhov	f53b5cc7a6	Merge pull request #105 from mailgun/sergey/fromstring html with comment that has no parent crashes html_tree_to_text	2016-08-15 13:40:37 -07:00
Sergey Obukhov	27adde7aa7	bump version	2016-08-15 13:21:10 -07:00
Sergey Obukhov	a9719833e0	html with comment that has no parent crashes html_tree_to_text	2016-08-12 17:40:12 -07:00
Sergey Obukhov	7bf37090ca	Merge pull request #101 from mailgun/sergey/empty-html if html stripped off quotations does not have readable text fallback …	2016-08-12 12:18:50 -07:00
Sergey Obukhov	44fcef7123	bump version	2016-08-11 23:59:18 -07:00
Sergey Obukhov	69a44b10a1	Merge branch 'master' into sergey/empty-html	2016-08-11 23:58:11 -07:00
Sergey Obukhov	b085e3d049	Merge pull request #104 from mailgun/sergey/spaces fixes mailgun/talon#103 keep newlines when parsing html quotations	2016-08-11 23:56:26 -07:00
Sergey Obukhov	4b953bcddc	fixes mailgun/talon#103 keep newlines when parsing html quotations	2016-08-11 20:17:37 -07:00
Sergey Obukhov	315eaa7080	if html stripped off quotations does not have readable text fallback to unparsed html	2016-08-11 19:55:23 -07:00
Sergey Obukhov	5a9bc967f1	Merge pull request #100 from mailgun/sergey/restrict do not parse html quotations if html is longer then certain threshold	2016-08-11 16:08:03 -07:00
Sergey Obukhov	a0d7236d0b	bump version and add a comment	2016-08-11 15:49:09 -07:00
Sergey Obukhov	21e9a31ffe	add test	2016-08-09 17:15:49 -07:00
Sergey Obukhov	4ee46c0a97	do not parse html quotations if html is longer then certain threshold	2016-08-09 17:08:58 -07:00