if html stripped off quotations does not have readable text fallback to unparsed html

2016-08-11 19:54:53 -07:00
parent 5a9bc967f1
commit 315eaa7080
3 changed files with 56 additions and 31 deletions
@@ -12,7 +12,7 @@ from copy import deepcopy

 from lxml import html, etree

-from talon.utils import get_delimiter, html_to_text
+from talon.utils import get_delimiter, html_tree_to_text
 from talon import html_quotations
 from six.moves import range
 import six
@@ -407,8 +407,7 @@ def _extract_from_html(msg_body):

    number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
    quotation_checkpoints = [False] * number_of_checkpoints
-    msg_with_checkpoints = html.tostring(html_tree)
-    plain_text = html_to_text(msg_with_checkpoints)
+    plain_text = html_tree_to_text(html_tree)
    plain_text = preprocess(plain_text, '\n', content_type='text/html')
    lines = plain_text.splitlines()

@@ -431,25 +430,31 @@ def _extract_from_html(msg_body):
    return_flags = []
    process_marked_lines(lines, markers, return_flags)
    lines_were_deleted, first_deleted, last_deleted = return_flags
+
+    if not lines_were_deleted and not cut_quotations:
+        return msg_body
+
    if lines_were_deleted:
        #collect checkpoints from deleted lines
        for i in range(first_deleted, last_deleted):
            for checkpoint in line_checkpoints[i]:
                quotation_checkpoints[checkpoint] = True
-    else:
-        if cut_quotations:
-            return html.tostring(html_tree_copy)
-        else:
-            return msg_body

-    # Remove tags with quotation checkpoints
-    html_quotations.delete_quotation_tags(
-        html_tree_copy, 0, quotation_checkpoints
-    )
+        # Remove tags with quotation checkpoints
+        html_quotations.delete_quotation_tags(
+            html_tree_copy, 0, quotation_checkpoints
+        )
+
+    if _readable_text_empty(html_tree_copy):
+        return msg_body

    return html.tostring(html_tree_copy)


+def _readable_text_empty(html_tree):
+    return not bool(html_tree_to_text(html_tree).strip())
+
+
 def is_splitter(line):
    '''
    Returns Matcher object if provided string is a splitter and
@@ -112,25 +112,7 @@ def get_delimiter(msg_body):

    return delimiter

-
-def html_to_text(string):
-    """
-    Dead-simple HTML-to-text converter:
-        >>> html_to_text("one<br>two<br>three")
-        >>> "one\ntwo\nthree"
-
-    NOTES:
-        1. the string is expected to contain UTF-8 encoded HTML!
-        2. returns utf-8 encoded str (not unicode)
-    """
-    if isinstance(string, six.text_type):
-        string = string.encode('utf8')
-
-    s = _prepend_utf8_declaration(string)
-    s = s.replace(b"\n", b"")
-
-    tree = html.fromstring(s)
-
+def html_tree_to_text(tree):
    for style in CSSSelector('style')(tree):
        style.getparent().remove(style)

@@ -159,6 +141,26 @@ def html_to_text(string):
    return _encode_utf8(retval)


+def html_to_text(string):
+    """
+    Dead-simple HTML-to-text converter:
+        >>> html_to_text("one<br>two<br>three")
+        >>> "one\ntwo\nthree"
+
+    NOTES:
+        1. the string is expected to contain UTF-8 encoded HTML!
+        2. returns utf-8 encoded str (not unicode)
+    """
+    if isinstance(string, six.text_type):
+        string = string.encode('utf8')
+
+    s = _prepend_utf8_declaration(string)
+    s = s.replace(b"\n", b"")
+
+    tree = html.fromstring(s)
+    return html_tree_to_text(tree)
+
+
 def _contains_charset_spec(s):
    """Return True if the first 4KB contain charset spec
    """
@@ -392,3 +392,21 @@ def test_too_large_html():
               '</div>'
    eq_(RE_WHITESPACE.sub('', msg_body),
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
+
+
+def test_readable_html_empty():
+    msg_body = """
+<blockquote>
+  Reply
+  <div>
+    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
+  </div>
+
+  <div>
+    Test
+  </div>
+
+</blockquote>"""
+
+    eq_(RE_WHITESPACE.sub('', msg_body),
+        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))