Merge pull request #101 from mailgun/sergey/empty-html

if html stripped off quotations does not have readable text fallback …
bump version
2016-08-12 12:18:50 -07:00 · 2016-08-11 23:59:18 -07:00 · 2016-08-11 23:58:11 -07:00 · 2016-08-11 23:56:26 -07:00 · 2016-08-11 20:17:37 -07:00 · 2016-08-11 19:55:23 -07:00
4 changed files with 62 additions and 36 deletions
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):
 setup(name='talon',
-      version='1.2.14',
+      version='1.2.15',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -12,7 +12,7 @@ from copy import deepcopy
 from lxml import html, etree
-from talon.utils import get_delimiter, html_to_text
+from talon.utils import get_delimiter, html_tree_to_text
 from talon import html_quotations
 from six.moves import range
 import six
@@ -391,7 +391,7 @@ def _extract_from_html(msg_body):
    if msg_body.strip() == b'':
        return msg_body
-    msg_body = msg_body.replace(b'\r\n', b'').replace(b'\n', b'')
+    msg_body = msg_body.replace(b'\r\n', b'\n')
    html_tree = html.document_fromstring(
        msg_body,
        parser=html.HTMLParser(encoding="utf-8")
@@ -407,8 +407,7 @@ def _extract_from_html(msg_body):
    number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
    quotation_checkpoints = [False] * number_of_checkpoints
-    msg_with_checkpoints = html.tostring(html_tree)
+    plain_text = html_tree_to_text(html_tree)
    plain_text = html_to_text(msg_with_checkpoints)
    plain_text = preprocess(plain_text, '\n', content_type='text/html')
    lines = plain_text.splitlines()
@@ -431,25 +430,31 @@ def _extract_from_html(msg_body):
    return_flags = []
    process_marked_lines(lines, markers, return_flags)
    lines_were_deleted, first_deleted, last_deleted = return_flags
    if not lines_were_deleted and not cut_quotations:
        return msg_body
    if lines_were_deleted:
        #collect checkpoints from deleted lines
        for i in range(first_deleted, last_deleted):
            for checkpoint in line_checkpoints[i]:
                quotation_checkpoints[checkpoint] = True
    else:
        if cut_quotations:
            return html.tostring(html_tree_copy)
        else:
            return msg_body
        # Remove tags with quotation checkpoints
        html_quotations.delete_quotation_tags(
            html_tree_copy, 0, quotation_checkpoints
        )
    if _readable_text_empty(html_tree_copy):
        return msg_body
    return html.tostring(html_tree_copy)
 def _readable_text_empty(html_tree):
    return not bool(html_tree_to_text(html_tree).strip())
 def is_splitter(line):
    '''
    Returns Matcher object if provided string is a splitter and
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -112,25 +112,7 @@ def get_delimiter(msg_body):
    return delimiter
-
+def html_tree_to_text(tree):
 def html_to_text(string):
    """
    Dead-simple HTML-to-text converter:
        >>> html_to_text("one<br>two<br>three")
        >>> "one\ntwo\nthree"
    NOTES:
        1. the string is expected to contain UTF-8 encoded HTML!
        2. returns utf-8 encoded str (not unicode)
    """
    if isinstance(string, six.text_type):
        string = string.encode('utf8')
    s = _prepend_utf8_declaration(string)
    s = s.replace(b"\n", b"")
    tree = html.fromstring(s)
    for style in CSSSelector('style')(tree):
        style.getparent().remove(style)
@@ -159,6 +141,26 @@ def html_to_text(string):
    return _encode_utf8(retval)
 def html_to_text(string):
    """
    Dead-simple HTML-to-text converter:
        >>> html_to_text("one<br>two<br>three")
        >>> "one\ntwo\nthree"
    NOTES:
        1. the string is expected to contain UTF-8 encoded HTML!
        2. returns utf-8 encoded str (not unicode)
    """
    if isinstance(string, six.text_type):
        string = string.encode('utf8')
    s = _prepend_utf8_declaration(string)
    s = s.replace(b"\n", b"")
    tree = html.fromstring(s)
    return html_tree_to_text(tree)
 def _contains_charset_spec(s):
    """Return True if the first 4KB contain charset spec
    """
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -356,7 +356,8 @@ def test_CRLF():
    assert_false(symbol in extracted)
    eq_('<html></html>', RE_WHITESPACE.sub('', extracted))
-    msg_body = """Reply
+    msg_body = """My
 reply
 <blockquote>
  <div>
@@ -371,8 +372,8 @@ def test_CRLF():
    msg_body = msg_body.replace('\n', '\r\n')
    extracted = quotations.extract_from_html(msg_body)
    assert_false(symbol in extracted)    
-    eq_("<html><body><p>Reply</p></body></html>",
+    # Keep new lines otherwise "My reply" becomes one word - "Myreply" 
-        RE_WHITESPACE.sub('', extracted))
+    eq_("<html><body><p>My\nreply\n</p></body></html>", extracted)
 def test_gmail_forwarded_msg():
@@ -392,3 +393,21 @@ def test_too_large_html():
               '</div>'
    eq_(RE_WHITESPACE.sub('', msg_body),
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
 def test_readable_html_empty():
    msg_body = """
 <blockquote>
  Reply
  <div>
    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
  </div>
  <div>
    Test
  </div>
 </blockquote>"""
    eq_(RE_WHITESPACE.sub('', msg_body),
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
Author	SHA1	Message	Date
Sergey Obukhov	7bf37090ca	Merge pull request #101 from mailgun/sergey/empty-html if html stripped off quotations does not have readable text fallback …	2016-08-12 12:18:50 -07:00
Sergey Obukhov	44fcef7123	bump version	2016-08-11 23:59:18 -07:00
Sergey Obukhov	69a44b10a1	Merge branch 'master' into sergey/empty-html	2016-08-11 23:58:11 -07:00
Sergey Obukhov	b085e3d049	Merge pull request #104 from mailgun/sergey/spaces fixes mailgun/talon#103 keep newlines when parsing html quotations	2016-08-11 23:56:26 -07:00
Sergey Obukhov	4b953bcddc	fixes mailgun/talon#103 keep newlines when parsing html quotations	2016-08-11 20:17:37 -07:00
Sergey Obukhov	315eaa7080	if html stripped off quotations does not have readable text fallback to unparsed html	2016-08-11 19:55:23 -07:00