Merge pull request #60 from mailgun/sergey/26

fixes mailgun/talon#26
2015-09-21 09:54:35 -07:00
parent 2cb9b5399c e4c1c11845
commit 3b0c9273c1
3 changed files with 81 additions and 12 deletions
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -315,7 +315,7 @@ def extract_from_plain(msg_body):
    return msg_body
-def extract_from_html(msg_body):
+def extract_from_html(s):
    """
    Extract not quoted message from provided html message body
    using tags and plain text algorithm.
@@ -332,8 +332,12 @@ def extract_from_html(msg_body):
    then deleting necessary tags.
    """
-    if msg_body.strip() == '':
+    if s.strip() == '':
-        return msg_body
+        return s
    # replace CRLF with LF temporaraly otherwise CR will be converted to '&#13;'
    # when doing deepcopy on html tree
    msg_body, replaced = _CRLF_to_LF(s)
    html_tree = html.document_fromstring(
        msg_body,
@@ -364,15 +368,12 @@ def extract_from_html(msg_body):
    plain_text = plain_text.replace('*', '')
    # Unmask saved star symbols
    plain_text = plain_text.replace('3423oorkg432', '*')
-
+    plain_text = preprocess(plain_text, '\n', content_type='text/html')
    delimiter = get_delimiter(plain_text)
    plain_text = preprocess(plain_text, delimiter, content_type='text/html')
    lines = plain_text.splitlines()
    # Don't process too long messages
    if len(lines) > MAX_LINES_COUNT:
-        return msg_body
+        return s
    # Collect checkpoints on each line
    line_checkpoints = [
@@ -397,9 +398,9 @@ def extract_from_html(msg_body):
                quotation_checkpoints[checkpoint] = True
    else:
        if cut_quotations:
-            return html.tostring(html_tree_copy)
+            return _restore_CRLF(html.tostring(html_tree_copy), replaced)
        else:
-            return msg_body
+            return s
    # Remove tags with quotation checkpoints
    html_quotations.delete_quotation_tags(
@@ -435,3 +436,37 @@ def register_xpath_extensions():
    ns.prefix = 'mg'
    ns['text_content'] = text_content
    ns['tail'] = tail
 def _restore_CRLF(s, replaced=True):
    """Restore CRLF if previously CRLF was replaced with LF
    >>> _restore_CRLF('a\nb')
    'a\r\nb'
    >>> _restore_CRLF('a\nb', replaced=False)
    'a\nb'
    """
    if replaced:
        return s.replace('\n', '\r\n')
    return s
 def _CRLF_to_LF(s):
    """Replace CRLF with LF
    >>> s, changed = _CRLF_to_LF('a\r\n'b)
    >>> s
    'a\nb'
    >>> changed
    True
    >>> s, changed = _CRLF_to_LF('a\n'b)
    >>> s
    'a\nb'
    >>> changed
    False
    """
    delimiter = get_delimiter(s)
    if delimiter == '\r\n':
        return s.replace(delimiter, '\n'), True
    return s, False
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -28,8 +28,8 @@ def test_quotation_splitter_inside_blockquote():
 </blockquote>"""
-    eq_("<html><body><p>Reply</p></body></html>",
+    eq_("<html><body><p>Reply\n</p></body></html>",
-        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
+        quotations.extract_from_html(msg_body))
 def test_quotation_splitter_outside_blockquote():
@@ -310,3 +310,25 @@ def test_windows_mail_reply():
 def test_yandex_ru_reply():
    extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html")
 def test_CRLF():
    """CR is not converted to '&#13;'
    """
    eq_('<html>\r\n</html>', quotations.extract_from_html('<html>\r\n</html>'))
    msg_body = """Reply
 <blockquote>
  <div>
    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
  </div>
  <div>
    Test
  </div>
 </blockquote>"""
    msg_body = msg_body.replace('\n', '\r\n')
    eq_("<html><body><p>Reply\r\n</p></body></html>",
        quotations.extract_from_html(msg_body))
--- a/tests/quotations_test.py
+++ b/tests/quotations_test.py
@@ -29,3 +29,15 @@ def test_crash_inside_extract_from():
 def test_empty_body():
    eq_('', quotations.extract_from_plain(''))
 def test__CRLF_to_LF():
    eq_(('\n\r', True), quotations._CRLF_to_LF('\r\n\r'))
    eq_(('\n\r', False), quotations._CRLF_to_LF('\n\r'))
 def test__restore_CRLF():
    eq_('\n', quotations._restore_CRLF('\n', replaced=False))
    eq_('\r\n', quotations._restore_CRLF('\n', replaced=True))    
    # default
    eq_('\r\n', quotations._restore_CRLF('\n'))