fixes mailgun/talon#26

2015-09-21 09:51:26 -07:00
parent 2cb9b5399c
commit ae508fe0e5
3 changed files with 82 additions and 12 deletions
@@ -315,7 +315,7 @@ def extract_from_plain(msg_body):
    return msg_body


-def extract_from_html(msg_body):
+def extract_from_html(s):
    """
    Extract not quoted message from provided html message body
    using tags and plain text algorithm.
@@ -332,8 +332,12 @@ def extract_from_html(msg_body):
    then deleting necessary tags.
    """

-    if msg_body.strip() == '':
-        return msg_body
+    if s.strip() == '':
+        return s
+
+    # replace CRLF with LF temporaraly otherwise CR will be converted to '&#13;'
+    # when doing deepcopy on html tree
+    msg_body, replaced = _CRLF_to_LF(s)

    html_tree = html.document_fromstring(
        msg_body,
@@ -364,15 +368,12 @@ def extract_from_html(msg_body):
    plain_text = plain_text.replace('*', '')
    # Unmask saved star symbols
    plain_text = plain_text.replace('3423oorkg432', '*')
-
-    delimiter = get_delimiter(plain_text)
-
-    plain_text = preprocess(plain_text, delimiter, content_type='text/html')
+    plain_text = preprocess(plain_text, '\n', content_type='text/html')
    lines = plain_text.splitlines()

    # Don't process too long messages
    if len(lines) > MAX_LINES_COUNT:
-        return msg_body
+        return s

    # Collect checkpoints on each line
    line_checkpoints = [
@@ -397,9 +398,10 @@ def extract_from_html(msg_body):
                quotation_checkpoints[checkpoint] = True
    else:
        if cut_quotations:
-            return html.tostring(html_tree_copy)
+            print 1111111111, replaced
+            return _restore_CRLF(html.tostring(html_tree_copy), replaced)
        else:
-            return msg_body
+            return s

    # Remove tags with quotation checkpoints
    html_quotations.delete_quotation_tags(
@@ -435,3 +437,37 @@ def register_xpath_extensions():
    ns.prefix = 'mg'
    ns['text_content'] = text_content
    ns['tail'] = tail
+
+
+def _restore_CRLF(s, replaced=True):
+    """Restore CRLF if previously CRLF was replaced with LF
+
+    >>> _restore_CRLF('a\nb')
+    'a\r\nb'
+    >>> _restore_CRLF('a\nb', replaced=False)
+    'a\nb'
+    """
+    if replaced:
+        return s.replace('\n', '\r\n')
+    return s
+
+
+def _CRLF_to_LF(s):
+    """Replace CRLF with LF
+
+    >>> s, changed = _CRLF_to_LF('a\r\n'b)
+    >>> s
+    'a\nb'
+    >>> changed
+    True
+
+    >>> s, changed = _CRLF_to_LF('a\n'b)
+    >>> s
+    'a\nb'
+    >>> changed
+    False
+    """
+    delimiter = get_delimiter(s)
+    if delimiter == '\r\n':
+        return s.replace(delimiter, '\n'), True
+    return s, False
@@ -28,8 +28,8 @@ def test_quotation_splitter_inside_blockquote():

 </blockquote>"""

-    eq_("<html><body><p>Reply</p></body></html>",
-        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
+    eq_("<html><body><p>Reply\n</p></body></html>",
+        quotations.extract_from_html(msg_body))


 def test_quotation_splitter_outside_blockquote():
@@ -310,3 +310,25 @@ def test_windows_mail_reply():

 def test_yandex_ru_reply():
    extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html")
+
+
+def test_CRLF():
+    """CR is not converted to '&#13;'
+    """
+    eq_('<html>\r\n</html>', quotations.extract_from_html('<html>\r\n</html>'))
+
+    msg_body = """Reply
+<blockquote>
+
+  <div>
+    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
+  </div>
+
+  <div>
+    Test
+  </div>
+
+</blockquote>"""
+    msg_body = msg_body.replace('\n', '\r\n')
+    eq_("<html><body><p>Reply\r\n</p></body></html>",
+        quotations.extract_from_html(msg_body))
@@ -29,3 +29,15 @@ def test_crash_inside_extract_from():

 def test_empty_body():
    eq_('', quotations.extract_from_plain(''))
+
+
+def test__CRLF_to_LF():
+    eq_(('\n\r', True), quotations._CRLF_to_LF('\r\n\r'))
+    eq_(('\n\r', False), quotations._CRLF_to_LF('\n\r'))
+
+
+def test__restore_CRLF():
+    eq_('\n', quotations._restore_CRLF('\n', replaced=False))
+    eq_('\r\n', quotations._restore_CRLF('\n', replaced=True))    
+    # default
+    eq_('\r\n', quotations._restore_CRLF('\n'))