From ae508fe0e5fd793bfcec05bbe4ea65da27a69f80 Mon Sep 17 00:00:00 2001
From: Sergey Obukhov <sergey.obykhov@mailgunhq.com>
Date: Mon, 21 Sep 2015 09:51:26 -0700
Subject: [PATCH 1/2] fixes mailgun/talon#26

---
 talon/quotations.py           | 56 ++++++++++++++++++++++++++++-------
 tests/html_quotations_test.py | 26 ++++++++++++++--
 tests/quotations_test.py      | 12 ++++++++
 3 files changed, 82 insertions(+), 12 deletions(-)
diff --git a/talon/quotations.py b/talon/quotations.py
index db6e0dc..51db576 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -315,7 +315,7 @@ def extract_from_plain(msg_body):
     return msg_body
 
 
-def extract_from_html(msg_body):
+def extract_from_html(s):
     """
     Extract not quoted message from provided html message body
     using tags and plain text algorithm.
@@ -332,8 +332,12 @@ def extract_from_html(msg_body):
     then deleting necessary tags.
     """
 
-    if msg_body.strip() == '':
-        return msg_body
+    if s.strip() == '':
+        return s
+
+    # replace CRLF with LF temporaraly otherwise CR will be converted to '&#13;'
+    # when doing deepcopy on html tree
+    msg_body, replaced = _CRLF_to_LF(s)
 
     html_tree = html.document_fromstring(
         msg_body,
@@ -364,15 +368,12 @@ def extract_from_html(msg_body):
     plain_text = plain_text.replace('*', '')
     # Unmask saved star symbols
     plain_text = plain_text.replace('3423oorkg432', '*')
-
-    delimiter = get_delimiter(plain_text)
-
-    plain_text = preprocess(plain_text, delimiter, content_type='text/html')
+    plain_text = preprocess(plain_text, '\n', content_type='text/html')
     lines = plain_text.splitlines()
 
     # Don't process too long messages
     if len(lines) > MAX_LINES_COUNT:
-        return msg_body
+        return s
 
     # Collect checkpoints on each line
     line_checkpoints = [
@@ -397,9 +398,10 @@ def extract_from_html(msg_body):
                 quotation_checkpoints[checkpoint] = True
     else:
         if cut_quotations:
-            return html.tostring(html_tree_copy)
+            print 1111111111, replaced
+            return _restore_CRLF(html.tostring(html_tree_copy), replaced)
         else:
-            return msg_body
+            return s
 
     # Remove tags with quotation checkpoints
     html_quotations.delete_quotation_tags(
@@ -435,3 +437,37 @@ def register_xpath_extensions():
     ns.prefix = 'mg'
     ns['text_content'] = text_content
     ns['tail'] = tail
+
+
+def _restore_CRLF(s, replaced=True):
+    """Restore CRLF if previously CRLF was replaced with LF
+
+    >>> _restore_CRLF('a\nb')
+    'a\r\nb'
+    >>> _restore_CRLF('a\nb', replaced=False)
+    'a\nb'
+    """
+    if replaced:
+        return s.replace('\n', '\r\n')
+    return s
+
+
+def _CRLF_to_LF(s):
+    """Replace CRLF with LF
+
+    >>> s, changed = _CRLF_to_LF('a\r\n'b)
+    >>> s
+    'a\nb'
+    >>> changed
+    True
+
+    >>> s, changed = _CRLF_to_LF('a\n'b)
+    >>> s
+    'a\nb'
+    >>> changed
+    False
+    """
+    delimiter = get_delimiter(s)
+    if delimiter == '\r\n':
+        return s.replace(delimiter, '\n'), True
+    return s, False
diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py
index b7b4885..5c4118e 100644
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -28,8 +28,8 @@ def test_quotation_splitter_inside_blockquote():
 
 </blockquote>"""
 
-    eq_("<html><body><p>Reply</p></body></html>",
-        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
+    eq_("<html><body><p>Reply\n</p></body></html>",
+        quotations.extract_from_html(msg_body))
 
 
 def test_quotation_splitter_outside_blockquote():
@@ -310,3 +310,25 @@ def test_windows_mail_reply():
 
 def test_yandex_ru_reply():
     extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html")
+
+
+def test_CRLF():
+    """CR is not converted to '&#13;'
+    """
+    eq_('<html>\r\n</html>', quotations.extract_from_html('<html>\r\n</html>'))
+
+    msg_body = """Reply
+<blockquote>
+
+  <div>
+    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
+  </div>
+
+  <div>
+    Test
+  </div>
+
+</blockquote>"""
+    msg_body = msg_body.replace('\n', '\r\n')
+    eq_("<html><body><p>Reply\r\n</p></body></html>",
+        quotations.extract_from_html(msg_body))
diff --git a/tests/quotations_test.py b/tests/quotations_test.py
index 7184368..0cd18b2 100644
--- a/tests/quotations_test.py
+++ b/tests/quotations_test.py
@@ -29,3 +29,15 @@ def test_crash_inside_extract_from():
 
 def test_empty_body():
     eq_('', quotations.extract_from_plain(''))
+
+
+def test__CRLF_to_LF():
+    eq_(('\n\r', True), quotations._CRLF_to_LF('\r\n\r'))
+    eq_(('\n\r', False), quotations._CRLF_to_LF('\n\r'))
+
+
+def test__restore_CRLF():
+    eq_('\n', quotations._restore_CRLF('\n', replaced=False))
+    eq_('\r\n', quotations._restore_CRLF('\n', replaced=True))    
+    # default
+    eq_('\r\n', quotations._restore_CRLF('\n'))

From e4c1c118455533cfa576e6902f4b35d452f3e3eb Mon Sep 17 00:00:00 2001
From: Sergey Obukhov <sergey.obykhov@mailgunhq.com>
Date: Mon, 21 Sep 2015 09:52:47 -0700
Subject: [PATCH 2/2] remove print

---
 talon/quotations.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/talon/quotations.py b/talon/quotations.py
index 51db576..d699acd 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -398,7 +398,6 @@ def extract_from_html(s):
                 quotation_checkpoints[checkpoint] = True
     else:
         if cut_quotations:
-            print 1111111111, replaced
             return _restore_CRLF(html.tostring(html_tree_copy), replaced)
         else:
             return s