Merge pull request #104 from mailgun/sergey/spaces

fixes mailgun/talon#103 keep newlines when parsing html quotations
2016-08-11 23:56:26 -07:00
parent 5a9bc967f1 4b953bcddc
commit b085e3d049
2 changed files with 5 additions and 4 deletions
@@ -391,7 +391,7 @@ def _extract_from_html(msg_body):
    if msg_body.strip() == b'':
        return msg_body

-    msg_body = msg_body.replace(b'\r\n', b'').replace(b'\n', b'')
+    msg_body = msg_body.replace(b'\r\n', b'\n')
    html_tree = html.document_fromstring(
        msg_body,
        parser=html.HTMLParser(encoding="utf-8")
@@ -356,7 +356,8 @@ def test_CRLF():
    assert_false(symbol in extracted)
    eq_('<html></html>', RE_WHITESPACE.sub('', extracted))

-    msg_body = """Reply
+    msg_body = """My
+reply
 <blockquote>

  <div>
@@ -371,8 +372,8 @@ def test_CRLF():
    msg_body = msg_body.replace('\n', '\r\n')
    extracted = quotations.extract_from_html(msg_body)
    assert_false(symbol in extracted)    
-    eq_("<html><body><p>Reply</p></body></html>",
-        RE_WHITESPACE.sub('', extracted))
+    # Keep new lines otherwise "My reply" becomes one word - "Myreply" 
+    eq_("<html><body><p>My\nreply\n</p></body></html>", extracted)


 def test_gmail_forwarded_msg():