Merge pull request #104 from mailgun/sergey/spaces

fixes mailgun/talon#103 keep newlines when parsing html quotations
This commit is contained in:
Sergey Obukhov
2016-08-11 23:56:26 -07:00
committed by GitHub
2 changed files with 5 additions and 4 deletions

View File

@@ -391,7 +391,7 @@ def _extract_from_html(msg_body):
if msg_body.strip() == b'':
return msg_body
msg_body = msg_body.replace(b'\r\n', b'').replace(b'\n', b'')
msg_body = msg_body.replace(b'\r\n', b'\n')
html_tree = html.document_fromstring(
msg_body,
parser=html.HTMLParser(encoding="utf-8")

View File

@@ -356,7 +356,8 @@ def test_CRLF():
assert_false(symbol in extracted)
eq_('<html></html>', RE_WHITESPACE.sub('', extracted))
msg_body = """Reply
msg_body = """My
reply
<blockquote>
<div>
@@ -371,8 +372,8 @@ def test_CRLF():
msg_body = msg_body.replace('\n', '\r\n')
extracted = quotations.extract_from_html(msg_body)
assert_false(symbol in extracted)
eq_("<html><body><p>Reply</p></body></html>",
RE_WHITESPACE.sub('', extracted))
# Keep new lines otherwise "My reply" becomes one word - "Myreply"
eq_("<html><body><p>My\nreply\n</p></body></html>", extracted)
def test_gmail_forwarded_msg():