fixes mailgun/talon#26
This commit is contained in:
@@ -315,7 +315,7 @@ def extract_from_plain(msg_body):
|
||||
return msg_body
|
||||
|
||||
|
||||
def extract_from_html(msg_body):
|
||||
def extract_from_html(s):
|
||||
"""
|
||||
Extract not quoted message from provided html message body
|
||||
using tags and plain text algorithm.
|
||||
@@ -332,8 +332,12 @@ def extract_from_html(msg_body):
|
||||
then deleting necessary tags.
|
||||
"""
|
||||
|
||||
if msg_body.strip() == '':
|
||||
return msg_body
|
||||
if s.strip() == '':
|
||||
return s
|
||||
|
||||
# replace CRLF with LF temporaraly otherwise CR will be converted to ' '
|
||||
# when doing deepcopy on html tree
|
||||
msg_body, replaced = _CRLF_to_LF(s)
|
||||
|
||||
html_tree = html.document_fromstring(
|
||||
msg_body,
|
||||
@@ -364,15 +368,12 @@ def extract_from_html(msg_body):
|
||||
plain_text = plain_text.replace('*', '')
|
||||
# Unmask saved star symbols
|
||||
plain_text = plain_text.replace('3423oorkg432', '*')
|
||||
|
||||
delimiter = get_delimiter(plain_text)
|
||||
|
||||
plain_text = preprocess(plain_text, delimiter, content_type='text/html')
|
||||
plain_text = preprocess(plain_text, '\n', content_type='text/html')
|
||||
lines = plain_text.splitlines()
|
||||
|
||||
# Don't process too long messages
|
||||
if len(lines) > MAX_LINES_COUNT:
|
||||
return msg_body
|
||||
return s
|
||||
|
||||
# Collect checkpoints on each line
|
||||
line_checkpoints = [
|
||||
@@ -397,9 +398,10 @@ def extract_from_html(msg_body):
|
||||
quotation_checkpoints[checkpoint] = True
|
||||
else:
|
||||
if cut_quotations:
|
||||
return html.tostring(html_tree_copy)
|
||||
print 1111111111, replaced
|
||||
return _restore_CRLF(html.tostring(html_tree_copy), replaced)
|
||||
else:
|
||||
return msg_body
|
||||
return s
|
||||
|
||||
# Remove tags with quotation checkpoints
|
||||
html_quotations.delete_quotation_tags(
|
||||
@@ -435,3 +437,37 @@ def register_xpath_extensions():
|
||||
ns.prefix = 'mg'
|
||||
ns['text_content'] = text_content
|
||||
ns['tail'] = tail
|
||||
|
||||
|
||||
def _restore_CRLF(s, replaced=True):
|
||||
"""Restore CRLF if previously CRLF was replaced with LF
|
||||
|
||||
>>> _restore_CRLF('a\nb')
|
||||
'a\r\nb'
|
||||
>>> _restore_CRLF('a\nb', replaced=False)
|
||||
'a\nb'
|
||||
"""
|
||||
if replaced:
|
||||
return s.replace('\n', '\r\n')
|
||||
return s
|
||||
|
||||
|
||||
def _CRLF_to_LF(s):
|
||||
"""Replace CRLF with LF
|
||||
|
||||
>>> s, changed = _CRLF_to_LF('a\r\n'b)
|
||||
>>> s
|
||||
'a\nb'
|
||||
>>> changed
|
||||
True
|
||||
|
||||
>>> s, changed = _CRLF_to_LF('a\n'b)
|
||||
>>> s
|
||||
'a\nb'
|
||||
>>> changed
|
||||
False
|
||||
"""
|
||||
delimiter = get_delimiter(s)
|
||||
if delimiter == '\r\n':
|
||||
return s.replace(delimiter, '\n'), True
|
||||
return s, False
|
||||
|
||||
@@ -28,8 +28,8 @@ def test_quotation_splitter_inside_blockquote():
|
||||
|
||||
</blockquote>"""
|
||||
|
||||
eq_("<html><body><p>Reply</p></body></html>",
|
||||
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
|
||||
eq_("<html><body><p>Reply\n</p></body></html>",
|
||||
quotations.extract_from_html(msg_body))
|
||||
|
||||
|
||||
def test_quotation_splitter_outside_blockquote():
|
||||
@@ -310,3 +310,25 @@ def test_windows_mail_reply():
|
||||
|
||||
def test_yandex_ru_reply():
|
||||
extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html")
|
||||
|
||||
|
||||
def test_CRLF():
|
||||
"""CR is not converted to ' '
|
||||
"""
|
||||
eq_('<html>\r\n</html>', quotations.extract_from_html('<html>\r\n</html>'))
|
||||
|
||||
msg_body = """Reply
|
||||
<blockquote>
|
||||
|
||||
<div>
|
||||
On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:
|
||||
</div>
|
||||
|
||||
<div>
|
||||
Test
|
||||
</div>
|
||||
|
||||
</blockquote>"""
|
||||
msg_body = msg_body.replace('\n', '\r\n')
|
||||
eq_("<html><body><p>Reply\r\n</p></body></html>",
|
||||
quotations.extract_from_html(msg_body))
|
||||
|
||||
@@ -29,3 +29,15 @@ def test_crash_inside_extract_from():
|
||||
|
||||
def test_empty_body():
|
||||
eq_('', quotations.extract_from_plain(''))
|
||||
|
||||
|
||||
def test__CRLF_to_LF():
|
||||
eq_(('\n\r', True), quotations._CRLF_to_LF('\r\n\r'))
|
||||
eq_(('\n\r', False), quotations._CRLF_to_LF('\n\r'))
|
||||
|
||||
|
||||
def test__restore_CRLF():
|
||||
eq_('\n', quotations._restore_CRLF('\n', replaced=False))
|
||||
eq_('\r\n', quotations._restore_CRLF('\n', replaced=True))
|
||||
# default
|
||||
eq_('\r\n', quotations._restore_CRLF('\n'))
|
||||
|
||||
Reference in New Issue
Block a user