From ae508fe0e5fd793bfcec05bbe4ea65da27a69f80 Mon Sep 17 00:00:00 2001 From: Sergey Obukhov Date: Mon, 21 Sep 2015 09:51:26 -0700 Subject: [PATCH 1/2] fixes mailgun/talon#26 --- talon/quotations.py | 56 ++++++++++++++++++++++++++++------- tests/html_quotations_test.py | 26 ++++++++++++++-- tests/quotations_test.py | 12 ++++++++ 3 files changed, 82 insertions(+), 12 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index db6e0dc..51db576 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -315,7 +315,7 @@ def extract_from_plain(msg_body): return msg_body -def extract_from_html(msg_body): +def extract_from_html(s): """ Extract not quoted message from provided html message body using tags and plain text algorithm. @@ -332,8 +332,12 @@ def extract_from_html(msg_body): then deleting necessary tags. """ - if msg_body.strip() == '': - return msg_body + if s.strip() == '': + return s + + # replace CRLF with LF temporaraly otherwise CR will be converted to ' ' + # when doing deepcopy on html tree + msg_body, replaced = _CRLF_to_LF(s) html_tree = html.document_fromstring( msg_body, @@ -364,15 +368,12 @@ def extract_from_html(msg_body): plain_text = plain_text.replace('*', '') # Unmask saved star symbols plain_text = plain_text.replace('3423oorkg432', '*') - - delimiter = get_delimiter(plain_text) - - plain_text = preprocess(plain_text, delimiter, content_type='text/html') + plain_text = preprocess(plain_text, '\n', content_type='text/html') lines = plain_text.splitlines() # Don't process too long messages if len(lines) > MAX_LINES_COUNT: - return msg_body + return s # Collect checkpoints on each line line_checkpoints = [ @@ -397,9 +398,10 @@ def extract_from_html(msg_body): quotation_checkpoints[checkpoint] = True else: if cut_quotations: - return html.tostring(html_tree_copy) + print 1111111111, replaced + return _restore_CRLF(html.tostring(html_tree_copy), replaced) else: - return msg_body + return s # Remove tags with quotation checkpoints html_quotations.delete_quotation_tags( @@ -435,3 +437,37 @@ def register_xpath_extensions(): ns.prefix = 'mg' ns['text_content'] = text_content ns['tail'] = tail + + +def _restore_CRLF(s, replaced=True): + """Restore CRLF if previously CRLF was replaced with LF + + >>> _restore_CRLF('a\nb') + 'a\r\nb' + >>> _restore_CRLF('a\nb', replaced=False) + 'a\nb' + """ + if replaced: + return s.replace('\n', '\r\n') + return s + + +def _CRLF_to_LF(s): + """Replace CRLF with LF + + >>> s, changed = _CRLF_to_LF('a\r\n'b) + >>> s + 'a\nb' + >>> changed + True + + >>> s, changed = _CRLF_to_LF('a\n'b) + >>> s + 'a\nb' + >>> changed + False + """ + delimiter = get_delimiter(s) + if delimiter == '\r\n': + return s.replace(delimiter, '\n'), True + return s, False diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index b7b4885..5c4118e 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -28,8 +28,8 @@ def test_quotation_splitter_inside_blockquote(): """ - eq_("

Reply

", - RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) + eq_("

Reply\n

", + quotations.extract_from_html(msg_body)) def test_quotation_splitter_outside_blockquote(): @@ -310,3 +310,25 @@ def test_windows_mail_reply(): def test_yandex_ru_reply(): extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html") + + +def test_CRLF(): + """CR is not converted to ' ' + """ + eq_('\r\n', quotations.extract_from_html('\r\n')) + + msg_body = """Reply +
+ +
+ On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: +
+ +
+ Test +
+ +
""" + msg_body = msg_body.replace('\n', '\r\n') + eq_("

Reply\r\n

", + quotations.extract_from_html(msg_body)) diff --git a/tests/quotations_test.py b/tests/quotations_test.py index 7184368..0cd18b2 100644 --- a/tests/quotations_test.py +++ b/tests/quotations_test.py @@ -29,3 +29,15 @@ def test_crash_inside_extract_from(): def test_empty_body(): eq_('', quotations.extract_from_plain('')) + + +def test__CRLF_to_LF(): + eq_(('\n\r', True), quotations._CRLF_to_LF('\r\n\r')) + eq_(('\n\r', False), quotations._CRLF_to_LF('\n\r')) + + +def test__restore_CRLF(): + eq_('\n', quotations._restore_CRLF('\n', replaced=False)) + eq_('\r\n', quotations._restore_CRLF('\n', replaced=True)) + # default + eq_('\r\n', quotations._restore_CRLF('\n')) From e4c1c118455533cfa576e6902f4b35d452f3e3eb Mon Sep 17 00:00:00 2001 From: Sergey Obukhov Date: Mon, 21 Sep 2015 09:52:47 -0700 Subject: [PATCH 2/2] remove print --- talon/quotations.py | 1 - 1 file changed, 1 deletion(-) diff --git a/talon/quotations.py b/talon/quotations.py index 51db576..d699acd 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -398,7 +398,6 @@ def extract_from_html(s): quotation_checkpoints[checkpoint] = True else: if cut_quotations: - print 1111111111, replaced return _restore_CRLF(html.tostring(html_tree_copy), replaced) else: return s