# -*- coding: utf-8 -*- from . import * from . fixtures import * import regex as re from talon import quotations import html2text RE_WHITESPACE = re.compile("\s") RE_DOUBLE_WHITESPACE = re.compile("\s") def test_quotation_splitter_inside_blockquote(): msg_body = """Reply
On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:
Test
""" eq_("

Reply\n

", quotations.extract_from_html(msg_body)) def test_quotation_splitter_outside_blockquote(): msg_body = """Reply
On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:
Test
""" eq_("

Reply

", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) def test_regular_blockquote(): msg_body = """Reply
Regular
On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:
Nested
""" eq_("

Reply

Regular
", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) def test_no_blockquote(): msg_body = """ Reply
On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:
Test
""" reply = """ Reply """ eq_(RE_WHITESPACE.sub('', reply), RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) def test_empty_body(): eq_('', quotations.extract_from_html('')) def test_validate_output_html(): msg_body = """Reply
On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:
Test
""" out = quotations.extract_from_html(msg_body) ok_('' in out and '' in out, 'Invalid HTML - / tag not present') ok_('
' not in out, 'Invalid HTML output -
element is not valid') def test_gmail_quote(): msg_body = """Reply
On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:
Test
""" eq_("

Reply

", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) def test_unicode_in_reply(): msg_body = u"""Reply \xa0 \xa0 Text

Quote
""".encode("utf-8") eq_("

Reply  Text


" "", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) def test_blockquote_disclaimer(): msg_body = """
message
Quote
disclaimer
""" stripped_html = """
message
disclaimer
""" eq_(RE_WHITESPACE.sub('', stripped_html), RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) def test_date_block(): msg_body = """
message

Date: Fri, 23 Mar 2012 12:35:31 -0600
To: bob@example.com
From: rob@example.com
Subject: You Have New Mail From Mary!

text
""" eq_('
message
', RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) def test_from_block(): msg_body = """
message

From: bob@example.com
Date: Fri, 23 Mar 2012 12:35:31 -0600
To: rob@example.com
Subject: You Have New Mail From Mary!

text
""" eq_('
message
', RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) def test_reply_shares_div_with_from_block(): msg_body = '''
Blah


Date: Tue, 22 May 2012 18:29:16 -0600
To: xx@hotmail.ca
From: quickemail@ashleymadison.com
Subject: You Have New Mail From x!

''' eq_('
Blah

', RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) def test_reply_quotations_share_block(): stripped_html = quotations.extract_from_plain(REPLY_QUOTATIONS_SHARE_BLOCK) ok_(stripped_html) ok_('From' not in stripped_html) def test_OLK_SRC_BODY_SECTION_stripped(): eq_('
Reply
', RE_WHITESPACE.sub( '', quotations.extract_from_html(OLK_SRC_BODY_SECTION))) def test_reply_separated_by_hr(): eq_('
Hi
there
', RE_WHITESPACE.sub( '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) RE_REPLY = re.compile(r"^Hi\. I am fine\.\s*\n\s*Thanks,\s*\n\s*Alex\s*$") def extract_reply_and_check(filename): f = open(filename) msg_body = f.read() reply = quotations.extract_from_html(msg_body) h = html2text.HTML2Text() h.body_width = 0 plain_reply = h.handle(reply) #remove   spaces plain_reply = plain_reply.replace(u'\xa0', u' ') if RE_REPLY.match(plain_reply): eq_(1, 1) else: eq_("Hi. I am fine.\n\nThanks,\nAlex", plain_reply) def test_gmail_reply(): extract_reply_and_check("tests/fixtures/html_replies/gmail.html") def test_mail_ru_reply(): extract_reply_and_check("tests/fixtures/html_replies/mail_ru.html") def test_hotmail_reply(): extract_reply_and_check("tests/fixtures/html_replies/hotmail.html") def test_ms_outlook_2003_reply(): extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2003.html") def test_ms_outlook_2007_reply(): extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2007.html") def test_thunderbird_reply(): extract_reply_and_check("tests/fixtures/html_replies/thunderbird.html") def test_windows_mail_reply(): extract_reply_and_check("tests/fixtures/html_replies/windows_mail.html") def test_yandex_ru_reply(): extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html") def test_CRLF(): """CR is not converted to ' ' """ eq_('\r\n', quotations.extract_from_html('\r\n')) msg_body = """Reply
On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:
Test
""" msg_body = msg_body.replace('\n', '\r\n') eq_("

Reply\r\n

", quotations.extract_from_html(msg_body))