diff --git a/setup.py b/setup.py index f9305ff..b1750a1 100755 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ class InstallCommand(install): setup(name='talon', - version='1.4.5', + version='1.4.7', description=("Mailgun library " "to extract message quotations and signatures."), long_description=open("README.rst").read(), diff --git a/talon/quotations.py b/talon/quotations.py index 8b368e5..aec5643 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -22,7 +22,7 @@ import six log = logging.getLogger(__name__) -RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) +RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+\s*$", re.I | re.M) RE_ON_DATE_SMB_WROTE = re.compile( u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format( @@ -139,13 +139,17 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format( 'Oprindelig meddelelse', ))), re.I) -RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]?.*'.format( +RE_FROM_COLON_OR_DATE_COLON = re.compile(u'((_+\r?\n)?[\s]*:?[*]?({})[\s]?:([^\n$]+\n){{1,2}}){{2,}}'.format( u'|'.join(( # "From" in different languages. 'From', 'Van', 'De', 'Von', 'Fra', u'Från', # "Date" in different languages. - 'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt', - ))), re.I) + 'Date', '[S]ent', 'Datum', u'Envoyé', 'Skickat', 'Sendt', 'Gesendet', + # "Subject" in different languages. + 'Subject', 'Betreff', 'Objet', 'Emne', u'Ämne', + # "To" in different languages. + 'To', 'An', 'Til', u'À', 'Till' + ))), re.I | re.M) # ---- John Smith wrote ---- RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format( @@ -567,7 +571,6 @@ def _correct_splitlines_in_headers(markers, lines): updated_markers = "" i = 0 in_header_block = False - for m in markers: # Only set in_header_block flag when we hit an 's' and line is a header if m == 's': diff --git a/talon/utils.py b/talon/utils.py index 34a21c6..d257c17 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -131,7 +131,7 @@ def html_tree_to_text(tree): for el in tree.iter(): el_text = (el.text or '') + (el.tail or '') if len(el_text) > 1: - if el.tag in _BLOCKTAGS: + if el.tag in _BLOCKTAGS + _HARDBREAKS: text += "\n" if el.tag == 'li': text += " * " @@ -142,7 +142,8 @@ def html_tree_to_text(tree): if href: text += "(%s) " % href - if el.tag in _HARDBREAKS and text and not text.endswith("\n"): + if (el.tag in _HARDBREAKS and text and + not text.endswith("\n") and not el_text): text += "\n" retval = _rm_excessive_newlines(text) diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index 84be09d..89a7974 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -453,7 +453,8 @@ def test_link_closed_with_quotation_marker_on_new_line(): msg_body = '''8.45am-1pm From: somebody@example.com - +Date: Wed, 16 May 2012 00:15:02 -0600 + > @@ -494,7 +495,9 @@ def test_from_block_starts_with_date(): msg_body = """Blah Date: Wed, 16 May 2012 00:15:02 -0600 -To: klizhentas@example.com""" +To: klizhentas@example.com + +""" eq_('Blah', quotations.extract_from_plain(msg_body)) @@ -564,11 +567,12 @@ def test_mark_message_lines(): # next line should be marked as splitter '_____________', 'From: foo@bar.com', + 'Date: Wed, 16 May 2012 00:15:02 -0600', '', '> Hi', '', 'Signature'] - eq_('tessemet', quotations.mark_message_lines(lines)) + eq_('tesssemet', quotations.mark_message_lines(lines)) lines = ['Just testing the email reply', '', @@ -807,7 +811,7 @@ def test_split_email(): > > """ - expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm" + expected_markers = "stttttsttttetesetesmmmmmmsmmmmmmmmmmmmmmmm" markers = quotations.split_emails(msg) eq_(markers, expected_markers) @@ -823,3 +827,15 @@ that this line is intact.""" parsed = quotations.extract_from_plain(msg_body) eq_(msg_body, parsed.decode('utf8')) + + +def test_appointment(): + msg_body = """Invitation for an interview: + +Date: Wednesday 3, October 2011 +Time: 7 : 00am +Address: 130 Fox St + +Please bring in your ID.""" + parsed = quotations.extract_from_plain(msg_body) + eq_(msg_body, parsed.decode('utf8'))