From b16060261aa8cd1b2dc5a4396ff431f2fd66ae7c Mon Sep 17 00:00:00 2001 From: szymonsobczak Date: Tue, 24 Feb 2015 11:39:12 +0100 Subject: [PATCH] support some polish and french formats --- talon/quotations.py | 35 +++++++++++++++++++------ tests/text_quotations_test.py | 48 +++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 8 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index 0f34617..58f4465 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -23,14 +23,33 @@ log = logging.getLogger(__name__) RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) RE_ON_DATE_SMB_WROTE = re.compile( - r''' - ( - -* # could include dashes - [ ]?On[ ].*, # date part ends with comma - (.*\n){0,2} # splitter takes 4 lines at most - .*(wrote|sent): - ) - ''', re.VERBOSE) + u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):)'.format( + # Beginning of the line + u'|'.join(( + # English + 'On', + # French + 'Le', + # Polish + 'W dniu' + )), + # Date and sender separator + u'|'.join(( + # most languages separate date and sender address by comma + ',', + # polish date and sender address separator + u'użytkownik' + )), + # Ending of the line + u'|'.join(( + # English + 'wrote', 'sent', + # French + u'a écrit', + # Polish + u'napisał' + )) + )) RE_QUOTATION = re.compile( r''' diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index 26939c9..a19cee9 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -202,6 +202,24 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote: eq_("Hi", quotations.extract_from_plain(msg_body)) +def test_short_quotation_with_newline(): + msg_body = """Btw blah blah... + +On Tue, Jan 27, 2015 at 12:42 PM -0800, "Company" wrote: + +Hi Mark, +Blah blah?  +Thanks,Christine  + +On Jan 27, 2015, at 11:55 AM, Mark XXX wrote: + +Lorem ipsum? +Mark + +Sent from Acompli""" + eq_("Btw blah blah...", quotations.extract_from_plain(msg_body)) + + def test_pattern_date_email_with_unicode(): msg_body = """Replying ok 2011/4/7 Nathan \xd0\xb8ova @@ -233,6 +251,36 @@ Betreff: The manager has commented on your Loop Blah-blah-blah """)) +def test_french_multiline_from_block(): + eq_('Lorem ipsum', quotations.extract_from_plain( + u"""Lorem ipsum + +De: Brendan xxx [mailto:brendan.xxx@xxx.com] +Envoyé: vendredi 23 janvier 2015 16:39 +À: Camille XXX +Objet: Follow Up + +Blah-blah-blah +""")) + +def test_french_from_block(): + eq_('Lorem ipsum', quotations.extract_from_plain( + u"""Lorem ipsum + +Le 23 janv. 2015 à 22:03, Brendan xxx > a écrit: + +Bonjour!""")) + +def test_polish_from_block(): + eq_('Lorem ipsum', quotations.extract_from_plain( + u"""Lorem ipsum + +W dniu 28 stycznia 2015 01:53 użytkownik Zoe xxx +napisał: + +Blah! +""")) + def test_danish_from_block(): eq_('Allo! Follow up MIME!', quotations.extract_from_plain( """Allo! Follow up MIME!