From b16060261aa8cd1b2dc5a4396ff431f2fd66ae7c Mon Sep 17 00:00:00 2001 From: szymonsobczak Date: Tue, 24 Feb 2015 11:39:12 +0100 Subject: [PATCH 1/2] support some polish and french formats --- talon/quotations.py | 35 +++++++++++++++++++------ tests/text_quotations_test.py | 48 +++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 8 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index 0f34617..58f4465 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -23,14 +23,33 @@ log = logging.getLogger(__name__) RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) RE_ON_DATE_SMB_WROTE = re.compile( - r''' - ( - -* # could include dashes - [ ]?On[ ].*, # date part ends with comma - (.*\n){0,2} # splitter takes 4 lines at most - .*(wrote|sent): - ) - ''', re.VERBOSE) + u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):)'.format( + # Beginning of the line + u'|'.join(( + # English + 'On', + # French + 'Le', + # Polish + 'W dniu' + )), + # Date and sender separator + u'|'.join(( + # most languages separate date and sender address by comma + ',', + # polish date and sender address separator + u'użytkownik' + )), + # Ending of the line + u'|'.join(( + # English + 'wrote', 'sent', + # French + u'a écrit', + # Polish + u'napisał' + )) + )) RE_QUOTATION = re.compile( r''' diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index 26939c9..a19cee9 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -202,6 +202,24 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote: eq_("Hi", quotations.extract_from_plain(msg_body)) +def test_short_quotation_with_newline(): + msg_body = """Btw blah blah... + +On Tue, Jan 27, 2015 at 12:42 PM -0800, "Company" wrote: + +Hi Mark, +Blah blah?  +Thanks,Christine  + +On Jan 27, 2015, at 11:55 AM, Mark XXX wrote: + +Lorem ipsum? +Mark + +Sent from Acompli""" + eq_("Btw blah blah...", quotations.extract_from_plain(msg_body)) + + def test_pattern_date_email_with_unicode(): msg_body = """Replying ok 2011/4/7 Nathan \xd0\xb8ova @@ -233,6 +251,36 @@ Betreff: The manager has commented on your Loop Blah-blah-blah """)) +def test_french_multiline_from_block(): + eq_('Lorem ipsum', quotations.extract_from_plain( + u"""Lorem ipsum + +De: Brendan xxx [mailto:brendan.xxx@xxx.com] +Envoyé: vendredi 23 janvier 2015 16:39 +À: Camille XXX +Objet: Follow Up + +Blah-blah-blah +""")) + +def test_french_from_block(): + eq_('Lorem ipsum', quotations.extract_from_plain( + u"""Lorem ipsum + +Le 23 janv. 2015 à 22:03, Brendan xxx > a écrit: + +Bonjour!""")) + +def test_polish_from_block(): + eq_('Lorem ipsum', quotations.extract_from_plain( + u"""Lorem ipsum + +W dniu 28 stycznia 2015 01:53 użytkownik Zoe xxx +napisał: + +Blah! +""")) + def test_danish_from_block(): eq_('Allo! Follow up MIME!', quotations.extract_from_plain( """Allo! Follow up MIME! From 3c9ef4653f589f8795b796a09c45504519233c28 Mon Sep 17 00:00:00 2001 From: szymonsobczak Date: Tue, 24 Feb 2015 12:18:54 +0100 Subject: [PATCH 2/2] some more french fromats --- talon/quotations.py | 6 +++--- tests/text_quotations_test.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index 58f4465..292b39a 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -97,12 +97,12 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format( 'Oprindelig meddelelse', ))), re.I) -RE_FROM_COLON_OR_DATE_COLON = re.compile('(_+\r?\n)?[\s]*(:?[*]?{}):[*]? .*'.format( - '|'.join(( +RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format( + u'|'.join(( # "From" in different languages. 'From', 'Van', 'De', 'Von', 'Fra', # "Date" in different languages. - 'Date', 'Datum', + 'Date', 'Datum', u'Envoyé' ))), re.I) SPLITTER_PATTERNS = [ diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index a19cee9..b68b132 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -255,10 +255,10 @@ def test_french_multiline_from_block(): eq_('Lorem ipsum', quotations.extract_from_plain( u"""Lorem ipsum -De: Brendan xxx [mailto:brendan.xxx@xxx.com] -Envoyé: vendredi 23 janvier 2015 16:39 -À: Camille XXX -Objet: Follow Up +De : Brendan xxx [mailto:brendan.xxx@xxx.com] +Envoyé : vendredi 23 janvier 2015 16:39 +À : Camille XXX +Objet : Follow Up Blah-blah-blah """))