diff --git a/talon/quotations.py b/talon/quotations.py index 0f34617..292b39a 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -23,14 +23,33 @@ log = logging.getLogger(__name__) RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) RE_ON_DATE_SMB_WROTE = re.compile( - r''' - ( - -* # could include dashes - [ ]?On[ ].*, # date part ends with comma - (.*\n){0,2} # splitter takes 4 lines at most - .*(wrote|sent): - ) - ''', re.VERBOSE) + u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):)'.format( + # Beginning of the line + u'|'.join(( + # English + 'On', + # French + 'Le', + # Polish + 'W dniu' + )), + # Date and sender separator + u'|'.join(( + # most languages separate date and sender address by comma + ',', + # polish date and sender address separator + u'użytkownik' + )), + # Ending of the line + u'|'.join(( + # English + 'wrote', 'sent', + # French + u'a écrit', + # Polish + u'napisał' + )) + )) RE_QUOTATION = re.compile( r''' @@ -78,12 +97,12 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format( 'Oprindelig meddelelse', ))), re.I) -RE_FROM_COLON_OR_DATE_COLON = re.compile('(_+\r?\n)?[\s]*(:?[*]?{}):[*]? .*'.format( - '|'.join(( +RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format( + u'|'.join(( # "From" in different languages. 'From', 'Van', 'De', 'Von', 'Fra', # "Date" in different languages. - 'Date', 'Datum', + 'Date', 'Datum', u'Envoyé' ))), re.I) SPLITTER_PATTERNS = [ diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index 26939c9..b68b132 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -202,6 +202,24 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote: eq_("Hi", quotations.extract_from_plain(msg_body)) +def test_short_quotation_with_newline(): + msg_body = """Btw blah blah... + +On Tue, Jan 27, 2015 at 12:42 PM -0800, "Company" wrote: + +Hi Mark, +Blah blah?  +Thanks,Christine  + +On Jan 27, 2015, at 11:55 AM, Mark XXX wrote: + +Lorem ipsum? +Mark + +Sent from Acompli""" + eq_("Btw blah blah...", quotations.extract_from_plain(msg_body)) + + def test_pattern_date_email_with_unicode(): msg_body = """Replying ok 2011/4/7 Nathan \xd0\xb8ova @@ -233,6 +251,36 @@ Betreff: The manager has commented on your Loop Blah-blah-blah """)) +def test_french_multiline_from_block(): + eq_('Lorem ipsum', quotations.extract_from_plain( + u"""Lorem ipsum + +De : Brendan xxx [mailto:brendan.xxx@xxx.com] +Envoyé : vendredi 23 janvier 2015 16:39 +À : Camille XXX +Objet : Follow Up + +Blah-blah-blah +""")) + +def test_french_from_block(): + eq_('Lorem ipsum', quotations.extract_from_plain( + u"""Lorem ipsum + +Le 23 janv. 2015 à 22:03, Brendan xxx > a écrit: + +Bonjour!""")) + +def test_polish_from_block(): + eq_('Lorem ipsum', quotations.extract_from_plain( + u"""Lorem ipsum + +W dniu 28 stycznia 2015 01:53 użytkownik Zoe xxx +napisał: + +Blah! +""")) + def test_danish_from_block(): eq_('Allo! Follow up MIME!', quotations.extract_from_plain( """Allo! Follow up MIME!