From 613d1fc815bdb791e733df5635c75bff4d0d7edf Mon Sep 17 00:00:00 2001 From: Jeremy Schlatter Date: Tue, 23 Dec 2014 15:44:04 -0800 Subject: [PATCH] Add extra splitter expressions and tests for German and Danish. Also some refactoring to make it a bit easier to add more languages. --- talon/quotations.py | 28 +++++++++++++++----- tests/text_quotations_test.py | 49 +++++++++++++++++++++++++---------- 2 files changed, 58 insertions(+), 19 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index 7278513..0f34617 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -66,16 +66,32 @@ RE_EMPTY_QUOTATION = re.compile( e* ''', re.VERBOSE) +# ------Original Message------ or ---- Reply Message ---- +# With variations in other languages. +RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format( + u'|'.join(( + # English + 'Original Message', 'Reply Message', + # German + u'Ursprüngliche Nachricht', 'Antwort Nachricht', + # Danish + 'Oprindelig meddelelse', + ))), re.I) + +RE_FROM_COLON_OR_DATE_COLON = re.compile('(_+\r?\n)?[\s]*(:?[*]?{}):[*]? .*'.format( + '|'.join(( + # "From" in different languages. + 'From', 'Van', 'De', 'Von', 'Fra', + # "Date" in different languages. + 'Date', 'Datum', + ))), re.I) + SPLITTER_PATTERNS = [ - # ------Original Message------ or ---- Reply Message ---- - re.compile("[\s]*[-]+[ ]*(Original|Reply) Message[ ]*[-]+", re.I), + RE_ORIGINAL_MESSAGE, # re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE), RE_ON_DATE_SMB_WROTE, - re.compile('(_+\r?\n)?[\s]*(:?[*]?From|Date):[*]? .*'), - re.compile('(_+\r?\n)?[\s]*(:?[*]?Van|Datum):[*]? .*'), - re.compile('(_+\r?\n)?[\s]*(:?[*]?De|Date):[*]? .*'), - re.compile('(_+\r?\n)?[\s]*(:?[*]?Von|Datum):[*]? .*'), + RE_FROM_COLON_OR_DATE_COLON, re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' '( \S+){3,6}@\S+:') ] diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index e94cc58..02d2590 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -99,21 +99,21 @@ bla-bla - bla""" def test_pattern_original_message(): - msg_body = """Test reply + languages = ( + 'Original Message', # English + 'Reply Message', + u'Ursprüngliche Nachricht', # German + 'Antwort Nachricht', + 'Oprindelig meddelelse', # Danish + ) + msg_body = u"""Test reply ------Original Message----- +-----{}----- Test""" - eq_("Test reply", quotations.extract_from_plain(msg_body)) - - msg_body = """Test reply - - -----Original Message----- - -Test""" - - eq_("Test reply", quotations.extract_from_plain(msg_body)) + for language in languages: + eq_("Test reply", quotations.extract_from_plain(msg_body.format(unicode(language)))) def test_reply_after_quotations(): @@ -209,7 +209,7 @@ def test_pattern_date_email_with_unicode(): def test_pattern_from_block(): - msg_body = """Allo! Follow up MIME! + english = """Allo! Follow up MIME! From: somebody@example.com Sent: March-19-11 5:42 PM @@ -218,7 +218,30 @@ Subject: The manager has commented on your Loop Blah-blah-blah """ - eq_("Allo! Follow up MIME!", quotations.extract_from_plain(msg_body)) + + german = """Allo! Follow up MIME! + +Von: somebody@example.com +Gesendet: Dienstag, 25. November 2014 14:59 +An: Somebody +Betreff: The manager has commented on your Loop + +Blah-blah-blah +""" + + danish = """Allo! Follow up MIME! + +Fra: somebody@example.com +Sendt: 19. march 2011 12:10 +Til: Somebody +Emne: The manager has commented on your Loop + + +Blah-blah-blah +""" + + for language in (english, german, danish): + eq_("Allo! Follow up MIME!", quotations.extract_from_plain(language)) def test_quotation_marker_false_positive():