Add extra splitter expressions and tests for German and Danish.

Also some refactoring to make it a bit easier to add more languages.
This commit is contained in:
Jeremy Schlatter
2014-12-23 15:44:04 -08:00
parent 52505bba8a
commit 613d1fc815
2 changed files with 58 additions and 19 deletions

View File

@@ -66,16 +66,32 @@ RE_EMPTY_QUOTATION = re.compile(
e*
''', re.VERBOSE)
SPLITTER_PATTERNS = [
# ------Original Message------ or ---- Reply Message ----
re.compile("[\s]*[-]+[ ]*(Original|Reply) Message[ ]*[-]+", re.I),
# With variations in other languages.
RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
u'|'.join((
# English
'Original Message', 'Reply Message',
# German
u'Ursprüngliche Nachricht', 'Antwort Nachricht',
# Danish
'Oprindelig meddelelse',
))), re.I)
RE_FROM_COLON_OR_DATE_COLON = re.compile('(_+\r?\n)?[\s]*(:?[*]?{}):[*]? .*'.format(
'|'.join((
# "From" in different languages.
'From', 'Van', 'De', 'Von', 'Fra',
# "Date" in different languages.
'Date', 'Datum',
))), re.I)
SPLITTER_PATTERNS = [
RE_ORIGINAL_MESSAGE,
# <date> <person>
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE),
RE_ON_DATE_SMB_WROTE,
re.compile('(_+\r?\n)?[\s]*(:?[*]?From|Date):[*]? .*'),
re.compile('(_+\r?\n)?[\s]*(:?[*]?Van|Datum):[*]? .*'),
re.compile('(_+\r?\n)?[\s]*(:?[*]?De|Date):[*]? .*'),
re.compile('(_+\r?\n)?[\s]*(:?[*]?Von|Datum):[*]? .*'),
RE_FROM_COLON_OR_DATE_COLON,
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
'( \S+){3,6}@\S+:')
]

View File

@@ -99,21 +99,21 @@ bla-bla - bla"""
def test_pattern_original_message():
msg_body = """Test reply
languages = (
'Original Message', # English
'Reply Message',
u'Ursprüngliche Nachricht', # German
'Antwort Nachricht',
'Oprindelig meddelelse', # Danish
)
msg_body = u"""Test reply
-----Original Message-----
-----{}-----
Test"""
eq_("Test reply", quotations.extract_from_plain(msg_body))
msg_body = """Test reply
-----Original Message-----
Test"""
eq_("Test reply", quotations.extract_from_plain(msg_body))
for language in languages:
eq_("Test reply", quotations.extract_from_plain(msg_body.format(unicode(language))))
def test_reply_after_quotations():
@@ -209,7 +209,7 @@ def test_pattern_date_email_with_unicode():
def test_pattern_from_block():
msg_body = """Allo! Follow up MIME!
english = """Allo! Follow up MIME!
From: somebody@example.com
Sent: March-19-11 5:42 PM
@@ -218,7 +218,30 @@ Subject: The manager has commented on your Loop
Blah-blah-blah
"""
eq_("Allo! Follow up MIME!", quotations.extract_from_plain(msg_body))
german = """Allo! Follow up MIME!
Von: somebody@example.com
Gesendet: Dienstag, 25. November 2014 14:59
An: Somebody
Betreff: The manager has commented on your Loop
Blah-blah-blah
"""
danish = """Allo! Follow up MIME!
Fra: somebody@example.com
Sendt: 19. march 2011 12:10
Til: Somebody
Emne: The manager has commented on your Loop
Blah-blah-blah
"""
for language in (english, german, danish):
eq_("Allo! Follow up MIME!", quotations.extract_from_plain(language))
def test_quotation_marker_false_positive():