Add extra splitter expressions and tests for German and Danish.

Also some refactoring to make it a bit easier to add more languages.
This commit is contained in:
Jeremy Schlatter
2014-12-23 15:44:04 -08:00
parent 52505bba8a
commit 613d1fc815
2 changed files with 58 additions and 19 deletions

View File

@@ -66,16 +66,32 @@ RE_EMPTY_QUOTATION = re.compile(
e*
''', re.VERBOSE)
# ------Original Message------ or ---- Reply Message ----
# With variations in other languages.
RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
u'|'.join((
# English
'Original Message', 'Reply Message',
# German
u'Ursprüngliche Nachricht', 'Antwort Nachricht',
# Danish
'Oprindelig meddelelse',
))), re.I)
RE_FROM_COLON_OR_DATE_COLON = re.compile('(_+\r?\n)?[\s]*(:?[*]?{}):[*]? .*'.format(
'|'.join((
# "From" in different languages.
'From', 'Van', 'De', 'Von', 'Fra',
# "Date" in different languages.
'Date', 'Datum',
))), re.I)
SPLITTER_PATTERNS = [
# ------Original Message------ or ---- Reply Message ----
re.compile("[\s]*[-]+[ ]*(Original|Reply) Message[ ]*[-]+", re.I),
RE_ORIGINAL_MESSAGE,
# <date> <person>
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE),
RE_ON_DATE_SMB_WROTE,
re.compile('(_+\r?\n)?[\s]*(:?[*]?From|Date):[*]? .*'),
re.compile('(_+\r?\n)?[\s]*(:?[*]?Van|Datum):[*]? .*'),
re.compile('(_+\r?\n)?[\s]*(:?[*]?De|Date):[*]? .*'),
re.compile('(_+\r?\n)?[\s]*(:?[*]?Von|Datum):[*]? .*'),
RE_FROM_COLON_OR_DATE_COLON,
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
'( \S+){3,6}@\S+:')
]