Add better support for Scandinavian languages

This is a port of https://github.com/tictail/claw/pull/6 by @simonflore.
This commit is contained in:
Adam Renberg
2015-09-21 21:41:59 +02:00
parent d62d633215
commit 14e3a0d80b
2 changed files with 37 additions and 4 deletions

View File

@@ -34,7 +34,11 @@ RE_ON_DATE_SMB_WROTE = re.compile(
# Dutch
'Op',
# German
'Am'
'Am',
# Norwegian
u'',
# Swedish, Danish
'Den',
)),
# Date and sender separator
u'|'.join((
@@ -54,7 +58,9 @@ RE_ON_DATE_SMB_WROTE = re.compile(
# Dutch
'schreef','verzond','geschreven',
# German
'schrieb'
'schrieb',
# Norwegian, Swedish
'skrev',
))
))
# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
@@ -125,9 +131,9 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format(
u'|'.join((
# "From" in different languages.
'From', 'Van', 'De', 'Von', 'Fra',
'From', 'Van', 'De', 'Von', 'Fra', u'Från',
# "Date" in different languages.
'Date', 'Datum', u'Envoyé'
'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
))), re.I)
SPLITTER_PATTERNS = [

View File

@@ -311,6 +311,33 @@ Emne: The manager has commented on your Loop
Blah-blah-blah
"""))
def test_swedish_from_block():
eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
u"""Allo! Follow up MIME!
Från: Anno Sportel [mailto:anno.spoel@hsbcssad.com]
Skickat: den 26 augusti 2015 14:45
Till: Isacson Leiff
Ämne: RE: Week 36
Blah-blah-blah
"""))
def test_swedish_from_line():
eq_('Lorem', quotations.extract_from_plain(
"""Lorem
Den 14 september, 2015 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
"""))
def test_norwegian_from_line():
eq_('Lorem', quotations.extract_from_plain(
u"""Lorem
På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
"""))
def test_dutch_from_block():
eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
"""Gluten-free culpa lo-fi et nesciunt nostrud.