From f0ed5d6c07a7baa90f0964e1d5efe0595683ceda Mon Sep 17 00:00:00 2001 From: Simon Date: Tue, 14 Apr 2015 18:22:48 +0200 Subject: [PATCH] New splitter pattern for Dutch mail replies --- talon/quotations.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index 292b39a..742f1dc 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -31,7 +31,9 @@ RE_ON_DATE_SMB_WROTE = re.compile( # French 'Le', # Polish - 'W dniu' + 'W dniu', + # Dutch + 'Op' )), # Date and sender separator u'|'.join(( @@ -47,9 +49,26 @@ RE_ON_DATE_SMB_WROTE = re.compile( # French u'a écrit', # Polish - u'napisał' + u'napisał', + # Dutch + 'schreef','verzond' )) )) +# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:' +RE_ON_DATE_WROTE_SMB = re.compile( + u'(-*[ ]?({0}).*(.*\n){{0,2}}.*({1}).*:)'.format( + # Beginning of the line + u'|'.join(( + # Dutch + 'Op' + )), + # Ending of the line + u'|'.join(( + # Dutch + 'schreef' + )) + ) + ) RE_QUOTATION = re.compile( r''' @@ -110,6 +129,7 @@ SPLITTER_PATTERNS = [ # re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE), RE_ON_DATE_SMB_WROTE, + RE_ON_DATE_WROTE_SMB, RE_FROM_COLON_OR_DATE_COLON, re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' '( \S+){3,6}@\S+:')