diff --git a/talon/quotations.py b/talon/quotations.py index 292b39a..dc77fd4 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -23,7 +23,7 @@ log = logging.getLogger(__name__) RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) RE_ON_DATE_SMB_WROTE = re.compile( - u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):)'.format( + u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format( # Beginning of the line u'|'.join(( # English @@ -31,7 +31,9 @@ RE_ON_DATE_SMB_WROTE = re.compile( # French 'Le', # Polish - 'W dniu' + 'W dniu', + # Dutch + 'Op' )), # Date and sender separator u'|'.join(( @@ -47,9 +49,23 @@ RE_ON_DATE_SMB_WROTE = re.compile( # French u'a écrit', # Polish - u'napisał' + u'napisał', + # Dutch + 'schreef','verzond','geschreven' )) )) +# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:' +RE_ON_DATE_WROTE_SMB = re.compile( + u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format( + # Beginning of the line + 'Op', + # Ending of the line + u'|'.join(( + # Dutch + 'schreef','verzond','geschreven' + )) + ) + ) RE_QUOTATION = re.compile( r''' @@ -110,6 +126,7 @@ SPLITTER_PATTERNS = [ # re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE), RE_ON_DATE_SMB_WROTE, + RE_ON_DATE_WROTE_SMB, RE_FROM_COLON_OR_DATE_COLON, re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' '( \S+){3,6}@\S+:') diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index b68b132..918ed29 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -33,6 +33,16 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko wrote: eq_("Test reply", quotations.extract_from_plain(msg_body)) +def test_pattern_on_date_wrote_somebody(): + eq_('Lorem', quotations.extract_from_plain( + """Lorem + +Op 13-02-2014 3:18 schreef Julius Caesar : + +Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse. +""")) + + def test_pattern_on_date_somebody_wrote_date_with_slashes(): msg_body = """Test reply @@ -201,6 +211,15 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote: > Hello""" eq_("Hi", quotations.extract_from_plain(msg_body)) +def test_with_indent(): + msg_body = """YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin. + +------On 12/29/1987 17:32 PM, Julius Caesar wrote----- + +Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur. + """ + eq_("YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.", quotations.extract_from_plain(msg_body)) + def test_short_quotation_with_newline(): msg_body = """Btw blah blah... @@ -293,6 +312,15 @@ Emne: The manager has commented on your Loop Blah-blah-blah """)) +def test_dutch_from_block(): + eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain( + """Gluten-free culpa lo-fi et nesciunt nostrud. + +Op 17-feb.-2015, om 13:18 heeft Julius Caesar het volgende geschreven: + +Small batch beard laboris tempor, non listicle hella Tumblr heirloom. +""")) + def test_quotation_marker_false_positive(): msg_body = """Visit us now for assistance...