Merge pull request #41 from simonflore/master
New splitter pattern for Dutch mail replies
This commit is contained in:
@@ -23,7 +23,7 @@ log = logging.getLogger(__name__)
|
|||||||
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
|
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
|
||||||
|
|
||||||
RE_ON_DATE_SMB_WROTE = re.compile(
|
RE_ON_DATE_SMB_WROTE = re.compile(
|
||||||
u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):)'.format(
|
u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
|
||||||
# Beginning of the line
|
# Beginning of the line
|
||||||
u'|'.join((
|
u'|'.join((
|
||||||
# English
|
# English
|
||||||
@@ -31,7 +31,9 @@ RE_ON_DATE_SMB_WROTE = re.compile(
|
|||||||
# French
|
# French
|
||||||
'Le',
|
'Le',
|
||||||
# Polish
|
# Polish
|
||||||
'W dniu'
|
'W dniu',
|
||||||
|
# Dutch
|
||||||
|
'Op'
|
||||||
)),
|
)),
|
||||||
# Date and sender separator
|
# Date and sender separator
|
||||||
u'|'.join((
|
u'|'.join((
|
||||||
@@ -47,9 +49,23 @@ RE_ON_DATE_SMB_WROTE = re.compile(
|
|||||||
# French
|
# French
|
||||||
u'a écrit',
|
u'a écrit',
|
||||||
# Polish
|
# Polish
|
||||||
u'napisał'
|
u'napisał',
|
||||||
|
# Dutch
|
||||||
|
'schreef','verzond','geschreven'
|
||||||
))
|
))
|
||||||
))
|
))
|
||||||
|
# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
|
||||||
|
RE_ON_DATE_WROTE_SMB = re.compile(
|
||||||
|
u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format(
|
||||||
|
# Beginning of the line
|
||||||
|
'Op',
|
||||||
|
# Ending of the line
|
||||||
|
u'|'.join((
|
||||||
|
# Dutch
|
||||||
|
'schreef','verzond','geschreven'
|
||||||
|
))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
RE_QUOTATION = re.compile(
|
RE_QUOTATION = re.compile(
|
||||||
r'''
|
r'''
|
||||||
@@ -110,6 +126,7 @@ SPLITTER_PATTERNS = [
|
|||||||
# <date> <person>
|
# <date> <person>
|
||||||
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE),
|
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE),
|
||||||
RE_ON_DATE_SMB_WROTE,
|
RE_ON_DATE_SMB_WROTE,
|
||||||
|
RE_ON_DATE_WROTE_SMB,
|
||||||
RE_FROM_COLON_OR_DATE_COLON,
|
RE_FROM_COLON_OR_DATE_COLON,
|
||||||
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
|
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
|
||||||
'( \S+){3,6}@\S+:')
|
'( \S+){3,6}@\S+:')
|
||||||
|
|||||||
@@ -33,6 +33,16 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote:
|
|||||||
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
||||||
|
|
||||||
|
|
||||||
|
def test_pattern_on_date_wrote_somebody():
|
||||||
|
eq_('Lorem', quotations.extract_from_plain(
|
||||||
|
"""Lorem
|
||||||
|
|
||||||
|
Op 13-02-2014 3:18 schreef Julius Caesar <pantheon@rome.com>:
|
||||||
|
|
||||||
|
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
|
||||||
|
"""))
|
||||||
|
|
||||||
|
|
||||||
def test_pattern_on_date_somebody_wrote_date_with_slashes():
|
def test_pattern_on_date_somebody_wrote_date_with_slashes():
|
||||||
msg_body = """Test reply
|
msg_body = """Test reply
|
||||||
|
|
||||||
@@ -201,6 +211,15 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote:
|
|||||||
> Hello"""
|
> Hello"""
|
||||||
eq_("Hi", quotations.extract_from_plain(msg_body))
|
eq_("Hi", quotations.extract_from_plain(msg_body))
|
||||||
|
|
||||||
|
def test_with_indent():
|
||||||
|
msg_body = """YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.
|
||||||
|
|
||||||
|
------On 12/29/1987 17:32 PM, Julius Caesar wrote-----
|
||||||
|
|
||||||
|
Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur.
|
||||||
|
"""
|
||||||
|
eq_("YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.", quotations.extract_from_plain(msg_body))
|
||||||
|
|
||||||
|
|
||||||
def test_short_quotation_with_newline():
|
def test_short_quotation_with_newline():
|
||||||
msg_body = """Btw blah blah...
|
msg_body = """Btw blah blah...
|
||||||
@@ -293,6 +312,15 @@ Emne: The manager has commented on your Loop
|
|||||||
Blah-blah-blah
|
Blah-blah-blah
|
||||||
"""))
|
"""))
|
||||||
|
|
||||||
|
def test_dutch_from_block():
|
||||||
|
eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
|
||||||
|
"""Gluten-free culpa lo-fi et nesciunt nostrud.
|
||||||
|
|
||||||
|
Op 17-feb.-2015, om 13:18 heeft Julius Caesar <pantheon@rome.com> het volgende geschreven:
|
||||||
|
|
||||||
|
Small batch beard laboris tempor, non listicle hella Tumblr heirloom.
|
||||||
|
"""))
|
||||||
|
|
||||||
|
|
||||||
def test_quotation_marker_false_positive():
|
def test_quotation_marker_false_positive():
|
||||||
msg_body = """Visit us now for assistance...
|
msg_body = """Visit us now for assistance...
|
||||||
|
|||||||
Reference in New Issue
Block a user