Merge pull request #56 from mailgun/sergey/1000+German+NL

process first 1000 lines for long messages, support for German and Dutch
This commit is contained in:
Sergey Obukhov
2015-09-11 06:20:34 -07:00
2 changed files with 18 additions and 11 deletions

View File

@@ -32,7 +32,9 @@ RE_ON_DATE_SMB_WROTE = re.compile(
# Polish # Polish
'W dniu', 'W dniu',
# Dutch # Dutch
'Op' 'Op',
# German
'Am'
)), )),
# Date and sender separator # Date and sender separator
u'|'.join(( u'|'.join((
@@ -50,18 +52,26 @@ RE_ON_DATE_SMB_WROTE = re.compile(
# Polish # Polish
u'napisał', u'napisał',
# Dutch # Dutch
'schreef','verzond','geschreven' 'schreef','verzond','geschreven',
# German
'schrieb'
)) ))
)) ))
# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:' # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
RE_ON_DATE_WROTE_SMB = re.compile( RE_ON_DATE_WROTE_SMB = re.compile(
u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format( u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
# Beginning of the line # Beginning of the line
u'|'.join((
'Op', 'Op',
#German
'Am'
)),
# Ending of the line # Ending of the line
u'|'.join(( u'|'.join((
# Dutch # Dutch
'schreef','verzond','geschreven' 'schreef','verzond','geschreven',
# German
'schrieb'
)) ))
) )
) )
@@ -181,6 +191,7 @@ def mark_message_lines(lines):
else: else:
# in case splitter is spread across several lines # in case splitter is spread across several lines
splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES])) splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
if splitter: if splitter:
# append as many splitter markers as lines in splitter # append as many splitter markers as lines in splitter
splitter_lines = splitter.group().splitlines() splitter_lines = splitter.group().splitlines()
@@ -293,12 +304,8 @@ def extract_from_plain(msg_body):
delimiter = get_delimiter(msg_body) delimiter = get_delimiter(msg_body)
msg_body = preprocess(msg_body, delimiter) msg_body = preprocess(msg_body, delimiter)
lines = msg_body.splitlines()
# don't process too long messages # don't process too long messages
if len(lines) > MAX_LINES_COUNT: lines = msg_body.splitlines()[:MAX_LINES_COUNT]
return stripped_text
markers = mark_message_lines(lines) markers = mark_message_lines(lines)
lines = process_marked_lines(lines, markers) lines = process_marked_lines(lines, markers)

View File

@@ -12,11 +12,11 @@ from talon import quotations
@patch.object(quotations, 'MAX_LINES_COUNT', 1) @patch.object(quotations, 'MAX_LINES_COUNT', 1)
def test_too_many_lines(): def test_too_many_lines():
msg_body = """Test reply msg_body = """Test reply
Hi
-----Original Message----- -----Original Message-----
Test""" Test"""
eq_(msg_body, quotations.extract_from_plain(msg_body)) eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_pattern_on_date_somebody_wrote(): def test_pattern_on_date_somebody_wrote():