Merge pull request #56 from mailgun/sergey/1000+German+NL
process first 1000 lines for long messages, support for German and Dutch
This commit is contained in:
@@ -32,7 +32,9 @@ RE_ON_DATE_SMB_WROTE = re.compile(
|
||||
# Polish
|
||||
'W dniu',
|
||||
# Dutch
|
||||
'Op'
|
||||
'Op',
|
||||
# German
|
||||
'Am'
|
||||
)),
|
||||
# Date and sender separator
|
||||
u'|'.join((
|
||||
@@ -50,18 +52,26 @@ RE_ON_DATE_SMB_WROTE = re.compile(
|
||||
# Polish
|
||||
u'napisał',
|
||||
# Dutch
|
||||
'schreef','verzond','geschreven'
|
||||
'schreef','verzond','geschreven',
|
||||
# German
|
||||
'schrieb'
|
||||
))
|
||||
))
|
||||
# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
|
||||
RE_ON_DATE_WROTE_SMB = re.compile(
|
||||
u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format(
|
||||
u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
|
||||
# Beginning of the line
|
||||
u'|'.join((
|
||||
'Op',
|
||||
#German
|
||||
'Am'
|
||||
)),
|
||||
# Ending of the line
|
||||
u'|'.join((
|
||||
# Dutch
|
||||
'schreef','verzond','geschreven'
|
||||
'schreef','verzond','geschreven',
|
||||
# German
|
||||
'schrieb'
|
||||
))
|
||||
)
|
||||
)
|
||||
@@ -181,6 +191,7 @@ def mark_message_lines(lines):
|
||||
else:
|
||||
# in case splitter is spread across several lines
|
||||
splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
|
||||
|
||||
if splitter:
|
||||
# append as many splitter markers as lines in splitter
|
||||
splitter_lines = splitter.group().splitlines()
|
||||
@@ -293,12 +304,8 @@ def extract_from_plain(msg_body):
|
||||
|
||||
delimiter = get_delimiter(msg_body)
|
||||
msg_body = preprocess(msg_body, delimiter)
|
||||
lines = msg_body.splitlines()
|
||||
|
||||
# don't process too long messages
|
||||
if len(lines) > MAX_LINES_COUNT:
|
||||
return stripped_text
|
||||
|
||||
lines = msg_body.splitlines()[:MAX_LINES_COUNT]
|
||||
markers = mark_message_lines(lines)
|
||||
lines = process_marked_lines(lines, markers)
|
||||
|
||||
|
||||
@@ -12,11 +12,11 @@ from talon import quotations
|
||||
@patch.object(quotations, 'MAX_LINES_COUNT', 1)
|
||||
def test_too_many_lines():
|
||||
msg_body = """Test reply
|
||||
|
||||
Hi
|
||||
-----Original Message-----
|
||||
|
||||
Test"""
|
||||
eq_(msg_body, quotations.extract_from_plain(msg_body))
|
||||
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
||||
|
||||
|
||||
def test_pattern_on_date_somebody_wrote():
|
||||
|
||||
Reference in New Issue
Block a user