process first 1000 lines for long messages, support for German and Dutch

This commit is contained in:
Sergey Obukhov
2015-09-11 06:17:14 -07:00
parent 127771dac9
commit 385285e5de
2 changed files with 18 additions and 11 deletions

View File

@@ -32,7 +32,9 @@ RE_ON_DATE_SMB_WROTE = re.compile(
# Polish
'W dniu',
# Dutch
'Op'
'Op',
# German
'Am'
)),
# Date and sender separator
u'|'.join((
@@ -50,18 +52,26 @@ RE_ON_DATE_SMB_WROTE = re.compile(
# Polish
u'napisał',
# Dutch
'schreef','verzond','geschreven'
'schreef','verzond','geschreven',
# German
'schrieb'
))
))
# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
RE_ON_DATE_WROTE_SMB = re.compile(
u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format(
u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
# Beginning of the line
u'|'.join((
'Op',
#German
'Am'
)),
# Ending of the line
u'|'.join((
# Dutch
'schreef','verzond','geschreven'
'schreef','verzond','geschreven',
# German
'schrieb'
))
)
)
@@ -181,6 +191,7 @@ def mark_message_lines(lines):
else:
# in case splitter is spread across several lines
splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
if splitter:
# append as many splitter markers as lines in splitter
splitter_lines = splitter.group().splitlines()
@@ -293,12 +304,8 @@ def extract_from_plain(msg_body):
delimiter = get_delimiter(msg_body)
msg_body = preprocess(msg_body, delimiter)
lines = msg_body.splitlines()
# don't process too long messages
if len(lines) > MAX_LINES_COUNT:
return stripped_text
lines = msg_body.splitlines()[:MAX_LINES_COUNT]
markers = mark_message_lines(lines)
lines = process_marked_lines(lines, markers)

View File

@@ -12,11 +12,11 @@ from talon import quotations
@patch.object(quotations, 'MAX_LINES_COUNT', 1)
def test_too_many_lines():
msg_body = """Test reply
Hi
-----Original Message-----
Test"""
eq_(msg_body, quotations.extract_from_plain(msg_body))
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_pattern_on_date_somebody_wrote():