process first 1000 lines for long messages, support for German and Dutch
This commit is contained in:
@@ -32,7 +32,9 @@ RE_ON_DATE_SMB_WROTE = re.compile(
|
|||||||
# Polish
|
# Polish
|
||||||
'W dniu',
|
'W dniu',
|
||||||
# Dutch
|
# Dutch
|
||||||
'Op'
|
'Op',
|
||||||
|
# German
|
||||||
|
'Am'
|
||||||
)),
|
)),
|
||||||
# Date and sender separator
|
# Date and sender separator
|
||||||
u'|'.join((
|
u'|'.join((
|
||||||
@@ -50,18 +52,26 @@ RE_ON_DATE_SMB_WROTE = re.compile(
|
|||||||
# Polish
|
# Polish
|
||||||
u'napisał',
|
u'napisał',
|
||||||
# Dutch
|
# Dutch
|
||||||
'schreef','verzond','geschreven'
|
'schreef','verzond','geschreven',
|
||||||
|
# German
|
||||||
|
'schrieb'
|
||||||
))
|
))
|
||||||
))
|
))
|
||||||
# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
|
# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
|
||||||
RE_ON_DATE_WROTE_SMB = re.compile(
|
RE_ON_DATE_WROTE_SMB = re.compile(
|
||||||
u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format(
|
u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
|
||||||
# Beginning of the line
|
# Beginning of the line
|
||||||
|
u'|'.join((
|
||||||
'Op',
|
'Op',
|
||||||
|
#German
|
||||||
|
'Am'
|
||||||
|
)),
|
||||||
# Ending of the line
|
# Ending of the line
|
||||||
u'|'.join((
|
u'|'.join((
|
||||||
# Dutch
|
# Dutch
|
||||||
'schreef','verzond','geschreven'
|
'schreef','verzond','geschreven',
|
||||||
|
# German
|
||||||
|
'schrieb'
|
||||||
))
|
))
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@@ -181,6 +191,7 @@ def mark_message_lines(lines):
|
|||||||
else:
|
else:
|
||||||
# in case splitter is spread across several lines
|
# in case splitter is spread across several lines
|
||||||
splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
|
splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
|
||||||
|
|
||||||
if splitter:
|
if splitter:
|
||||||
# append as many splitter markers as lines in splitter
|
# append as many splitter markers as lines in splitter
|
||||||
splitter_lines = splitter.group().splitlines()
|
splitter_lines = splitter.group().splitlines()
|
||||||
@@ -293,12 +304,8 @@ def extract_from_plain(msg_body):
|
|||||||
|
|
||||||
delimiter = get_delimiter(msg_body)
|
delimiter = get_delimiter(msg_body)
|
||||||
msg_body = preprocess(msg_body, delimiter)
|
msg_body = preprocess(msg_body, delimiter)
|
||||||
lines = msg_body.splitlines()
|
|
||||||
|
|
||||||
# don't process too long messages
|
# don't process too long messages
|
||||||
if len(lines) > MAX_LINES_COUNT:
|
lines = msg_body.splitlines()[:MAX_LINES_COUNT]
|
||||||
return stripped_text
|
|
||||||
|
|
||||||
markers = mark_message_lines(lines)
|
markers = mark_message_lines(lines)
|
||||||
lines = process_marked_lines(lines, markers)
|
lines = process_marked_lines(lines, markers)
|
||||||
|
|
||||||
|
|||||||
@@ -12,11 +12,11 @@ from talon import quotations
|
|||||||
@patch.object(quotations, 'MAX_LINES_COUNT', 1)
|
@patch.object(quotations, 'MAX_LINES_COUNT', 1)
|
||||||
def test_too_many_lines():
|
def test_too_many_lines():
|
||||||
msg_body = """Test reply
|
msg_body = """Test reply
|
||||||
|
Hi
|
||||||
-----Original Message-----
|
-----Original Message-----
|
||||||
|
|
||||||
Test"""
|
Test"""
|
||||||
eq_(msg_body, quotations.extract_from_plain(msg_body))
|
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
||||||
|
|
||||||
|
|
||||||
def test_pattern_on_date_somebody_wrote():
|
def test_pattern_on_date_somebody_wrote():
|
||||||
|
|||||||
Reference in New Issue
Block a user