Compare commits
15 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3a37d8b649 | ||
|
|
f9f428f4c3 | ||
|
|
84a83e865e | ||
|
|
b4c180b9ff | ||
|
|
072a440837 | ||
|
|
105d16644d | ||
|
|
df3338192a | ||
|
|
f0ed5d6c07 | ||
|
|
790463821f | ||
|
|
763d3b308e | ||
|
|
3c9ef4653f | ||
|
|
b16060261a | ||
|
|
13dc43e960 | ||
|
|
3768d7ba31 | ||
|
|
613d1fc815 |
@@ -23,14 +23,49 @@ log = logging.getLogger(__name__)
|
|||||||
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
|
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
|
||||||
|
|
||||||
RE_ON_DATE_SMB_WROTE = re.compile(
|
RE_ON_DATE_SMB_WROTE = re.compile(
|
||||||
r'''
|
u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
|
||||||
(
|
# Beginning of the line
|
||||||
-* # could include dashes
|
u'|'.join((
|
||||||
[ ]?On[ ].*, # date part ends with comma
|
# English
|
||||||
(.*\n){0,2} # splitter takes 4 lines at most
|
'On',
|
||||||
.*(wrote|sent):
|
# French
|
||||||
|
'Le',
|
||||||
|
# Polish
|
||||||
|
'W dniu',
|
||||||
|
# Dutch
|
||||||
|
'Op'
|
||||||
|
)),
|
||||||
|
# Date and sender separator
|
||||||
|
u'|'.join((
|
||||||
|
# most languages separate date and sender address by comma
|
||||||
|
',',
|
||||||
|
# polish date and sender address separator
|
||||||
|
u'użytkownik'
|
||||||
|
)),
|
||||||
|
# Ending of the line
|
||||||
|
u'|'.join((
|
||||||
|
# English
|
||||||
|
'wrote', 'sent',
|
||||||
|
# French
|
||||||
|
u'a écrit',
|
||||||
|
# Polish
|
||||||
|
u'napisał',
|
||||||
|
# Dutch
|
||||||
|
'schreef','verzond','geschreven'
|
||||||
|
))
|
||||||
|
))
|
||||||
|
# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
|
||||||
|
RE_ON_DATE_WROTE_SMB = re.compile(
|
||||||
|
u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format(
|
||||||
|
# Beginning of the line
|
||||||
|
'Op',
|
||||||
|
# Ending of the line
|
||||||
|
u'|'.join((
|
||||||
|
# Dutch
|
||||||
|
'schreef','verzond','geschreven'
|
||||||
|
))
|
||||||
|
)
|
||||||
)
|
)
|
||||||
''', re.VERBOSE)
|
|
||||||
|
|
||||||
RE_QUOTATION = re.compile(
|
RE_QUOTATION = re.compile(
|
||||||
r'''
|
r'''
|
||||||
@@ -66,16 +101,33 @@ RE_EMPTY_QUOTATION = re.compile(
|
|||||||
e*
|
e*
|
||||||
''', re.VERBOSE)
|
''', re.VERBOSE)
|
||||||
|
|
||||||
|
# ------Original Message------ or ---- Reply Message ----
|
||||||
|
# With variations in other languages.
|
||||||
|
RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
|
||||||
|
u'|'.join((
|
||||||
|
# English
|
||||||
|
'Original Message', 'Reply Message',
|
||||||
|
# German
|
||||||
|
u'Ursprüngliche Nachricht', 'Antwort Nachricht',
|
||||||
|
# Danish
|
||||||
|
'Oprindelig meddelelse',
|
||||||
|
))), re.I)
|
||||||
|
|
||||||
|
RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format(
|
||||||
|
u'|'.join((
|
||||||
|
# "From" in different languages.
|
||||||
|
'From', 'Van', 'De', 'Von', 'Fra',
|
||||||
|
# "Date" in different languages.
|
||||||
|
'Date', 'Datum', u'Envoyé'
|
||||||
|
))), re.I)
|
||||||
|
|
||||||
SPLITTER_PATTERNS = [
|
SPLITTER_PATTERNS = [
|
||||||
# ------Original Message------ or ---- Reply Message ----
|
RE_ORIGINAL_MESSAGE,
|
||||||
re.compile("[\s]*[-]+[ ]*(Original|Reply) Message[ ]*[-]+", re.I),
|
|
||||||
# <date> <person>
|
# <date> <person>
|
||||||
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE),
|
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE),
|
||||||
RE_ON_DATE_SMB_WROTE,
|
RE_ON_DATE_SMB_WROTE,
|
||||||
re.compile('(_+\r?\n)?[\s]*(:?[*]?From|Date):[*]? .*'),
|
RE_ON_DATE_WROTE_SMB,
|
||||||
re.compile('(_+\r?\n)?[\s]*(:?[*]?Van|Datum):[*]? .*'),
|
RE_FROM_COLON_OR_DATE_COLON,
|
||||||
re.compile('(_+\r?\n)?[\s]*(:?[*]?De|Date):[*]? .*'),
|
|
||||||
re.compile('(_+\r?\n)?[\s]*(:?[*]?Von|Datum):[*]? .*'),
|
|
||||||
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
|
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
|
||||||
'( \S+){3,6}@\S+:')
|
'( \S+){3,6}@\S+:')
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -7,7 +7,8 @@ The body and the message sender string are converted into unicode before
|
|||||||
applying features to them.
|
applying features to them.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from talon.signature.constants import SIGNATURE_MAX_LINES
|
from talon.signature.constants import (SIGNATURE_MAX_LINES,
|
||||||
|
TOO_LONG_SIGNATURE_LINE)
|
||||||
from talon.signature.learning.helpers import *
|
from talon.signature.learning.helpers import *
|
||||||
|
|
||||||
|
|
||||||
@@ -20,7 +21,7 @@ def features(sender=''):
|
|||||||
# This one is not from paper.
|
# This one is not from paper.
|
||||||
# Line is too long.
|
# Line is too long.
|
||||||
# This one is less aggressive than `Line is too short`
|
# This one is less aggressive than `Line is too short`
|
||||||
lambda line: 1 if len(line) > 60 else 0,
|
lambda line: 1 if len(line) > TOO_LONG_SIGNATURE_LINE else 0,
|
||||||
# Line contains email pattern.
|
# Line contains email pattern.
|
||||||
binary_regex_search(RE_EMAIL),
|
binary_regex_search(RE_EMAIL),
|
||||||
# Line contains url.
|
# Line contains url.
|
||||||
|
|||||||
@@ -33,6 +33,16 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote:
|
|||||||
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
||||||
|
|
||||||
|
|
||||||
|
def test_pattern_on_date_wrote_somebody():
|
||||||
|
eq_('Lorem', quotations.extract_from_plain(
|
||||||
|
"""Lorem
|
||||||
|
|
||||||
|
Op 13-02-2014 3:18 schreef Julius Caesar <pantheon@rome.com>:
|
||||||
|
|
||||||
|
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
|
||||||
|
"""))
|
||||||
|
|
||||||
|
|
||||||
def test_pattern_on_date_somebody_wrote_date_with_slashes():
|
def test_pattern_on_date_somebody_wrote_date_with_slashes():
|
||||||
msg_body = """Test reply
|
msg_body = """Test reply
|
||||||
|
|
||||||
@@ -98,22 +108,24 @@ bla-bla - bla"""
|
|||||||
eq_(reply, quotations.extract_from_plain(msg_body))
|
eq_(reply, quotations.extract_from_plain(msg_body))
|
||||||
|
|
||||||
|
|
||||||
def test_pattern_original_message():
|
def _check_pattern_original_message(original_message_indicator):
|
||||||
msg_body = """Test reply
|
msg_body = u"""Test reply
|
||||||
|
|
||||||
-----Original Message-----
|
-----{}-----
|
||||||
|
|
||||||
Test"""
|
Test"""
|
||||||
|
eq_('Test reply', quotations.extract_from_plain(msg_body.format(unicode(original_message_indicator))))
|
||||||
|
|
||||||
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
def test_english_original_message():
|
||||||
|
_check_pattern_original_message('Original Message')
|
||||||
|
_check_pattern_original_message('Reply Message')
|
||||||
|
|
||||||
msg_body = """Test reply
|
def test_german_original_message():
|
||||||
|
_check_pattern_original_message(u'Ursprüngliche Nachricht')
|
||||||
|
_check_pattern_original_message('Antwort Nachricht')
|
||||||
|
|
||||||
-----Original Message-----
|
def test_danish_original_message():
|
||||||
|
_check_pattern_original_message('Oprindelig meddelelse')
|
||||||
Test"""
|
|
||||||
|
|
||||||
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
|
||||||
|
|
||||||
|
|
||||||
def test_reply_after_quotations():
|
def test_reply_after_quotations():
|
||||||
@@ -199,6 +211,33 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote:
|
|||||||
> Hello"""
|
> Hello"""
|
||||||
eq_("Hi", quotations.extract_from_plain(msg_body))
|
eq_("Hi", quotations.extract_from_plain(msg_body))
|
||||||
|
|
||||||
|
def test_with_indent():
|
||||||
|
msg_body = """YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.
|
||||||
|
|
||||||
|
------On 12/29/1987 17:32 PM, Julius Caesar wrote-----
|
||||||
|
|
||||||
|
Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur.
|
||||||
|
"""
|
||||||
|
eq_("YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.", quotations.extract_from_plain(msg_body))
|
||||||
|
|
||||||
|
|
||||||
|
def test_short_quotation_with_newline():
|
||||||
|
msg_body = """Btw blah blah...
|
||||||
|
|
||||||
|
On Tue, Jan 27, 2015 at 12:42 PM -0800, "Company" <christine.XXX@XXX.com> wrote:
|
||||||
|
|
||||||
|
Hi Mark,
|
||||||
|
Blah blah?
|
||||||
|
Thanks,Christine
|
||||||
|
|
||||||
|
On Jan 27, 2015, at 11:55 AM, Mark XXX <mark@XXX.com> wrote:
|
||||||
|
|
||||||
|
Lorem ipsum?
|
||||||
|
Mark
|
||||||
|
|
||||||
|
Sent from Acompli"""
|
||||||
|
eq_("Btw blah blah...", quotations.extract_from_plain(msg_body))
|
||||||
|
|
||||||
|
|
||||||
def test_pattern_date_email_with_unicode():
|
def test_pattern_date_email_with_unicode():
|
||||||
msg_body = """Replying ok
|
msg_body = """Replying ok
|
||||||
@@ -208,8 +247,8 @@ def test_pattern_date_email_with_unicode():
|
|||||||
eq_("Replying ok", quotations.extract_from_plain(msg_body))
|
eq_("Replying ok", quotations.extract_from_plain(msg_body))
|
||||||
|
|
||||||
|
|
||||||
def test_pattern_from_block():
|
def test_english_from_block():
|
||||||
msg_body = """Allo! Follow up MIME!
|
eq_('Allo! Follow up MIME!', quotations.extract_from_plain("""Allo! Follow up MIME!
|
||||||
|
|
||||||
From: somebody@example.com
|
From: somebody@example.com
|
||||||
Sent: March-19-11 5:42 PM
|
Sent: March-19-11 5:42 PM
|
||||||
@@ -217,8 +256,70 @@ To: Somebody
|
|||||||
Subject: The manager has commented on your Loop
|
Subject: The manager has commented on your Loop
|
||||||
|
|
||||||
Blah-blah-blah
|
Blah-blah-blah
|
||||||
"""
|
"""))
|
||||||
eq_("Allo! Follow up MIME!", quotations.extract_from_plain(msg_body))
|
|
||||||
|
def test_german_from_block():
|
||||||
|
eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
|
||||||
|
"""Allo! Follow up MIME!
|
||||||
|
|
||||||
|
Von: somebody@example.com
|
||||||
|
Gesendet: Dienstag, 25. November 2014 14:59
|
||||||
|
An: Somebody
|
||||||
|
Betreff: The manager has commented on your Loop
|
||||||
|
|
||||||
|
Blah-blah-blah
|
||||||
|
"""))
|
||||||
|
|
||||||
|
def test_french_multiline_from_block():
|
||||||
|
eq_('Lorem ipsum', quotations.extract_from_plain(
|
||||||
|
u"""Lorem ipsum
|
||||||
|
|
||||||
|
De : Brendan xxx [mailto:brendan.xxx@xxx.com]
|
||||||
|
Envoyé : vendredi 23 janvier 2015 16:39
|
||||||
|
À : Camille XXX
|
||||||
|
Objet : Follow Up
|
||||||
|
|
||||||
|
Blah-blah-blah
|
||||||
|
"""))
|
||||||
|
|
||||||
|
def test_french_from_block():
|
||||||
|
eq_('Lorem ipsum', quotations.extract_from_plain(
|
||||||
|
u"""Lorem ipsum
|
||||||
|
|
||||||
|
Le 23 janv. 2015 à 22:03, Brendan xxx <brendan.xxx@xxx.com<mailto:brendan.xxx@xxx.com>> a écrit:
|
||||||
|
|
||||||
|
Bonjour!"""))
|
||||||
|
|
||||||
|
def test_polish_from_block():
|
||||||
|
eq_('Lorem ipsum', quotations.extract_from_plain(
|
||||||
|
u"""Lorem ipsum
|
||||||
|
|
||||||
|
W dniu 28 stycznia 2015 01:53 użytkownik Zoe xxx <zoe.xxx@xxx.com>
|
||||||
|
napisał:
|
||||||
|
|
||||||
|
Blah!
|
||||||
|
"""))
|
||||||
|
|
||||||
|
def test_danish_from_block():
|
||||||
|
eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
|
||||||
|
"""Allo! Follow up MIME!
|
||||||
|
|
||||||
|
Fra: somebody@example.com
|
||||||
|
Sendt: 19. march 2011 12:10
|
||||||
|
Til: Somebody
|
||||||
|
Emne: The manager has commented on your Loop
|
||||||
|
|
||||||
|
Blah-blah-blah
|
||||||
|
"""))
|
||||||
|
|
||||||
|
def test_dutch_from_block():
|
||||||
|
eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
|
||||||
|
"""Gluten-free culpa lo-fi et nesciunt nostrud.
|
||||||
|
|
||||||
|
Op 17-feb.-2015, om 13:18 heeft Julius Caesar <pantheon@rome.com> het volgende geschreven:
|
||||||
|
|
||||||
|
Small batch beard laboris tempor, non listicle hella Tumblr heirloom.
|
||||||
|
"""))
|
||||||
|
|
||||||
|
|
||||||
def test_quotation_marker_false_positive():
|
def test_quotation_marker_false_positive():
|
||||||
|
|||||||
Reference in New Issue
Block a user