Merge pull request #35 from futuresimple/more_formats

Support some polish and french formats
This commit is contained in:
Sergey Obukhov
2015-03-02 14:25:26 -08:00
2 changed files with 78 additions and 11 deletions

View File

@@ -23,14 +23,33 @@ log = logging.getLogger(__name__)
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
RE_ON_DATE_SMB_WROTE = re.compile( RE_ON_DATE_SMB_WROTE = re.compile(
r''' u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):)'.format(
( # Beginning of the line
-* # could include dashes u'|'.join((
[ ]?On[ ].*, # date part ends with comma # English
(.*\n){0,2} # splitter takes 4 lines at most 'On',
.*(wrote|sent): # French
) 'Le',
''', re.VERBOSE) # Polish
'W dniu'
)),
# Date and sender separator
u'|'.join((
# most languages separate date and sender address by comma
',',
# polish date and sender address separator
u'użytkownik'
)),
# Ending of the line
u'|'.join((
# English
'wrote', 'sent',
# French
u'a écrit',
# Polish
u'napisał'
))
))
RE_QUOTATION = re.compile( RE_QUOTATION = re.compile(
r''' r'''
@@ -78,12 +97,12 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
'Oprindelig meddelelse', 'Oprindelig meddelelse',
))), re.I) ))), re.I)
RE_FROM_COLON_OR_DATE_COLON = re.compile('(_+\r?\n)?[\s]*(:?[*]?{}):[*]? .*'.format( RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format(
'|'.join(( u'|'.join((
# "From" in different languages. # "From" in different languages.
'From', 'Van', 'De', 'Von', 'Fra', 'From', 'Van', 'De', 'Von', 'Fra',
# "Date" in different languages. # "Date" in different languages.
'Date', 'Datum', 'Date', 'Datum', u'Envoyé'
))), re.I) ))), re.I)
SPLITTER_PATTERNS = [ SPLITTER_PATTERNS = [

View File

@@ -202,6 +202,24 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote:
eq_("Hi", quotations.extract_from_plain(msg_body)) eq_("Hi", quotations.extract_from_plain(msg_body))
def test_short_quotation_with_newline():
msg_body = """Btw blah blah...
On Tue, Jan 27, 2015 at 12:42 PM -0800, "Company" <christine.XXX@XXX.com> wrote:
Hi Mark,
Blah blah? 
Thanks,Christine 
On Jan 27, 2015, at 11:55 AM, Mark XXX <mark@XXX.com> wrote:
Lorem ipsum?
Mark
Sent from Acompli"""
eq_("Btw blah blah...", quotations.extract_from_plain(msg_body))
def test_pattern_date_email_with_unicode(): def test_pattern_date_email_with_unicode():
msg_body = """Replying ok msg_body = """Replying ok
2011/4/7 Nathan \xd0\xb8ova <support@example.com> 2011/4/7 Nathan \xd0\xb8ova <support@example.com>
@@ -233,6 +251,36 @@ Betreff: The manager has commented on your Loop
Blah-blah-blah Blah-blah-blah
""")) """))
def test_french_multiline_from_block():
eq_('Lorem ipsum', quotations.extract_from_plain(
u"""Lorem ipsum
De : Brendan xxx [mailto:brendan.xxx@xxx.com]
Envoyé : vendredi 23 janvier 2015 16:39
À : Camille XXX
Objet : Follow Up
Blah-blah-blah
"""))
def test_french_from_block():
eq_('Lorem ipsum', quotations.extract_from_plain(
u"""Lorem ipsum
Le 23 janv. 2015 à 22:03, Brendan xxx <brendan.xxx@xxx.com<mailto:brendan.xxx@xxx.com>> a écrit:
Bonjour!"""))
def test_polish_from_block():
eq_('Lorem ipsum', quotations.extract_from_plain(
u"""Lorem ipsum
W dniu 28 stycznia 2015 01:53 użytkownik Zoe xxx <zoe.xxx@xxx.com>
napisał:
Blah!
"""))
def test_danish_from_block(): def test_danish_from_block():
eq_('Allo! Follow up MIME!', quotations.extract_from_plain( eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
"""Allo! Follow up MIME! """Allo! Follow up MIME!