11 Commits

Author SHA1 Message Date
Sergey Obukhov
f16ae5110b Merge pull request #138 from mailgun/sergey/v1.3.7
bumped talon version
2017-04-25 11:49:29 -07:00
Sergey Obukhov
ab5cbe5ec3 bumped talon version 2017-04-25 11:43:55 -07:00
Sergey Obukhov
be5da92f16 Merge pull request #135 from esetnik/polymail_support
Polymail Quote Support
2017-04-25 11:34:47 -07:00
Sergey Obukhov
95954a65a0 Merge branch 'master' into polymail_support 2017-04-25 11:30:53 -07:00
Sergey Obukhov
0b55e8fa77 Merge pull request #137 from mailgun/sergey/chardet
loosen the encoding requirement for detect_encoding
2017-04-25 11:29:06 -07:00
Sergey Obukhov
6f159e8959 loosen the encoding requirement for detect_encoding 2017-04-25 11:19:01 -07:00
Ethan Setnik
5c413b4b00 allow more lines since polymail has extra whitespace 2017-04-12 00:07:29 -04:00
Ethan Setnik
cca64d3ed1 add test case 2017-04-11 23:36:36 -04:00
Ethan Setnik
e11eaf6ff8 add support for polymail reply format 2017-04-11 22:38:29 -04:00
Sergey Obukhov
85a4c1d855 Merge pull request #133 from mailgun/sergey/android
add android quotation pattern
2017-04-10 16:37:17 -07:00
Sergey Obukhov
0f5e72623b add android quotation pattern 2017-04-10 16:33:21 -07:00
4 changed files with 58 additions and 13 deletions

View File

@@ -29,7 +29,7 @@ class InstallCommand(install):
setup(name='talon', setup(name='talon',
version='1.3.4', version='1.3.7',
description=("Mailgun library " description=("Mailgun library "
"to extract message quotations and signatures."), "to extract message quotations and signatures."),
long_description=open("README.rst").read(), long_description=open("README.rst").read(),
@@ -48,7 +48,7 @@ setup(name='talon',
"regex>=1", "regex>=1",
"numpy", "numpy",
"scipy", "scipy",
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild "scikit-learn>=0.16.1", # pickled versions of classifier, else rebuild
'chardet>=1.0.1', 'chardet>=1.0.1',
'cchardet>=0.3.5', 'cchardet>=0.3.5',
'cssselect', 'cssselect',

View File

@@ -139,6 +139,21 @@ RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]?.*
'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt', 'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
))), re.I) ))), re.I)
# ---- John Smith wrote ----
RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format(
u'|'.join((
# English
'wrote'
))), re.I)
# Support polymail.io reply format
# On Tue, Apr 11, 2017 at 10:07 PM John Smith
#
# <
# mailto:John Smith <johnsmith@gmail.com>
# > wrote:
RE_POLYMAIL = re.compile('On.*\s{2}<\smailto:.*\s> wrote:', re.I)
SPLITTER_PATTERNS = [ SPLITTER_PATTERNS = [
RE_ORIGINAL_MESSAGE, RE_ORIGINAL_MESSAGE,
RE_ON_DATE_SMB_WROTE, RE_ON_DATE_SMB_WROTE,
@@ -154,16 +169,17 @@ SPLITTER_PATTERNS = [
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
'( \S+){3,6}@\S+:'), '( \S+){3,6}@\S+:'),
# Sent from Samsung MobileName <address@example.com> wrote: # Sent from Samsung MobileName <address@example.com> wrote:
re.compile('Sent from Samsung .*@.*> wrote') re.compile('Sent from Samsung .*@.*> wrote'),
RE_ANDROID_WROTE,
RE_POLYMAIL
] ]
RE_LINK = re.compile('<(http://[^>]*)>') RE_LINK = re.compile('<(http://[^>]*)>')
RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@') RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@')
RE_PARENTHESIS_LINK = re.compile("\(https?://") RE_PARENTHESIS_LINK = re.compile("\(https?://")
SPLITTER_MAX_LINES = 4 SPLITTER_MAX_LINES = 6
MAX_LINES_COUNT = 1000 MAX_LINES_COUNT = 1000
# an extensive research shows that exceeding this limit # an extensive research shows that exceeding this limit
# leads to excessive processing time # leads to excessive processing time

View File

@@ -35,6 +35,19 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote:
eq_("Test reply", quotations.extract_from_plain(msg_body)) eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_pattern_on_date_polymail():
msg_body = """Test reply
On Tue, Apr 11, 2017 at 10:07 PM John Smith
<
mailto:John Smith <johnsmith@gmail.com>
> wrote:
Test quoted data
"""
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_pattern_sent_from_samsung_smb_wrote(): def test_pattern_sent_from_samsung_smb_wrote():
msg_body = """Test reply msg_body = """Test reply
@@ -54,7 +67,7 @@ def test_pattern_on_date_wrote_somebody():
"""Lorem """Lorem
Op 13-02-2014 3:18 schreef Julius Caesar <pantheon@rome.com>: Op 13-02-2014 3:18 schreef Julius Caesar <pantheon@rome.com>:
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse. Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
""")) """))
@@ -142,7 +155,8 @@ def _check_pattern_original_message(original_message_indicator):
-----{}----- -----{}-----
Test""" Test"""
eq_('Test reply', quotations.extract_from_plain(msg_body.format(six.text_type(original_message_indicator)))) eq_('Test reply', quotations.extract_from_plain(
msg_body.format(six.text_type(original_message_indicator))))
def test_english_original_message(): def test_english_original_message():
_check_pattern_original_message('Original Message') _check_pattern_original_message('Original Message')
@@ -165,6 +179,17 @@ Test reply"""
eq_("Test reply", quotations.extract_from_plain(msg_body)) eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_android_wrote():
msg_body = """Test reply
---- John Smith wrote ----
> quoted
> text
"""
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_reply_wraps_quotations(): def test_reply_wraps_quotations():
msg_body = """Test reply msg_body = """Test reply
@@ -244,7 +269,7 @@ def test_with_indent():
------On 12/29/1987 17:32 PM, Julius Caesar wrote----- ------On 12/29/1987 17:32 PM, Julius Caesar wrote-----
Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur. Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur.
""" """
eq_("YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.", quotations.extract_from_plain(msg_body)) eq_("YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.", quotations.extract_from_plain(msg_body))
@@ -369,11 +394,11 @@ Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny p
def test_dutch_from_block(): def test_dutch_from_block():
eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain( eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
"""Gluten-free culpa lo-fi et nesciunt nostrud. """Gluten-free culpa lo-fi et nesciunt nostrud.
Op 17-feb.-2015, om 13:18 heeft Julius Caesar <pantheon@rome.com> het volgende geschreven: Op 17-feb.-2015, om 13:18 heeft Julius Caesar <pantheon@rome.com> het volgende geschreven:
Small batch beard laboris tempor, non listicle hella Tumblr heirloom. Small batch beard laboris tempor, non listicle hella Tumblr heirloom.
""")) """))

View File

@@ -29,7 +29,9 @@ def test_unicode():
def test_detect_encoding(): def test_detect_encoding():
eq_ ('ascii', u.detect_encoding(b'qwe').lower()) eq_ ('ascii', u.detect_encoding(b'qwe').lower())
eq_ ('iso-8859-2', u.detect_encoding(u'Versi\xf3n'.encode('iso-8859-2')).lower()) ok_ (u.detect_encoding(
u'Versi\xf3n'.encode('iso-8859-2')).lower() in [
'iso-8859-1', 'iso-8859-2'])
eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower()) eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower())
# fallback to utf-8 # fallback to utf-8
with patch.object(u.chardet, 'detect') as detect: with patch.object(u.chardet, 'detect') as detect:
@@ -39,7 +41,9 @@ def test_detect_encoding():
def test_quick_detect_encoding(): def test_quick_detect_encoding():
eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower()) eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower())
eq_ ('windows-1252', u.quick_detect_encoding(u'Versi\xf3n'.encode('windows-1252')).lower()) ok_ (u.quick_detect_encoding(
u'Versi\xf3n'.encode('windows-1252')).lower() in [
'windows-1252', 'windows-1250'])
eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower()) eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower())