4 Commits

Author SHA1 Message Date
Sergey Obukhov
0b55e8fa77 Merge pull request #137 from mailgun/sergey/chardet
loosen the encoding requirement for detect_encoding
2017-04-25 11:29:06 -07:00
Sergey Obukhov
6f159e8959 loosen the encoding requirement for detect_encoding 2017-04-25 11:19:01 -07:00
Sergey Obukhov
85a4c1d855 Merge pull request #133 from mailgun/sergey/android
add android quotation pattern
2017-04-10 16:37:17 -07:00
Sergey Obukhov
0f5e72623b add android quotation pattern 2017-04-10 16:33:21 -07:00
4 changed files with 30 additions and 7 deletions

View File

@@ -29,7 +29,7 @@ class InstallCommand(install):
setup(name='talon',
version='1.3.4',
version='1.3.6',
description=("Mailgun library "
"to extract message quotations and signatures."),
long_description=open("README.rst").read(),
@@ -48,7 +48,7 @@ setup(name='talon',
"regex>=1",
"numpy",
"scipy",
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
"scikit-learn>=0.16.1", # pickled versions of classifier, else rebuild
'chardet>=1.0.1',
'cchardet>=0.3.5',
'cssselect',

View File

@@ -139,6 +139,13 @@ RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]?.*
'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
))), re.I)
# ---- John Smith wrote ----
RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format(
u'|'.join((
# English
'wrote'
))), re.I)
SPLITTER_PATTERNS = [
RE_ORIGINAL_MESSAGE,
RE_ON_DATE_SMB_WROTE,
@@ -154,10 +161,10 @@ SPLITTER_PATTERNS = [
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
'( \S+){3,6}@\S+:'),
# Sent from Samsung MobileName <address@example.com> wrote:
re.compile('Sent from Samsung .*@.*> wrote')
re.compile('Sent from Samsung .*@.*> wrote'),
RE_ANDROID_WROTE
]
RE_LINK = re.compile('<(http://[^>]*)>')
RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@')

View File

@@ -142,7 +142,8 @@ def _check_pattern_original_message(original_message_indicator):
-----{}-----
Test"""
eq_('Test reply', quotations.extract_from_plain(msg_body.format(six.text_type(original_message_indicator))))
eq_('Test reply', quotations.extract_from_plain(
msg_body.format(six.text_type(original_message_indicator))))
def test_english_original_message():
_check_pattern_original_message('Original Message')
@@ -165,6 +166,17 @@ Test reply"""
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_android_wrote():
msg_body = """Test reply
---- John Smith wrote ----
> quoted
> text
"""
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_reply_wraps_quotations():
msg_body = """Test reply

View File

@@ -29,7 +29,9 @@ def test_unicode():
def test_detect_encoding():
eq_ ('ascii', u.detect_encoding(b'qwe').lower())
eq_ ('iso-8859-2', u.detect_encoding(u'Versi\xf3n'.encode('iso-8859-2')).lower())
ok_ (u.detect_encoding(
u'Versi\xf3n'.encode('iso-8859-2')).lower() in [
'iso-8859-1', 'iso-8859-2'])
eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower())
# fallback to utf-8
with patch.object(u.chardet, 'detect') as detect:
@@ -39,7 +41,9 @@ def test_detect_encoding():
def test_quick_detect_encoding():
eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower())
eq_ ('windows-1252', u.quick_detect_encoding(u'Versi\xf3n'.encode('windows-1252')).lower())
ok_ (u.quick_detect_encoding(
u'Versi\xf3n'.encode('windows-1252')).lower() in [
'windows-1252', 'windows-1250'])
eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower())