13 Commits

Author SHA1 Message Date
Sergey Obukhov
b5af9c03a5 bump up version 2015-09-11 10:42:26 -07:00
Sergey Obukhov
176c7e7532 Merge pull request #57 from mailgun/sergey/to_unicode
use precise encoding when converting to unicode
2015-09-11 10:40:52 -07:00
Sergey Obukhov
15976888a0 use precise encoding when converting to unicode 2015-09-11 10:38:28 -07:00
Sergey Obukhov
9bee502903 bump up version 2015-09-11 06:27:12 -07:00
Sergey Obukhov
e3cb8dc3e6 Merge pull request #56 from mailgun/sergey/1000+German+NL
process first 1000 lines for long messages, support for German and Dutch
2015-09-11 06:20:34 -07:00
Sergey Obukhov
385285e5de process first 1000 lines for long messages, support for German and Dutch 2015-09-11 06:17:14 -07:00
Sergey Obukhov
127771dac9 bump up version 2015-09-11 04:51:39 -07:00
Sergey Obukhov
cc98befba5 Merge pull request #50 from Easy-D/preserve-regular-blockquotes
Preserve regular blockquotes
2015-09-11 04:49:36 -07:00
Sergey Obukhov
567549cba4 bump up talon version 2015-09-10 10:47:16 -07:00
Sergey Obukhov
76c4f49be8 Merge pull request #55 from mailgun/sergey/lxml
unpin lxml version
2015-09-10 10:44:59 -07:00
Sergey Obukhov
d9d89dc250 unpin lxml version 2015-09-10 10:44:05 -07:00
Easy-D
390b0a6dc9 preserve regular blockquotes 2015-07-16 21:31:41 +02:00
Easy-D
ed6b861a47 add failing test that shows how regular blockquotes are removed 2015-07-16 21:24:49 +02:00
8 changed files with 137 additions and 27 deletions

View File

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
setup(name='talon',
version='1.0.5',
version='1.0.7',
description=("Mailgun library "
"to extract message quotations and signatures."),
long_description=open("README.rst").read(),
@@ -14,12 +14,14 @@ setup(name='talon',
include_package_data=True,
zip_safe=True,
install_requires=[
"lxml==2.3.3",
"lxml>=2.3.3",
"regex>=1",
"html2text",
"numpy",
"scipy",
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
'chardet>=1.0.1',
'cchardet>=0.3.5',
],
tests_require=[
"mock",

View File

@@ -138,9 +138,10 @@ def cut_by_id(html_message):
def cut_blockquote(html_message):
''' Cuts blockquote with wrapping elements. '''
quote = html_message.find('.//blockquote')
if quote is not None:
''' Cuts the last non-nested blockquote with wrapping elements. '''
quote = html_message.xpath('(.//blockquote)[not(ancestor::blockquote)][last()]')
if quote:
quote = quote[0]
quote.getparent().remove(quote)
return True

View File

@@ -32,7 +32,9 @@ RE_ON_DATE_SMB_WROTE = re.compile(
# Polish
'W dniu',
# Dutch
'Op'
'Op',
# German
'Am'
)),
# Date and sender separator
u'|'.join((
@@ -50,18 +52,26 @@ RE_ON_DATE_SMB_WROTE = re.compile(
# Polish
u'napisał',
# Dutch
'schreef','verzond','geschreven'
'schreef','verzond','geschreven',
# German
'schrieb'
))
))
# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
RE_ON_DATE_WROTE_SMB = re.compile(
u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format(
u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
# Beginning of the line
u'|'.join((
'Op',
#German
'Am'
)),
# Ending of the line
u'|'.join((
# Dutch
'schreef','verzond','geschreven'
'schreef','verzond','geschreven',
# German
'schrieb'
))
)
)
@@ -181,6 +191,7 @@ def mark_message_lines(lines):
else:
# in case splitter is spread across several lines
splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
if splitter:
# append as many splitter markers as lines in splitter
splitter_lines = splitter.group().splitlines()
@@ -293,12 +304,8 @@ def extract_from_plain(msg_body):
delimiter = get_delimiter(msg_body)
msg_body = preprocess(msg_body, delimiter)
lines = msg_body.splitlines()
# don't process too long messages
if len(lines) > MAX_LINES_COUNT:
return stripped_text
lines = msg_body.splitlines()[:MAX_LINES_COUNT]
markers = mark_message_lines(lines)
lines = process_marked_lines(lines, markers)

View File

@@ -134,7 +134,7 @@ def extract_names(sender):
>>> extract_names('')
[]
"""
sender = to_unicode(sender)
sender = to_unicode(sender, precise=True)
# Remove non-alphabetical characters
sender = "".join([char if char.isalpha() else ' ' for char in sender])
# Remove too short words and words from "black" list i.e.
@@ -161,7 +161,7 @@ def categories_percent(s, categories):
50.0
'''
count = 0
s = to_unicode(s)
s = to_unicode(s, precise=True)
for c in s:
if unicodedata.category(c) in categories:
count += 1
@@ -181,7 +181,7 @@ def punctuation_percent(s):
def capitalized_words_percent(s):
'''Returns capitalized words percent.'''
s = to_unicode(s)
s = to_unicode(s, precise=True)
words = re.split('\s', s)
words = [w for w in words if w.strip()]
capitalized_words_counter = 0

View File

@@ -2,13 +2,12 @@
import logging
from random import shuffle
import chardet
import cchardet
from talon.constants import RE_DELIMITER
log = logging.getLogger(__name__)
def safe_format(format_string, *args, **kwargs):
"""
Helper: formats string with any combination of bytestrings/unicode
@@ -42,12 +41,44 @@ def to_unicode(str_or_unicode, precise=False):
u'привет'
If `precise` flag is True, tries to guess the correct encoding first.
"""
encoding = detect_encoding(str_or_unicode) if precise else 'utf-8'
encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8'
if isinstance(str_or_unicode, str):
return unicode(str_or_unicode, encoding, 'replace')
return str_or_unicode
def detect_encoding(string):
"""
Tries to detect the encoding of the passed string.
Defaults to UTF-8.
"""
try:
detected = chardet.detect(string)
if detected:
return detected.get('encoding') or 'utf-8'
except Exception, e:
print 11111111111, e
pass
return 'utf-8'
def quick_detect_encoding(string):
"""
Tries to detect the encoding of the passed string.
Uses cchardet. Fallbacks to detect_encoding.
"""
try:
detected = cchardet.detect(string)
if detected:
return detected.get('encoding') or detect_encoding(string)
except Exception, e:
print 222222222222, e
pass
return detect_encoding(string)
def to_utf8(str_or_unicode):
"""
Safely returns a UTF-8 version of a given string

View File

@@ -49,6 +49,24 @@ def test_quotation_splitter_outside_blockquote():
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_regular_blockquote():
msg_body = """Reply
<blockquote>Regular</blockquote>
<div>
On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
</div>
<blockquote>
<div>
<blockquote>Nested</blockquote>
</div>
</blockquote>
"""
eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote><div></div></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_no_blockquote():
msg_body = """
<html>

View File

@@ -12,11 +12,11 @@ from talon import quotations
@patch.object(quotations, 'MAX_LINES_COUNT', 1)
def test_too_many_lines():
msg_body = """Test reply
Hi
-----Original Message-----
Test"""
eq_(msg_body, quotations.extract_from_plain(msg_body))
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_pattern_on_date_somebody_wrote():

View File

@@ -1,9 +1,60 @@
# coding:utf-8
from . import *
from talon import utils
from talon import utils as u
import cchardet
def test_get_delimiter():
eq_('\r\n', utils.get_delimiter('abc\r\n123'))
eq_('\n', utils.get_delimiter('abc\n123'))
eq_('\n', utils.get_delimiter('abc'))
eq_('\r\n', u.get_delimiter('abc\r\n123'))
eq_('\n', u.get_delimiter('abc\n123'))
eq_('\n', u.get_delimiter('abc'))
def test_unicode():
eq_ (u'hi', u.to_unicode('hi'))
eq_ (type(u.to_unicode('hi')), unicode )
eq_ (type(u.to_unicode(u'hi')), unicode )
eq_ (type(u.to_unicode('привет')), unicode )
eq_ (type(u.to_unicode(u'привет')), unicode )
eq_ (u"привет", u.to_unicode('привет'))
eq_ (u"привет", u.to_unicode(u'привет'))
# some latin1 stuff
eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True))
def test_detect_encoding():
eq_ ('ascii', u.detect_encoding('qwe').lower())
eq_ ('iso-8859-2', u.detect_encoding('Versi\xf3n').lower())
eq_ ('utf-8', u.detect_encoding('привет').lower())
# fallback to utf-8
with patch.object(u.chardet, 'detect') as detect:
detect.side_effect = Exception
eq_ ('utf-8', u.detect_encoding('qwe').lower())
def test_quick_detect_encoding():
eq_ ('ascii', u.quick_detect_encoding('qwe').lower())
eq_ ('windows-1252', u.quick_detect_encoding('Versi\xf3n').lower())
eq_ ('utf-8', u.quick_detect_encoding('привет').lower())
@patch.object(cchardet, 'detect')
@patch.object(u, 'detect_encoding')
def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect):
cchardet_detect.return_value = {'encoding': 'ascii'}
eq_('ascii', u.quick_detect_encoding("qwe"))
cchardet_detect.assert_called_once_with("qwe")
# fallback to detect_encoding
cchardet_detect.return_value = {}
detect_encoding.return_value = 'utf-8'
eq_('utf-8', u.quick_detect_encoding("qwe"))
# exception
detect_encoding.reset_mock()
cchardet_detect.side_effect = Exception()
detect_encoding.return_value = 'utf-8'
eq_('utf-8', u.quick_detect_encoding("qwe"))
ok_(detect_encoding.called)