Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2377c387c7 |
6
setup.py
6
setup.py
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
||||
|
||||
|
||||
setup(name='talon',
|
||||
version='1.0.7',
|
||||
version='1.0.5',
|
||||
description=("Mailgun library "
|
||||
"to extract message quotations and signatures."),
|
||||
long_description=open("README.rst").read(),
|
||||
@@ -14,14 +14,12 @@ setup(name='talon',
|
||||
include_package_data=True,
|
||||
zip_safe=True,
|
||||
install_requires=[
|
||||
"lxml>=2.3.3",
|
||||
"lxml==2.3.3",
|
||||
"regex>=1",
|
||||
"html2text",
|
||||
"numpy",
|
||||
"scipy",
|
||||
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
|
||||
'chardet>=1.0.1',
|
||||
'cchardet>=0.3.5',
|
||||
],
|
||||
tests_require=[
|
||||
"mock",
|
||||
|
||||
@@ -138,10 +138,9 @@ def cut_by_id(html_message):
|
||||
|
||||
|
||||
def cut_blockquote(html_message):
|
||||
''' Cuts the last non-nested blockquote with wrapping elements. '''
|
||||
quote = html_message.xpath('(.//blockquote)[not(ancestor::blockquote)][last()]')
|
||||
if quote:
|
||||
quote = quote[0]
|
||||
''' Cuts blockquote with wrapping elements. '''
|
||||
quote = html_message.find('.//blockquote')
|
||||
if quote is not None:
|
||||
quote.getparent().remove(quote)
|
||||
return True
|
||||
|
||||
|
||||
@@ -32,9 +32,7 @@ RE_ON_DATE_SMB_WROTE = re.compile(
|
||||
# Polish
|
||||
'W dniu',
|
||||
# Dutch
|
||||
'Op',
|
||||
# German
|
||||
'Am'
|
||||
'Op'
|
||||
)),
|
||||
# Date and sender separator
|
||||
u'|'.join((
|
||||
@@ -52,26 +50,18 @@ RE_ON_DATE_SMB_WROTE = re.compile(
|
||||
# Polish
|
||||
u'napisał',
|
||||
# Dutch
|
||||
'schreef','verzond','geschreven',
|
||||
# German
|
||||
'schrieb'
|
||||
'schreef','verzond','geschreven'
|
||||
))
|
||||
))
|
||||
# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
|
||||
RE_ON_DATE_WROTE_SMB = re.compile(
|
||||
u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
|
||||
u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format(
|
||||
# Beginning of the line
|
||||
u'|'.join((
|
||||
'Op',
|
||||
#German
|
||||
'Am'
|
||||
)),
|
||||
# Ending of the line
|
||||
u'|'.join((
|
||||
# Dutch
|
||||
'schreef','verzond','geschreven',
|
||||
# German
|
||||
'schrieb'
|
||||
'schreef','verzond','geschreven'
|
||||
))
|
||||
)
|
||||
)
|
||||
@@ -191,7 +181,6 @@ def mark_message_lines(lines):
|
||||
else:
|
||||
# in case splitter is spread across several lines
|
||||
splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
|
||||
|
||||
if splitter:
|
||||
# append as many splitter markers as lines in splitter
|
||||
splitter_lines = splitter.group().splitlines()
|
||||
@@ -304,8 +293,12 @@ def extract_from_plain(msg_body):
|
||||
|
||||
delimiter = get_delimiter(msg_body)
|
||||
msg_body = preprocess(msg_body, delimiter)
|
||||
lines = msg_body.splitlines()
|
||||
|
||||
# don't process too long messages
|
||||
lines = msg_body.splitlines()[:MAX_LINES_COUNT]
|
||||
if len(lines) > MAX_LINES_COUNT:
|
||||
return stripped_text
|
||||
|
||||
markers = mark_message_lines(lines)
|
||||
lines = process_marked_lines(lines, markers)
|
||||
|
||||
|
||||
@@ -134,7 +134,7 @@ def extract_names(sender):
|
||||
>>> extract_names('')
|
||||
[]
|
||||
"""
|
||||
sender = to_unicode(sender, precise=True)
|
||||
sender = to_unicode(sender)
|
||||
# Remove non-alphabetical characters
|
||||
sender = "".join([char if char.isalpha() else ' ' for char in sender])
|
||||
# Remove too short words and words from "black" list i.e.
|
||||
@@ -161,7 +161,7 @@ def categories_percent(s, categories):
|
||||
50.0
|
||||
'''
|
||||
count = 0
|
||||
s = to_unicode(s, precise=True)
|
||||
s = to_unicode(s)
|
||||
for c in s:
|
||||
if unicodedata.category(c) in categories:
|
||||
count += 1
|
||||
@@ -181,7 +181,7 @@ def punctuation_percent(s):
|
||||
|
||||
def capitalized_words_percent(s):
|
||||
'''Returns capitalized words percent.'''
|
||||
s = to_unicode(s, precise=True)
|
||||
s = to_unicode(s)
|
||||
words = re.split('\s', s)
|
||||
words = [w for w in words if w.strip()]
|
||||
capitalized_words_counter = 0
|
||||
|
||||
@@ -2,12 +2,13 @@
|
||||
|
||||
import logging
|
||||
from random import shuffle
|
||||
import chardet
|
||||
import cchardet
|
||||
|
||||
from talon.constants import RE_DELIMITER
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def safe_format(format_string, *args, **kwargs):
|
||||
"""
|
||||
Helper: formats string with any combination of bytestrings/unicode
|
||||
@@ -41,44 +42,12 @@ def to_unicode(str_or_unicode, precise=False):
|
||||
u'привет'
|
||||
If `precise` flag is True, tries to guess the correct encoding first.
|
||||
"""
|
||||
encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8'
|
||||
encoding = detect_encoding(str_or_unicode) if precise else 'utf-8'
|
||||
if isinstance(str_or_unicode, str):
|
||||
return unicode(str_or_unicode, encoding, 'replace')
|
||||
return str_or_unicode
|
||||
|
||||
|
||||
def detect_encoding(string):
|
||||
"""
|
||||
Tries to detect the encoding of the passed string.
|
||||
|
||||
Defaults to UTF-8.
|
||||
"""
|
||||
try:
|
||||
detected = chardet.detect(string)
|
||||
if detected:
|
||||
return detected.get('encoding') or 'utf-8'
|
||||
except Exception, e:
|
||||
print 11111111111, e
|
||||
pass
|
||||
return 'utf-8'
|
||||
|
||||
|
||||
def quick_detect_encoding(string):
|
||||
"""
|
||||
Tries to detect the encoding of the passed string.
|
||||
|
||||
Uses cchardet. Fallbacks to detect_encoding.
|
||||
"""
|
||||
try:
|
||||
detected = cchardet.detect(string)
|
||||
if detected:
|
||||
return detected.get('encoding') or detect_encoding(string)
|
||||
except Exception, e:
|
||||
print 222222222222, e
|
||||
pass
|
||||
return detect_encoding(string)
|
||||
|
||||
|
||||
def to_utf8(str_or_unicode):
|
||||
"""
|
||||
Safely returns a UTF-8 version of a given string
|
||||
|
||||
@@ -49,24 +49,6 @@ def test_quotation_splitter_outside_blockquote():
|
||||
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
|
||||
|
||||
|
||||
def test_regular_blockquote():
|
||||
msg_body = """Reply
|
||||
<blockquote>Regular</blockquote>
|
||||
|
||||
<div>
|
||||
On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:
|
||||
</div>
|
||||
|
||||
<blockquote>
|
||||
<div>
|
||||
<blockquote>Nested</blockquote>
|
||||
</div>
|
||||
</blockquote>
|
||||
"""
|
||||
eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote><div></div></body></html>",
|
||||
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
|
||||
|
||||
|
||||
def test_no_blockquote():
|
||||
msg_body = """
|
||||
<html>
|
||||
|
||||
@@ -12,11 +12,11 @@ from talon import quotations
|
||||
@patch.object(quotations, 'MAX_LINES_COUNT', 1)
|
||||
def test_too_many_lines():
|
||||
msg_body = """Test reply
|
||||
Hi
|
||||
|
||||
-----Original Message-----
|
||||
|
||||
Test"""
|
||||
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
||||
eq_(msg_body, quotations.extract_from_plain(msg_body))
|
||||
|
||||
|
||||
def test_pattern_on_date_somebody_wrote():
|
||||
|
||||
@@ -1,60 +1,9 @@
|
||||
# coding:utf-8
|
||||
|
||||
from . import *
|
||||
|
||||
from talon import utils as u
|
||||
import cchardet
|
||||
from talon import utils
|
||||
|
||||
|
||||
def test_get_delimiter():
|
||||
eq_('\r\n', u.get_delimiter('abc\r\n123'))
|
||||
eq_('\n', u.get_delimiter('abc\n123'))
|
||||
eq_('\n', u.get_delimiter('abc'))
|
||||
|
||||
|
||||
def test_unicode():
|
||||
eq_ (u'hi', u.to_unicode('hi'))
|
||||
eq_ (type(u.to_unicode('hi')), unicode )
|
||||
eq_ (type(u.to_unicode(u'hi')), unicode )
|
||||
eq_ (type(u.to_unicode('привет')), unicode )
|
||||
eq_ (type(u.to_unicode(u'привет')), unicode )
|
||||
eq_ (u"привет", u.to_unicode('привет'))
|
||||
eq_ (u"привет", u.to_unicode(u'привет'))
|
||||
# some latin1 stuff
|
||||
eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True))
|
||||
|
||||
|
||||
def test_detect_encoding():
|
||||
eq_ ('ascii', u.detect_encoding('qwe').lower())
|
||||
eq_ ('iso-8859-2', u.detect_encoding('Versi\xf3n').lower())
|
||||
eq_ ('utf-8', u.detect_encoding('привет').lower())
|
||||
# fallback to utf-8
|
||||
with patch.object(u.chardet, 'detect') as detect:
|
||||
detect.side_effect = Exception
|
||||
eq_ ('utf-8', u.detect_encoding('qwe').lower())
|
||||
|
||||
|
||||
def test_quick_detect_encoding():
|
||||
eq_ ('ascii', u.quick_detect_encoding('qwe').lower())
|
||||
eq_ ('windows-1252', u.quick_detect_encoding('Versi\xf3n').lower())
|
||||
eq_ ('utf-8', u.quick_detect_encoding('привет').lower())
|
||||
|
||||
|
||||
@patch.object(cchardet, 'detect')
|
||||
@patch.object(u, 'detect_encoding')
|
||||
def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect):
|
||||
cchardet_detect.return_value = {'encoding': 'ascii'}
|
||||
eq_('ascii', u.quick_detect_encoding("qwe"))
|
||||
cchardet_detect.assert_called_once_with("qwe")
|
||||
|
||||
# fallback to detect_encoding
|
||||
cchardet_detect.return_value = {}
|
||||
detect_encoding.return_value = 'utf-8'
|
||||
eq_('utf-8', u.quick_detect_encoding("qwe"))
|
||||
|
||||
# exception
|
||||
detect_encoding.reset_mock()
|
||||
cchardet_detect.side_effect = Exception()
|
||||
detect_encoding.return_value = 'utf-8'
|
||||
eq_('utf-8', u.quick_detect_encoding("qwe"))
|
||||
ok_(detect_encoding.called)
|
||||
eq_('\r\n', utils.get_delimiter('abc\r\n123'))
|
||||
eq_('\n', utils.get_delimiter('abc\n123'))
|
||||
eq_('\n', utils.get_delimiter('abc'))
|
||||
|
||||
Reference in New Issue
Block a user