1 Commits

Author SHA1 Message Date
Ralph Meijer
2377c387c7 Actually bump up talon's version up to 1.0.5 to match the tag. 2015-09-09 22:46:18 +02:00
15 changed files with 47 additions and 283 deletions

View File

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
setup(name='talon',
version='1.0.9',
version='1.0.5',
description=("Mailgun library "
"to extract message quotations and signatures."),
long_description=open("README.rst").read(),
@@ -14,15 +14,12 @@ setup(name='talon',
include_package_data=True,
zip_safe=True,
install_requires=[
"lxml>=2.3.3",
"lxml==2.3.3",
"regex>=1",
"html2text",
"numpy",
"scipy",
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
'chardet>=1.0.1',
'cchardet>=0.3.5',
'cssselect'
],
tests_require=[
"mock",

View File

@@ -138,10 +138,9 @@ def cut_by_id(html_message):
def cut_blockquote(html_message):
''' Cuts the last non-nested blockquote with wrapping elements. '''
quote = html_message.xpath('(.//blockquote)[not(ancestor::blockquote)][last()]')
if quote:
quote = quote[0]
''' Cuts blockquote with wrapping elements. '''
quote = html_message.find('.//blockquote')
if quote is not None:
quote.getparent().remove(quote)
return True

View File

@@ -22,7 +22,7 @@ log = logging.getLogger(__name__)
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
RE_ON_DATE_SMB_WROTE = re.compile(
u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
# Beginning of the line
u'|'.join((
# English
@@ -32,13 +32,7 @@ RE_ON_DATE_SMB_WROTE = re.compile(
# Polish
'W dniu',
# Dutch
'Op',
# German
'Am',
# Norwegian
u'',
# Swedish, Danish
'Den',
'Op'
)),
# Date and sender separator
u'|'.join((
@@ -56,28 +50,18 @@ RE_ON_DATE_SMB_WROTE = re.compile(
# Polish
u'napisał',
# Dutch
'schreef','verzond','geschreven',
# German
'schrieb',
# Norwegian, Swedish
'skrev',
'schreef','verzond','geschreven'
))
))
# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
RE_ON_DATE_WROTE_SMB = re.compile(
u'(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format(
# Beginning of the line
u'|'.join((
'Op',
#German
'Am'
)),
# Ending of the line
u'|'.join((
# Dutch
'schreef','verzond','geschreven',
# German
'schrieb'
'schreef','verzond','geschreven'
))
)
)
@@ -131,9 +115,9 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format(
u'|'.join((
# "From" in different languages.
'From', 'Van', 'De', 'Von', 'Fra', u'Från',
'From', 'Van', 'De', 'Von', 'Fra',
# "Date" in different languages.
'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
'Date', 'Datum', u'Envoyé'
))), re.I)
SPLITTER_PATTERNS = [
@@ -197,7 +181,6 @@ def mark_message_lines(lines):
else:
# in case splitter is spread across several lines
splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
if splitter:
# append as many splitter markers as lines in splitter
splitter_lines = splitter.group().splitlines()
@@ -310,8 +293,12 @@ def extract_from_plain(msg_body):
delimiter = get_delimiter(msg_body)
msg_body = preprocess(msg_body, delimiter)
lines = msg_body.splitlines()
# don't process too long messages
lines = msg_body.splitlines()[:MAX_LINES_COUNT]
if len(lines) > MAX_LINES_COUNT:
return stripped_text
markers = mark_message_lines(lines)
lines = process_marked_lines(lines, markers)
@@ -321,7 +308,7 @@ def extract_from_plain(msg_body):
return msg_body
def extract_from_html(s):
def extract_from_html(msg_body):
"""
Extract not quoted message from provided html message body
using tags and plain text algorithm.
@@ -338,12 +325,8 @@ def extract_from_html(s):
then deleting necessary tags.
"""
if s.strip() == '':
return s
# replace CRLF with LF temporaraly otherwise CR will be converted to '
'
# when doing deepcopy on html tree
msg_body, replaced = _CRLF_to_LF(s)
if msg_body.strip() == '':
return msg_body
html_tree = html.document_fromstring(
msg_body,
@@ -374,12 +357,15 @@ def extract_from_html(s):
plain_text = plain_text.replace('*', '')
# Unmask saved star symbols
plain_text = plain_text.replace('3423oorkg432', '*')
plain_text = preprocess(plain_text, '\n', content_type='text/html')
delimiter = get_delimiter(plain_text)
plain_text = preprocess(plain_text, delimiter, content_type='text/html')
lines = plain_text.splitlines()
# Don't process too long messages
if len(lines) > MAX_LINES_COUNT:
return s
return msg_body
# Collect checkpoints on each line
line_checkpoints = [
@@ -404,9 +390,9 @@ def extract_from_html(s):
quotation_checkpoints[checkpoint] = True
else:
if cut_quotations:
return _restore_CRLF(html.tostring(html_tree_copy), replaced)
return html.tostring(html_tree_copy)
else:
return s
return msg_body
# Remove tags with quotation checkpoints
html_quotations.delete_quotation_tags(
@@ -442,37 +428,3 @@ def register_xpath_extensions():
ns.prefix = 'mg'
ns['text_content'] = text_content
ns['tail'] = tail
def _restore_CRLF(s, replaced=True):
"""Restore CRLF if previously CRLF was replaced with LF
>>> _restore_CRLF('a\nb')
'a\r\nb'
>>> _restore_CRLF('a\nb', replaced=False)
'a\nb'
"""
if replaced:
return s.replace('\n', '\r\n')
return s
def _CRLF_to_LF(s):
"""Replace CRLF with LF
>>> s, changed = _CRLF_to_LF('a\r\n'b)
>>> s
'a\nb'
>>> changed
True
>>> s, changed = _CRLF_to_LF('a\n'b)
>>> s
'a\nb'
>>> changed
False
"""
delimiter = get_delimiter(s)
if delimiter == '\r\n':
return s.replace(delimiter, '\n'), True
return s, False

Binary file not shown.

View File

@@ -16,7 +16,7 @@ from talon.signature.constants import SIGNATURE_MAX_LINES
rc = re.compile
RE_EMAIL = rc('\S@\S')
RE_EMAIL = rc('@')
RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')
@@ -120,7 +120,7 @@ def contains_sender_names(sender):
names = names or sender
if names != '':
return binary_regex_search(re.compile(names))
return lambda s: 0
return lambda s: False
def extract_names(sender):
@@ -134,7 +134,7 @@ def extract_names(sender):
>>> extract_names('')
[]
"""
sender = to_unicode(sender, precise=True)
sender = to_unicode(sender)
# Remove non-alphabetical characters
sender = "".join([char if char.isalpha() else ' ' for char in sender])
# Remove too short words and words from "black" list i.e.
@@ -161,7 +161,7 @@ def categories_percent(s, categories):
50.0
'''
count = 0
s = to_unicode(s, precise=True)
s = to_unicode(s)
for c in s:
if unicodedata.category(c) in categories:
count += 1
@@ -181,7 +181,7 @@ def punctuation_percent(s):
def capitalized_words_percent(s):
'''Returns capitalized words percent.'''
s = to_unicode(s, precise=True)
s = to_unicode(s)
words = re.split('\s', s)
words = [w for w in words if w.strip()]
capitalized_words_counter = 0

View File

@@ -2,12 +2,13 @@
import logging
from random import shuffle
import chardet
import cchardet
from talon.constants import RE_DELIMITER
log = logging.getLogger(__name__)
def safe_format(format_string, *args, **kwargs):
"""
Helper: formats string with any combination of bytestrings/unicode
@@ -41,44 +42,12 @@ def to_unicode(str_or_unicode, precise=False):
u'привет'
If `precise` flag is True, tries to guess the correct encoding first.
"""
encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8'
encoding = detect_encoding(str_or_unicode) if precise else 'utf-8'
if isinstance(str_or_unicode, str):
return unicode(str_or_unicode, encoding, 'replace')
return str_or_unicode
def detect_encoding(string):
"""
Tries to detect the encoding of the passed string.
Defaults to UTF-8.
"""
try:
detected = chardet.detect(string)
if detected:
return detected.get('encoding') or 'utf-8'
except Exception, e:
print 11111111111, e
pass
return 'utf-8'
def quick_detect_encoding(string):
"""
Tries to detect the encoding of the passed string.
Uses cchardet. Fallbacks to detect_encoding.
"""
try:
detected = cchardet.detect(string)
if detected:
return detected.get('encoding') or detect_encoding(string)
except Exception, e:
print 222222222222, e
pass
return detect_encoding(string)
def to_utf8(str_or_unicode):
"""
Safely returns a UTF-8 version of a given string

View File

@@ -1,4 +1,3 @@
<?xml version="1.0" encoding="UTF-8"?>
<html>
<head>
<style><!--

View File

@@ -1,19 +0,0 @@
Content-Type: text/plain;
charset=us-ascii
Mime-Version: 1.0 (Mac OS X Mail 8.2 \(2104\))
Subject: Re: Hello there
X-Universally-Unique-Identifier: 85B1075D-5841-46A9-8565-FCB287A93AC4
From: Adam Renberg <adam@tictail.com>
In-Reply-To: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
Date: Sat, 22 Aug 2015 19:22:20 +0200
Content-Transfer-Encoding: 7bit
X-Smtp-Server: smtp.gmail.com:adam@tictail.com
Message-Id: <68001B29-8EA4-444C-A894-0537D2CA5208@tictail.com>
References: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
To: Adam Renberg <tgwizard@gmail.com>
Hello
> On 22 Aug 2015, at 19:21, Adam Renberg <tgwizard@gmail.com> wrote:
>
> Hi there!

View File

@@ -28,8 +28,8 @@ def test_quotation_splitter_inside_blockquote():
</blockquote>"""
eq_("<html><body><p>Reply\n</p></body></html>",
quotations.extract_from_html(msg_body))
eq_("<html><body><p>Reply</p></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_quotation_splitter_outside_blockquote():
@@ -49,24 +49,6 @@ def test_quotation_splitter_outside_blockquote():
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_regular_blockquote():
msg_body = """Reply
<blockquote>Regular</blockquote>
<div>
On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
</div>
<blockquote>
<div>
<blockquote>Nested</blockquote>
</div>
</blockquote>
"""
eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote><div></div></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_no_blockquote():
msg_body = """
<html>
@@ -264,7 +246,7 @@ RE_REPLY = re.compile(r"^Hi\. I am fine\.\s*\n\s*Thanks,\s*\n\s*Alex\s*$")
def extract_reply_and_check(filename):
f = open(filename)
msg_body = f.read()
msg_body = f.read().decode("utf-8")
reply = quotations.extract_from_html(msg_body)
h = html2text.HTML2Text()
@@ -310,25 +292,3 @@ def test_windows_mail_reply():
def test_yandex_ru_reply():
extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html")
def test_CRLF():
"""CR is not converted to '&#13;'
"""
eq_('<html>\r\n</html>', quotations.extract_from_html('<html>\r\n</html>'))
msg_body = """Reply
<blockquote>
<div>
On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
</div>
<div>
Test
</div>
</blockquote>"""
msg_body = msg_body.replace('\n', '\r\n')
eq_("<html><body><p>Reply\r\n</p></body></html>",
quotations.extract_from_html(msg_body))

View File

@@ -29,15 +29,3 @@ def test_crash_inside_extract_from():
def test_empty_body():
eq_('', quotations.extract_from_plain(''))
def test__CRLF_to_LF():
eq_(('\n\r', True), quotations._CRLF_to_LF('\r\n\r'))
eq_(('\n\r', False), quotations._CRLF_to_LF('\n\r'))
def test__restore_CRLF():
eq_('\n', quotations._restore_CRLF('\n', replaced=False))
eq_('\r\n', quotations._restore_CRLF('\n', replaced=True))
# default
eq_('\r\n', quotations._restore_CRLF('\n'))

View File

@@ -6,9 +6,7 @@ from talon.signature.learning import featurespace as fs
def test_apply_features():
s = '''This is John Doe
Tuesday @3pm suits. I'll chat to you then.
s = '''John Doe
VP Research and Development, Xxxx Xxxx Xxxxx
@@ -21,12 +19,11 @@ john@example.com'''
# note that we don't consider the first line because signatures don't
# usually take all the text, empty lines are not considered
eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
with patch.object(fs, 'SIGNATURE_MAX_LINES', 5):
with patch.object(fs, 'SIGNATURE_MAX_LINES', 4):
features = fs.features(sender)
new_result = fs.apply_features(s, features)
# result remains the same because we don't consider empty lines

View File

@@ -12,11 +12,11 @@ from talon import quotations
@patch.object(quotations, 'MAX_LINES_COUNT', 1)
def test_too_many_lines():
msg_body = """Test reply
Hi
-----Original Message-----
Test"""
eq_("Test reply", quotations.extract_from_plain(msg_body))
eq_(msg_body, quotations.extract_from_plain(msg_body))
def test_pattern_on_date_somebody_wrote():
@@ -311,33 +311,6 @@ Emne: The manager has commented on your Loop
Blah-blah-blah
"""))
def test_swedish_from_block():
eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
u"""Allo! Follow up MIME!
Från: Anno Sportel [mailto:anno.spoel@hsbcssad.com]
Skickat: den 26 augusti 2015 14:45
Till: Isacson Leiff
Ämne: RE: Week 36
Blah-blah-blah
"""))
def test_swedish_from_line():
eq_('Lorem', quotations.extract_from_plain(
"""Lorem
Den 14 september, 2015 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
"""))
def test_norwegian_from_line():
eq_('Lorem', quotations.extract_from_plain(
u"""Lorem
På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
"""))
def test_dutch_from_block():
eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
"""Gluten-free culpa lo-fi et nesciunt nostrud.

View File

@@ -1,60 +1,9 @@
# coding:utf-8
from . import *
from talon import utils as u
import cchardet
from talon import utils
def test_get_delimiter():
eq_('\r\n', u.get_delimiter('abc\r\n123'))
eq_('\n', u.get_delimiter('abc\n123'))
eq_('\n', u.get_delimiter('abc'))
def test_unicode():
eq_ (u'hi', u.to_unicode('hi'))
eq_ (type(u.to_unicode('hi')), unicode )
eq_ (type(u.to_unicode(u'hi')), unicode )
eq_ (type(u.to_unicode('привет')), unicode )
eq_ (type(u.to_unicode(u'привет')), unicode )
eq_ (u"привет", u.to_unicode('привет'))
eq_ (u"привет", u.to_unicode(u'привет'))
# some latin1 stuff
eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True))
def test_detect_encoding():
eq_ ('ascii', u.detect_encoding('qwe').lower())
eq_ ('iso-8859-2', u.detect_encoding('Versi\xf3n').lower())
eq_ ('utf-8', u.detect_encoding('привет').lower())
# fallback to utf-8
with patch.object(u.chardet, 'detect') as detect:
detect.side_effect = Exception
eq_ ('utf-8', u.detect_encoding('qwe').lower())
def test_quick_detect_encoding():
eq_ ('ascii', u.quick_detect_encoding('qwe').lower())
eq_ ('windows-1252', u.quick_detect_encoding('Versi\xf3n').lower())
eq_ ('utf-8', u.quick_detect_encoding('привет').lower())
@patch.object(cchardet, 'detect')
@patch.object(u, 'detect_encoding')
def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect):
cchardet_detect.return_value = {'encoding': 'ascii'}
eq_('ascii', u.quick_detect_encoding("qwe"))
cchardet_detect.assert_called_once_with("qwe")
# fallback to detect_encoding
cchardet_detect.return_value = {}
detect_encoding.return_value = 'utf-8'
eq_('utf-8', u.quick_detect_encoding("qwe"))
# exception
detect_encoding.reset_mock()
cchardet_detect.side_effect = Exception()
detect_encoding.return_value = 'utf-8'
eq_('utf-8', u.quick_detect_encoding("qwe"))
ok_(detect_encoding.called)
eq_('\r\n', utils.get_delimiter('abc\r\n123'))
eq_('\n', utils.get_delimiter('abc\n123'))
eq_('\n', utils.get_delimiter('abc'))