15 Commits

Author SHA1 Message Date
Sergey Obukhov
2c416ecc0e Merge pull request #62 from tgwizard/better-support-for-scandinavian-languages
Add better support for Scandinavian languages
2015-10-14 21:48:10 -07:00
Sergey Obukhov
3ab33c557b Merge pull request #65 from mailgun/sergey/cssselect
add cssselect to dependencies
2015-10-14 20:34:02 -07:00
Sergey Obukhov
8db05f4950 add cssselect to dependencies 2015-10-14 20:31:26 -07:00
Sergey Obukhov
3d5bc82a03 Merge pull request #61 from tgwizard/fix-for-apple-mail
Add fix for Apple Mail email format
2015-10-14 12:38:06 -07:00
Adam Renberg
14e3a0d80b Add better support for Scandinavian languages
This is a port of https://github.com/tictail/claw/pull/6 by @simonflore.
2015-09-21 21:42:01 +02:00
Adam Renberg
fcd9e2716a Add fix for Apple Mail email format
Where they have an initial > on the "date line".
2015-09-21 21:33:57 +02:00
Sergey Obukhov
d62d633215 bump up version 2015-09-21 09:55:51 -07:00
Sergey Obukhov
3b0c9273c1 Merge pull request #60 from mailgun/sergey/26
fixes mailgun/talon#26
2015-09-21 09:54:35 -07:00
Sergey Obukhov
e4c1c11845 remove print 2015-09-21 09:52:47 -07:00
Sergey Obukhov
ae508fe0e5 fixes mailgun/talon#26 2015-09-21 09:51:26 -07:00
Sergey Obukhov
2cb9b5399c bump up version 2015-09-18 05:23:29 -07:00
Sergey Obukhov
134c47f515 Merge pull request #59 from mailgun/sergey/43
fixes mailgun/talon#43
2015-09-18 05:20:51 -07:00
Sergey Obukhov
d328c9d128 fixes mailgun/talon#43 2015-09-18 05:19:59 -07:00
Sergey Obukhov
77b62b0fef Merge pull request #58 from mailgun/sergey/52
fixes mailgun/talon#52
2015-09-18 04:48:50 -07:00
Sergey Obukhov
ad09b18f3f fixes mailgun/talon#52 2015-09-18 04:47:23 -07:00
12 changed files with 150 additions and 24 deletions

View File

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
setup(name='talon', setup(name='talon',
version='1.0.7', version='1.0.9',
description=("Mailgun library " description=("Mailgun library "
"to extract message quotations and signatures."), "to extract message quotations and signatures."),
long_description=open("README.rst").read(), long_description=open("README.rst").read(),
@@ -22,6 +22,7 @@ setup(name='talon',
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
'chardet>=1.0.1', 'chardet>=1.0.1',
'cchardet>=0.3.5', 'cchardet>=0.3.5',
'cssselect'
], ],
tests_require=[ tests_require=[
"mock", "mock",

View File

@@ -22,7 +22,7 @@ log = logging.getLogger(__name__)
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
RE_ON_DATE_SMB_WROTE = re.compile( RE_ON_DATE_SMB_WROTE = re.compile(
u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format( u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
# Beginning of the line # Beginning of the line
u'|'.join(( u'|'.join((
# English # English
@@ -34,7 +34,11 @@ RE_ON_DATE_SMB_WROTE = re.compile(
# Dutch # Dutch
'Op', 'Op',
# German # German
'Am' 'Am',
# Norwegian
u'',
# Swedish, Danish
'Den',
)), )),
# Date and sender separator # Date and sender separator
u'|'.join(( u'|'.join((
@@ -54,12 +58,14 @@ RE_ON_DATE_SMB_WROTE = re.compile(
# Dutch # Dutch
'schreef','verzond','geschreven', 'schreef','verzond','geschreven',
# German # German
'schrieb' 'schrieb',
# Norwegian, Swedish
'skrev',
)) ))
)) ))
# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:' # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
RE_ON_DATE_WROTE_SMB = re.compile( RE_ON_DATE_WROTE_SMB = re.compile(
u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format( u'(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
# Beginning of the line # Beginning of the line
u'|'.join(( u'|'.join((
'Op', 'Op',
@@ -125,9 +131,9 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format( RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format(
u'|'.join(( u'|'.join((
# "From" in different languages. # "From" in different languages.
'From', 'Van', 'De', 'Von', 'Fra', 'From', 'Van', 'De', 'Von', 'Fra', u'Från',
# "Date" in different languages. # "Date" in different languages.
'Date', 'Datum', u'Envoyé' 'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
))), re.I) ))), re.I)
SPLITTER_PATTERNS = [ SPLITTER_PATTERNS = [
@@ -315,7 +321,7 @@ def extract_from_plain(msg_body):
return msg_body return msg_body
def extract_from_html(msg_body): def extract_from_html(s):
""" """
Extract not quoted message from provided html message body Extract not quoted message from provided html message body
using tags and plain text algorithm. using tags and plain text algorithm.
@@ -332,8 +338,12 @@ def extract_from_html(msg_body):
then deleting necessary tags. then deleting necessary tags.
""" """
if msg_body.strip() == '': if s.strip() == '':
return msg_body return s
# replace CRLF with LF temporaraly otherwise CR will be converted to '
'
# when doing deepcopy on html tree
msg_body, replaced = _CRLF_to_LF(s)
html_tree = html.document_fromstring( html_tree = html.document_fromstring(
msg_body, msg_body,
@@ -364,15 +374,12 @@ def extract_from_html(msg_body):
plain_text = plain_text.replace('*', '') plain_text = plain_text.replace('*', '')
# Unmask saved star symbols # Unmask saved star symbols
plain_text = plain_text.replace('3423oorkg432', '*') plain_text = plain_text.replace('3423oorkg432', '*')
plain_text = preprocess(plain_text, '\n', content_type='text/html')
delimiter = get_delimiter(plain_text)
plain_text = preprocess(plain_text, delimiter, content_type='text/html')
lines = plain_text.splitlines() lines = plain_text.splitlines()
# Don't process too long messages # Don't process too long messages
if len(lines) > MAX_LINES_COUNT: if len(lines) > MAX_LINES_COUNT:
return msg_body return s
# Collect checkpoints on each line # Collect checkpoints on each line
line_checkpoints = [ line_checkpoints = [
@@ -397,9 +404,9 @@ def extract_from_html(msg_body):
quotation_checkpoints[checkpoint] = True quotation_checkpoints[checkpoint] = True
else: else:
if cut_quotations: if cut_quotations:
return html.tostring(html_tree_copy) return _restore_CRLF(html.tostring(html_tree_copy), replaced)
else: else:
return msg_body return s
# Remove tags with quotation checkpoints # Remove tags with quotation checkpoints
html_quotations.delete_quotation_tags( html_quotations.delete_quotation_tags(
@@ -435,3 +442,37 @@ def register_xpath_extensions():
ns.prefix = 'mg' ns.prefix = 'mg'
ns['text_content'] = text_content ns['text_content'] = text_content
ns['tail'] = tail ns['tail'] = tail
def _restore_CRLF(s, replaced=True):
"""Restore CRLF if previously CRLF was replaced with LF
>>> _restore_CRLF('a\nb')
'a\r\nb'
>>> _restore_CRLF('a\nb', replaced=False)
'a\nb'
"""
if replaced:
return s.replace('\n', '\r\n')
return s
def _CRLF_to_LF(s):
"""Replace CRLF with LF
>>> s, changed = _CRLF_to_LF('a\r\n'b)
>>> s
'a\nb'
>>> changed
True
>>> s, changed = _CRLF_to_LF('a\n'b)
>>> s
'a\nb'
>>> changed
False
"""
delimiter = get_delimiter(s)
if delimiter == '\r\n':
return s.replace(delimiter, '\n'), True
return s, False

Binary file not shown.

View File

@@ -16,7 +16,7 @@ from talon.signature.constants import SIGNATURE_MAX_LINES
rc = re.compile rc = re.compile
RE_EMAIL = rc('@') RE_EMAIL = rc('\S@\S')
RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}') RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''') RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')
@@ -120,7 +120,7 @@ def contains_sender_names(sender):
names = names or sender names = names or sender
if names != '': if names != '':
return binary_regex_search(re.compile(names)) return binary_regex_search(re.compile(names))
return lambda s: False return lambda s: 0
def extract_names(sender): def extract_names(sender):

View File

@@ -1,3 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<html> <html>
<head> <head>
<style><!-- <style><!--

View File

@@ -0,0 +1,19 @@
Content-Type: text/plain;
charset=us-ascii
Mime-Version: 1.0 (Mac OS X Mail 8.2 \(2104\))
Subject: Re: Hello there
X-Universally-Unique-Identifier: 85B1075D-5841-46A9-8565-FCB287A93AC4
From: Adam Renberg <adam@tictail.com>
In-Reply-To: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
Date: Sat, 22 Aug 2015 19:22:20 +0200
Content-Transfer-Encoding: 7bit
X-Smtp-Server: smtp.gmail.com:adam@tictail.com
Message-Id: <68001B29-8EA4-444C-A894-0537D2CA5208@tictail.com>
References: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
To: Adam Renberg <tgwizard@gmail.com>
Hello
> On 22 Aug 2015, at 19:21, Adam Renberg <tgwizard@gmail.com> wrote:
>
> Hi there!

View File

@@ -28,8 +28,8 @@ def test_quotation_splitter_inside_blockquote():
</blockquote>""" </blockquote>"""
eq_("<html><body><p>Reply</p></body></html>", eq_("<html><body><p>Reply\n</p></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) quotations.extract_from_html(msg_body))
def test_quotation_splitter_outside_blockquote(): def test_quotation_splitter_outside_blockquote():
@@ -264,7 +264,7 @@ RE_REPLY = re.compile(r"^Hi\. I am fine\.\s*\n\s*Thanks,\s*\n\s*Alex\s*$")
def extract_reply_and_check(filename): def extract_reply_and_check(filename):
f = open(filename) f = open(filename)
msg_body = f.read().decode("utf-8") msg_body = f.read()
reply = quotations.extract_from_html(msg_body) reply = quotations.extract_from_html(msg_body)
h = html2text.HTML2Text() h = html2text.HTML2Text()
@@ -310,3 +310,25 @@ def test_windows_mail_reply():
def test_yandex_ru_reply(): def test_yandex_ru_reply():
extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html") extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html")
def test_CRLF():
"""CR is not converted to '&#13;'
"""
eq_('<html>\r\n</html>', quotations.extract_from_html('<html>\r\n</html>'))
msg_body = """Reply
<blockquote>
<div>
On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
</div>
<div>
Test
</div>
</blockquote>"""
msg_body = msg_body.replace('\n', '\r\n')
eq_("<html><body><p>Reply\r\n</p></body></html>",
quotations.extract_from_html(msg_body))

View File

@@ -29,3 +29,15 @@ def test_crash_inside_extract_from():
def test_empty_body(): def test_empty_body():
eq_('', quotations.extract_from_plain('')) eq_('', quotations.extract_from_plain(''))
def test__CRLF_to_LF():
eq_(('\n\r', True), quotations._CRLF_to_LF('\r\n\r'))
eq_(('\n\r', False), quotations._CRLF_to_LF('\n\r'))
def test__restore_CRLF():
eq_('\n', quotations._restore_CRLF('\n', replaced=False))
eq_('\r\n', quotations._restore_CRLF('\n', replaced=True))
# default
eq_('\r\n', quotations._restore_CRLF('\n'))

View File

@@ -6,7 +6,9 @@ from talon.signature.learning import featurespace as fs
def test_apply_features(): def test_apply_features():
s = '''John Doe s = '''This is John Doe
Tuesday @3pm suits. I'll chat to you then.
VP Research and Development, Xxxx Xxxx Xxxxx VP Research and Development, Xxxx Xxxx Xxxxx
@@ -19,11 +21,12 @@ john@example.com'''
# note that we don't consider the first line because signatures don't # note that we don't consider the first line because signatures don't
# usually take all the text, empty lines are not considered # usually take all the text, empty lines are not considered
eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
with patch.object(fs, 'SIGNATURE_MAX_LINES', 4): with patch.object(fs, 'SIGNATURE_MAX_LINES', 5):
features = fs.features(sender) features = fs.features(sender)
new_result = fs.apply_features(s, features) new_result = fs.apply_features(s, features)
# result remains the same because we don't consider empty lines # result remains the same because we don't consider empty lines

View File

@@ -311,6 +311,33 @@ Emne: The manager has commented on your Loop
Blah-blah-blah Blah-blah-blah
""")) """))
def test_swedish_from_block():
eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
u"""Allo! Follow up MIME!
Från: Anno Sportel [mailto:anno.spoel@hsbcssad.com]
Skickat: den 26 augusti 2015 14:45
Till: Isacson Leiff
Ämne: RE: Week 36
Blah-blah-blah
"""))
def test_swedish_from_line():
eq_('Lorem', quotations.extract_from_plain(
"""Lorem
Den 14 september, 2015 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
"""))
def test_norwegian_from_line():
eq_('Lorem', quotations.extract_from_plain(
u"""Lorem
På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
"""))
def test_dutch_from_block(): def test_dutch_from_block():
eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain( eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
"""Gluten-free culpa lo-fi et nesciunt nostrud. """Gluten-free culpa lo-fi et nesciunt nostrud.