Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2377c387c7 |
7
setup.py
7
setup.py
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
|||||||
|
|
||||||
|
|
||||||
setup(name='talon',
|
setup(name='talon',
|
||||||
version='1.0.9',
|
version='1.0.5',
|
||||||
description=("Mailgun library "
|
description=("Mailgun library "
|
||||||
"to extract message quotations and signatures."),
|
"to extract message quotations and signatures."),
|
||||||
long_description=open("README.rst").read(),
|
long_description=open("README.rst").read(),
|
||||||
@@ -14,15 +14,12 @@ setup(name='talon',
|
|||||||
include_package_data=True,
|
include_package_data=True,
|
||||||
zip_safe=True,
|
zip_safe=True,
|
||||||
install_requires=[
|
install_requires=[
|
||||||
"lxml>=2.3.3",
|
"lxml==2.3.3",
|
||||||
"regex>=1",
|
"regex>=1",
|
||||||
"html2text",
|
"html2text",
|
||||||
"numpy",
|
"numpy",
|
||||||
"scipy",
|
"scipy",
|
||||||
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
|
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
|
||||||
'chardet>=1.0.1',
|
|
||||||
'cchardet>=0.3.5',
|
|
||||||
'cssselect'
|
|
||||||
],
|
],
|
||||||
tests_require=[
|
tests_require=[
|
||||||
"mock",
|
"mock",
|
||||||
|
|||||||
@@ -138,10 +138,9 @@ def cut_by_id(html_message):
|
|||||||
|
|
||||||
|
|
||||||
def cut_blockquote(html_message):
|
def cut_blockquote(html_message):
|
||||||
''' Cuts the last non-nested blockquote with wrapping elements. '''
|
''' Cuts blockquote with wrapping elements. '''
|
||||||
quote = html_message.xpath('(.//blockquote)[not(ancestor::blockquote)][last()]')
|
quote = html_message.find('.//blockquote')
|
||||||
if quote:
|
if quote is not None:
|
||||||
quote = quote[0]
|
|
||||||
quote.getparent().remove(quote)
|
quote.getparent().remove(quote)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ log = logging.getLogger(__name__)
|
|||||||
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
|
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
|
||||||
|
|
||||||
RE_ON_DATE_SMB_WROTE = re.compile(
|
RE_ON_DATE_SMB_WROTE = re.compile(
|
||||||
u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
|
u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
|
||||||
# Beginning of the line
|
# Beginning of the line
|
||||||
u'|'.join((
|
u'|'.join((
|
||||||
# English
|
# English
|
||||||
@@ -32,13 +32,7 @@ RE_ON_DATE_SMB_WROTE = re.compile(
|
|||||||
# Polish
|
# Polish
|
||||||
'W dniu',
|
'W dniu',
|
||||||
# Dutch
|
# Dutch
|
||||||
'Op',
|
'Op'
|
||||||
# German
|
|
||||||
'Am',
|
|
||||||
# Norwegian
|
|
||||||
u'På',
|
|
||||||
# Swedish, Danish
|
|
||||||
'Den',
|
|
||||||
)),
|
)),
|
||||||
# Date and sender separator
|
# Date and sender separator
|
||||||
u'|'.join((
|
u'|'.join((
|
||||||
@@ -56,28 +50,18 @@ RE_ON_DATE_SMB_WROTE = re.compile(
|
|||||||
# Polish
|
# Polish
|
||||||
u'napisał',
|
u'napisał',
|
||||||
# Dutch
|
# Dutch
|
||||||
'schreef','verzond','geschreven',
|
'schreef','verzond','geschreven'
|
||||||
# German
|
|
||||||
'schrieb',
|
|
||||||
# Norwegian, Swedish
|
|
||||||
'skrev',
|
|
||||||
))
|
))
|
||||||
))
|
))
|
||||||
# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
|
# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
|
||||||
RE_ON_DATE_WROTE_SMB = re.compile(
|
RE_ON_DATE_WROTE_SMB = re.compile(
|
||||||
u'(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
|
u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format(
|
||||||
# Beginning of the line
|
# Beginning of the line
|
||||||
u'|'.join((
|
|
||||||
'Op',
|
'Op',
|
||||||
#German
|
|
||||||
'Am'
|
|
||||||
)),
|
|
||||||
# Ending of the line
|
# Ending of the line
|
||||||
u'|'.join((
|
u'|'.join((
|
||||||
# Dutch
|
# Dutch
|
||||||
'schreef','verzond','geschreven',
|
'schreef','verzond','geschreven'
|
||||||
# German
|
|
||||||
'schrieb'
|
|
||||||
))
|
))
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@@ -131,9 +115,9 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
|
|||||||
RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format(
|
RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format(
|
||||||
u'|'.join((
|
u'|'.join((
|
||||||
# "From" in different languages.
|
# "From" in different languages.
|
||||||
'From', 'Van', 'De', 'Von', 'Fra', u'Från',
|
'From', 'Van', 'De', 'Von', 'Fra',
|
||||||
# "Date" in different languages.
|
# "Date" in different languages.
|
||||||
'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
|
'Date', 'Datum', u'Envoyé'
|
||||||
))), re.I)
|
))), re.I)
|
||||||
|
|
||||||
SPLITTER_PATTERNS = [
|
SPLITTER_PATTERNS = [
|
||||||
@@ -197,7 +181,6 @@ def mark_message_lines(lines):
|
|||||||
else:
|
else:
|
||||||
# in case splitter is spread across several lines
|
# in case splitter is spread across several lines
|
||||||
splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
|
splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
|
||||||
|
|
||||||
if splitter:
|
if splitter:
|
||||||
# append as many splitter markers as lines in splitter
|
# append as many splitter markers as lines in splitter
|
||||||
splitter_lines = splitter.group().splitlines()
|
splitter_lines = splitter.group().splitlines()
|
||||||
@@ -310,8 +293,12 @@ def extract_from_plain(msg_body):
|
|||||||
|
|
||||||
delimiter = get_delimiter(msg_body)
|
delimiter = get_delimiter(msg_body)
|
||||||
msg_body = preprocess(msg_body, delimiter)
|
msg_body = preprocess(msg_body, delimiter)
|
||||||
|
lines = msg_body.splitlines()
|
||||||
|
|
||||||
# don't process too long messages
|
# don't process too long messages
|
||||||
lines = msg_body.splitlines()[:MAX_LINES_COUNT]
|
if len(lines) > MAX_LINES_COUNT:
|
||||||
|
return stripped_text
|
||||||
|
|
||||||
markers = mark_message_lines(lines)
|
markers = mark_message_lines(lines)
|
||||||
lines = process_marked_lines(lines, markers)
|
lines = process_marked_lines(lines, markers)
|
||||||
|
|
||||||
@@ -321,7 +308,7 @@ def extract_from_plain(msg_body):
|
|||||||
return msg_body
|
return msg_body
|
||||||
|
|
||||||
|
|
||||||
def extract_from_html(s):
|
def extract_from_html(msg_body):
|
||||||
"""
|
"""
|
||||||
Extract not quoted message from provided html message body
|
Extract not quoted message from provided html message body
|
||||||
using tags and plain text algorithm.
|
using tags and plain text algorithm.
|
||||||
@@ -338,12 +325,8 @@ def extract_from_html(s):
|
|||||||
then deleting necessary tags.
|
then deleting necessary tags.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if s.strip() == '':
|
if msg_body.strip() == '':
|
||||||
return s
|
return msg_body
|
||||||
|
|
||||||
# replace CRLF with LF temporaraly otherwise CR will be converted to ' '
|
|
||||||
# when doing deepcopy on html tree
|
|
||||||
msg_body, replaced = _CRLF_to_LF(s)
|
|
||||||
|
|
||||||
html_tree = html.document_fromstring(
|
html_tree = html.document_fromstring(
|
||||||
msg_body,
|
msg_body,
|
||||||
@@ -374,12 +357,15 @@ def extract_from_html(s):
|
|||||||
plain_text = plain_text.replace('*', '')
|
plain_text = plain_text.replace('*', '')
|
||||||
# Unmask saved star symbols
|
# Unmask saved star symbols
|
||||||
plain_text = plain_text.replace('3423oorkg432', '*')
|
plain_text = plain_text.replace('3423oorkg432', '*')
|
||||||
plain_text = preprocess(plain_text, '\n', content_type='text/html')
|
|
||||||
|
delimiter = get_delimiter(plain_text)
|
||||||
|
|
||||||
|
plain_text = preprocess(plain_text, delimiter, content_type='text/html')
|
||||||
lines = plain_text.splitlines()
|
lines = plain_text.splitlines()
|
||||||
|
|
||||||
# Don't process too long messages
|
# Don't process too long messages
|
||||||
if len(lines) > MAX_LINES_COUNT:
|
if len(lines) > MAX_LINES_COUNT:
|
||||||
return s
|
return msg_body
|
||||||
|
|
||||||
# Collect checkpoints on each line
|
# Collect checkpoints on each line
|
||||||
line_checkpoints = [
|
line_checkpoints = [
|
||||||
@@ -404,9 +390,9 @@ def extract_from_html(s):
|
|||||||
quotation_checkpoints[checkpoint] = True
|
quotation_checkpoints[checkpoint] = True
|
||||||
else:
|
else:
|
||||||
if cut_quotations:
|
if cut_quotations:
|
||||||
return _restore_CRLF(html.tostring(html_tree_copy), replaced)
|
return html.tostring(html_tree_copy)
|
||||||
else:
|
else:
|
||||||
return s
|
return msg_body
|
||||||
|
|
||||||
# Remove tags with quotation checkpoints
|
# Remove tags with quotation checkpoints
|
||||||
html_quotations.delete_quotation_tags(
|
html_quotations.delete_quotation_tags(
|
||||||
@@ -442,37 +428,3 @@ def register_xpath_extensions():
|
|||||||
ns.prefix = 'mg'
|
ns.prefix = 'mg'
|
||||||
ns['text_content'] = text_content
|
ns['text_content'] = text_content
|
||||||
ns['tail'] = tail
|
ns['tail'] = tail
|
||||||
|
|
||||||
|
|
||||||
def _restore_CRLF(s, replaced=True):
|
|
||||||
"""Restore CRLF if previously CRLF was replaced with LF
|
|
||||||
|
|
||||||
>>> _restore_CRLF('a\nb')
|
|
||||||
'a\r\nb'
|
|
||||||
>>> _restore_CRLF('a\nb', replaced=False)
|
|
||||||
'a\nb'
|
|
||||||
"""
|
|
||||||
if replaced:
|
|
||||||
return s.replace('\n', '\r\n')
|
|
||||||
return s
|
|
||||||
|
|
||||||
|
|
||||||
def _CRLF_to_LF(s):
|
|
||||||
"""Replace CRLF with LF
|
|
||||||
|
|
||||||
>>> s, changed = _CRLF_to_LF('a\r\n'b)
|
|
||||||
>>> s
|
|
||||||
'a\nb'
|
|
||||||
>>> changed
|
|
||||||
True
|
|
||||||
|
|
||||||
>>> s, changed = _CRLF_to_LF('a\n'b)
|
|
||||||
>>> s
|
|
||||||
'a\nb'
|
|
||||||
>>> changed
|
|
||||||
False
|
|
||||||
"""
|
|
||||||
delimiter = get_delimiter(s)
|
|
||||||
if delimiter == '\r\n':
|
|
||||||
return s.replace(delimiter, '\n'), True
|
|
||||||
return s, False
|
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -16,7 +16,7 @@ from talon.signature.constants import SIGNATURE_MAX_LINES
|
|||||||
|
|
||||||
rc = re.compile
|
rc = re.compile
|
||||||
|
|
||||||
RE_EMAIL = rc('\S@\S')
|
RE_EMAIL = rc('@')
|
||||||
RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
|
RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
|
||||||
RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')
|
RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')
|
||||||
|
|
||||||
@@ -120,7 +120,7 @@ def contains_sender_names(sender):
|
|||||||
names = names or sender
|
names = names or sender
|
||||||
if names != '':
|
if names != '':
|
||||||
return binary_regex_search(re.compile(names))
|
return binary_regex_search(re.compile(names))
|
||||||
return lambda s: 0
|
return lambda s: False
|
||||||
|
|
||||||
|
|
||||||
def extract_names(sender):
|
def extract_names(sender):
|
||||||
@@ -134,7 +134,7 @@ def extract_names(sender):
|
|||||||
>>> extract_names('')
|
>>> extract_names('')
|
||||||
[]
|
[]
|
||||||
"""
|
"""
|
||||||
sender = to_unicode(sender, precise=True)
|
sender = to_unicode(sender)
|
||||||
# Remove non-alphabetical characters
|
# Remove non-alphabetical characters
|
||||||
sender = "".join([char if char.isalpha() else ' ' for char in sender])
|
sender = "".join([char if char.isalpha() else ' ' for char in sender])
|
||||||
# Remove too short words and words from "black" list i.e.
|
# Remove too short words and words from "black" list i.e.
|
||||||
@@ -161,7 +161,7 @@ def categories_percent(s, categories):
|
|||||||
50.0
|
50.0
|
||||||
'''
|
'''
|
||||||
count = 0
|
count = 0
|
||||||
s = to_unicode(s, precise=True)
|
s = to_unicode(s)
|
||||||
for c in s:
|
for c in s:
|
||||||
if unicodedata.category(c) in categories:
|
if unicodedata.category(c) in categories:
|
||||||
count += 1
|
count += 1
|
||||||
@@ -181,7 +181,7 @@ def punctuation_percent(s):
|
|||||||
|
|
||||||
def capitalized_words_percent(s):
|
def capitalized_words_percent(s):
|
||||||
'''Returns capitalized words percent.'''
|
'''Returns capitalized words percent.'''
|
||||||
s = to_unicode(s, precise=True)
|
s = to_unicode(s)
|
||||||
words = re.split('\s', s)
|
words = re.split('\s', s)
|
||||||
words = [w for w in words if w.strip()]
|
words = [w for w in words if w.strip()]
|
||||||
capitalized_words_counter = 0
|
capitalized_words_counter = 0
|
||||||
|
|||||||
@@ -2,12 +2,13 @@
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
from random import shuffle
|
from random import shuffle
|
||||||
import chardet
|
|
||||||
import cchardet
|
|
||||||
|
|
||||||
from talon.constants import RE_DELIMITER
|
from talon.constants import RE_DELIMITER
|
||||||
|
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def safe_format(format_string, *args, **kwargs):
|
def safe_format(format_string, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
Helper: formats string with any combination of bytestrings/unicode
|
Helper: formats string with any combination of bytestrings/unicode
|
||||||
@@ -41,44 +42,12 @@ def to_unicode(str_or_unicode, precise=False):
|
|||||||
u'привет'
|
u'привет'
|
||||||
If `precise` flag is True, tries to guess the correct encoding first.
|
If `precise` flag is True, tries to guess the correct encoding first.
|
||||||
"""
|
"""
|
||||||
encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8'
|
encoding = detect_encoding(str_or_unicode) if precise else 'utf-8'
|
||||||
if isinstance(str_or_unicode, str):
|
if isinstance(str_or_unicode, str):
|
||||||
return unicode(str_or_unicode, encoding, 'replace')
|
return unicode(str_or_unicode, encoding, 'replace')
|
||||||
return str_or_unicode
|
return str_or_unicode
|
||||||
|
|
||||||
|
|
||||||
def detect_encoding(string):
|
|
||||||
"""
|
|
||||||
Tries to detect the encoding of the passed string.
|
|
||||||
|
|
||||||
Defaults to UTF-8.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
detected = chardet.detect(string)
|
|
||||||
if detected:
|
|
||||||
return detected.get('encoding') or 'utf-8'
|
|
||||||
except Exception, e:
|
|
||||||
print 11111111111, e
|
|
||||||
pass
|
|
||||||
return 'utf-8'
|
|
||||||
|
|
||||||
|
|
||||||
def quick_detect_encoding(string):
|
|
||||||
"""
|
|
||||||
Tries to detect the encoding of the passed string.
|
|
||||||
|
|
||||||
Uses cchardet. Fallbacks to detect_encoding.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
detected = cchardet.detect(string)
|
|
||||||
if detected:
|
|
||||||
return detected.get('encoding') or detect_encoding(string)
|
|
||||||
except Exception, e:
|
|
||||||
print 222222222222, e
|
|
||||||
pass
|
|
||||||
return detect_encoding(string)
|
|
||||||
|
|
||||||
|
|
||||||
def to_utf8(str_or_unicode):
|
def to_utf8(str_or_unicode):
|
||||||
"""
|
"""
|
||||||
Safely returns a UTF-8 version of a given string
|
Safely returns a UTF-8 version of a given string
|
||||||
|
|||||||
1
tests/fixtures/html_replies/hotmail.html
vendored
1
tests/fixtures/html_replies/hotmail.html
vendored
@@ -1,4 +1,3 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<html>
|
<html>
|
||||||
<head>
|
<head>
|
||||||
<style><!--
|
<style><!--
|
||||||
|
|||||||
19
tests/fixtures/standard_replies/apple_mail_2.eml
vendored
19
tests/fixtures/standard_replies/apple_mail_2.eml
vendored
@@ -1,19 +0,0 @@
|
|||||||
Content-Type: text/plain;
|
|
||||||
charset=us-ascii
|
|
||||||
Mime-Version: 1.0 (Mac OS X Mail 8.2 \(2104\))
|
|
||||||
Subject: Re: Hello there
|
|
||||||
X-Universally-Unique-Identifier: 85B1075D-5841-46A9-8565-FCB287A93AC4
|
|
||||||
From: Adam Renberg <adam@tictail.com>
|
|
||||||
In-Reply-To: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
|
|
||||||
Date: Sat, 22 Aug 2015 19:22:20 +0200
|
|
||||||
Content-Transfer-Encoding: 7bit
|
|
||||||
X-Smtp-Server: smtp.gmail.com:adam@tictail.com
|
|
||||||
Message-Id: <68001B29-8EA4-444C-A894-0537D2CA5208@tictail.com>
|
|
||||||
References: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
|
|
||||||
To: Adam Renberg <tgwizard@gmail.com>
|
|
||||||
|
|
||||||
Hello
|
|
||||||
> On 22 Aug 2015, at 19:21, Adam Renberg <tgwizard@gmail.com> wrote:
|
|
||||||
>
|
|
||||||
> Hi there!
|
|
||||||
|
|
||||||
@@ -28,8 +28,8 @@ def test_quotation_splitter_inside_blockquote():
|
|||||||
|
|
||||||
</blockquote>"""
|
</blockquote>"""
|
||||||
|
|
||||||
eq_("<html><body><p>Reply\n</p></body></html>",
|
eq_("<html><body><p>Reply</p></body></html>",
|
||||||
quotations.extract_from_html(msg_body))
|
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
|
||||||
|
|
||||||
|
|
||||||
def test_quotation_splitter_outside_blockquote():
|
def test_quotation_splitter_outside_blockquote():
|
||||||
@@ -49,24 +49,6 @@ def test_quotation_splitter_outside_blockquote():
|
|||||||
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
|
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
|
||||||
|
|
||||||
|
|
||||||
def test_regular_blockquote():
|
|
||||||
msg_body = """Reply
|
|
||||||
<blockquote>Regular</blockquote>
|
|
||||||
|
|
||||||
<div>
|
|
||||||
On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<blockquote>
|
|
||||||
<div>
|
|
||||||
<blockquote>Nested</blockquote>
|
|
||||||
</div>
|
|
||||||
</blockquote>
|
|
||||||
"""
|
|
||||||
eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote><div></div></body></html>",
|
|
||||||
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
|
|
||||||
|
|
||||||
|
|
||||||
def test_no_blockquote():
|
def test_no_blockquote():
|
||||||
msg_body = """
|
msg_body = """
|
||||||
<html>
|
<html>
|
||||||
@@ -264,7 +246,7 @@ RE_REPLY = re.compile(r"^Hi\. I am fine\.\s*\n\s*Thanks,\s*\n\s*Alex\s*$")
|
|||||||
def extract_reply_and_check(filename):
|
def extract_reply_and_check(filename):
|
||||||
f = open(filename)
|
f = open(filename)
|
||||||
|
|
||||||
msg_body = f.read()
|
msg_body = f.read().decode("utf-8")
|
||||||
reply = quotations.extract_from_html(msg_body)
|
reply = quotations.extract_from_html(msg_body)
|
||||||
|
|
||||||
h = html2text.HTML2Text()
|
h = html2text.HTML2Text()
|
||||||
@@ -310,25 +292,3 @@ def test_windows_mail_reply():
|
|||||||
|
|
||||||
def test_yandex_ru_reply():
|
def test_yandex_ru_reply():
|
||||||
extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html")
|
extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html")
|
||||||
|
|
||||||
|
|
||||||
def test_CRLF():
|
|
||||||
"""CR is not converted to ' '
|
|
||||||
"""
|
|
||||||
eq_('<html>\r\n</html>', quotations.extract_from_html('<html>\r\n</html>'))
|
|
||||||
|
|
||||||
msg_body = """Reply
|
|
||||||
<blockquote>
|
|
||||||
|
|
||||||
<div>
|
|
||||||
On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div>
|
|
||||||
Test
|
|
||||||
</div>
|
|
||||||
|
|
||||||
</blockquote>"""
|
|
||||||
msg_body = msg_body.replace('\n', '\r\n')
|
|
||||||
eq_("<html><body><p>Reply\r\n</p></body></html>",
|
|
||||||
quotations.extract_from_html(msg_body))
|
|
||||||
|
|||||||
@@ -29,15 +29,3 @@ def test_crash_inside_extract_from():
|
|||||||
|
|
||||||
def test_empty_body():
|
def test_empty_body():
|
||||||
eq_('', quotations.extract_from_plain(''))
|
eq_('', quotations.extract_from_plain(''))
|
||||||
|
|
||||||
|
|
||||||
def test__CRLF_to_LF():
|
|
||||||
eq_(('\n\r', True), quotations._CRLF_to_LF('\r\n\r'))
|
|
||||||
eq_(('\n\r', False), quotations._CRLF_to_LF('\n\r'))
|
|
||||||
|
|
||||||
|
|
||||||
def test__restore_CRLF():
|
|
||||||
eq_('\n', quotations._restore_CRLF('\n', replaced=False))
|
|
||||||
eq_('\r\n', quotations._restore_CRLF('\n', replaced=True))
|
|
||||||
# default
|
|
||||||
eq_('\r\n', quotations._restore_CRLF('\n'))
|
|
||||||
|
|||||||
@@ -6,9 +6,7 @@ from talon.signature.learning import featurespace as fs
|
|||||||
|
|
||||||
|
|
||||||
def test_apply_features():
|
def test_apply_features():
|
||||||
s = '''This is John Doe
|
s = '''John Doe
|
||||||
|
|
||||||
Tuesday @3pm suits. I'll chat to you then.
|
|
||||||
|
|
||||||
VP Research and Development, Xxxx Xxxx Xxxxx
|
VP Research and Development, Xxxx Xxxx Xxxxx
|
||||||
|
|
||||||
@@ -21,12 +19,11 @@ john@example.com'''
|
|||||||
# note that we don't consider the first line because signatures don't
|
# note that we don't consider the first line because signatures don't
|
||||||
# usually take all the text, empty lines are not considered
|
# usually take all the text, empty lines are not considered
|
||||||
eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
|
eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
|
||||||
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
|
||||||
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||||
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
|
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
|
||||||
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
|
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
|
||||||
|
|
||||||
with patch.object(fs, 'SIGNATURE_MAX_LINES', 5):
|
with patch.object(fs, 'SIGNATURE_MAX_LINES', 4):
|
||||||
features = fs.features(sender)
|
features = fs.features(sender)
|
||||||
new_result = fs.apply_features(s, features)
|
new_result = fs.apply_features(s, features)
|
||||||
# result remains the same because we don't consider empty lines
|
# result remains the same because we don't consider empty lines
|
||||||
|
|||||||
@@ -12,11 +12,11 @@ from talon import quotations
|
|||||||
@patch.object(quotations, 'MAX_LINES_COUNT', 1)
|
@patch.object(quotations, 'MAX_LINES_COUNT', 1)
|
||||||
def test_too_many_lines():
|
def test_too_many_lines():
|
||||||
msg_body = """Test reply
|
msg_body = """Test reply
|
||||||
Hi
|
|
||||||
-----Original Message-----
|
-----Original Message-----
|
||||||
|
|
||||||
Test"""
|
Test"""
|
||||||
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
eq_(msg_body, quotations.extract_from_plain(msg_body))
|
||||||
|
|
||||||
|
|
||||||
def test_pattern_on_date_somebody_wrote():
|
def test_pattern_on_date_somebody_wrote():
|
||||||
@@ -311,33 +311,6 @@ Emne: The manager has commented on your Loop
|
|||||||
Blah-blah-blah
|
Blah-blah-blah
|
||||||
"""))
|
"""))
|
||||||
|
|
||||||
def test_swedish_from_block():
|
|
||||||
eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
|
|
||||||
u"""Allo! Follow up MIME!
|
|
||||||
Från: Anno Sportel [mailto:anno.spoel@hsbcssad.com]
|
|
||||||
Skickat: den 26 augusti 2015 14:45
|
|
||||||
Till: Isacson Leiff
|
|
||||||
Ämne: RE: Week 36
|
|
||||||
|
|
||||||
Blah-blah-blah
|
|
||||||
"""))
|
|
||||||
|
|
||||||
def test_swedish_from_line():
|
|
||||||
eq_('Lorem', quotations.extract_from_plain(
|
|
||||||
"""Lorem
|
|
||||||
Den 14 september, 2015 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
|
|
||||||
|
|
||||||
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
|
|
||||||
"""))
|
|
||||||
|
|
||||||
def test_norwegian_from_line():
|
|
||||||
eq_('Lorem', quotations.extract_from_plain(
|
|
||||||
u"""Lorem
|
|
||||||
På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
|
|
||||||
|
|
||||||
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
|
|
||||||
"""))
|
|
||||||
|
|
||||||
def test_dutch_from_block():
|
def test_dutch_from_block():
|
||||||
eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
|
eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
|
||||||
"""Gluten-free culpa lo-fi et nesciunt nostrud.
|
"""Gluten-free culpa lo-fi et nesciunt nostrud.
|
||||||
|
|||||||
@@ -1,60 +1,9 @@
|
|||||||
# coding:utf-8
|
|
||||||
|
|
||||||
from . import *
|
from . import *
|
||||||
|
|
||||||
from talon import utils as u
|
from talon import utils
|
||||||
import cchardet
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_delimiter():
|
def test_get_delimiter():
|
||||||
eq_('\r\n', u.get_delimiter('abc\r\n123'))
|
eq_('\r\n', utils.get_delimiter('abc\r\n123'))
|
||||||
eq_('\n', u.get_delimiter('abc\n123'))
|
eq_('\n', utils.get_delimiter('abc\n123'))
|
||||||
eq_('\n', u.get_delimiter('abc'))
|
eq_('\n', utils.get_delimiter('abc'))
|
||||||
|
|
||||||
|
|
||||||
def test_unicode():
|
|
||||||
eq_ (u'hi', u.to_unicode('hi'))
|
|
||||||
eq_ (type(u.to_unicode('hi')), unicode )
|
|
||||||
eq_ (type(u.to_unicode(u'hi')), unicode )
|
|
||||||
eq_ (type(u.to_unicode('привет')), unicode )
|
|
||||||
eq_ (type(u.to_unicode(u'привет')), unicode )
|
|
||||||
eq_ (u"привет", u.to_unicode('привет'))
|
|
||||||
eq_ (u"привет", u.to_unicode(u'привет'))
|
|
||||||
# some latin1 stuff
|
|
||||||
eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True))
|
|
||||||
|
|
||||||
|
|
||||||
def test_detect_encoding():
|
|
||||||
eq_ ('ascii', u.detect_encoding('qwe').lower())
|
|
||||||
eq_ ('iso-8859-2', u.detect_encoding('Versi\xf3n').lower())
|
|
||||||
eq_ ('utf-8', u.detect_encoding('привет').lower())
|
|
||||||
# fallback to utf-8
|
|
||||||
with patch.object(u.chardet, 'detect') as detect:
|
|
||||||
detect.side_effect = Exception
|
|
||||||
eq_ ('utf-8', u.detect_encoding('qwe').lower())
|
|
||||||
|
|
||||||
|
|
||||||
def test_quick_detect_encoding():
|
|
||||||
eq_ ('ascii', u.quick_detect_encoding('qwe').lower())
|
|
||||||
eq_ ('windows-1252', u.quick_detect_encoding('Versi\xf3n').lower())
|
|
||||||
eq_ ('utf-8', u.quick_detect_encoding('привет').lower())
|
|
||||||
|
|
||||||
|
|
||||||
@patch.object(cchardet, 'detect')
|
|
||||||
@patch.object(u, 'detect_encoding')
|
|
||||||
def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect):
|
|
||||||
cchardet_detect.return_value = {'encoding': 'ascii'}
|
|
||||||
eq_('ascii', u.quick_detect_encoding("qwe"))
|
|
||||||
cchardet_detect.assert_called_once_with("qwe")
|
|
||||||
|
|
||||||
# fallback to detect_encoding
|
|
||||||
cchardet_detect.return_value = {}
|
|
||||||
detect_encoding.return_value = 'utf-8'
|
|
||||||
eq_('utf-8', u.quick_detect_encoding("qwe"))
|
|
||||||
|
|
||||||
# exception
|
|
||||||
detect_encoding.reset_mock()
|
|
||||||
cchardet_detect.side_effect = Exception()
|
|
||||||
detect_encoding.return_value = 'utf-8'
|
|
||||||
eq_('utf-8', u.quick_detect_encoding("qwe"))
|
|
||||||
ok_(detect_encoding.called)
|
|
||||||
|
|||||||
Reference in New Issue
Block a user