1 Commits

Author SHA1 Message Date
Ralph Meijer
2377c387c7 Actually bump up talon's version up to 1.0.5 to match the tag. 2015-09-09 22:46:18 +02:00
14 changed files with 80 additions and 385 deletions

View File

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
setup(name='talon', setup(name='talon',
version='1.2.0', version='1.0.5',
description=("Mailgun library " description=("Mailgun library "
"to extract message quotations and signatures."), "to extract message quotations and signatures."),
long_description=open("README.rst").read(), long_description=open("README.rst").read(),
@@ -14,14 +14,12 @@ setup(name='talon',
include_package_data=True, include_package_data=True,
zip_safe=True, zip_safe=True,
install_requires=[ install_requires=[
"lxml>=2.3.3", "lxml==2.3.3",
"regex>=1", "regex>=1",
"html2text",
"numpy", "numpy",
"scipy", "scipy",
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
'chardet>=1.0.1',
'cchardet>=0.3.5',
'cssselect'
], ],
tests_require=[ tests_require=[
"mock", "mock",

View File

@@ -76,7 +76,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints):
def cut_gmail_quote(html_message): def cut_gmail_quote(html_message):
''' Cuts the outermost block element with class gmail_quote. ''' ''' Cuts the outermost block element with class gmail_quote. '''
gmail_quote = html_message.cssselect('div.gmail_quote') gmail_quote = html_message.cssselect('.gmail_quote')
if gmail_quote: if gmail_quote:
gmail_quote[0].getparent().remove(gmail_quote[0]) gmail_quote[0].getparent().remove(gmail_quote[0])
return True return True
@@ -138,14 +138,9 @@ def cut_by_id(html_message):
def cut_blockquote(html_message): def cut_blockquote(html_message):
''' Cuts the last non-nested blockquote with wrapping elements.''' ''' Cuts blockquote with wrapping elements. '''
quote = html_message.xpath( quote = html_message.find('.//blockquote')
'(.//blockquote)' if quote is not None:
'[not(@class="gmail_quote") and not(ancestor::blockquote)]'
'[last()]')
if quote:
quote = quote[0]
quote.getparent().remove(quote) quote.getparent().remove(quote)
return True return True

View File

@@ -10,8 +10,9 @@ import logging
from copy import deepcopy from copy import deepcopy
from lxml import html, etree from lxml import html, etree
import html2text
from talon.utils import get_delimiter, html_to_text from talon.utils import get_delimiter
from talon import html_quotations from talon import html_quotations
@@ -21,7 +22,7 @@ log = logging.getLogger(__name__)
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
RE_ON_DATE_SMB_WROTE = re.compile( RE_ON_DATE_SMB_WROTE = re.compile(
u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format( u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
# Beginning of the line # Beginning of the line
u'|'.join(( u'|'.join((
# English # English
@@ -31,13 +32,7 @@ RE_ON_DATE_SMB_WROTE = re.compile(
# Polish # Polish
'W dniu', 'W dniu',
# Dutch # Dutch
'Op', 'Op'
# German
'Am',
# Norwegian
u'',
# Swedish, Danish
'Den',
)), )),
# Date and sender separator # Date and sender separator
u'|'.join(( u'|'.join((
@@ -55,28 +50,18 @@ RE_ON_DATE_SMB_WROTE = re.compile(
# Polish # Polish
u'napisał', u'napisał',
# Dutch # Dutch
'schreef','verzond','geschreven', 'schreef','verzond','geschreven'
# German
'schrieb',
# Norwegian, Swedish
'skrev',
)) ))
)) ))
# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:' # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
RE_ON_DATE_WROTE_SMB = re.compile( RE_ON_DATE_WROTE_SMB = re.compile(
u'(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format( u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format(
# Beginning of the line # Beginning of the line
u'|'.join((
'Op', 'Op',
#German
'Am'
)),
# Ending of the line # Ending of the line
u'|'.join(( u'|'.join((
# Dutch # Dutch
'schreef','verzond','geschreven', 'schreef','verzond','geschreven'
# German
'schrieb'
)) ))
) )
) )
@@ -107,7 +92,7 @@ RE_EMPTY_QUOTATION = re.compile(
( (
# quotation border: splitter line or a number of quotation marker lines # quotation border: splitter line or a number of quotation marker lines
(?: (?:
(?:se*)+ s
| |
(?:me*){2,} (?:me*){2,}
) )
@@ -130,15 +115,15 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format( RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format(
u'|'.join(( u'|'.join((
# "From" in different languages. # "From" in different languages.
'From', 'Van', 'De', 'Von', 'Fra', u'Från', 'From', 'Van', 'De', 'Von', 'Fra',
# "Date" in different languages. # "Date" in different languages.
'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt', 'Date', 'Datum', u'Envoyé'
))), re.I) ))), re.I)
SPLITTER_PATTERNS = [ SPLITTER_PATTERNS = [
RE_ORIGINAL_MESSAGE, RE_ORIGINAL_MESSAGE,
# <date> <person> # <date> <person>
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S), re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE),
RE_ON_DATE_SMB_WROTE, RE_ON_DATE_SMB_WROTE,
RE_ON_DATE_WROTE_SMB, RE_ON_DATE_WROTE_SMB,
RE_FROM_COLON_OR_DATE_COLON, RE_FROM_COLON_OR_DATE_COLON,
@@ -196,7 +181,6 @@ def mark_message_lines(lines):
else: else:
# in case splitter is spread across several lines # in case splitter is spread across several lines
splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES])) splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
if splitter: if splitter:
# append as many splitter markers as lines in splitter # append as many splitter markers as lines in splitter
splitter_lines = splitter.group().splitlines() splitter_lines = splitter.group().splitlines()
@@ -309,8 +293,12 @@ def extract_from_plain(msg_body):
delimiter = get_delimiter(msg_body) delimiter = get_delimiter(msg_body)
msg_body = preprocess(msg_body, delimiter) msg_body = preprocess(msg_body, delimiter)
lines = msg_body.splitlines()
# don't process too long messages # don't process too long messages
lines = msg_body.splitlines()[:MAX_LINES_COUNT] if len(lines) > MAX_LINES_COUNT:
return stripped_text
markers = mark_message_lines(lines) markers = mark_message_lines(lines)
lines = process_marked_lines(lines, markers) lines = process_marked_lines(lines, markers)
@@ -336,27 +324,43 @@ def extract_from_html(msg_body):
then checking deleted checkpoints, then checking deleted checkpoints,
then deleting necessary tags. then deleting necessary tags.
""" """
if msg_body.strip() == '': if msg_body.strip() == '':
return msg_body return msg_body
msg_body = msg_body.replace('\r\n', '').replace('\n', '')
html_tree = html.document_fromstring( html_tree = html.document_fromstring(
msg_body, msg_body,
parser=html.HTMLParser(encoding="utf-8") parser=html.HTMLParser(encoding="utf-8")
) )
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
html_quotations.cut_blockquote(html_tree) or html_quotations.cut_blockquote(html_tree) or
html_quotations.cut_microsoft_quote(html_tree) or html_quotations.cut_microsoft_quote(html_tree) or
html_quotations.cut_by_id(html_tree) or html_quotations.cut_by_id(html_tree) or
html_quotations.cut_from_block(html_tree) html_quotations.cut_from_block(html_tree)
) )
html_tree_copy = deepcopy(html_tree) html_tree_copy = deepcopy(html_tree)
number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
quotation_checkpoints = [False] * number_of_checkpoints quotation_checkpoints = [False] * number_of_checkpoints
msg_with_checkpoints = html.tostring(html_tree) msg_with_checkpoints = html.tostring(html_tree)
plain_text = html_to_text(msg_with_checkpoints)
plain_text = preprocess(plain_text, '\n', content_type='text/html') h = html2text.HTML2Text()
h.body_width = 0 # generate plain text without wrap
# html2text adds unnecessary star symbols. Remove them.
# Mask star symbols
msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432')
plain_text = h.handle(msg_with_checkpoints)
# Remove created star symbols
plain_text = plain_text.replace('*', '')
# Unmask saved star symbols
plain_text = plain_text.replace('3423oorkg432', '*')
delimiter = get_delimiter(plain_text)
plain_text = preprocess(plain_text, delimiter, content_type='text/html')
lines = plain_text.splitlines() lines = plain_text.splitlines()
# Don't process too long messages # Don't process too long messages
@@ -378,6 +382,7 @@ def extract_from_html(msg_body):
return_flags = [] return_flags = []
process_marked_lines(lines, markers, return_flags) process_marked_lines(lines, markers, return_flags)
lines_were_deleted, first_deleted, last_deleted = return_flags lines_were_deleted, first_deleted, last_deleted = return_flags
if lines_were_deleted: if lines_were_deleted:
#collect checkpoints from deleted lines #collect checkpoints from deleted lines
for i in xrange(first_deleted, last_deleted): for i in xrange(first_deleted, last_deleted):

Binary file not shown.

View File

@@ -16,7 +16,7 @@ from talon.signature.constants import SIGNATURE_MAX_LINES
rc = re.compile rc = re.compile
RE_EMAIL = rc('\S@\S') RE_EMAIL = rc('@')
RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}') RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''') RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')
@@ -120,7 +120,7 @@ def contains_sender_names(sender):
names = names or sender names = names or sender
if names != '': if names != '':
return binary_regex_search(re.compile(names)) return binary_regex_search(re.compile(names))
return lambda s: 0 return lambda s: False
def extract_names(sender): def extract_names(sender):
@@ -134,7 +134,7 @@ def extract_names(sender):
>>> extract_names('') >>> extract_names('')
[] []
""" """
sender = to_unicode(sender, precise=True) sender = to_unicode(sender)
# Remove non-alphabetical characters # Remove non-alphabetical characters
sender = "".join([char if char.isalpha() else ' ' for char in sender]) sender = "".join([char if char.isalpha() else ' ' for char in sender])
# Remove too short words and words from "black" list i.e. # Remove too short words and words from "black" list i.e.
@@ -161,7 +161,7 @@ def categories_percent(s, categories):
50.0 50.0
''' '''
count = 0 count = 0
s = to_unicode(s, precise=True) s = to_unicode(s)
for c in s: for c in s:
if unicodedata.category(c) in categories: if unicodedata.category(c) in categories:
count += 1 count += 1
@@ -181,7 +181,7 @@ def punctuation_percent(s):
def capitalized_words_percent(s): def capitalized_words_percent(s):
'''Returns capitalized words percent.''' '''Returns capitalized words percent.'''
s = to_unicode(s, precise=True) s = to_unicode(s)
words = re.split('\s', s) words = re.split('\s', s)
words = [w for w in words if w.strip()] words = [w for w in words if w.strip()]
capitalized_words_counter = 0 capitalized_words_counter = 0

View File

@@ -2,16 +2,13 @@
import logging import logging
from random import shuffle from random import shuffle
import chardet
import cchardet
import regex as re
from lxml import html
from lxml.cssselect import CSSSelector
from talon.constants import RE_DELIMITER from talon.constants import RE_DELIMITER
log = logging.getLogger(__name__)
def safe_format(format_string, *args, **kwargs): def safe_format(format_string, *args, **kwargs):
""" """
Helper: formats string with any combination of bytestrings/unicode Helper: formats string with any combination of bytestrings/unicode
@@ -45,42 +42,12 @@ def to_unicode(str_or_unicode, precise=False):
u'привет' u'привет'
If `precise` flag is True, tries to guess the correct encoding first. If `precise` flag is True, tries to guess the correct encoding first.
""" """
encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8' encoding = detect_encoding(str_or_unicode) if precise else 'utf-8'
if isinstance(str_or_unicode, str): if isinstance(str_or_unicode, str):
return unicode(str_or_unicode, encoding, 'replace') return unicode(str_or_unicode, encoding, 'replace')
return str_or_unicode return str_or_unicode
def detect_encoding(string):
"""
Tries to detect the encoding of the passed string.
Defaults to UTF-8.
"""
try:
detected = chardet.detect(string)
if detected:
return detected.get('encoding') or 'utf-8'
except Exception, e:
pass
return 'utf-8'
def quick_detect_encoding(string):
"""
Tries to detect the encoding of the passed string.
Uses cchardet. Fallbacks to detect_encoding.
"""
try:
detected = cchardet.detect(string)
if detected:
return detected.get('encoding') or detect_encoding(string)
except Exception, e:
pass
return detect_encoding(string)
def to_utf8(str_or_unicode): def to_utf8(str_or_unicode):
""" """
Safely returns a UTF-8 version of a given string Safely returns a UTF-8 version of a given string
@@ -107,81 +74,3 @@ def get_delimiter(msg_body):
delimiter = '\n' delimiter = '\n'
return delimiter return delimiter
def html_to_text(string):
"""
Dead-simple HTML-to-text converter:
>>> html_to_text("one<br>two<br>three")
>>> "one\ntwo\nthree"
NOTES:
1. the string is expected to contain UTF-8 encoded HTML!
2. returns utf-8 encoded str (not unicode)
"""
s = _prepend_utf8_declaration(string)
s = s.replace("\n", "")
tree = html.fromstring(s)
for style in CSSSelector('style')(tree):
style.getparent().remove(style)
for c in tree.xpath('//comment()'):
c.getparent().remove(c)
text = ""
for el in tree.iter():
el_text = (el.text or '') + (el.tail or '')
if len(el_text) > 1:
if el.tag in _BLOCKTAGS:
text += "\n"
if el.tag == 'li':
text += " * "
text += el_text.strip() + " "
# add href to the output
href = el.attrib.get('href')
if href:
text += "(%s) " % href
if el.tag in _HARDBREAKS and text and not text.endswith("\n"):
text += "\n"
retval = _rm_excessive_newlines(text)
return _encode_utf8(retval)
def _contains_charset_spec(s):
"""Return True if the first 4KB contain charset spec
"""
return s.lower().find('html; charset=', 0, 4096) != -1
def _prepend_utf8_declaration(s):
"""Prepend 'utf-8' encoding declaration if the first 4KB don't have any
"""
return s if _contains_charset_spec(s) else _UTF8_DECLARATION + s
def _rm_excessive_newlines(s):
"""Remove excessive newlines that often happen due to tons of divs
"""
return _RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip()
def _encode_utf8(s):
"""Encode in 'utf-8' if unicode
"""
return s.encode('utf-8') if isinstance(s, unicode) else s
_UTF8_DECLARATION = ('<meta http-equiv="Content-Type" content="text/html;'
'charset=utf-8">')
_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
_HARDBREAKS = ['br', 'hr', 'tr']
_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")

View File

@@ -1,4 +1,3 @@
<?xml version="1.0" encoding="UTF-8"?>
<html> <html>
<head> <head>
<style><!-- <style><!--

View File

@@ -1,19 +0,0 @@
Content-Type: text/plain;
charset=us-ascii
Mime-Version: 1.0 (Mac OS X Mail 8.2 \(2104\))
Subject: Re: Hello there
X-Universally-Unique-Identifier: 85B1075D-5841-46A9-8565-FCB287A93AC4
From: Adam Renberg <adam@tictail.com>
In-Reply-To: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
Date: Sat, 22 Aug 2015 19:22:20 +0200
Content-Transfer-Encoding: 7bit
X-Smtp-Server: smtp.gmail.com:adam@tictail.com
Message-Id: <68001B29-8EA4-444C-A894-0537D2CA5208@tictail.com>
References: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
To: Adam Renberg <tgwizard@gmail.com>
Hello
> On 22 Aug 2015, at 19:21, Adam Renberg <tgwizard@gmail.com> wrote:
>
> Hi there!

View File

@@ -5,7 +5,9 @@ from . fixtures import *
import regex as re import regex as re
from talon import quotations, utils as u from talon import quotations
import html2text
RE_WHITESPACE = re.compile("\s") RE_WHITESPACE = re.compile("\s")
@@ -43,25 +45,7 @@ def test_quotation_splitter_outside_blockquote():
</div> </div>
</blockquote> </blockquote>
""" """
eq_("<html><body><p>Reply</p></body></html>", eq_("<html><body><p>Reply</p><div></div></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_regular_blockquote():
msg_body = """Reply
<blockquote>Regular</blockquote>
<div>
On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
</div>
<blockquote>
<div>
<blockquote>Nested</blockquote>
</div>
</blockquote>
"""
eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -131,18 +115,6 @@ def test_gmail_quote():
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_gmail_quote_blockquote():
msg_body = """Message
<blockquote class="gmail_quote">
<div class="gmail_default">
My name is William Shakespeare.
<br/>
</div>
</blockquote>"""
eq_(RE_WHITESPACE.sub('', msg_body),
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_unicode_in_reply(): def test_unicode_in_reply():
msg_body = u"""Reply \xa0 \xa0 Text<br> msg_body = u"""Reply \xa0 \xa0 Text<br>
@@ -150,7 +122,7 @@ def test_unicode_in_reply():
<br> <br>
</div> </div>
<blockquote> <blockquote class="gmail_quote">
Quote Quote
</blockquote>""".encode("utf-8") </blockquote>""".encode("utf-8")
@@ -268,15 +240,26 @@ def test_reply_separated_by_hr():
'', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR)))
RE_REPLY = re.compile(r"^Hi\. I am fine\.\s*\n\s*Thanks,\s*\n\s*Alex\s*$")
def extract_reply_and_check(filename): def extract_reply_and_check(filename):
f = open(filename) f = open(filename)
msg_body = f.read() msg_body = f.read().decode("utf-8")
reply = quotations.extract_from_html(msg_body) reply = quotations.extract_from_html(msg_body)
plain_reply = u.html_to_text(reply)
eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"), h = html2text.HTML2Text()
RE_WHITESPACE.sub('', plain_reply)) h.body_width = 0
plain_reply = h.handle(reply)
#remove &nbsp; spaces
plain_reply = plain_reply.replace(u'\xa0', u' ')
if RE_REPLY.match(plain_reply):
eq_(1, 1)
else:
eq_("Hi. I am fine.\n\nThanks,\nAlex", plain_reply)
def test_gmail_reply(): def test_gmail_reply():
@@ -309,30 +292,3 @@ def test_windows_mail_reply():
def test_yandex_ru_reply(): def test_yandex_ru_reply():
extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html") extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html")
def test_CRLF():
"""CR is not converted to '&#13;'
"""
symbol = '&#13;'
extracted = quotations.extract_from_html('<html>\r\n</html>')
assert_false(symbol in extracted)
eq_('<html></html>', RE_WHITESPACE.sub('', extracted))
msg_body = """Reply
<blockquote>
<div>
On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
</div>
<div>
Test
</div>
</blockquote>"""
msg_body = msg_body.replace('\n', '\r\n')
extracted = quotations.extract_from_html(msg_body)
assert_false(symbol in extracted)
eq_("<html><body><p>Reply</p></body></html>",
RE_WHITESPACE.sub('', extracted))

View File

@@ -6,9 +6,7 @@ from talon.signature.learning import featurespace as fs
def test_apply_features(): def test_apply_features():
s = '''This is John Doe s = '''John Doe
Tuesday @3pm suits. I'll chat to you then.
VP Research and Development, Xxxx Xxxx Xxxxx VP Research and Development, Xxxx Xxxx Xxxxx
@@ -21,12 +19,11 @@ john@example.com'''
# note that we don't consider the first line because signatures don't # note that we don't consider the first line because signatures don't
# usually take all the text, empty lines are not considered # usually take all the text, empty lines are not considered
eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
with patch.object(fs, 'SIGNATURE_MAX_LINES', 5): with patch.object(fs, 'SIGNATURE_MAX_LINES', 4):
features = fs.features(sender) features = fs.features(sender)
new_result = fs.apply_features(s, features) new_result = fs.apply_features(s, features)
# result remains the same because we don't consider empty lines # result remains the same because we don't consider empty lines

View File

@@ -12,11 +12,11 @@ from talon import quotations
@patch.object(quotations, 'MAX_LINES_COUNT', 1) @patch.object(quotations, 'MAX_LINES_COUNT', 1)
def test_too_many_lines(): def test_too_many_lines():
msg_body = """Test reply msg_body = """Test reply
Hi
-----Original Message----- -----Original Message-----
Test""" Test"""
eq_("Test reply", quotations.extract_from_plain(msg_body)) eq_(msg_body, quotations.extract_from_plain(msg_body))
def test_pattern_on_date_somebody_wrote(): def test_pattern_on_date_somebody_wrote():
@@ -311,33 +311,6 @@ Emne: The manager has commented on your Loop
Blah-blah-blah Blah-blah-blah
""")) """))
def test_swedish_from_block():
eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
u"""Allo! Follow up MIME!
Från: Anno Sportel [mailto:anno.spoel@hsbcssad.com]
Skickat: den 26 augusti 2015 14:45
Till: Isacson Leiff
Ämne: RE: Week 36
Blah-blah-blah
"""))
def test_swedish_from_line():
eq_('Lorem', quotations.extract_from_plain(
"""Lorem
Den 14 september, 2015 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
"""))
def test_norwegian_from_line():
eq_('Lorem', quotations.extract_from_plain(
u"""Lorem
På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
"""))
def test_dutch_from_block(): def test_dutch_from_block():
eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain( eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
"""Gluten-free culpa lo-fi et nesciunt nostrud. """Gluten-free culpa lo-fi et nesciunt nostrud.

View File

@@ -1,107 +1,9 @@
# coding:utf-8
from . import * from . import *
from talon import utils as u from talon import utils
import cchardet
def test_get_delimiter(): def test_get_delimiter():
eq_('\r\n', u.get_delimiter('abc\r\n123')) eq_('\r\n', utils.get_delimiter('abc\r\n123'))
eq_('\n', u.get_delimiter('abc\n123')) eq_('\n', utils.get_delimiter('abc\n123'))
eq_('\n', u.get_delimiter('abc')) eq_('\n', utils.get_delimiter('abc'))
def test_unicode():
eq_ (u'hi', u.to_unicode('hi'))
eq_ (type(u.to_unicode('hi')), unicode )
eq_ (type(u.to_unicode(u'hi')), unicode )
eq_ (type(u.to_unicode('привет')), unicode )
eq_ (type(u.to_unicode(u'привет')), unicode )
eq_ (u"привет", u.to_unicode('привет'))
eq_ (u"привет", u.to_unicode(u'привет'))
# some latin1 stuff
eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True))
def test_detect_encoding():
eq_ ('ascii', u.detect_encoding('qwe').lower())
eq_ ('iso-8859-2', u.detect_encoding('Versi\xf3n').lower())
eq_ ('utf-8', u.detect_encoding('привет').lower())
# fallback to utf-8
with patch.object(u.chardet, 'detect') as detect:
detect.side_effect = Exception
eq_ ('utf-8', u.detect_encoding('qwe').lower())
def test_quick_detect_encoding():
eq_ ('ascii', u.quick_detect_encoding('qwe').lower())
eq_ ('windows-1252', u.quick_detect_encoding('Versi\xf3n').lower())
eq_ ('utf-8', u.quick_detect_encoding('привет').lower())
@patch.object(cchardet, 'detect')
@patch.object(u, 'detect_encoding')
def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect):
cchardet_detect.return_value = {'encoding': 'ascii'}
eq_('ascii', u.quick_detect_encoding("qwe"))
cchardet_detect.assert_called_once_with("qwe")
# fallback to detect_encoding
cchardet_detect.return_value = {}
detect_encoding.return_value = 'utf-8'
eq_('utf-8', u.quick_detect_encoding("qwe"))
# exception
detect_encoding.reset_mock()
cchardet_detect.side_effect = Exception()
detect_encoding.return_value = 'utf-8'
eq_('utf-8', u.quick_detect_encoding("qwe"))
ok_(detect_encoding.called)
def test_html_to_text():
html = """<body>
<p>Hello world!</p>
<br>
<ul>
<li>One!</li>
<li>Two</li>
</ul>
<p>
Haha
</p>
</body>"""
text = u.html_to_text(html)
eq_("Hello world! \n\n * One! \n * Two \nHaha", text)
eq_("привет!", u.html_to_text("<b>привет!</b>"))
html = '<body><br/><br/>Hi</body>'
eq_ ('Hi', u.html_to_text(html))
html = """Hi
<style type="text/css">
div, p, li {
font: 13px 'Lucida Grande', Arial, sans-serif;
}
</style>
<style type="text/css">
h1 {
font: 13px 'Lucida Grande', Arial, sans-serif;
}
</style>"""
eq_ ('Hi', u.html_to_text(html))
html = """<div>
<!-- COMMENT 1 -->
<span>TEXT 1</span>
<p>TEXT 2 <!-- COMMENT 2 --></p>
</div>"""
eq_('TEXT 1 \nTEXT 2', u.html_to_text(html))