@@ -1,9 +1,7 @@
|
||||
recursive-include tests *
|
||||
recursive-include talon *
|
||||
recursive-exclude tests *.pyc *~
|
||||
recursive-exclude talon *.pyc *~
|
||||
include train.data
|
||||
include classifier
|
||||
include LICENSE
|
||||
include MANIFEST.in
|
||||
include README.rst
|
||||
include README.rst
|
||||
|
||||
35
setup.py
35
setup.py
@@ -1,4 +1,31 @@
|
||||
from __future__ import absolute_import
|
||||
from setuptools import setup, find_packages
|
||||
from setuptools.command.install import install
|
||||
|
||||
|
||||
class InstallCommand(install):
|
||||
user_options = install.user_options + [
|
||||
('no-ml', None, "Don't install without Machine Learning modules."),
|
||||
]
|
||||
|
||||
boolean_options = install.boolean_options + ['no-ml']
|
||||
|
||||
def initialize_options(self):
|
||||
install.initialize_options(self)
|
||||
self.no_ml = None
|
||||
|
||||
def finalize_options(self):
|
||||
install.finalize_options(self)
|
||||
if self.no_ml:
|
||||
dist = self.distribution
|
||||
dist.packages=find_packages(exclude=[
|
||||
'tests',
|
||||
'tests.*',
|
||||
'talon.signature',
|
||||
'talon.signature.*',
|
||||
])
|
||||
for not_required in ['numpy', 'scipy', 'scikit-learn==0.16.1']:
|
||||
dist.install_requires.remove(not_required)
|
||||
|
||||
|
||||
setup(name='talon',
|
||||
@@ -10,7 +37,10 @@ setup(name='talon',
|
||||
author_email='admin@mailgunhq.com',
|
||||
url='https://github.com/mailgun/talon',
|
||||
license='APACHE2',
|
||||
packages=find_packages(exclude=['tests']),
|
||||
cmdclass={
|
||||
'install': InstallCommand,
|
||||
},
|
||||
packages=find_packages(exclude=['tests', 'tests.*']),
|
||||
include_package_data=True,
|
||||
zip_safe=True,
|
||||
install_requires=[
|
||||
@@ -21,7 +51,8 @@ setup(name='talon',
|
||||
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
|
||||
'chardet>=1.0.1',
|
||||
'cchardet>=0.3.5',
|
||||
'cssselect'
|
||||
'cssselect',
|
||||
'six>=1.10.0',
|
||||
],
|
||||
tests_require=[
|
||||
"mock",
|
||||
|
||||
@@ -1,7 +1,13 @@
|
||||
from __future__ import absolute_import
|
||||
from talon.quotations import register_xpath_extensions
|
||||
from talon import signature
|
||||
try:
|
||||
from talon import signature
|
||||
ML_ENABLED = True
|
||||
except ImportError:
|
||||
ML_ENABLED = False
|
||||
|
||||
|
||||
def init():
|
||||
register_xpath_extensions()
|
||||
signature.initialize()
|
||||
if ML_ENABLED:
|
||||
signature.initialize()
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from __future__ import absolute_import
|
||||
import regex as re
|
||||
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ The module's functions operate on message bodies trying to extract original
|
||||
messages (without quoted messages) from html
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
import regex as re
|
||||
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ The module's functions operate on message bodies trying to extract
|
||||
original messages (without quoted messages)
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
import regex as re
|
||||
import logging
|
||||
from copy import deepcopy
|
||||
@@ -13,6 +14,8 @@ from lxml import html, etree
|
||||
|
||||
from talon.utils import get_delimiter, html_to_text
|
||||
from talon import html_quotations
|
||||
from six.moves import range
|
||||
import six
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
@@ -191,7 +194,7 @@ def mark_message_lines(lines):
|
||||
>>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question'])
|
||||
'tsem'
|
||||
"""
|
||||
markers = bytearray(len(lines))
|
||||
markers = ['e' for _ in lines]
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
if not lines[i].strip():
|
||||
@@ -207,7 +210,7 @@ def mark_message_lines(lines):
|
||||
if splitter:
|
||||
# append as many splitter markers as lines in splitter
|
||||
splitter_lines = splitter.group().splitlines()
|
||||
for j in xrange(len(splitter_lines)):
|
||||
for j in range(len(splitter_lines)):
|
||||
markers[i + j] = 's'
|
||||
|
||||
# skip splitter lines
|
||||
@@ -217,7 +220,7 @@ def mark_message_lines(lines):
|
||||
markers[i] = 't'
|
||||
i += 1
|
||||
|
||||
return markers
|
||||
return ''.join(markers)
|
||||
|
||||
|
||||
def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
|
||||
@@ -231,6 +234,7 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
|
||||
return_flags = [were_lines_deleted, first_deleted_line,
|
||||
last_deleted_line]
|
||||
"""
|
||||
markers = ''.join(markers)
|
||||
# if there are no splitter there should be no markers
|
||||
if 's' not in markers and not re.search('(me*){3}', markers):
|
||||
markers = markers.replace('m', 't')
|
||||
@@ -276,10 +280,15 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
|
||||
Replaces link brackets so that they couldn't be taken for quotation marker.
|
||||
Splits line in two if splitter pattern preceded by some text on the same
|
||||
line (done only for 'On <date> <person> wrote:' pattern).
|
||||
|
||||
Converts msg_body into a unicode.
|
||||
"""
|
||||
# normalize links i.e. replace '<', '>' wrapping the link with some symbols
|
||||
# so that '>' closing the link couldn't be mistakenly taken for quotation
|
||||
# marker.
|
||||
if isinstance(msg_body, bytes):
|
||||
msg_body = msg_body.decode('utf8')
|
||||
|
||||
def link_wrapper(link):
|
||||
newline_index = msg_body[:link.start()].rfind("\n")
|
||||
if msg_body[newline_index + 1] == ">":
|
||||
@@ -342,11 +351,41 @@ def extract_from_html(msg_body):
|
||||
then extracting quotations from text,
|
||||
then checking deleted checkpoints,
|
||||
then deleting necessary tags.
|
||||
|
||||
Returns a unicode string.
|
||||
"""
|
||||
if msg_body.strip() == '':
|
||||
if isinstance(msg_body, six.text_type):
|
||||
msg_body = msg_body.encode('utf8')
|
||||
elif not isinstance(msg_body, bytes):
|
||||
msg_body = msg_body.encode('ascii')
|
||||
|
||||
result = _extract_from_html(msg_body)
|
||||
if isinstance(result, bytes):
|
||||
result = result.decode('utf8')
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _extract_from_html(msg_body):
|
||||
"""
|
||||
Extract not quoted message from provided html message body
|
||||
using tags and plain text algorithm.
|
||||
|
||||
Cut out the 'blockquote', 'gmail_quote' tags.
|
||||
Cut Microsoft quotations.
|
||||
|
||||
Then use plain text algorithm to cut out splitter or
|
||||
leftover quotation.
|
||||
This works by adding checkpoint text to all html tags,
|
||||
then converting html to text,
|
||||
then extracting quotations from text,
|
||||
then checking deleted checkpoints,
|
||||
then deleting necessary tags.
|
||||
"""
|
||||
if msg_body.strip() == b'':
|
||||
return msg_body
|
||||
|
||||
msg_body = msg_body.replace('\r\n', '').replace('\n', '')
|
||||
msg_body = msg_body.replace(b'\r\n', b'').replace(b'\n', b'')
|
||||
html_tree = html.document_fromstring(
|
||||
msg_body,
|
||||
parser=html.HTMLParser(encoding="utf-8")
|
||||
@@ -388,7 +427,7 @@ def extract_from_html(msg_body):
|
||||
lines_were_deleted, first_deleted, last_deleted = return_flags
|
||||
if lines_were_deleted:
|
||||
#collect checkpoints from deleted lines
|
||||
for i in xrange(first_deleted, last_deleted):
|
||||
for i in range(first_deleted, last_deleted):
|
||||
for checkpoint in line_checkpoints[i]:
|
||||
quotation_checkpoints[checkpoint] = True
|
||||
else:
|
||||
|
||||
@@ -20,6 +20,7 @@ trained against, don't forget to regenerate:
|
||||
* signature/data/classifier
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
import os
|
||||
|
||||
from . import extraction
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from __future__ import absolute_import
|
||||
import logging
|
||||
|
||||
import regex as re
|
||||
@@ -111,7 +112,7 @@ def extract_signature(msg_body):
|
||||
|
||||
return (stripped_body.strip(),
|
||||
signature.strip())
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
log.exception('ERROR extracting signature')
|
||||
return (msg_body, None)
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
import logging
|
||||
|
||||
import regex as re
|
||||
|
||||
@@ -5,6 +5,7 @@ The classifier could be used to detect if a certain line of the message
|
||||
body belongs to the signature.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from numpy import genfromtxt
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.externals import joblib
|
||||
|
||||
@@ -16,11 +16,13 @@ suffix and the corresponding sender file has the same name except for the
|
||||
suffix which should be `_sender`.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
import os
|
||||
import regex as re
|
||||
|
||||
from talon.signature.constants import SIGNATURE_MAX_LINES
|
||||
from talon.signature.learning.featurespace import build_pattern, features
|
||||
from six.moves import range
|
||||
|
||||
|
||||
SENDER_SUFFIX = '_sender'
|
||||
@@ -144,7 +146,7 @@ def build_extraction_dataset(folder, dataset_filename,
|
||||
if not sender or not msg:
|
||||
continue
|
||||
lines = msg.splitlines()
|
||||
for i in xrange(1, min(SIGNATURE_MAX_LINES,
|
||||
for i in range(1, min(SIGNATURE_MAX_LINES,
|
||||
len(lines)) + 1):
|
||||
line = lines[-i]
|
||||
label = -1
|
||||
|
||||
@@ -7,9 +7,12 @@ The body and the message sender string are converted into unicode before
|
||||
applying features to them.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from talon.signature.constants import (SIGNATURE_MAX_LINES,
|
||||
TOO_LONG_SIGNATURE_LINE)
|
||||
from talon.signature.learning.helpers import *
|
||||
from six.moves import zip
|
||||
from functools import reduce
|
||||
|
||||
|
||||
def features(sender=''):
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
import unicodedata
|
||||
import regex as re
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# coding:utf-8
|
||||
|
||||
from __future__ import absolute_import
|
||||
import logging
|
||||
from random import shuffle
|
||||
import chardet
|
||||
@@ -10,6 +11,7 @@ from lxml import html
|
||||
from lxml.cssselect import CSSSelector
|
||||
|
||||
from talon.constants import RE_DELIMITER
|
||||
import six
|
||||
|
||||
|
||||
def safe_format(format_string, *args, **kwargs):
|
||||
@@ -28,7 +30,7 @@ def safe_format(format_string, *args, **kwargs):
|
||||
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||
format_string = to_utf8(format_string)
|
||||
args = [to_utf8(p) for p in args]
|
||||
kwargs = {k: to_utf8(v) for k, v in kwargs.iteritems()}
|
||||
kwargs = {k: to_utf8(v) for k, v in six.iteritems(kwargs)}
|
||||
return format_string.format(*args, **kwargs)
|
||||
|
||||
# ignore other errors
|
||||
@@ -45,9 +47,9 @@ def to_unicode(str_or_unicode, precise=False):
|
||||
u'привет'
|
||||
If `precise` flag is True, tries to guess the correct encoding first.
|
||||
"""
|
||||
encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8'
|
||||
if isinstance(str_or_unicode, str):
|
||||
return unicode(str_or_unicode, encoding, 'replace')
|
||||
if not isinstance(str_or_unicode, six.text_type):
|
||||
encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8'
|
||||
return six.text_type(str_or_unicode, encoding, 'replace')
|
||||
return str_or_unicode
|
||||
|
||||
|
||||
@@ -57,11 +59,12 @@ def detect_encoding(string):
|
||||
|
||||
Defaults to UTF-8.
|
||||
"""
|
||||
assert isinstance(string, bytes)
|
||||
try:
|
||||
detected = chardet.detect(string)
|
||||
if detected:
|
||||
return detected.get('encoding') or 'utf-8'
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
pass
|
||||
return 'utf-8'
|
||||
|
||||
@@ -72,11 +75,12 @@ def quick_detect_encoding(string):
|
||||
|
||||
Uses cchardet. Fallbacks to detect_encoding.
|
||||
"""
|
||||
assert isinstance(string, bytes)
|
||||
try:
|
||||
detected = cchardet.detect(string)
|
||||
if detected:
|
||||
return detected.get('encoding') or detect_encoding(string)
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
pass
|
||||
return detect_encoding(string)
|
||||
|
||||
@@ -87,7 +91,7 @@ def to_utf8(str_or_unicode):
|
||||
>>> utils.to_utf8(u'hi')
|
||||
'hi'
|
||||
"""
|
||||
if isinstance(str_or_unicode, unicode):
|
||||
if not isinstance(str_or_unicode, six.text_type):
|
||||
return str_or_unicode.encode("utf-8", "ignore")
|
||||
return str(str_or_unicode)
|
||||
|
||||
@@ -119,8 +123,11 @@ def html_to_text(string):
|
||||
1. the string is expected to contain UTF-8 encoded HTML!
|
||||
2. returns utf-8 encoded str (not unicode)
|
||||
"""
|
||||
if isinstance(string, six.text_type):
|
||||
string = string.encode('utf8')
|
||||
|
||||
s = _prepend_utf8_declaration(string)
|
||||
s = s.replace("\n", "")
|
||||
s = s.replace(b"\n", b"")
|
||||
|
||||
tree = html.fromstring(s)
|
||||
|
||||
@@ -155,7 +162,7 @@ def html_to_text(string):
|
||||
def _contains_charset_spec(s):
|
||||
"""Return True if the first 4KB contain charset spec
|
||||
"""
|
||||
return s.lower().find('html; charset=', 0, 4096) != -1
|
||||
return s.lower().find(b'html; charset=', 0, 4096) != -1
|
||||
|
||||
|
||||
def _prepend_utf8_declaration(s):
|
||||
@@ -173,11 +180,11 @@ def _rm_excessive_newlines(s):
|
||||
def _encode_utf8(s):
|
||||
"""Encode in 'utf-8' if unicode
|
||||
"""
|
||||
return s.encode('utf-8') if isinstance(s, unicode) else s
|
||||
return s.encode('utf-8') if isinstance(s, six.text_type) else s
|
||||
|
||||
|
||||
_UTF8_DECLARATION = ('<meta http-equiv="Content-Type" content="text/html;'
|
||||
'charset=utf-8">')
|
||||
_UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;'
|
||||
b'charset=utf-8">')
|
||||
|
||||
|
||||
_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from __future__ import absolute_import
|
||||
from nose.tools import *
|
||||
from mock import *
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from . import *
|
||||
from . fixtures import *
|
||||
|
||||
@@ -305,6 +306,7 @@ def extract_reply_and_check(filename):
|
||||
msg_body = f.read()
|
||||
reply = quotations.extract_from_html(msg_body)
|
||||
plain_reply = u.html_to_text(reply)
|
||||
plain_reply = plain_reply.decode('utf8')
|
||||
|
||||
eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"),
|
||||
RE_WHITESPACE.sub('', plain_reply))
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from . import *
|
||||
from . fixtures import *
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from .. import *
|
||||
|
||||
from talon.signature import bruteforce
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from .. import *
|
||||
|
||||
import os
|
||||
@@ -8,6 +9,7 @@ from talon.signature.learning import dataset
|
||||
from talon import signature
|
||||
from talon.signature import extraction as e
|
||||
from talon.signature import bruteforce
|
||||
from six.moves import range
|
||||
|
||||
|
||||
def test_message_shorter_SIGNATURE_MAX_LINES():
|
||||
@@ -127,20 +129,20 @@ def test_mark_lines():
|
||||
|
||||
def test_process_marked_lines():
|
||||
# no signature found
|
||||
eq_((range(5), None), e._process_marked_lines(range(5), 'telt'))
|
||||
eq_((list(range(5)), None), e._process_marked_lines(list(range(5)), 'telt'))
|
||||
|
||||
# signature in the middle of the text
|
||||
eq_((range(9), None), e._process_marked_lines(range(9), 'tesestelt'))
|
||||
eq_((list(range(9)), None), e._process_marked_lines(list(range(9)), 'tesestelt'))
|
||||
|
||||
# long line splits signature
|
||||
eq_((range(7), [7, 8]),
|
||||
e._process_marked_lines(range(9), 'tsslsless'))
|
||||
eq_((list(range(7)), [7, 8]),
|
||||
e._process_marked_lines(list(range(9)), 'tsslsless'))
|
||||
|
||||
eq_((range(20), [20]),
|
||||
e._process_marked_lines(range(21), 'ttttttstttesllelelets'))
|
||||
eq_((list(range(20)), [20]),
|
||||
e._process_marked_lines(list(range(21)), 'ttttttstttesllelelets'))
|
||||
|
||||
# some signature lines could be identified as text
|
||||
eq_(([0], range(1, 9)), e._process_marked_lines(range(9), 'tsetetest'))
|
||||
eq_(([0], list(range(1, 9))), e._process_marked_lines(list(range(9)), 'tsetetest'))
|
||||
|
||||
eq_(([], range(5)),
|
||||
e._process_marked_lines(range(5), "ststt"))
|
||||
eq_(([], list(range(5))),
|
||||
e._process_marked_lines(list(range(5)), "ststt"))
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from ... import *
|
||||
import os
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from ... import *
|
||||
|
||||
from talon.signature.learning import featurespace as fs
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from ... import *
|
||||
|
||||
import regex as re
|
||||
|
||||
from talon.signature.learning import helpers as h
|
||||
from talon.signature.learning.helpers import *
|
||||
from six.moves import range
|
||||
|
||||
# First testing regex constants.
|
||||
VALID = '''
|
||||
@@ -154,7 +156,7 @@ def test_extract_names():
|
||||
# check that extracted names could be compiled
|
||||
try:
|
||||
re.compile("|".join(extracted_names))
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
ok_(False, ("Failed to compile extracted names {}"
|
||||
"\n\nReason: {}").format(extracted_names, e))
|
||||
if expected_names:
|
||||
@@ -204,7 +206,7 @@ def test_has_signature():
|
||||
'sender@example.com'))
|
||||
assert_false(h.has_signature('http://www.example.com/555-555-5555',
|
||||
'sender@example.com'))
|
||||
long_line = ''.join(['q' for e in xrange(28)])
|
||||
long_line = ''.join(['q' for e in range(28)])
|
||||
assert_false(h.has_signature(long_line + ' sender', 'sender@example.com'))
|
||||
# wont crash on an empty string
|
||||
assert_false(h.has_signature('', ''))
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from . import *
|
||||
from . fixtures import *
|
||||
|
||||
@@ -7,6 +8,9 @@ import os
|
||||
|
||||
import email.iterators
|
||||
from talon import quotations
|
||||
import six
|
||||
from six.moves import range
|
||||
from six import StringIO
|
||||
|
||||
|
||||
@patch.object(quotations, 'MAX_LINES_COUNT', 1)
|
||||
@@ -138,7 +142,7 @@ def _check_pattern_original_message(original_message_indicator):
|
||||
-----{}-----
|
||||
|
||||
Test"""
|
||||
eq_('Test reply', quotations.extract_from_plain(msg_body.format(unicode(original_message_indicator))))
|
||||
eq_('Test reply', quotations.extract_from_plain(msg_body.format(six.text_type(original_message_indicator))))
|
||||
|
||||
def test_english_original_message():
|
||||
_check_pattern_original_message('Original Message')
|
||||
@@ -662,6 +666,15 @@ def test_preprocess_postprocess_2_links():
|
||||
eq_(msg_body, quotations.extract_from_plain(msg_body))
|
||||
|
||||
|
||||
def body_iterator(msg, decode=False):
|
||||
for subpart in msg.walk():
|
||||
payload = subpart.get_payload(decode=decode)
|
||||
if isinstance(payload, six.text_type):
|
||||
yield payload
|
||||
else:
|
||||
yield payload.decode('utf8')
|
||||
|
||||
|
||||
def test_standard_replies():
|
||||
for filename in os.listdir(STANDARD_REPLIES):
|
||||
filename = os.path.join(STANDARD_REPLIES, filename)
|
||||
@@ -669,8 +682,8 @@ def test_standard_replies():
|
||||
continue
|
||||
with open(filename) as f:
|
||||
message = email.message_from_file(f)
|
||||
body = email.iterators.typed_subpart_iterator(message, subtype='plain').next()
|
||||
text = ''.join(email.iterators.body_line_iterator(body, True))
|
||||
body = next(email.iterators.typed_subpart_iterator(message, subtype='plain'))
|
||||
text = ''.join(body_iterator(body, True))
|
||||
|
||||
stripped_text = quotations.extract_from_plain(text)
|
||||
reply_text_fn = filename[:-4] + '_reply_text'
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
# coding:utf-8
|
||||
|
||||
from __future__ import absolute_import
|
||||
from . import *
|
||||
|
||||
from talon import utils as u
|
||||
import cchardet
|
||||
import six
|
||||
|
||||
|
||||
def test_get_delimiter():
|
||||
@@ -14,49 +16,49 @@ def test_get_delimiter():
|
||||
|
||||
def test_unicode():
|
||||
eq_ (u'hi', u.to_unicode('hi'))
|
||||
eq_ (type(u.to_unicode('hi')), unicode )
|
||||
eq_ (type(u.to_unicode(u'hi')), unicode )
|
||||
eq_ (type(u.to_unicode('привет')), unicode )
|
||||
eq_ (type(u.to_unicode(u'привет')), unicode )
|
||||
eq_ (type(u.to_unicode('hi')), six.text_type )
|
||||
eq_ (type(u.to_unicode(u'hi')), six.text_type )
|
||||
eq_ (type(u.to_unicode('привет')), six.text_type )
|
||||
eq_ (type(u.to_unicode(u'привет')), six.text_type )
|
||||
eq_ (u"привет", u.to_unicode('привет'))
|
||||
eq_ (u"привет", u.to_unicode(u'привет'))
|
||||
# some latin1 stuff
|
||||
eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True))
|
||||
eq_ (u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True))
|
||||
|
||||
|
||||
def test_detect_encoding():
|
||||
eq_ ('ascii', u.detect_encoding('qwe').lower())
|
||||
eq_ ('iso-8859-2', u.detect_encoding('Versi\xf3n').lower())
|
||||
eq_ ('utf-8', u.detect_encoding('привет').lower())
|
||||
eq_ ('ascii', u.detect_encoding(b'qwe').lower())
|
||||
eq_ ('iso-8859-2', u.detect_encoding(u'Versi\xf3n'.encode('iso-8859-2')).lower())
|
||||
eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower())
|
||||
# fallback to utf-8
|
||||
with patch.object(u.chardet, 'detect') as detect:
|
||||
detect.side_effect = Exception
|
||||
eq_ ('utf-8', u.detect_encoding('qwe').lower())
|
||||
eq_ ('utf-8', u.detect_encoding('qwe'.encode('utf8')).lower())
|
||||
|
||||
|
||||
def test_quick_detect_encoding():
|
||||
eq_ ('ascii', u.quick_detect_encoding('qwe').lower())
|
||||
eq_ ('windows-1252', u.quick_detect_encoding('Versi\xf3n').lower())
|
||||
eq_ ('utf-8', u.quick_detect_encoding('привет').lower())
|
||||
eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower())
|
||||
eq_ ('windows-1252', u.quick_detect_encoding(u'Versi\xf3n'.encode('windows-1252')).lower())
|
||||
eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower())
|
||||
|
||||
|
||||
@patch.object(cchardet, 'detect')
|
||||
@patch.object(u, 'detect_encoding')
|
||||
def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect):
|
||||
cchardet_detect.return_value = {'encoding': 'ascii'}
|
||||
eq_('ascii', u.quick_detect_encoding("qwe"))
|
||||
cchardet_detect.assert_called_once_with("qwe")
|
||||
eq_('ascii', u.quick_detect_encoding(b"qwe"))
|
||||
cchardet_detect.assert_called_once_with(b"qwe")
|
||||
|
||||
# fallback to detect_encoding
|
||||
cchardet_detect.return_value = {}
|
||||
detect_encoding.return_value = 'utf-8'
|
||||
eq_('utf-8', u.quick_detect_encoding("qwe"))
|
||||
eq_('utf-8', u.quick_detect_encoding(b"qwe"))
|
||||
|
||||
# exception
|
||||
detect_encoding.reset_mock()
|
||||
cchardet_detect.side_effect = Exception()
|
||||
detect_encoding.return_value = 'utf-8'
|
||||
eq_('utf-8', u.quick_detect_encoding("qwe"))
|
||||
eq_('utf-8', u.quick_detect_encoding(b"qwe"))
|
||||
ok_(detect_encoding.called)
|
||||
|
||||
|
||||
@@ -73,11 +75,11 @@ Haha
|
||||
</p>
|
||||
</body>"""
|
||||
text = u.html_to_text(html)
|
||||
eq_("Hello world! \n\n * One! \n * Two \nHaha", text)
|
||||
eq_("привет!", u.html_to_text("<b>привет!</b>"))
|
||||
eq_(b"Hello world! \n\n * One! \n * Two \nHaha", text)
|
||||
eq_(u"привет!", u.html_to_text("<b>привет!</b>").decode('utf8'))
|
||||
|
||||
html = '<body><br/><br/>Hi</body>'
|
||||
eq_ ('Hi', u.html_to_text(html))
|
||||
eq_ (b'Hi', u.html_to_text(html))
|
||||
|
||||
html = """Hi
|
||||
<style type="text/css">
|
||||
@@ -97,11 +99,11 @@ font: 13px 'Lucida Grande', Arial, sans-serif;
|
||||
|
||||
}
|
||||
</style>"""
|
||||
eq_ ('Hi', u.html_to_text(html))
|
||||
eq_ (b'Hi', u.html_to_text(html))
|
||||
|
||||
html = """<div>
|
||||
<!-- COMMENT 1 -->
|
||||
<span>TEXT 1</span>
|
||||
<p>TEXT 2 <!-- COMMENT 2 --></p>
|
||||
</div>"""
|
||||
eq_('TEXT 1 \nTEXT 2', u.html_to_text(html))
|
||||
eq_(b'TEXT 1 \nTEXT 2', u.html_to_text(html))
|
||||
|
||||
Reference in New Issue
Block a user