Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2377c387c7 |
7
setup.py
7
setup.py
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
||||
|
||||
|
||||
setup(name='talon',
|
||||
version='1.0.9',
|
||||
version='1.0.5',
|
||||
description=("Mailgun library "
|
||||
"to extract message quotations and signatures."),
|
||||
long_description=open("README.rst").read(),
|
||||
@@ -14,15 +14,12 @@ setup(name='talon',
|
||||
include_package_data=True,
|
||||
zip_safe=True,
|
||||
install_requires=[
|
||||
"lxml>=2.3.3",
|
||||
"lxml==2.3.3",
|
||||
"regex>=1",
|
||||
"html2text",
|
||||
"numpy",
|
||||
"scipy",
|
||||
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
|
||||
'chardet>=1.0.1',
|
||||
'cchardet>=0.3.5',
|
||||
'cssselect'
|
||||
],
|
||||
tests_require=[
|
||||
"mock",
|
||||
|
||||
@@ -138,10 +138,9 @@ def cut_by_id(html_message):
|
||||
|
||||
|
||||
def cut_blockquote(html_message):
|
||||
''' Cuts the last non-nested blockquote with wrapping elements. '''
|
||||
quote = html_message.xpath('(.//blockquote)[not(ancestor::blockquote)][last()]')
|
||||
if quote:
|
||||
quote = quote[0]
|
||||
''' Cuts blockquote with wrapping elements. '''
|
||||
quote = html_message.find('.//blockquote')
|
||||
if quote is not None:
|
||||
quote.getparent().remove(quote)
|
||||
return True
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ log = logging.getLogger(__name__)
|
||||
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
|
||||
|
||||
RE_ON_DATE_SMB_WROTE = re.compile(
|
||||
u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
|
||||
u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
|
||||
# Beginning of the line
|
||||
u'|'.join((
|
||||
# English
|
||||
@@ -32,13 +32,7 @@ RE_ON_DATE_SMB_WROTE = re.compile(
|
||||
# Polish
|
||||
'W dniu',
|
||||
# Dutch
|
||||
'Op',
|
||||
# German
|
||||
'Am',
|
||||
# Norwegian
|
||||
u'På',
|
||||
# Swedish, Danish
|
||||
'Den',
|
||||
'Op'
|
||||
)),
|
||||
# Date and sender separator
|
||||
u'|'.join((
|
||||
@@ -56,28 +50,18 @@ RE_ON_DATE_SMB_WROTE = re.compile(
|
||||
# Polish
|
||||
u'napisał',
|
||||
# Dutch
|
||||
'schreef','verzond','geschreven',
|
||||
# German
|
||||
'schrieb',
|
||||
# Norwegian, Swedish
|
||||
'skrev',
|
||||
'schreef','verzond','geschreven'
|
||||
))
|
||||
))
|
||||
# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
|
||||
RE_ON_DATE_WROTE_SMB = re.compile(
|
||||
u'(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
|
||||
u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format(
|
||||
# Beginning of the line
|
||||
u'|'.join((
|
||||
'Op',
|
||||
#German
|
||||
'Am'
|
||||
)),
|
||||
# Ending of the line
|
||||
u'|'.join((
|
||||
# Dutch
|
||||
'schreef','verzond','geschreven',
|
||||
# German
|
||||
'schrieb'
|
||||
'schreef','verzond','geschreven'
|
||||
))
|
||||
)
|
||||
)
|
||||
@@ -131,9 +115,9 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
|
||||
RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format(
|
||||
u'|'.join((
|
||||
# "From" in different languages.
|
||||
'From', 'Van', 'De', 'Von', 'Fra', u'Från',
|
||||
'From', 'Van', 'De', 'Von', 'Fra',
|
||||
# "Date" in different languages.
|
||||
'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
|
||||
'Date', 'Datum', u'Envoyé'
|
||||
))), re.I)
|
||||
|
||||
SPLITTER_PATTERNS = [
|
||||
@@ -197,7 +181,6 @@ def mark_message_lines(lines):
|
||||
else:
|
||||
# in case splitter is spread across several lines
|
||||
splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
|
||||
|
||||
if splitter:
|
||||
# append as many splitter markers as lines in splitter
|
||||
splitter_lines = splitter.group().splitlines()
|
||||
@@ -310,8 +293,12 @@ def extract_from_plain(msg_body):
|
||||
|
||||
delimiter = get_delimiter(msg_body)
|
||||
msg_body = preprocess(msg_body, delimiter)
|
||||
lines = msg_body.splitlines()
|
||||
|
||||
# don't process too long messages
|
||||
lines = msg_body.splitlines()[:MAX_LINES_COUNT]
|
||||
if len(lines) > MAX_LINES_COUNT:
|
||||
return stripped_text
|
||||
|
||||
markers = mark_message_lines(lines)
|
||||
lines = process_marked_lines(lines, markers)
|
||||
|
||||
@@ -321,7 +308,7 @@ def extract_from_plain(msg_body):
|
||||
return msg_body
|
||||
|
||||
|
||||
def extract_from_html(s):
|
||||
def extract_from_html(msg_body):
|
||||
"""
|
||||
Extract not quoted message from provided html message body
|
||||
using tags and plain text algorithm.
|
||||
@@ -338,12 +325,8 @@ def extract_from_html(s):
|
||||
then deleting necessary tags.
|
||||
"""
|
||||
|
||||
if s.strip() == '':
|
||||
return s
|
||||
|
||||
# replace CRLF with LF temporaraly otherwise CR will be converted to ' '
|
||||
# when doing deepcopy on html tree
|
||||
msg_body, replaced = _CRLF_to_LF(s)
|
||||
if msg_body.strip() == '':
|
||||
return msg_body
|
||||
|
||||
html_tree = html.document_fromstring(
|
||||
msg_body,
|
||||
@@ -374,12 +357,15 @@ def extract_from_html(s):
|
||||
plain_text = plain_text.replace('*', '')
|
||||
# Unmask saved star symbols
|
||||
plain_text = plain_text.replace('3423oorkg432', '*')
|
||||
plain_text = preprocess(plain_text, '\n', content_type='text/html')
|
||||
|
||||
delimiter = get_delimiter(plain_text)
|
||||
|
||||
plain_text = preprocess(plain_text, delimiter, content_type='text/html')
|
||||
lines = plain_text.splitlines()
|
||||
|
||||
# Don't process too long messages
|
||||
if len(lines) > MAX_LINES_COUNT:
|
||||
return s
|
||||
return msg_body
|
||||
|
||||
# Collect checkpoints on each line
|
||||
line_checkpoints = [
|
||||
@@ -404,9 +390,9 @@ def extract_from_html(s):
|
||||
quotation_checkpoints[checkpoint] = True
|
||||
else:
|
||||
if cut_quotations:
|
||||
return _restore_CRLF(html.tostring(html_tree_copy), replaced)
|
||||
return html.tostring(html_tree_copy)
|
||||
else:
|
||||
return s
|
||||
return msg_body
|
||||
|
||||
# Remove tags with quotation checkpoints
|
||||
html_quotations.delete_quotation_tags(
|
||||
@@ -442,37 +428,3 @@ def register_xpath_extensions():
|
||||
ns.prefix = 'mg'
|
||||
ns['text_content'] = text_content
|
||||
ns['tail'] = tail
|
||||
|
||||
|
||||
def _restore_CRLF(s, replaced=True):
|
||||
"""Restore CRLF if previously CRLF was replaced with LF
|
||||
|
||||
>>> _restore_CRLF('a\nb')
|
||||
'a\r\nb'
|
||||
>>> _restore_CRLF('a\nb', replaced=False)
|
||||
'a\nb'
|
||||
"""
|
||||
if replaced:
|
||||
return s.replace('\n', '\r\n')
|
||||
return s
|
||||
|
||||
|
||||
def _CRLF_to_LF(s):
|
||||
"""Replace CRLF with LF
|
||||
|
||||
>>> s, changed = _CRLF_to_LF('a\r\n'b)
|
||||
>>> s
|
||||
'a\nb'
|
||||
>>> changed
|
||||
True
|
||||
|
||||
>>> s, changed = _CRLF_to_LF('a\n'b)
|
||||
>>> s
|
||||
'a\nb'
|
||||
>>> changed
|
||||
False
|
||||
"""
|
||||
delimiter = get_delimiter(s)
|
||||
if delimiter == '\r\n':
|
||||
return s.replace(delimiter, '\n'), True
|
||||
return s, False
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -16,7 +16,7 @@ from talon.signature.constants import SIGNATURE_MAX_LINES
|
||||
|
||||
rc = re.compile
|
||||
|
||||
RE_EMAIL = rc('\S@\S')
|
||||
RE_EMAIL = rc('@')
|
||||
RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
|
||||
RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')
|
||||
|
||||
@@ -120,7 +120,7 @@ def contains_sender_names(sender):
|
||||
names = names or sender
|
||||
if names != '':
|
||||
return binary_regex_search(re.compile(names))
|
||||
return lambda s: 0
|
||||
return lambda s: False
|
||||
|
||||
|
||||
def extract_names(sender):
|
||||
@@ -134,7 +134,7 @@ def extract_names(sender):
|
||||
>>> extract_names('')
|
||||
[]
|
||||
"""
|
||||
sender = to_unicode(sender, precise=True)
|
||||
sender = to_unicode(sender)
|
||||
# Remove non-alphabetical characters
|
||||
sender = "".join([char if char.isalpha() else ' ' for char in sender])
|
||||
# Remove too short words and words from "black" list i.e.
|
||||
@@ -161,7 +161,7 @@ def categories_percent(s, categories):
|
||||
50.0
|
||||
'''
|
||||
count = 0
|
||||
s = to_unicode(s, precise=True)
|
||||
s = to_unicode(s)
|
||||
for c in s:
|
||||
if unicodedata.category(c) in categories:
|
||||
count += 1
|
||||
@@ -181,7 +181,7 @@ def punctuation_percent(s):
|
||||
|
||||
def capitalized_words_percent(s):
|
||||
'''Returns capitalized words percent.'''
|
||||
s = to_unicode(s, precise=True)
|
||||
s = to_unicode(s)
|
||||
words = re.split('\s', s)
|
||||
words = [w for w in words if w.strip()]
|
||||
capitalized_words_counter = 0
|
||||
|
||||
@@ -2,12 +2,13 @@
|
||||
|
||||
import logging
|
||||
from random import shuffle
|
||||
import chardet
|
||||
import cchardet
|
||||
|
||||
from talon.constants import RE_DELIMITER
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def safe_format(format_string, *args, **kwargs):
|
||||
"""
|
||||
Helper: formats string with any combination of bytestrings/unicode
|
||||
@@ -41,44 +42,12 @@ def to_unicode(str_or_unicode, precise=False):
|
||||
u'привет'
|
||||
If `precise` flag is True, tries to guess the correct encoding first.
|
||||
"""
|
||||
encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8'
|
||||
encoding = detect_encoding(str_or_unicode) if precise else 'utf-8'
|
||||
if isinstance(str_or_unicode, str):
|
||||
return unicode(str_or_unicode, encoding, 'replace')
|
||||
return str_or_unicode
|
||||
|
||||
|
||||
def detect_encoding(string):
|
||||
"""
|
||||
Tries to detect the encoding of the passed string.
|
||||
|
||||
Defaults to UTF-8.
|
||||
"""
|
||||
try:
|
||||
detected = chardet.detect(string)
|
||||
if detected:
|
||||
return detected.get('encoding') or 'utf-8'
|
||||
except Exception, e:
|
||||
print 11111111111, e
|
||||
pass
|
||||
return 'utf-8'
|
||||
|
||||
|
||||
def quick_detect_encoding(string):
|
||||
"""
|
||||
Tries to detect the encoding of the passed string.
|
||||
|
||||
Uses cchardet. Fallbacks to detect_encoding.
|
||||
"""
|
||||
try:
|
||||
detected = cchardet.detect(string)
|
||||
if detected:
|
||||
return detected.get('encoding') or detect_encoding(string)
|
||||
except Exception, e:
|
||||
print 222222222222, e
|
||||
pass
|
||||
return detect_encoding(string)
|
||||
|
||||
|
||||
def to_utf8(str_or_unicode):
|
||||
"""
|
||||
Safely returns a UTF-8 version of a given string
|
||||
|
||||
1
tests/fixtures/html_replies/hotmail.html
vendored
1
tests/fixtures/html_replies/hotmail.html
vendored
@@ -1,4 +1,3 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<html>
|
||||
<head>
|
||||
<style><!--
|
||||
|
||||
19
tests/fixtures/standard_replies/apple_mail_2.eml
vendored
19
tests/fixtures/standard_replies/apple_mail_2.eml
vendored
@@ -1,19 +0,0 @@
|
||||
Content-Type: text/plain;
|
||||
charset=us-ascii
|
||||
Mime-Version: 1.0 (Mac OS X Mail 8.2 \(2104\))
|
||||
Subject: Re: Hello there
|
||||
X-Universally-Unique-Identifier: 85B1075D-5841-46A9-8565-FCB287A93AC4
|
||||
From: Adam Renberg <adam@tictail.com>
|
||||
In-Reply-To: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
|
||||
Date: Sat, 22 Aug 2015 19:22:20 +0200
|
||||
Content-Transfer-Encoding: 7bit
|
||||
X-Smtp-Server: smtp.gmail.com:adam@tictail.com
|
||||
Message-Id: <68001B29-8EA4-444C-A894-0537D2CA5208@tictail.com>
|
||||
References: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
|
||||
To: Adam Renberg <tgwizard@gmail.com>
|
||||
|
||||
Hello
|
||||
> On 22 Aug 2015, at 19:21, Adam Renberg <tgwizard@gmail.com> wrote:
|
||||
>
|
||||
> Hi there!
|
||||
|
||||
@@ -28,8 +28,8 @@ def test_quotation_splitter_inside_blockquote():
|
||||
|
||||
</blockquote>"""
|
||||
|
||||
eq_("<html><body><p>Reply\n</p></body></html>",
|
||||
quotations.extract_from_html(msg_body))
|
||||
eq_("<html><body><p>Reply</p></body></html>",
|
||||
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
|
||||
|
||||
|
||||
def test_quotation_splitter_outside_blockquote():
|
||||
@@ -49,24 +49,6 @@ def test_quotation_splitter_outside_blockquote():
|
||||
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
|
||||
|
||||
|
||||
def test_regular_blockquote():
|
||||
msg_body = """Reply
|
||||
<blockquote>Regular</blockquote>
|
||||
|
||||
<div>
|
||||
On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:
|
||||
</div>
|
||||
|
||||
<blockquote>
|
||||
<div>
|
||||
<blockquote>Nested</blockquote>
|
||||
</div>
|
||||
</blockquote>
|
||||
"""
|
||||
eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote><div></div></body></html>",
|
||||
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
|
||||
|
||||
|
||||
def test_no_blockquote():
|
||||
msg_body = """
|
||||
<html>
|
||||
@@ -264,7 +246,7 @@ RE_REPLY = re.compile(r"^Hi\. I am fine\.\s*\n\s*Thanks,\s*\n\s*Alex\s*$")
|
||||
def extract_reply_and_check(filename):
|
||||
f = open(filename)
|
||||
|
||||
msg_body = f.read()
|
||||
msg_body = f.read().decode("utf-8")
|
||||
reply = quotations.extract_from_html(msg_body)
|
||||
|
||||
h = html2text.HTML2Text()
|
||||
@@ -310,25 +292,3 @@ def test_windows_mail_reply():
|
||||
|
||||
def test_yandex_ru_reply():
|
||||
extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html")
|
||||
|
||||
|
||||
def test_CRLF():
|
||||
"""CR is not converted to ' '
|
||||
"""
|
||||
eq_('<html>\r\n</html>', quotations.extract_from_html('<html>\r\n</html>'))
|
||||
|
||||
msg_body = """Reply
|
||||
<blockquote>
|
||||
|
||||
<div>
|
||||
On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:
|
||||
</div>
|
||||
|
||||
<div>
|
||||
Test
|
||||
</div>
|
||||
|
||||
</blockquote>"""
|
||||
msg_body = msg_body.replace('\n', '\r\n')
|
||||
eq_("<html><body><p>Reply\r\n</p></body></html>",
|
||||
quotations.extract_from_html(msg_body))
|
||||
|
||||
@@ -29,15 +29,3 @@ def test_crash_inside_extract_from():
|
||||
|
||||
def test_empty_body():
|
||||
eq_('', quotations.extract_from_plain(''))
|
||||
|
||||
|
||||
def test__CRLF_to_LF():
|
||||
eq_(('\n\r', True), quotations._CRLF_to_LF('\r\n\r'))
|
||||
eq_(('\n\r', False), quotations._CRLF_to_LF('\n\r'))
|
||||
|
||||
|
||||
def test__restore_CRLF():
|
||||
eq_('\n', quotations._restore_CRLF('\n', replaced=False))
|
||||
eq_('\r\n', quotations._restore_CRLF('\n', replaced=True))
|
||||
# default
|
||||
eq_('\r\n', quotations._restore_CRLF('\n'))
|
||||
|
||||
@@ -6,9 +6,7 @@ from talon.signature.learning import featurespace as fs
|
||||
|
||||
|
||||
def test_apply_features():
|
||||
s = '''This is John Doe
|
||||
|
||||
Tuesday @3pm suits. I'll chat to you then.
|
||||
s = '''John Doe
|
||||
|
||||
VP Research and Development, Xxxx Xxxx Xxxxx
|
||||
|
||||
@@ -21,12 +19,11 @@ john@example.com'''
|
||||
# note that we don't consider the first line because signatures don't
|
||||
# usually take all the text, empty lines are not considered
|
||||
eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
|
||||
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
|
||||
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
|
||||
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
|
||||
|
||||
with patch.object(fs, 'SIGNATURE_MAX_LINES', 5):
|
||||
with patch.object(fs, 'SIGNATURE_MAX_LINES', 4):
|
||||
features = fs.features(sender)
|
||||
new_result = fs.apply_features(s, features)
|
||||
# result remains the same because we don't consider empty lines
|
||||
|
||||
@@ -12,11 +12,11 @@ from talon import quotations
|
||||
@patch.object(quotations, 'MAX_LINES_COUNT', 1)
|
||||
def test_too_many_lines():
|
||||
msg_body = """Test reply
|
||||
Hi
|
||||
|
||||
-----Original Message-----
|
||||
|
||||
Test"""
|
||||
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
||||
eq_(msg_body, quotations.extract_from_plain(msg_body))
|
||||
|
||||
|
||||
def test_pattern_on_date_somebody_wrote():
|
||||
@@ -311,33 +311,6 @@ Emne: The manager has commented on your Loop
|
||||
Blah-blah-blah
|
||||
"""))
|
||||
|
||||
def test_swedish_from_block():
|
||||
eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
|
||||
u"""Allo! Follow up MIME!
|
||||
Från: Anno Sportel [mailto:anno.spoel@hsbcssad.com]
|
||||
Skickat: den 26 augusti 2015 14:45
|
||||
Till: Isacson Leiff
|
||||
Ämne: RE: Week 36
|
||||
|
||||
Blah-blah-blah
|
||||
"""))
|
||||
|
||||
def test_swedish_from_line():
|
||||
eq_('Lorem', quotations.extract_from_plain(
|
||||
"""Lorem
|
||||
Den 14 september, 2015 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
|
||||
|
||||
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
|
||||
"""))
|
||||
|
||||
def test_norwegian_from_line():
|
||||
eq_('Lorem', quotations.extract_from_plain(
|
||||
u"""Lorem
|
||||
På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
|
||||
|
||||
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
|
||||
"""))
|
||||
|
||||
def test_dutch_from_block():
|
||||
eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
|
||||
"""Gluten-free culpa lo-fi et nesciunt nostrud.
|
||||
|
||||
@@ -1,60 +1,9 @@
|
||||
# coding:utf-8
|
||||
|
||||
from . import *
|
||||
|
||||
from talon import utils as u
|
||||
import cchardet
|
||||
from talon import utils
|
||||
|
||||
|
||||
def test_get_delimiter():
|
||||
eq_('\r\n', u.get_delimiter('abc\r\n123'))
|
||||
eq_('\n', u.get_delimiter('abc\n123'))
|
||||
eq_('\n', u.get_delimiter('abc'))
|
||||
|
||||
|
||||
def test_unicode():
|
||||
eq_ (u'hi', u.to_unicode('hi'))
|
||||
eq_ (type(u.to_unicode('hi')), unicode )
|
||||
eq_ (type(u.to_unicode(u'hi')), unicode )
|
||||
eq_ (type(u.to_unicode('привет')), unicode )
|
||||
eq_ (type(u.to_unicode(u'привет')), unicode )
|
||||
eq_ (u"привет", u.to_unicode('привет'))
|
||||
eq_ (u"привет", u.to_unicode(u'привет'))
|
||||
# some latin1 stuff
|
||||
eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True))
|
||||
|
||||
|
||||
def test_detect_encoding():
|
||||
eq_ ('ascii', u.detect_encoding('qwe').lower())
|
||||
eq_ ('iso-8859-2', u.detect_encoding('Versi\xf3n').lower())
|
||||
eq_ ('utf-8', u.detect_encoding('привет').lower())
|
||||
# fallback to utf-8
|
||||
with patch.object(u.chardet, 'detect') as detect:
|
||||
detect.side_effect = Exception
|
||||
eq_ ('utf-8', u.detect_encoding('qwe').lower())
|
||||
|
||||
|
||||
def test_quick_detect_encoding():
|
||||
eq_ ('ascii', u.quick_detect_encoding('qwe').lower())
|
||||
eq_ ('windows-1252', u.quick_detect_encoding('Versi\xf3n').lower())
|
||||
eq_ ('utf-8', u.quick_detect_encoding('привет').lower())
|
||||
|
||||
|
||||
@patch.object(cchardet, 'detect')
|
||||
@patch.object(u, 'detect_encoding')
|
||||
def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect):
|
||||
cchardet_detect.return_value = {'encoding': 'ascii'}
|
||||
eq_('ascii', u.quick_detect_encoding("qwe"))
|
||||
cchardet_detect.assert_called_once_with("qwe")
|
||||
|
||||
# fallback to detect_encoding
|
||||
cchardet_detect.return_value = {}
|
||||
detect_encoding.return_value = 'utf-8'
|
||||
eq_('utf-8', u.quick_detect_encoding("qwe"))
|
||||
|
||||
# exception
|
||||
detect_encoding.reset_mock()
|
||||
cchardet_detect.side_effect = Exception()
|
||||
detect_encoding.return_value = 'utf-8'
|
||||
eq_('utf-8', u.quick_detect_encoding("qwe"))
|
||||
ok_(detect_encoding.called)
|
||||
eq_('\r\n', utils.get_delimiter('abc\r\n123'))
|
||||
eq_('\n', utils.get_delimiter('abc\n123'))
|
||||
eq_('\n', utils.get_delimiter('abc'))
|
||||
|
||||
Reference in New Issue
Block a user