Updated talon for Python 3

This commit is contained in:
Yacine Filali
2017-05-23 15:39:50 -07:00
parent f16ae5110b
commit 086f5ba43b
10 changed files with 75 additions and 61 deletions

View File

@@ -1,13 +1,12 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from . import *
from . fixtures import *
import regex as re
from talon import quotations, utils as u
from . import *
from .fixtures import *
RE_WHITESPACE = re.compile("\s")
RE_DOUBLE_WHITESPACE = re.compile("\s")
@@ -303,7 +302,7 @@ Reply
def extract_reply_and_check(filename):
f = open(filename)
f = open(filename, encoding='utf8')
msg_body = f.read()
reply = quotations.extract_from_html(msg_body)
@@ -373,7 +372,7 @@ reply
</blockquote>"""
msg_body = msg_body.replace('\n', '\r\n')
extracted = quotations.extract_from_html(msg_body)
assert_false(symbol in extracted)
assert_false(symbol in extracted)
# Keep new lines otherwise "My reply" becomes one word - "Myreply"
eq_("<html><head></head><body>My\nreply\n</body></html>", extracted)

View File

@@ -31,7 +31,7 @@ def test_messages_longer_SIGNATURE_MAX_LINES():
sender, body = dataset.parse_msg_sender(filename)
text, extracted_signature = signature.extract(body, sender)
extracted_signature = extracted_signature or ''
with open(filename[:-len('body')] + 'signature') as ms:
with open(filename[:-len('body')] + 'signature', encoding='utf8') as ms:
msg_signature = ms.read()
eq_(msg_signature.strip(), extracted_signature.strip())
stripped_msg = body.strip()[:len(body.strip())-len(msg_signature)]

View File

@@ -1,12 +1,12 @@
# coding:utf-8
from __future__ import absolute_import
from . import *
from talon import utils as u
import cchardet
import six
from lxml import html
from talon import utils as u
from . import *
def test_get_delimiter():
@@ -16,35 +16,35 @@ def test_get_delimiter():
def test_unicode():
eq_ (u'hi', u.to_unicode('hi'))
eq_ (type(u.to_unicode('hi')), six.text_type )
eq_ (type(u.to_unicode(u'hi')), six.text_type )
eq_ (type(u.to_unicode('привет')), six.text_type )
eq_ (type(u.to_unicode(u'привет')), six.text_type )
eq_ (u"привет", u.to_unicode('привет'))
eq_ (u"привет", u.to_unicode(u'привет'))
eq_(u'hi', u.to_unicode('hi'))
eq_(type(u.to_unicode('hi')), six.text_type)
eq_(type(u.to_unicode(u'hi')), six.text_type)
eq_(type(u.to_unicode('привет')), six.text_type)
eq_(type(u.to_unicode(u'привет')), six.text_type)
eq_(u"привет", u.to_unicode('привет'))
eq_(u"привет", u.to_unicode(u'привет'))
# some latin1 stuff
eq_ (u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True))
eq_(u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True))
def test_detect_encoding():
eq_ ('ascii', u.detect_encoding(b'qwe').lower())
ok_ (u.detect_encoding(
eq_('ascii', u.detect_encoding(b'qwe').lower())
ok_(u.detect_encoding(
u'Versi\xf3n'.encode('iso-8859-2')).lower() in [
'iso-8859-1', 'iso-8859-2'])
eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower())
eq_('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower())
# fallback to utf-8
with patch.object(u.chardet, 'detect') as detect:
detect.side_effect = Exception
eq_ ('utf-8', u.detect_encoding('qwe'.encode('utf8')).lower())
eq_('utf-8', u.detect_encoding('qwe'.encode('utf8')).lower())
def test_quick_detect_encoding():
eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower())
ok_ (u.quick_detect_encoding(
eq_('ascii', u.quick_detect_encoding(b'qwe').lower())
ok_(u.quick_detect_encoding(
u'Versi\xf3n'.encode('windows-1252')).lower() in [
'windows-1252', 'windows-1250'])
eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower())
eq_('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower())
@patch.object(cchardet, 'detect')
@@ -84,7 +84,7 @@ Haha
eq_(u"привет!", u.html_to_text("<b>привет!</b>").decode('utf8'))
html = '<body><br/><br/>Hi</body>'
eq_ (b'Hi', u.html_to_text(html))
eq_(b'Hi', u.html_to_text(html))
html = """Hi
<style type="text/css">
@@ -104,7 +104,7 @@ font: 13px 'Lucida Grande', Arial, sans-serif;
}
</style>"""
eq_ (b'Hi', u.html_to_text(html))
eq_(b'Hi', u.html_to_text(html))
html = """<div>
<!-- COMMENT 1 -->
@@ -115,15 +115,16 @@ font: 13px 'Lucida Grande', Arial, sans-serif;
def test_comment_no_parent():
s = "<!-- COMMENT 1 --> no comment"
s = b'<!-- COMMENT 1 --> no comment'
d = u.html_document_fromstring(s)
eq_("no comment", u.html_tree_to_text(d))
eq_(b"no comment", u.html_tree_to_text(d))
@patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception()))
def test_html_fromstring_exception():
eq_(None, u.html_fromstring("<html></html>"))
@patch.object(u, 'html_too_big', Mock())
@patch.object(u.html5parser, 'fromstring')
def test_html_fromstring_too_big(fromstring):
@@ -158,5 +159,5 @@ def test_html_too_big():
@patch.object(u, '_MAX_TAGS_COUNT', 3)
def test_html_to_text():
eq_("Hello", u.html_to_text("<div>Hello</div>"))
eq_(b"Hello", u.html_to_text("<div>Hello</div>"))
eq_(None, u.html_to_text("<div><span>Hi</span></div>"))