Operate on unicode data exclusively
This commit is contained in:
@@ -4,14 +4,17 @@ from __future__ import absolute_import
|
||||
|
||||
# noinspection PyUnresolvedReferences
|
||||
import re
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
from nose.tools import assert_false, assert_true, eq_, ok_
|
||||
|
||||
from tests.fixtures import (OLK_SRC_BODY_SECTION,
|
||||
REPLY_QUOTATIONS_SHARE_BLOCK,
|
||||
REPLY_SEPARATED_BY_HR)
|
||||
from talon import quotations, utils as u
|
||||
from . import *
|
||||
from .fixtures import *
|
||||
from lxml import html
|
||||
|
||||
RE_WHITESPACE = re.compile("\s")
|
||||
RE_DOUBLE_WHITESPACE = re.compile("\s")
|
||||
RE_WHITESPACE = re.compile(r"\s")
|
||||
RE_DOUBLE_WHITESPACE = re.compile(r"\s")
|
||||
|
||||
|
||||
def test_quotation_splitter_inside_blockquote():
|
||||
@@ -166,7 +169,7 @@ def test_unicode_in_reply():
|
||||
|
||||
<blockquote>
|
||||
Quote
|
||||
</blockquote>""".encode("utf-8")
|
||||
</blockquote>"""
|
||||
|
||||
eq_("<html><head></head><body>Reply  Text<br><div><br></div>"
|
||||
"</body></html>",
|
||||
@@ -314,7 +317,6 @@ def extract_reply_and_check(filename):
|
||||
msg_body = f.read()
|
||||
reply = quotations.extract_from_html(msg_body)
|
||||
plain_reply = u.html_to_text(reply)
|
||||
plain_reply = plain_reply.decode('utf8')
|
||||
|
||||
eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"),
|
||||
RE_WHITESPACE.sub('', plain_reply))
|
||||
|
||||
@@ -2,9 +2,6 @@
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import cchardet
|
||||
import six
|
||||
|
||||
from talon import utils as u
|
||||
from . import *
|
||||
|
||||
@@ -15,58 +12,6 @@ def test_get_delimiter():
|
||||
eq_('\n', u.get_delimiter('abc'))
|
||||
|
||||
|
||||
def test_unicode():
|
||||
eq_(u'hi', u.to_unicode('hi'))
|
||||
eq_(type(u.to_unicode('hi')), six.text_type)
|
||||
eq_(type(u.to_unicode(u'hi')), six.text_type)
|
||||
eq_(type(u.to_unicode('привет')), six.text_type)
|
||||
eq_(type(u.to_unicode(u'привет')), six.text_type)
|
||||
eq_(u"привет", u.to_unicode('привет'))
|
||||
eq_(u"привет", u.to_unicode(u'привет'))
|
||||
# some latin1 stuff
|
||||
eq_(u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True))
|
||||
|
||||
|
||||
def test_detect_encoding():
|
||||
eq_('ascii', u.detect_encoding(b'qwe').lower())
|
||||
ok_(u.detect_encoding(
|
||||
u'Versi\xf3n'.encode('iso-8859-2')).lower() in [
|
||||
'iso-8859-1', 'iso-8859-2'])
|
||||
eq_('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower())
|
||||
# fallback to utf-8
|
||||
with patch.object(u.chardet, 'detect') as detect:
|
||||
detect.side_effect = Exception
|
||||
eq_('utf-8', u.detect_encoding('qwe'.encode('utf8')).lower())
|
||||
|
||||
|
||||
def test_quick_detect_encoding():
|
||||
eq_('ascii', u.quick_detect_encoding(b'qwe').lower())
|
||||
ok_(u.quick_detect_encoding(
|
||||
u'Versi\xf3n'.encode('windows-1252')).lower() in [
|
||||
'windows-1252', 'windows-1250'])
|
||||
eq_('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower())
|
||||
|
||||
|
||||
@patch.object(cchardet, 'detect')
|
||||
@patch.object(u, 'detect_encoding')
|
||||
def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect):
|
||||
cchardet_detect.return_value = {'encoding': 'ascii'}
|
||||
eq_('ascii', u.quick_detect_encoding(b"qwe"))
|
||||
cchardet_detect.assert_called_once_with(b"qwe")
|
||||
|
||||
# fallback to detect_encoding
|
||||
cchardet_detect.return_value = {}
|
||||
detect_encoding.return_value = 'utf-8'
|
||||
eq_('utf-8', u.quick_detect_encoding(b"qwe"))
|
||||
|
||||
# exception
|
||||
detect_encoding.reset_mock()
|
||||
cchardet_detect.side_effect = Exception()
|
||||
detect_encoding.return_value = 'utf-8'
|
||||
eq_('utf-8', u.quick_detect_encoding(b"qwe"))
|
||||
ok_(detect_encoding.called)
|
||||
|
||||
|
||||
def test_html_to_text():
|
||||
html = """<body>
|
||||
<p>Hello world!</p>
|
||||
@@ -80,11 +25,11 @@ Haha
|
||||
</p>
|
||||
</body>"""
|
||||
text = u.html_to_text(html)
|
||||
eq_(b"Hello world! \n\n * One! \n * Two \nHaha", text)
|
||||
eq_(u"привет!", u.html_to_text("<b>привет!</b>").decode('utf8'))
|
||||
eq_("Hello world! \n\n * One! \n * Two \nHaha", text)
|
||||
eq_(u"привет!", u.html_to_text("<b>привет!</b>"))
|
||||
|
||||
html = '<body><br/><br/>Hi</body>'
|
||||
eq_(b'Hi', u.html_to_text(html))
|
||||
eq_('Hi', u.html_to_text(html))
|
||||
|
||||
html = """Hi
|
||||
<style type="text/css">
|
||||
@@ -104,31 +49,20 @@ font: 13px 'Lucida Grande', Arial, sans-serif;
|
||||
|
||||
}
|
||||
</style>"""
|
||||
eq_(b'Hi', u.html_to_text(html))
|
||||
eq_('Hi', u.html_to_text(html))
|
||||
|
||||
html = """<div>
|
||||
<!-- COMMENT 1 -->
|
||||
<span>TEXT 1</span>
|
||||
<p>TEXT 2 <!-- COMMENT 2 --></p>
|
||||
</div>"""
|
||||
eq_(b'TEXT 1 \nTEXT 2', u.html_to_text(html))
|
||||
eq_('TEXT 1 \nTEXT 2', u.html_to_text(html))
|
||||
|
||||
|
||||
def test_comment_no_parent():
|
||||
s = b'<!-- COMMENT 1 --> no comment'
|
||||
s = '<!-- COMMENT 1 --> no comment'
|
||||
d = u.html_document_fromstring(s)
|
||||
eq_(b"no comment", u.html_tree_to_text(d))
|
||||
|
||||
|
||||
@patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception()))
|
||||
def test_html_fromstring_exception():
|
||||
eq_(None, u.html_fromstring("<html></html>"))
|
||||
|
||||
|
||||
@patch.object(u.html5parser, 'document_fromstring')
|
||||
def test_html_document_fromstring_exception(document_fromstring):
|
||||
document_fromstring.side_effect = Exception()
|
||||
eq_(None, u.html_document_fromstring("<html></html>"))
|
||||
eq_("no comment", u.html_tree_to_text(d))
|
||||
|
||||
|
||||
@patch.object(u, 'html_fromstring', Mock(return_value=None))
|
||||
|
||||
Reference in New Issue
Block a user