diff --git a/talon/utils.py b/talon/utils.py index 0e50c8f..ebabdf1 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -59,6 +59,7 @@ def detect_encoding(string): Defaults to UTF-8. """ + assert isinstance(string, bytes) try: detected = chardet.detect(string) if detected: @@ -74,6 +75,7 @@ def quick_detect_encoding(string): Uses cchardet. Fallbacks to detect_encoding. """ + assert isinstance(string, bytes) try: detected = cchardet.detect(string) if detected: diff --git a/tests/utils_test.py b/tests/utils_test.py index 472e498..37f4d7b 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -23,42 +23,42 @@ def test_unicode(): eq_ (u"привет", u.to_unicode('привет')) eq_ (u"привет", u.to_unicode(u'привет')) # some latin1 stuff - eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True)) + eq_ (u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True)) def test_detect_encoding(): - eq_ ('ascii', u.detect_encoding('qwe').lower()) - eq_ ('iso-8859-2', u.detect_encoding('Versi\xf3n').lower()) - eq_ ('utf-8', u.detect_encoding('привет').lower()) + eq_ ('ascii', u.detect_encoding(b'qwe').lower()) + eq_ ('iso-8859-2', u.detect_encoding(u'Versi\xf3n'.encode('iso-8859-2')).lower()) + eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower()) # fallback to utf-8 with patch.object(u.chardet, 'detect') as detect: detect.side_effect = Exception - eq_ ('utf-8', u.detect_encoding('qwe').lower()) + eq_ ('utf-8', u.detect_encoding('qwe'.encode('utf8')).lower()) def test_quick_detect_encoding(): - eq_ ('ascii', u.quick_detect_encoding('qwe').lower()) - eq_ ('windows-1252', u.quick_detect_encoding('Versi\xf3n').lower()) - eq_ ('utf-8', u.quick_detect_encoding('привет').lower()) + eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower()) + eq_ ('windows-1252', u.quick_detect_encoding(u'Versi\xf3n'.encode('windows-1252')).lower()) + eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower()) @patch.object(cchardet, 'detect') @patch.object(u, 'detect_encoding') def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect): cchardet_detect.return_value = {'encoding': 'ascii'} - eq_('ascii', u.quick_detect_encoding("qwe")) - cchardet_detect.assert_called_once_with("qwe") + eq_('ascii', u.quick_detect_encoding(b"qwe")) + cchardet_detect.assert_called_once_with(b"qwe") # fallback to detect_encoding cchardet_detect.return_value = {} detect_encoding.return_value = 'utf-8' - eq_('utf-8', u.quick_detect_encoding("qwe")) + eq_('utf-8', u.quick_detect_encoding(b"qwe")) # exception detect_encoding.reset_mock() cchardet_detect.side_effect = Exception() detect_encoding.return_value = 'utf-8' - eq_('utf-8', u.quick_detect_encoding("qwe")) + eq_('utf-8', u.quick_detect_encoding(b"qwe")) ok_(detect_encoding.called) @@ -75,11 +75,11 @@ Haha

""" text = u.html_to_text(html) - eq_("Hello world! \n\n * One! \n * Two \nHaha", text) - eq_("привет!", u.html_to_text("привет!")) + eq_(b"Hello world! \n\n * One! \n * Two \nHaha", text) + eq_(u"привет!", u.html_to_text("привет!").decode('utf8')) html = '

Hi' - eq_ ('Hi', u.html_to_text(html)) + eq_ (b'Hi', u.html_to_text(html)) html = """Hi """ - eq_ ('Hi', u.html_to_text(html)) + eq_ (b'Hi', u.html_to_text(html)) html = """
TEXT 1

TEXT 2

""" - eq_('TEXT 1 \nTEXT 2', u.html_to_text(html)) + eq_(b'TEXT 1 \nTEXT 2', u.html_to_text(html))