From 15976888a03cd36fce33a8cda196acfc19f7cfe2 Mon Sep 17 00:00:00 2001 From: Sergey Obukhov Date: Fri, 11 Sep 2015 10:38:28 -0700 Subject: [PATCH] use precise encoding when converting to unicode --- setup.py | 2 + talon/signature/learning/helpers.py | 6 +-- talon/utils.py | 39 +++++++++++++++++-- tests/utils_test.py | 59 +++++++++++++++++++++++++++-- 4 files changed, 95 insertions(+), 11 deletions(-) diff --git a/setup.py b/setup.py index 1d00952..52ca913 100755 --- a/setup.py +++ b/setup.py @@ -20,6 +20,8 @@ setup(name='talon', "numpy", "scipy", "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild + 'chardet>=1.0.1', + 'cchardet>=0.3.5', ], tests_require=[ "mock", diff --git a/talon/signature/learning/helpers.py b/talon/signature/learning/helpers.py index 51a9227..953662b 100644 --- a/talon/signature/learning/helpers.py +++ b/talon/signature/learning/helpers.py @@ -134,7 +134,7 @@ def extract_names(sender): >>> extract_names('') [] """ - sender = to_unicode(sender) + sender = to_unicode(sender, precise=True) # Remove non-alphabetical characters sender = "".join([char if char.isalpha() else ' ' for char in sender]) # Remove too short words and words from "black" list i.e. @@ -161,7 +161,7 @@ def categories_percent(s, categories): 50.0 ''' count = 0 - s = to_unicode(s) + s = to_unicode(s, precise=True) for c in s: if unicodedata.category(c) in categories: count += 1 @@ -181,7 +181,7 @@ def punctuation_percent(s): def capitalized_words_percent(s): '''Returns capitalized words percent.''' - s = to_unicode(s) + s = to_unicode(s, precise=True) words = re.split('\s', s) words = [w for w in words if w.strip()] capitalized_words_counter = 0 diff --git a/talon/utils.py b/talon/utils.py index d1bf103..2092d8e 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -2,13 +2,12 @@ import logging from random import shuffle +import chardet +import cchardet from talon.constants import RE_DELIMITER -log = logging.getLogger(__name__) - - def safe_format(format_string, *args, **kwargs): """ Helper: formats string with any combination of bytestrings/unicode @@ -42,12 +41,44 @@ def to_unicode(str_or_unicode, precise=False): u'привет' If `precise` flag is True, tries to guess the correct encoding first. """ - encoding = detect_encoding(str_or_unicode) if precise else 'utf-8' + encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8' if isinstance(str_or_unicode, str): return unicode(str_or_unicode, encoding, 'replace') return str_or_unicode +def detect_encoding(string): + """ + Tries to detect the encoding of the passed string. + + Defaults to UTF-8. + """ + try: + detected = chardet.detect(string) + if detected: + return detected.get('encoding') or 'utf-8' + except Exception, e: + print 11111111111, e + pass + return 'utf-8' + + +def quick_detect_encoding(string): + """ + Tries to detect the encoding of the passed string. + + Uses cchardet. Fallbacks to detect_encoding. + """ + try: + detected = cchardet.detect(string) + if detected: + return detected.get('encoding') or detect_encoding(string) + except Exception, e: + print 222222222222, e + pass + return detect_encoding(string) + + def to_utf8(str_or_unicode): """ Safely returns a UTF-8 version of a given string diff --git a/tests/utils_test.py b/tests/utils_test.py index 79f09b5..519efe1 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -1,9 +1,60 @@ +# coding:utf-8 + from . import * -from talon import utils +from talon import utils as u +import cchardet def test_get_delimiter(): - eq_('\r\n', utils.get_delimiter('abc\r\n123')) - eq_('\n', utils.get_delimiter('abc\n123')) - eq_('\n', utils.get_delimiter('abc')) + eq_('\r\n', u.get_delimiter('abc\r\n123')) + eq_('\n', u.get_delimiter('abc\n123')) + eq_('\n', u.get_delimiter('abc')) + + +def test_unicode(): + eq_ (u'hi', u.to_unicode('hi')) + eq_ (type(u.to_unicode('hi')), unicode ) + eq_ (type(u.to_unicode(u'hi')), unicode ) + eq_ (type(u.to_unicode('привет')), unicode ) + eq_ (type(u.to_unicode(u'привет')), unicode ) + eq_ (u"привет", u.to_unicode('привет')) + eq_ (u"привет", u.to_unicode(u'привет')) + # some latin1 stuff + eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True)) + + +def test_detect_encoding(): + eq_ ('ascii', u.detect_encoding('qwe').lower()) + eq_ ('iso-8859-2', u.detect_encoding('Versi\xf3n').lower()) + eq_ ('utf-8', u.detect_encoding('привет').lower()) + # fallback to utf-8 + with patch.object(u.chardet, 'detect') as detect: + detect.side_effect = Exception + eq_ ('utf-8', u.detect_encoding('qwe').lower()) + + +def test_quick_detect_encoding(): + eq_ ('ascii', u.quick_detect_encoding('qwe').lower()) + eq_ ('windows-1252', u.quick_detect_encoding('Versi\xf3n').lower()) + eq_ ('utf-8', u.quick_detect_encoding('привет').lower()) + + +@patch.object(cchardet, 'detect') +@patch.object(u, 'detect_encoding') +def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect): + cchardet_detect.return_value = {'encoding': 'ascii'} + eq_('ascii', u.quick_detect_encoding("qwe")) + cchardet_detect.assert_called_once_with("qwe") + + # fallback to detect_encoding + cchardet_detect.return_value = {} + detect_encoding.return_value = 'utf-8' + eq_('utf-8', u.quick_detect_encoding("qwe")) + + # exception + detect_encoding.reset_mock() + cchardet_detect.side_effect = Exception() + detect_encoding.return_value = 'utf-8' + eq_('utf-8', u.quick_detect_encoding("qwe")) + ok_(detect_encoding.called)