Merge pull request #57 from mailgun/sergey/to_unicode

use precise encoding when converting to unicode
2015-09-11 10:40:52 -07:00
parent 9bee502903 15976888a0
commit 176c7e7532
4 changed files with 95 additions and 11 deletions
@@ -20,6 +20,8 @@ setup(name='talon',
          "numpy",
          "scipy",
          "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
          'chardet>=1.0.1',
          'cchardet>=0.3.5',
          ],
      tests_require=[
          "mock",
@@ -134,7 +134,7 @@ def extract_names(sender):
    >>> extract_names('')
    []
    """
-    sender = to_unicode(sender)
+    sender = to_unicode(sender, precise=True)
    # Remove non-alphabetical characters
    sender = "".join([char if char.isalpha() else ' ' for char in sender])
    # Remove too short words and words from "black" list i.e.
@@ -161,7 +161,7 @@ def categories_percent(s, categories):
    50.0
    '''
    count = 0
-    s = to_unicode(s)
+    s = to_unicode(s, precise=True)
    for c in s:
        if unicodedata.category(c) in categories:
            count += 1
@@ -181,7 +181,7 @@ def punctuation_percent(s):
 def capitalized_words_percent(s):
    '''Returns capitalized words percent.'''
-    s = to_unicode(s)
+    s = to_unicode(s, precise=True)
    words = re.split('\s', s)
    words = [w for w in words if w.strip()]
    capitalized_words_counter = 0
@@ -2,13 +2,12 @@
 import logging
 from random import shuffle
 import chardet
 import cchardet
 from talon.constants import RE_DELIMITER
 log = logging.getLogger(__name__)
 def safe_format(format_string, *args, **kwargs):
    """
    Helper: formats string with any combination of bytestrings/unicode
@@ -42,12 +41,44 @@ def to_unicode(str_or_unicode, precise=False):
        u'привет'
    If `precise` flag is True, tries to guess the correct encoding first.
    """
-    encoding = detect_encoding(str_or_unicode) if precise else 'utf-8'
+    encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8'
    if isinstance(str_or_unicode, str):
        return unicode(str_or_unicode, encoding, 'replace')
    return str_or_unicode
 def detect_encoding(string):
    """
    Tries to detect the encoding of the passed string.
    Defaults to UTF-8.
    """
    try:
        detected = chardet.detect(string)
        if detected:
            return detected.get('encoding') or 'utf-8'
    except Exception, e:
        print 11111111111, e
        pass
    return 'utf-8'
 def quick_detect_encoding(string):
    """
    Tries to detect the encoding of the passed string.
    Uses cchardet. Fallbacks to detect_encoding.
    """
    try:
        detected = cchardet.detect(string)
        if detected:
            return detected.get('encoding') or detect_encoding(string)
    except Exception, e:
        print 222222222222, e
        pass
    return detect_encoding(string)
 def to_utf8(str_or_unicode):
    """
    Safely returns a UTF-8 version of a given string
@@ -1,9 +1,60 @@
 # coding:utf-8
 from . import *
-from talon import utils
+from talon import utils as u
 import cchardet
 def test_get_delimiter():
-    eq_('\r\n', utils.get_delimiter('abc\r\n123'))
+    eq_('\r\n', u.get_delimiter('abc\r\n123'))
-    eq_('\n', utils.get_delimiter('abc\n123'))
+    eq_('\n', u.get_delimiter('abc\n123'))
-    eq_('\n', utils.get_delimiter('abc'))
+    eq_('\n', u.get_delimiter('abc'))
 def test_unicode():
    eq_ (u'hi', u.to_unicode('hi'))
    eq_ (type(u.to_unicode('hi')), unicode )
    eq_ (type(u.to_unicode(u'hi')), unicode )
    eq_ (type(u.to_unicode('привет')), unicode )
    eq_ (type(u.to_unicode(u'привет')), unicode )
    eq_ (u"привет", u.to_unicode('привет'))
    eq_ (u"привет", u.to_unicode(u'привет'))
    # some latin1 stuff
    eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True))
 def test_detect_encoding():
    eq_ ('ascii', u.detect_encoding('qwe').lower())
    eq_ ('iso-8859-2', u.detect_encoding('Versi\xf3n').lower())
    eq_ ('utf-8', u.detect_encoding('привет').lower())
    # fallback to utf-8
    with patch.object(u.chardet, 'detect') as detect:
        detect.side_effect = Exception
        eq_ ('utf-8', u.detect_encoding('qwe').lower())
 def test_quick_detect_encoding():
    eq_ ('ascii', u.quick_detect_encoding('qwe').lower())
    eq_ ('windows-1252', u.quick_detect_encoding('Versi\xf3n').lower())
    eq_ ('utf-8', u.quick_detect_encoding('привет').lower())
@patch.object(cchardet, 'detect')
@patch.object(u, 'detect_encoding')
 def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect):
    cchardet_detect.return_value = {'encoding': 'ascii'}
    eq_('ascii', u.quick_detect_encoding("qwe"))
    cchardet_detect.assert_called_once_with("qwe")
    # fallback to detect_encoding
    cchardet_detect.return_value = {}
    detect_encoding.return_value = 'utf-8'
    eq_('utf-8', u.quick_detect_encoding("qwe"))
    # exception
    detect_encoding.reset_mock()
    cchardet_detect.side_effect = Exception()
    detect_encoding.return_value = 'utf-8'
    eq_('utf-8', u.quick_detect_encoding("qwe"))
    ok_(detect_encoding.called)