From 15976888a03cd36fce33a8cda196acfc19f7cfe2 Mon Sep 17 00:00:00 2001
From: Sergey Obukhov <sergey.obykhov@mailgunhq.com>
Date: Fri, 11 Sep 2015 10:38:28 -0700
Subject: [PATCH] use precise encoding when converting to unicode

---
 setup.py                            |  2 +
 talon/signature/learning/helpers.py |  6 +--
 talon/utils.py                      | 39 +++++++++++++++++--
 tests/utils_test.py                 | 59 +++++++++++++++++++++++++++--
 4 files changed, 95 insertions(+), 11 deletions(-)

diff --git a/setup.py b/setup.py
index 1d00952..52ca913 100755
--- a/setup.py
+++ b/setup.py
@@ -20,6 +20,8 @@ setup(name='talon',
           "numpy",
           "scipy",
           "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
+          'chardet>=1.0.1',
+          'cchardet>=0.3.5',
           ],
       tests_require=[
           "mock",
diff --git a/talon/signature/learning/helpers.py b/talon/signature/learning/helpers.py
index 51a9227..953662b 100644
--- a/talon/signature/learning/helpers.py
+++ b/talon/signature/learning/helpers.py
@@ -134,7 +134,7 @@ def extract_names(sender):
     >>> extract_names('')
     []
     """
-    sender = to_unicode(sender)
+    sender = to_unicode(sender, precise=True)
     # Remove non-alphabetical characters
     sender = "".join([char if char.isalpha() else ' ' for char in sender])
     # Remove too short words and words from "black" list i.e.
@@ -161,7 +161,7 @@ def categories_percent(s, categories):
     50.0
     '''
     count = 0
-    s = to_unicode(s)
+    s = to_unicode(s, precise=True)
     for c in s:
         if unicodedata.category(c) in categories:
             count += 1
@@ -181,7 +181,7 @@ def punctuation_percent(s):
 
 def capitalized_words_percent(s):
     '''Returns capitalized words percent.'''
-    s = to_unicode(s)
+    s = to_unicode(s, precise=True)
     words = re.split('\s', s)
     words = [w for w in words if w.strip()]
     capitalized_words_counter = 0
diff --git a/talon/utils.py b/talon/utils.py
index d1bf103..2092d8e 100644
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -2,13 +2,12 @@
 
 import logging
 from random import shuffle
+import chardet
+import cchardet
 
 from talon.constants import RE_DELIMITER
 
 
-log = logging.getLogger(__name__)
-
-
 def safe_format(format_string, *args, **kwargs):
     """
     Helper: formats string with any combination of bytestrings/unicode
@@ -42,12 +41,44 @@ def to_unicode(str_or_unicode, precise=False):
         u'привет'
     If `precise` flag is True, tries to guess the correct encoding first.
     """
-    encoding = detect_encoding(str_or_unicode) if precise else 'utf-8'
+    encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8'
     if isinstance(str_or_unicode, str):
         return unicode(str_or_unicode, encoding, 'replace')
     return str_or_unicode
 
 
+def detect_encoding(string):
+    """
+    Tries to detect the encoding of the passed string.
+
+    Defaults to UTF-8.
+    """
+    try:
+        detected = chardet.detect(string)
+        if detected:
+            return detected.get('encoding') or 'utf-8'
+    except Exception, e:
+        print 11111111111, e
+        pass
+    return 'utf-8'
+
+
+def quick_detect_encoding(string):
+    """
+    Tries to detect the encoding of the passed string.
+
+    Uses cchardet. Fallbacks to detect_encoding.
+    """
+    try:
+        detected = cchardet.detect(string)
+        if detected:
+            return detected.get('encoding') or detect_encoding(string)
+    except Exception, e:
+        print 222222222222, e
+        pass
+    return detect_encoding(string)
+
+
 def to_utf8(str_or_unicode):
     """
     Safely returns a UTF-8 version of a given string
diff --git a/tests/utils_test.py b/tests/utils_test.py
index 79f09b5..519efe1 100644
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -1,9 +1,60 @@
+# coding:utf-8
+
 from . import *
 
-from talon import utils
+from talon import utils as u
+import cchardet
 
 
 def test_get_delimiter():
-    eq_('\r\n', utils.get_delimiter('abc\r\n123'))
-    eq_('\n', utils.get_delimiter('abc\n123'))
-    eq_('\n', utils.get_delimiter('abc'))
+    eq_('\r\n', u.get_delimiter('abc\r\n123'))
+    eq_('\n', u.get_delimiter('abc\n123'))
+    eq_('\n', u.get_delimiter('abc'))
+
+
+def test_unicode():
+    eq_ (u'hi', u.to_unicode('hi'))
+    eq_ (type(u.to_unicode('hi')), unicode )
+    eq_ (type(u.to_unicode(u'hi')), unicode )
+    eq_ (type(u.to_unicode('привет')), unicode )
+    eq_ (type(u.to_unicode(u'привет')), unicode )
+    eq_ (u"привет", u.to_unicode('привет'))
+    eq_ (u"привет", u.to_unicode(u'привет'))
+    # some latin1 stuff
+    eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True))
+
+
+def test_detect_encoding():
+    eq_ ('ascii', u.detect_encoding('qwe').lower())
+    eq_ ('iso-8859-2', u.detect_encoding('Versi\xf3n').lower())
+    eq_ ('utf-8', u.detect_encoding('привет').lower())
+    # fallback to utf-8
+    with patch.object(u.chardet, 'detect') as detect:
+        detect.side_effect = Exception
+        eq_ ('utf-8', u.detect_encoding('qwe').lower())
+
+
+def test_quick_detect_encoding():
+    eq_ ('ascii', u.quick_detect_encoding('qwe').lower())
+    eq_ ('windows-1252', u.quick_detect_encoding('Versi\xf3n').lower())
+    eq_ ('utf-8', u.quick_detect_encoding('привет').lower())
+
+
+@patch.object(cchardet, 'detect')
+@patch.object(u, 'detect_encoding')
+def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect):
+    cchardet_detect.return_value = {'encoding': 'ascii'}
+    eq_('ascii', u.quick_detect_encoding("qwe"))
+    cchardet_detect.assert_called_once_with("qwe")
+
+    # fallback to detect_encoding
+    cchardet_detect.return_value = {}
+    detect_encoding.return_value = 'utf-8'
+    eq_('utf-8', u.quick_detect_encoding("qwe"))
+
+    # exception
+    detect_encoding.reset_mock()
+    cchardet_detect.side_effect = Exception()
+    detect_encoding.return_value = 'utf-8'
+    eq_('utf-8', u.quick_detect_encoding("qwe"))
+    ok_(detect_encoding.called)