Merge pull request #57 from mailgun/sergey/to_unicode
use precise encoding when converting to unicode
This commit is contained in:
2
setup.py
2
setup.py
@@ -20,6 +20,8 @@ setup(name='talon',
|
|||||||
"numpy",
|
"numpy",
|
||||||
"scipy",
|
"scipy",
|
||||||
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
|
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
|
||||||
|
'chardet>=1.0.1',
|
||||||
|
'cchardet>=0.3.5',
|
||||||
],
|
],
|
||||||
tests_require=[
|
tests_require=[
|
||||||
"mock",
|
"mock",
|
||||||
|
|||||||
@@ -134,7 +134,7 @@ def extract_names(sender):
|
|||||||
>>> extract_names('')
|
>>> extract_names('')
|
||||||
[]
|
[]
|
||||||
"""
|
"""
|
||||||
sender = to_unicode(sender)
|
sender = to_unicode(sender, precise=True)
|
||||||
# Remove non-alphabetical characters
|
# Remove non-alphabetical characters
|
||||||
sender = "".join([char if char.isalpha() else ' ' for char in sender])
|
sender = "".join([char if char.isalpha() else ' ' for char in sender])
|
||||||
# Remove too short words and words from "black" list i.e.
|
# Remove too short words and words from "black" list i.e.
|
||||||
@@ -161,7 +161,7 @@ def categories_percent(s, categories):
|
|||||||
50.0
|
50.0
|
||||||
'''
|
'''
|
||||||
count = 0
|
count = 0
|
||||||
s = to_unicode(s)
|
s = to_unicode(s, precise=True)
|
||||||
for c in s:
|
for c in s:
|
||||||
if unicodedata.category(c) in categories:
|
if unicodedata.category(c) in categories:
|
||||||
count += 1
|
count += 1
|
||||||
@@ -181,7 +181,7 @@ def punctuation_percent(s):
|
|||||||
|
|
||||||
def capitalized_words_percent(s):
|
def capitalized_words_percent(s):
|
||||||
'''Returns capitalized words percent.'''
|
'''Returns capitalized words percent.'''
|
||||||
s = to_unicode(s)
|
s = to_unicode(s, precise=True)
|
||||||
words = re.split('\s', s)
|
words = re.split('\s', s)
|
||||||
words = [w for w in words if w.strip()]
|
words = [w for w in words if w.strip()]
|
||||||
capitalized_words_counter = 0
|
capitalized_words_counter = 0
|
||||||
|
|||||||
@@ -2,13 +2,12 @@
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
from random import shuffle
|
from random import shuffle
|
||||||
|
import chardet
|
||||||
|
import cchardet
|
||||||
|
|
||||||
from talon.constants import RE_DELIMITER
|
from talon.constants import RE_DELIMITER
|
||||||
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def safe_format(format_string, *args, **kwargs):
|
def safe_format(format_string, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
Helper: formats string with any combination of bytestrings/unicode
|
Helper: formats string with any combination of bytestrings/unicode
|
||||||
@@ -42,12 +41,44 @@ def to_unicode(str_or_unicode, precise=False):
|
|||||||
u'привет'
|
u'привет'
|
||||||
If `precise` flag is True, tries to guess the correct encoding first.
|
If `precise` flag is True, tries to guess the correct encoding first.
|
||||||
"""
|
"""
|
||||||
encoding = detect_encoding(str_or_unicode) if precise else 'utf-8'
|
encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8'
|
||||||
if isinstance(str_or_unicode, str):
|
if isinstance(str_or_unicode, str):
|
||||||
return unicode(str_or_unicode, encoding, 'replace')
|
return unicode(str_or_unicode, encoding, 'replace')
|
||||||
return str_or_unicode
|
return str_or_unicode
|
||||||
|
|
||||||
|
|
||||||
|
def detect_encoding(string):
|
||||||
|
"""
|
||||||
|
Tries to detect the encoding of the passed string.
|
||||||
|
|
||||||
|
Defaults to UTF-8.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
detected = chardet.detect(string)
|
||||||
|
if detected:
|
||||||
|
return detected.get('encoding') or 'utf-8'
|
||||||
|
except Exception, e:
|
||||||
|
print 11111111111, e
|
||||||
|
pass
|
||||||
|
return 'utf-8'
|
||||||
|
|
||||||
|
|
||||||
|
def quick_detect_encoding(string):
|
||||||
|
"""
|
||||||
|
Tries to detect the encoding of the passed string.
|
||||||
|
|
||||||
|
Uses cchardet. Fallbacks to detect_encoding.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
detected = cchardet.detect(string)
|
||||||
|
if detected:
|
||||||
|
return detected.get('encoding') or detect_encoding(string)
|
||||||
|
except Exception, e:
|
||||||
|
print 222222222222, e
|
||||||
|
pass
|
||||||
|
return detect_encoding(string)
|
||||||
|
|
||||||
|
|
||||||
def to_utf8(str_or_unicode):
|
def to_utf8(str_or_unicode):
|
||||||
"""
|
"""
|
||||||
Safely returns a UTF-8 version of a given string
|
Safely returns a UTF-8 version of a given string
|
||||||
|
|||||||
@@ -1,9 +1,60 @@
|
|||||||
|
# coding:utf-8
|
||||||
|
|
||||||
from . import *
|
from . import *
|
||||||
|
|
||||||
from talon import utils
|
from talon import utils as u
|
||||||
|
import cchardet
|
||||||
|
|
||||||
|
|
||||||
def test_get_delimiter():
|
def test_get_delimiter():
|
||||||
eq_('\r\n', utils.get_delimiter('abc\r\n123'))
|
eq_('\r\n', u.get_delimiter('abc\r\n123'))
|
||||||
eq_('\n', utils.get_delimiter('abc\n123'))
|
eq_('\n', u.get_delimiter('abc\n123'))
|
||||||
eq_('\n', utils.get_delimiter('abc'))
|
eq_('\n', u.get_delimiter('abc'))
|
||||||
|
|
||||||
|
|
||||||
|
def test_unicode():
|
||||||
|
eq_ (u'hi', u.to_unicode('hi'))
|
||||||
|
eq_ (type(u.to_unicode('hi')), unicode )
|
||||||
|
eq_ (type(u.to_unicode(u'hi')), unicode )
|
||||||
|
eq_ (type(u.to_unicode('привет')), unicode )
|
||||||
|
eq_ (type(u.to_unicode(u'привет')), unicode )
|
||||||
|
eq_ (u"привет", u.to_unicode('привет'))
|
||||||
|
eq_ (u"привет", u.to_unicode(u'привет'))
|
||||||
|
# some latin1 stuff
|
||||||
|
eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True))
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_encoding():
|
||||||
|
eq_ ('ascii', u.detect_encoding('qwe').lower())
|
||||||
|
eq_ ('iso-8859-2', u.detect_encoding('Versi\xf3n').lower())
|
||||||
|
eq_ ('utf-8', u.detect_encoding('привет').lower())
|
||||||
|
# fallback to utf-8
|
||||||
|
with patch.object(u.chardet, 'detect') as detect:
|
||||||
|
detect.side_effect = Exception
|
||||||
|
eq_ ('utf-8', u.detect_encoding('qwe').lower())
|
||||||
|
|
||||||
|
|
||||||
|
def test_quick_detect_encoding():
|
||||||
|
eq_ ('ascii', u.quick_detect_encoding('qwe').lower())
|
||||||
|
eq_ ('windows-1252', u.quick_detect_encoding('Versi\xf3n').lower())
|
||||||
|
eq_ ('utf-8', u.quick_detect_encoding('привет').lower())
|
||||||
|
|
||||||
|
|
||||||
|
@patch.object(cchardet, 'detect')
|
||||||
|
@patch.object(u, 'detect_encoding')
|
||||||
|
def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect):
|
||||||
|
cchardet_detect.return_value = {'encoding': 'ascii'}
|
||||||
|
eq_('ascii', u.quick_detect_encoding("qwe"))
|
||||||
|
cchardet_detect.assert_called_once_with("qwe")
|
||||||
|
|
||||||
|
# fallback to detect_encoding
|
||||||
|
cchardet_detect.return_value = {}
|
||||||
|
detect_encoding.return_value = 'utf-8'
|
||||||
|
eq_('utf-8', u.quick_detect_encoding("qwe"))
|
||||||
|
|
||||||
|
# exception
|
||||||
|
detect_encoding.reset_mock()
|
||||||
|
cchardet_detect.side_effect = Exception()
|
||||||
|
detect_encoding.return_value = 'utf-8'
|
||||||
|
eq_('utf-8', u.quick_detect_encoding("qwe"))
|
||||||
|
ok_(detect_encoding.called)
|
||||||
|
|||||||
Reference in New Issue
Block a user