use precise encoding when converting to unicode

This commit is contained in:
Sergey Obukhov
2015-09-11 10:38:28 -07:00
parent 9bee502903
commit 15976888a0
4 changed files with 95 additions and 11 deletions

View File

@@ -134,7 +134,7 @@ def extract_names(sender):
>>> extract_names('')
[]
"""
sender = to_unicode(sender)
sender = to_unicode(sender, precise=True)
# Remove non-alphabetical characters
sender = "".join([char if char.isalpha() else ' ' for char in sender])
# Remove too short words and words from "black" list i.e.
@@ -161,7 +161,7 @@ def categories_percent(s, categories):
50.0
'''
count = 0
s = to_unicode(s)
s = to_unicode(s, precise=True)
for c in s:
if unicodedata.category(c) in categories:
count += 1
@@ -181,7 +181,7 @@ def punctuation_percent(s):
def capitalized_words_percent(s):
'''Returns capitalized words percent.'''
s = to_unicode(s)
s = to_unicode(s, precise=True)
words = re.split('\s', s)
words = [w for w in words if w.strip()]
capitalized_words_counter = 0

View File

@@ -2,13 +2,12 @@
import logging
from random import shuffle
import chardet
import cchardet
from talon.constants import RE_DELIMITER
log = logging.getLogger(__name__)
def safe_format(format_string, *args, **kwargs):
"""
Helper: formats string with any combination of bytestrings/unicode
@@ -42,12 +41,44 @@ def to_unicode(str_or_unicode, precise=False):
u'привет'
If `precise` flag is True, tries to guess the correct encoding first.
"""
encoding = detect_encoding(str_or_unicode) if precise else 'utf-8'
encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8'
if isinstance(str_or_unicode, str):
return unicode(str_or_unicode, encoding, 'replace')
return str_or_unicode
def detect_encoding(string):
"""
Tries to detect the encoding of the passed string.
Defaults to UTF-8.
"""
try:
detected = chardet.detect(string)
if detected:
return detected.get('encoding') or 'utf-8'
except Exception, e:
print 11111111111, e
pass
return 'utf-8'
def quick_detect_encoding(string):
"""
Tries to detect the encoding of the passed string.
Uses cchardet. Fallbacks to detect_encoding.
"""
try:
detected = cchardet.detect(string)
if detected:
return detected.get('encoding') or detect_encoding(string)
except Exception, e:
print 222222222222, e
pass
return detect_encoding(string)
def to_utf8(str_or_unicode):
"""
Safely returns a UTF-8 version of a given string