bump up version

Merge pull request #57 from mailgun/sergey/to_unicode
use precise encoding when converting to unicode
2015-09-11 10:42:26 -07:00 · 2015-09-11 10:40:52 -07:00 · 2015-09-11 10:38:28 -07:00 · 2015-09-11 06:27:12 -07:00 · 2015-09-11 06:20:34 -07:00 · 2015-09-11 06:17:14 -07:00
8 changed files with 137 additions and 27 deletions
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages


 setup(name='talon',
-      version='1.0.5',
+      version='1.0.7',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
@@ -14,12 +14,14 @@ setup(name='talon',
      include_package_data=True,
      zip_safe=True,
      install_requires=[
-          "lxml==2.3.3",
+          "lxml>=2.3.3",
          "regex>=1",
          "html2text",
          "numpy",
          "scipy",
          "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
+          'chardet>=1.0.1',
+          'cchardet>=0.3.5',
          ],
      tests_require=[
          "mock",
--- a/talon/html_quotations.py
+++ b/talon/html_quotations.py
@@ -138,9 +138,10 @@ def cut_by_id(html_message):


 def cut_blockquote(html_message):
-    ''' Cuts blockquote with wrapping elements. '''
-    quote = html_message.find('.//blockquote')
-    if quote is not None:
+    ''' Cuts the last non-nested blockquote with wrapping elements. '''
+    quote = html_message.xpath('(.//blockquote)[not(ancestor::blockquote)][last()]')
+    if quote:
+        quote = quote[0]
        quote.getparent().remove(quote)
        return True

--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -32,7 +32,9 @@ RE_ON_DATE_SMB_WROTE = re.compile(
            # Polish
            'W dniu',
            # Dutch
-            'Op'
+            'Op',
+            # German
+            'Am'
        )),
        # Date and sender separator
        u'|'.join((
@@ -50,18 +52,26 @@ RE_ON_DATE_SMB_WROTE = re.compile(
            # Polish
            u'napisał',
            # Dutch
-            'schreef','verzond','geschreven'
+            'schreef','verzond','geschreven',
+            # German
+            'schrieb'
        ))
    ))
 # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
 RE_ON_DATE_WROTE_SMB = re.compile(
-    u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format(
+    u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
        # Beginning of the line
+        u'|'.join((
        	'Op',
+        	#German
+        	'Am'
+        )),
        # Ending of the line
        u'|'.join((
            # Dutch
-            'schreef','verzond','geschreven'
+            'schreef','verzond','geschreven',
+            # German
+            'schrieb'
        ))
    )
    )
@@ -181,6 +191,7 @@ def mark_message_lines(lines):
        else:
            # in case splitter is spread across several lines
            splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
+
            if splitter:
                # append as many splitter markers as lines in splitter
                splitter_lines = splitter.group().splitlines()
@@ -293,12 +304,8 @@ def extract_from_plain(msg_body):

    delimiter = get_delimiter(msg_body)
    msg_body = preprocess(msg_body, delimiter)
-    lines = msg_body.splitlines()
-
    # don't process too long messages
-    if len(lines) > MAX_LINES_COUNT:
-        return stripped_text
-
+    lines = msg_body.splitlines()[:MAX_LINES_COUNT]
    markers = mark_message_lines(lines)
    lines = process_marked_lines(lines, markers)

--- a/talon/signature/learning/helpers.py
+++ b/talon/signature/learning/helpers.py
@@ -134,7 +134,7 @@ def extract_names(sender):
    >>> extract_names('')
    []
    """
-    sender = to_unicode(sender)
+    sender = to_unicode(sender, precise=True)
    # Remove non-alphabetical characters
    sender = "".join([char if char.isalpha() else ' ' for char in sender])
    # Remove too short words and words from "black" list i.e.
@@ -161,7 +161,7 @@ def categories_percent(s, categories):
    50.0
    '''
    count = 0
-    s = to_unicode(s)
+    s = to_unicode(s, precise=True)
    for c in s:
        if unicodedata.category(c) in categories:
            count += 1
@@ -181,7 +181,7 @@ def punctuation_percent(s):

 def capitalized_words_percent(s):
    '''Returns capitalized words percent.'''
-    s = to_unicode(s)
+    s = to_unicode(s, precise=True)
    words = re.split('\s', s)
    words = [w for w in words if w.strip()]
    capitalized_words_counter = 0
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -2,13 +2,12 @@

 import logging
 from random import shuffle
+import chardet
+import cchardet

 from talon.constants import RE_DELIMITER


-log = logging.getLogger(__name__)
-
-
 def safe_format(format_string, *args, **kwargs):
    """
    Helper: formats string with any combination of bytestrings/unicode
@@ -42,12 +41,44 @@ def to_unicode(str_or_unicode, precise=False):
        u'привет'
    If `precise` flag is True, tries to guess the correct encoding first.
    """
-    encoding = detect_encoding(str_or_unicode) if precise else 'utf-8'
+    encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8'
    if isinstance(str_or_unicode, str):
        return unicode(str_or_unicode, encoding, 'replace')
    return str_or_unicode


+def detect_encoding(string):
+    """
+    Tries to detect the encoding of the passed string.
+
+    Defaults to UTF-8.
+    """
+    try:
+        detected = chardet.detect(string)
+        if detected:
+            return detected.get('encoding') or 'utf-8'
+    except Exception, e:
+        print 11111111111, e
+        pass
+    return 'utf-8'
+
+
+def quick_detect_encoding(string):
+    """
+    Tries to detect the encoding of the passed string.
+
+    Uses cchardet. Fallbacks to detect_encoding.
+    """
+    try:
+        detected = cchardet.detect(string)
+        if detected:
+            return detected.get('encoding') or detect_encoding(string)
+    except Exception, e:
+        print 222222222222, e
+        pass
+    return detect_encoding(string)
+
+
 def to_utf8(str_or_unicode):
    """
    Safely returns a UTF-8 version of a given string
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -49,6 +49,24 @@ def test_quotation_splitter_outside_blockquote():
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))


+def test_regular_blockquote():
+    msg_body = """Reply
+<blockquote>Regular</blockquote>
+
+<div>
+  On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
+</div>
+
+<blockquote>
+  <div>
+    <blockquote>Nested</blockquote>
+  </div>
+</blockquote>
+"""
+    eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote><div></div></body></html>",
+        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
+
+
 def test_no_blockquote():
    msg_body = """
 <html>
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -12,11 +12,11 @@ from talon import quotations
@patch.object(quotations, 'MAX_LINES_COUNT', 1)
 def test_too_many_lines():
    msg_body = """Test reply
-
+Hi
 -----Original Message-----

 Test"""
-    eq_(msg_body, quotations.extract_from_plain(msg_body))
+    eq_("Test reply", quotations.extract_from_plain(msg_body))


 def test_pattern_on_date_somebody_wrote():
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -1,9 +1,60 @@
+# coding:utf-8
+
 from . import *

-from talon import utils
+from talon import utils as u
+import cchardet


 def test_get_delimiter():
-    eq_('\r\n', utils.get_delimiter('abc\r\n123'))
-    eq_('\n', utils.get_delimiter('abc\n123'))
-    eq_('\n', utils.get_delimiter('abc'))
+    eq_('\r\n', u.get_delimiter('abc\r\n123'))
+    eq_('\n', u.get_delimiter('abc\n123'))
+    eq_('\n', u.get_delimiter('abc'))
+
+
+def test_unicode():
+    eq_ (u'hi', u.to_unicode('hi'))
+    eq_ (type(u.to_unicode('hi')), unicode )
+    eq_ (type(u.to_unicode(u'hi')), unicode )
+    eq_ (type(u.to_unicode('привет')), unicode )
+    eq_ (type(u.to_unicode(u'привет')), unicode )
+    eq_ (u"привет", u.to_unicode('привет'))
+    eq_ (u"привет", u.to_unicode(u'привет'))
+    # some latin1 stuff
+    eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True))
+
+
+def test_detect_encoding():
+    eq_ ('ascii', u.detect_encoding('qwe').lower())
+    eq_ ('iso-8859-2', u.detect_encoding('Versi\xf3n').lower())
+    eq_ ('utf-8', u.detect_encoding('привет').lower())
+    # fallback to utf-8
+    with patch.object(u.chardet, 'detect') as detect:
+        detect.side_effect = Exception
+        eq_ ('utf-8', u.detect_encoding('qwe').lower())
+
+
+def test_quick_detect_encoding():
+    eq_ ('ascii', u.quick_detect_encoding('qwe').lower())
+    eq_ ('windows-1252', u.quick_detect_encoding('Versi\xf3n').lower())
+    eq_ ('utf-8', u.quick_detect_encoding('привет').lower())
+
+
+@patch.object(cchardet, 'detect')
+@patch.object(u, 'detect_encoding')
+def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect):
+    cchardet_detect.return_value = {'encoding': 'ascii'}
+    eq_('ascii', u.quick_detect_encoding("qwe"))
+    cchardet_detect.assert_called_once_with("qwe")
+
+    # fallback to detect_encoding
+    cchardet_detect.return_value = {}
+    detect_encoding.return_value = 'utf-8'
+    eq_('utf-8', u.quick_detect_encoding("qwe"))
+
+    # exception
+    detect_encoding.reset_mock()
+    cchardet_detect.side_effect = Exception()
+    detect_encoding.return_value = 'utf-8'
+    eq_('utf-8', u.quick_detect_encoding("qwe"))
+    ok_(detect_encoding.called)
Author	SHA1	Message	Date
Sergey Obukhov	b5af9c03a5	bump up version	2015-09-11 10:42:26 -07:00
Sergey Obukhov	176c7e7532	Merge pull request #57 from mailgun/sergey/to_unicode use precise encoding when converting to unicode	2015-09-11 10:40:52 -07:00
Sergey Obukhov	15976888a0	use precise encoding when converting to unicode	2015-09-11 10:38:28 -07:00
Sergey Obukhov	9bee502903	bump up version	2015-09-11 06:27:12 -07:00
Sergey Obukhov	e3cb8dc3e6	Merge pull request #56 from mailgun/sergey/1000+German+NL process first 1000 lines for long messages, support for German and Dutch	2015-09-11 06:20:34 -07:00
Sergey Obukhov	385285e5de	process first 1000 lines for long messages, support for German and Dutch	2015-09-11 06:17:14 -07:00
Sergey Obukhov	127771dac9	bump up version	2015-09-11 04:51:39 -07:00
Sergey Obukhov	cc98befba5	Merge pull request #50 from Easy-D/preserve-regular-blockquotes Preserve regular blockquotes	2015-09-11 04:49:36 -07:00
Sergey Obukhov	567549cba4	bump up talon version	2015-09-10 10:47:16 -07:00
Sergey Obukhov	76c4f49be8	Merge pull request #55 from mailgun/sergey/lxml unpin lxml version	2015-09-10 10:44:59 -07:00
Sergey Obukhov	d9d89dc250	unpin lxml version	2015-09-10 10:44:05 -07:00
Easy-D	390b0a6dc9	preserve regular blockquotes	2015-07-16 21:31:41 +02:00
Easy-D	ed6b861a47	add failing test that shows how regular blockquotes are removed	2015-07-16 21:24:49 +02:00