Actually bump up talon's version up to 1.0.5 to match the tag.

2015-09-09 22:46:18 +02:00
8 changed files with 27 additions and 137 deletions
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages


 setup(name='talon',
-      version='1.0.7',
+      version='1.0.5',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
@@ -14,14 +14,12 @@ setup(name='talon',
      include_package_data=True,
      zip_safe=True,
      install_requires=[
-          "lxml>=2.3.3",
+          "lxml==2.3.3",
          "regex>=1",
          "html2text",
          "numpy",
          "scipy",
          "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
-          'chardet>=1.0.1',
-          'cchardet>=0.3.5',
          ],
      tests_require=[
          "mock",
--- a/talon/html_quotations.py
+++ b/talon/html_quotations.py
@@ -138,10 +138,9 @@ def cut_by_id(html_message):


 def cut_blockquote(html_message):
-    ''' Cuts the last non-nested blockquote with wrapping elements. '''
-    quote = html_message.xpath('(.//blockquote)[not(ancestor::blockquote)][last()]')
-    if quote:
-        quote = quote[0]
+    ''' Cuts blockquote with wrapping elements. '''
+    quote = html_message.find('.//blockquote')
+    if quote is not None:
        quote.getparent().remove(quote)
        return True

--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -32,9 +32,7 @@ RE_ON_DATE_SMB_WROTE = re.compile(
            # Polish
            'W dniu',
            # Dutch
-            'Op',
-            # German
-            'Am'
+            'Op'
        )),
        # Date and sender separator
        u'|'.join((
@@ -52,26 +50,18 @@ RE_ON_DATE_SMB_WROTE = re.compile(
            # Polish
            u'napisał',
            # Dutch
-            'schreef','verzond','geschreven',
-            # German
-            'schrieb'
+            'schreef','verzond','geschreven'
        ))
    ))
 # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
 RE_ON_DATE_WROTE_SMB = re.compile(
-    u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
+    u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format(
        # Beginning of the line
-        u'|'.join((
        	'Op',
-        	#German
-        	'Am'
-        )),
        # Ending of the line
        u'|'.join((
            # Dutch
-            'schreef','verzond','geschreven',
-            # German
-            'schrieb'
+            'schreef','verzond','geschreven'
        ))
    )
    )
@@ -191,7 +181,6 @@ def mark_message_lines(lines):
        else:
            # in case splitter is spread across several lines
            splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
-
            if splitter:
                # append as many splitter markers as lines in splitter
                splitter_lines = splitter.group().splitlines()
@@ -304,8 +293,12 @@ def extract_from_plain(msg_body):

    delimiter = get_delimiter(msg_body)
    msg_body = preprocess(msg_body, delimiter)
+    lines = msg_body.splitlines()
+
    # don't process too long messages
-    lines = msg_body.splitlines()[:MAX_LINES_COUNT]
+    if len(lines) > MAX_LINES_COUNT:
+        return stripped_text
+
    markers = mark_message_lines(lines)
    lines = process_marked_lines(lines, markers)

--- a/talon/signature/learning/helpers.py
+++ b/talon/signature/learning/helpers.py
@@ -134,7 +134,7 @@ def extract_names(sender):
    >>> extract_names('')
    []
    """
-    sender = to_unicode(sender, precise=True)
+    sender = to_unicode(sender)
    # Remove non-alphabetical characters
    sender = "".join([char if char.isalpha() else ' ' for char in sender])
    # Remove too short words and words from "black" list i.e.
@@ -161,7 +161,7 @@ def categories_percent(s, categories):
    50.0
    '''
    count = 0
-    s = to_unicode(s, precise=True)
+    s = to_unicode(s)
    for c in s:
        if unicodedata.category(c) in categories:
            count += 1
@@ -181,7 +181,7 @@ def punctuation_percent(s):

 def capitalized_words_percent(s):
    '''Returns capitalized words percent.'''
-    s = to_unicode(s, precise=True)
+    s = to_unicode(s)
    words = re.split('\s', s)
    words = [w for w in words if w.strip()]
    capitalized_words_counter = 0
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -2,12 +2,13 @@

 import logging
 from random import shuffle
-import chardet
-import cchardet

 from talon.constants import RE_DELIMITER


+log = logging.getLogger(__name__)
+
+
 def safe_format(format_string, *args, **kwargs):
    """
    Helper: formats string with any combination of bytestrings/unicode
@@ -41,44 +42,12 @@ def to_unicode(str_or_unicode, precise=False):
        u'привет'
    If `precise` flag is True, tries to guess the correct encoding first.
    """
-    encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8'
+    encoding = detect_encoding(str_or_unicode) if precise else 'utf-8'
    if isinstance(str_or_unicode, str):
        return unicode(str_or_unicode, encoding, 'replace')
    return str_or_unicode


-def detect_encoding(string):
-    """
-    Tries to detect the encoding of the passed string.
-
-    Defaults to UTF-8.
-    """
-    try:
-        detected = chardet.detect(string)
-        if detected:
-            return detected.get('encoding') or 'utf-8'
-    except Exception, e:
-        print 11111111111, e
-        pass
-    return 'utf-8'
-
-
-def quick_detect_encoding(string):
-    """
-    Tries to detect the encoding of the passed string.
-
-    Uses cchardet. Fallbacks to detect_encoding.
-    """
-    try:
-        detected = cchardet.detect(string)
-        if detected:
-            return detected.get('encoding') or detect_encoding(string)
-    except Exception, e:
-        print 222222222222, e
-        pass
-    return detect_encoding(string)
-
-
 def to_utf8(str_or_unicode):
    """
    Safely returns a UTF-8 version of a given string
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -49,24 +49,6 @@ def test_quotation_splitter_outside_blockquote():
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))


-def test_regular_blockquote():
-    msg_body = """Reply
-<blockquote>Regular</blockquote>
-
-<div>
-  On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
-</div>
-
-<blockquote>
-  <div>
-    <blockquote>Nested</blockquote>
-  </div>
-</blockquote>
-"""
-    eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote><div></div></body></html>",
-        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
-
-
 def test_no_blockquote():
    msg_body = """
 <html>
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -12,11 +12,11 @@ from talon import quotations
@patch.object(quotations, 'MAX_LINES_COUNT', 1)
 def test_too_many_lines():
    msg_body = """Test reply
-Hi
+
 -----Original Message-----

 Test"""
-    eq_("Test reply", quotations.extract_from_plain(msg_body))
+    eq_(msg_body, quotations.extract_from_plain(msg_body))


 def test_pattern_on_date_somebody_wrote():
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -1,60 +1,9 @@
-# coding:utf-8
-
 from . import *

-from talon import utils as u
-import cchardet
+from talon import utils


 def test_get_delimiter():
-    eq_('\r\n', u.get_delimiter('abc\r\n123'))
-    eq_('\n', u.get_delimiter('abc\n123'))
-    eq_('\n', u.get_delimiter('abc'))
-
-
-def test_unicode():
-    eq_ (u'hi', u.to_unicode('hi'))
-    eq_ (type(u.to_unicode('hi')), unicode )
-    eq_ (type(u.to_unicode(u'hi')), unicode )
-    eq_ (type(u.to_unicode('привет')), unicode )
-    eq_ (type(u.to_unicode(u'привет')), unicode )
-    eq_ (u"привет", u.to_unicode('привет'))
-    eq_ (u"привет", u.to_unicode(u'привет'))
-    # some latin1 stuff
-    eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True))
-
-
-def test_detect_encoding():
-    eq_ ('ascii', u.detect_encoding('qwe').lower())
-    eq_ ('iso-8859-2', u.detect_encoding('Versi\xf3n').lower())
-    eq_ ('utf-8', u.detect_encoding('привет').lower())
-    # fallback to utf-8
-    with patch.object(u.chardet, 'detect') as detect:
-        detect.side_effect = Exception
-        eq_ ('utf-8', u.detect_encoding('qwe').lower())
-
-
-def test_quick_detect_encoding():
-    eq_ ('ascii', u.quick_detect_encoding('qwe').lower())
-    eq_ ('windows-1252', u.quick_detect_encoding('Versi\xf3n').lower())
-    eq_ ('utf-8', u.quick_detect_encoding('привет').lower())
-
-
-@patch.object(cchardet, 'detect')
-@patch.object(u, 'detect_encoding')
-def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect):
-    cchardet_detect.return_value = {'encoding': 'ascii'}
-    eq_('ascii', u.quick_detect_encoding("qwe"))
-    cchardet_detect.assert_called_once_with("qwe")
-
-    # fallback to detect_encoding
-    cchardet_detect.return_value = {}
-    detect_encoding.return_value = 'utf-8'
-    eq_('utf-8', u.quick_detect_encoding("qwe"))
-
-    # exception
-    detect_encoding.reset_mock()
-    cchardet_detect.side_effect = Exception()
-    detect_encoding.return_value = 'utf-8'
-    eq_('utf-8', u.quick_detect_encoding("qwe"))
-    ok_(detect_encoding.called)
+    eq_('\r\n', utils.get_delimiter('abc\r\n123'))
+    eq_('\n', utils.get_delimiter('abc\n123'))
+    eq_('\n', utils.get_delimiter('abc'))