Actually bump up talon's version up to 1.0.5 to match the tag.

2015-09-09 22:46:18 +02:00
15 changed files with 47 additions and 283 deletions
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages


 setup(name='talon',
-      version='1.0.9',
+      version='1.0.5',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
@@ -14,15 +14,12 @@ setup(name='talon',
      include_package_data=True,
      zip_safe=True,
      install_requires=[
-          "lxml>=2.3.3",
+          "lxml==2.3.3",
          "regex>=1",
          "html2text",
          "numpy",
          "scipy",
          "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
-          'chardet>=1.0.1',
-          'cchardet>=0.3.5',
-          'cssselect'
          ],
      tests_require=[
          "mock",
--- a/talon/html_quotations.py
+++ b/talon/html_quotations.py
@@ -138,10 +138,9 @@ def cut_by_id(html_message):


 def cut_blockquote(html_message):
-    ''' Cuts the last non-nested blockquote with wrapping elements. '''
-    quote = html_message.xpath('(.//blockquote)[not(ancestor::blockquote)][last()]')
-    if quote:
-        quote = quote[0]
+    ''' Cuts blockquote with wrapping elements. '''
+    quote = html_message.find('.//blockquote')
+    if quote is not None:
        quote.getparent().remove(quote)
        return True

--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -22,7 +22,7 @@ log = logging.getLogger(__name__)
 RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)

 RE_ON_DATE_SMB_WROTE = re.compile(
-    u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
+    u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
        # Beginning of the line
        u'|'.join((
            # English
@@ -32,13 +32,7 @@ RE_ON_DATE_SMB_WROTE = re.compile(
            # Polish
            'W dniu',
            # Dutch
-            'Op',
-            # German
-            'Am',
-            # Norwegian
-            u'På',
-            # Swedish, Danish
-            'Den',
+            'Op'
        )),
        # Date and sender separator
        u'|'.join((
@@ -56,28 +50,18 @@ RE_ON_DATE_SMB_WROTE = re.compile(
            # Polish
            u'napisał',
            # Dutch
-            'schreef','verzond','geschreven',
-            # German
-            'schrieb',
-            # Norwegian, Swedish
-            'skrev',
+            'schreef','verzond','geschreven'
        ))
    ))
 # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
 RE_ON_DATE_WROTE_SMB = re.compile(
-    u'(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
+    u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format(
        # Beginning of the line
-        u'|'.join((
        	'Op',
-        	#German
-        	'Am'
-        )),
        # Ending of the line
        u'|'.join((
            # Dutch
-            'schreef','verzond','geschreven',
-            # German
-            'schrieb'
+            'schreef','verzond','geschreven'
        ))
    )
    )
@@ -131,9 +115,9 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
 RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format(
    u'|'.join((
        # "From" in different languages.
-        'From', 'Van', 'De', 'Von', 'Fra', u'Från',
+        'From', 'Van', 'De', 'Von', 'Fra',
        # "Date" in different languages.
-        'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
+        'Date', 'Datum', u'Envoyé'
    ))), re.I)

 SPLITTER_PATTERNS = [
@@ -197,7 +181,6 @@ def mark_message_lines(lines):
        else:
            # in case splitter is spread across several lines
            splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
-
            if splitter:
                # append as many splitter markers as lines in splitter
                splitter_lines = splitter.group().splitlines()
@@ -310,8 +293,12 @@ def extract_from_plain(msg_body):

    delimiter = get_delimiter(msg_body)
    msg_body = preprocess(msg_body, delimiter)
+    lines = msg_body.splitlines()
+
    # don't process too long messages
-    lines = msg_body.splitlines()[:MAX_LINES_COUNT]
+    if len(lines) > MAX_LINES_COUNT:
+        return stripped_text
+
    markers = mark_message_lines(lines)
    lines = process_marked_lines(lines, markers)

@@ -321,7 +308,7 @@ def extract_from_plain(msg_body):
    return msg_body


-def extract_from_html(s):
+def extract_from_html(msg_body):
    """
    Extract not quoted message from provided html message body
    using tags and plain text algorithm.
@@ -338,12 +325,8 @@ def extract_from_html(s):
    then deleting necessary tags.
    """

-    if s.strip() == '':
-        return s
-
-    # replace CRLF with LF temporaraly otherwise CR will be converted to '&#13;'
-    # when doing deepcopy on html tree
-    msg_body, replaced = _CRLF_to_LF(s)
+    if msg_body.strip() == '':
+        return msg_body

    html_tree = html.document_fromstring(
        msg_body,
@@ -374,12 +357,15 @@ def extract_from_html(s):
    plain_text = plain_text.replace('*', '')
    # Unmask saved star symbols
    plain_text = plain_text.replace('3423oorkg432', '*')
-    plain_text = preprocess(plain_text, '\n', content_type='text/html')
+
+    delimiter = get_delimiter(plain_text)
+
+    plain_text = preprocess(plain_text, delimiter, content_type='text/html')
    lines = plain_text.splitlines()

    # Don't process too long messages
    if len(lines) > MAX_LINES_COUNT:
-        return s
+        return msg_body

    # Collect checkpoints on each line
    line_checkpoints = [
@@ -404,9 +390,9 @@ def extract_from_html(s):
                quotation_checkpoints[checkpoint] = True
    else:
        if cut_quotations:
-            return _restore_CRLF(html.tostring(html_tree_copy), replaced)
+            return html.tostring(html_tree_copy)
        else:
-            return s
+            return msg_body

    # Remove tags with quotation checkpoints
    html_quotations.delete_quotation_tags(
@@ -442,37 +428,3 @@ def register_xpath_extensions():
    ns.prefix = 'mg'
    ns['text_content'] = text_content
    ns['tail'] = tail
-
-
-def _restore_CRLF(s, replaced=True):
-    """Restore CRLF if previously CRLF was replaced with LF
-
-    >>> _restore_CRLF('a\nb')
-    'a\r\nb'
-    >>> _restore_CRLF('a\nb', replaced=False)
-    'a\nb'
-    """
-    if replaced:
-        return s.replace('\n', '\r\n')
-    return s
-
-
-def _CRLF_to_LF(s):
-    """Replace CRLF with LF
-
-    >>> s, changed = _CRLF_to_LF('a\r\n'b)
-    >>> s
-    'a\nb'
-    >>> changed
-    True
-
-    >>> s, changed = _CRLF_to_LF('a\n'b)
-    >>> s
-    'a\nb'
-    >>> changed
-    False
-    """
-    delimiter = get_delimiter(s)
-    if delimiter == '\r\n':
-        return s.replace(delimiter, '\n'), True
-    return s, False
--- a/talon/signature/data/classifier
+++ b/talon/signature/data/classifier
--- a/talon/signature/data/classifier_02.npy
+++ b/talon/signature/data/classifier_02.npy
--- a/talon/signature/data/classifier_03.npy
+++ b/talon/signature/data/classifier_03.npy
--- a/talon/signature/learning/helpers.py
+++ b/talon/signature/learning/helpers.py
@@ -16,7 +16,7 @@ from talon.signature.constants import SIGNATURE_MAX_LINES

 rc = re.compile

-RE_EMAIL = rc('\S@\S')
+RE_EMAIL = rc('@')
 RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
 RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')

@@ -120,7 +120,7 @@ def contains_sender_names(sender):
    names = names or sender
    if names != '':
        return binary_regex_search(re.compile(names))
-    return lambda s: 0
+    return lambda s: False


 def extract_names(sender):
@@ -134,7 +134,7 @@ def extract_names(sender):
    >>> extract_names('')
    []
    """
-    sender = to_unicode(sender, precise=True)
+    sender = to_unicode(sender)
    # Remove non-alphabetical characters
    sender = "".join([char if char.isalpha() else ' ' for char in sender])
    # Remove too short words and words from "black" list i.e.
@@ -161,7 +161,7 @@ def categories_percent(s, categories):
    50.0
    '''
    count = 0
-    s = to_unicode(s, precise=True)
+    s = to_unicode(s)
    for c in s:
        if unicodedata.category(c) in categories:
            count += 1
@@ -181,7 +181,7 @@ def punctuation_percent(s):

 def capitalized_words_percent(s):
    '''Returns capitalized words percent.'''
-    s = to_unicode(s, precise=True)
+    s = to_unicode(s)
    words = re.split('\s', s)
    words = [w for w in words if w.strip()]
    capitalized_words_counter = 0
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -2,12 +2,13 @@

 import logging
 from random import shuffle
-import chardet
-import cchardet

 from talon.constants import RE_DELIMITER


+log = logging.getLogger(__name__)
+
+
 def safe_format(format_string, *args, **kwargs):
    """
    Helper: formats string with any combination of bytestrings/unicode
@@ -41,44 +42,12 @@ def to_unicode(str_or_unicode, precise=False):
        u'привет'
    If `precise` flag is True, tries to guess the correct encoding first.
    """
-    encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8'
+    encoding = detect_encoding(str_or_unicode) if precise else 'utf-8'
    if isinstance(str_or_unicode, str):
        return unicode(str_or_unicode, encoding, 'replace')
    return str_or_unicode


-def detect_encoding(string):
-    """
-    Tries to detect the encoding of the passed string.
-
-    Defaults to UTF-8.
-    """
-    try:
-        detected = chardet.detect(string)
-        if detected:
-            return detected.get('encoding') or 'utf-8'
-    except Exception, e:
-        print 11111111111, e
-        pass
-    return 'utf-8'
-
-
-def quick_detect_encoding(string):
-    """
-    Tries to detect the encoding of the passed string.
-
-    Uses cchardet. Fallbacks to detect_encoding.
-    """
-    try:
-        detected = cchardet.detect(string)
-        if detected:
-            return detected.get('encoding') or detect_encoding(string)
-    except Exception, e:
-        print 222222222222, e
-        pass
-    return detect_encoding(string)
-
-
 def to_utf8(str_or_unicode):
    """
    Safely returns a UTF-8 version of a given string
--- a/tests/fixtures/html_replies/hotmail.html
+++ b/tests/fixtures/html_replies/hotmail.html
@@ -1,4 +1,3 @@
-<?xml version="1.0" encoding="UTF-8"?>
 <html>
 <head>
 <style><!--
--- a/tests/fixtures/standard_replies/apple_mail_2.eml
+++ b/tests/fixtures/standard_replies/apple_mail_2.eml
@@ -1,19 +0,0 @@
-Content-Type: text/plain;
-	charset=us-ascii
-Mime-Version: 1.0 (Mac OS X Mail 8.2 \(2104\))
-Subject: Re: Hello there
-X-Universally-Unique-Identifier: 85B1075D-5841-46A9-8565-FCB287A93AC4
-From: Adam Renberg <adam@tictail.com>
-In-Reply-To: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
-Date: Sat, 22 Aug 2015 19:22:20 +0200
-Content-Transfer-Encoding: 7bit
-X-Smtp-Server: smtp.gmail.com:adam@tictail.com
-Message-Id: <68001B29-8EA4-444C-A894-0537D2CA5208@tictail.com>
-References: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
-To: Adam Renberg <tgwizard@gmail.com>
-
-Hello
-> On 22 Aug 2015, at 19:21, Adam Renberg <tgwizard@gmail.com> wrote:
->
-> Hi there!
-
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -28,8 +28,8 @@ def test_quotation_splitter_inside_blockquote():

 </blockquote>"""

-    eq_("<html><body><p>Reply\n</p></body></html>",
-        quotations.extract_from_html(msg_body))
+    eq_("<html><body><p>Reply</p></body></html>",
+        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))


 def test_quotation_splitter_outside_blockquote():
@@ -49,24 +49,6 @@ def test_quotation_splitter_outside_blockquote():
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))


-def test_regular_blockquote():
-    msg_body = """Reply
-<blockquote>Regular</blockquote>
-
-<div>
-  On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
-</div>
-
-<blockquote>
-  <div>
-    <blockquote>Nested</blockquote>
-  </div>
-</blockquote>
-"""
-    eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote><div></div></body></html>",
-        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
-
-
 def test_no_blockquote():
    msg_body = """
 <html>
@@ -264,7 +246,7 @@ RE_REPLY = re.compile(r"^Hi\. I am fine\.\s*\n\s*Thanks,\s*\n\s*Alex\s*$")
 def extract_reply_and_check(filename):
    f = open(filename)

-    msg_body = f.read()
+    msg_body = f.read().decode("utf-8")
    reply = quotations.extract_from_html(msg_body)

    h = html2text.HTML2Text()
@@ -310,25 +292,3 @@ def test_windows_mail_reply():

 def test_yandex_ru_reply():
    extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html")
-
-
-def test_CRLF():
-    """CR is not converted to '&#13;'
-    """
-    eq_('<html>\r\n</html>', quotations.extract_from_html('<html>\r\n</html>'))
-
-    msg_body = """Reply
-<blockquote>
-
-  <div>
-    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
-  </div>
-
-  <div>
-    Test
-  </div>
-
-</blockquote>"""
-    msg_body = msg_body.replace('\n', '\r\n')
-    eq_("<html><body><p>Reply\r\n</p></body></html>",
-        quotations.extract_from_html(msg_body))
--- a/tests/quotations_test.py
+++ b/tests/quotations_test.py
@@ -29,15 +29,3 @@ def test_crash_inside_extract_from():

 def test_empty_body():
    eq_('', quotations.extract_from_plain(''))
-
-
-def test__CRLF_to_LF():
-    eq_(('\n\r', True), quotations._CRLF_to_LF('\r\n\r'))
-    eq_(('\n\r', False), quotations._CRLF_to_LF('\n\r'))
-
-
-def test__restore_CRLF():
-    eq_('\n', quotations._restore_CRLF('\n', replaced=False))
-    eq_('\r\n', quotations._restore_CRLF('\n', replaced=True))    
-    # default
-    eq_('\r\n', quotations._restore_CRLF('\n'))
--- a/tests/signature/learning/featurespace_test.py
+++ b/tests/signature/learning/featurespace_test.py
@@ -6,9 +6,7 @@ from talon.signature.learning import featurespace as fs


 def test_apply_features():
-    s = '''This is John Doe
-
-Tuesday @3pm suits. I'll chat to you then.
+    s = '''John Doe

 VP Research and Development, Xxxx Xxxx Xxxxx

@@ -21,12 +19,11 @@ john@example.com'''
    # note that we don't consider the first line because signatures don't
    # usually take all the text, empty lines are not considered
    eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
-                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

-    with patch.object(fs, 'SIGNATURE_MAX_LINES', 5):
+    with patch.object(fs, 'SIGNATURE_MAX_LINES', 4):
        features = fs.features(sender)
        new_result = fs.apply_features(s, features)
        # result remains the same because we don't consider empty lines
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -12,11 +12,11 @@ from talon import quotations
@patch.object(quotations, 'MAX_LINES_COUNT', 1)
 def test_too_many_lines():
    msg_body = """Test reply
-Hi
+
 -----Original Message-----

 Test"""
-    eq_("Test reply", quotations.extract_from_plain(msg_body))
+    eq_(msg_body, quotations.extract_from_plain(msg_body))


 def test_pattern_on_date_somebody_wrote():
@@ -311,33 +311,6 @@ Emne: The manager has commented on your Loop
 Blah-blah-blah
 """))

-def test_swedish_from_block():
-    eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
-    u"""Allo! Follow up MIME!
-Från: Anno Sportel [mailto:anno.spoel@hsbcssad.com]
-Skickat: den 26 augusti 2015 14:45
-Till: Isacson Leiff
-Ämne: RE: Week 36
-
-Blah-blah-blah
-"""))
-
-def test_swedish_from_line():
-    eq_('Lorem', quotations.extract_from_plain(
-    """Lorem
-Den 14 september, 2015 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
-
-Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
-"""))
-
-def test_norwegian_from_line():
-    eq_('Lorem', quotations.extract_from_plain(
-    u"""Lorem
-På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
-
-Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
-"""))
-
 def test_dutch_from_block():
    eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
    """Gluten-free culpa lo-fi et nesciunt nostrud. 
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -1,60 +1,9 @@
-# coding:utf-8
-
 from . import *

-from talon import utils as u
-import cchardet
+from talon import utils


 def test_get_delimiter():
-    eq_('\r\n', u.get_delimiter('abc\r\n123'))
-    eq_('\n', u.get_delimiter('abc\n123'))
-    eq_('\n', u.get_delimiter('abc'))
-
-
-def test_unicode():
-    eq_ (u'hi', u.to_unicode('hi'))
-    eq_ (type(u.to_unicode('hi')), unicode )
-    eq_ (type(u.to_unicode(u'hi')), unicode )
-    eq_ (type(u.to_unicode('привет')), unicode )
-    eq_ (type(u.to_unicode(u'привет')), unicode )
-    eq_ (u"привет", u.to_unicode('привет'))
-    eq_ (u"привет", u.to_unicode(u'привет'))
-    # some latin1 stuff
-    eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True))
-
-
-def test_detect_encoding():
-    eq_ ('ascii', u.detect_encoding('qwe').lower())
-    eq_ ('iso-8859-2', u.detect_encoding('Versi\xf3n').lower())
-    eq_ ('utf-8', u.detect_encoding('привет').lower())
-    # fallback to utf-8
-    with patch.object(u.chardet, 'detect') as detect:
-        detect.side_effect = Exception
-        eq_ ('utf-8', u.detect_encoding('qwe').lower())
-
-
-def test_quick_detect_encoding():
-    eq_ ('ascii', u.quick_detect_encoding('qwe').lower())
-    eq_ ('windows-1252', u.quick_detect_encoding('Versi\xf3n').lower())
-    eq_ ('utf-8', u.quick_detect_encoding('привет').lower())
-
-
-@patch.object(cchardet, 'detect')
-@patch.object(u, 'detect_encoding')
-def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect):
-    cchardet_detect.return_value = {'encoding': 'ascii'}
-    eq_('ascii', u.quick_detect_encoding("qwe"))
-    cchardet_detect.assert_called_once_with("qwe")
-
-    # fallback to detect_encoding
-    cchardet_detect.return_value = {}
-    detect_encoding.return_value = 'utf-8'
-    eq_('utf-8', u.quick_detect_encoding("qwe"))
-
-    # exception
-    detect_encoding.reset_mock()
-    cchardet_detect.side_effect = Exception()
-    detect_encoding.return_value = 'utf-8'
-    eq_('utf-8', u.quick_detect_encoding("qwe"))
-    ok_(detect_encoding.called)
+    eq_('\r\n', utils.get_delimiter('abc\r\n123'))
+    eq_('\n', utils.get_delimiter('abc\n123'))
+    eq_('\n', utils.get_delimiter('abc'))