Updated talon for Python 3

2017-05-23 15:39:50 -07:00
parent f16ae5110b
commit 086f5ba43b
10 changed files with 75 additions and 61 deletions
@@ -39,6 +39,8 @@ nosetests.xml
 /.emacs.desktop
 /.emacs.desktop.lock
 .elc
 .idea
 .cache
 auto-save-list
 tramp
 .\#*
@@ -51,4 +53,4 @@ tramp
 _trial_temp
 # OSX
-.DS_Store
+.DS_Store
@@ -1,15 +1,15 @@
 from __future__ import absolute_import
 import logging
 import regex as re
 from talon.utils import get_delimiter
 from talon.signature.constants import (SIGNATURE_MAX_LINES,
                                       TOO_LONG_SIGNATURE_LINE)
 from talon.utils import get_delimiter
 log = logging.getLogger(__name__)
 # regex to fetch signature based on common signature words
 RE_SIGNATURE = re.compile(r'''
               (
@@ -28,7 +28,6 @@ RE_SIGNATURE = re.compile(r'''
               )
               ''', re.I | re.X | re.M | re.S)
 # signatures appended by phone email clients
 RE_PHONE_SIGNATURE = re.compile(r'''
               (
@@ -45,7 +44,6 @@ RE_PHONE_SIGNATURE = re.compile(r'''
               )
               ''', re.I | re.X | re.M | re.S)
 # see _mark_candidate_indexes() for details
 # c - could be signature line
 # d - line starts with dashes (could be signature or list item)
@@ -163,7 +161,7 @@ def _mark_candidate_indexes(lines, candidate):
    'cdc'
    """
    # at first consider everything to be potential signature lines
-    markers = bytearray('c'*len(candidate))
+    markers = list('c' * len(candidate))
    # mark lines starting from bottom up
    for i, line_idx in reversed(list(enumerate(candidate))):
@@ -174,7 +172,7 @@ def _mark_candidate_indexes(lines, candidate):
            if line.startswith('-') and line.strip("-"):
                markers[i] = 'd'
-    return markers
+    return "".join(markers)
 def _process_marked_candidate_indexes(candidate, markers):
@@ -1,16 +1,15 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import
 import logging
 import regex as re
 import numpy
-
+import regex as re
 from talon.signature.learning.featurespace import features, build_pattern
 from talon.utils import get_delimiter
 from talon.signature.bruteforce import get_signature_candidate
 from talon.signature.learning.featurespace import features, build_pattern
 from talon.signature.learning.helpers import has_signature
-
+from talon.utils import get_delimiter
 log = logging.getLogger(__name__)
@@ -58,7 +57,7 @@ def extract(body, sender):
                text = delimiter.join(text)
                if text.strip():
                    return (text, delimiter.join(signature))
-    except Exception:
+    except Exception as e:
        log.exception('ERROR when extracting signature with classifiers')
    return (body, None)
@@ -81,7 +80,7 @@ def _mark_lines(lines, sender):
    candidate = get_signature_candidate(lines)
    # at first consider everything to be text no signature
-    markers = bytearray('t'*len(lines))
+    markers = list('t' * len(lines))
    # mark lines starting from bottom up
    # mark only lines that belong to candidate
@@ -96,7 +95,7 @@ def _mark_lines(lines, sender):
        elif is_signature_line(line, sender, EXTRACTOR):
            markers[j] = 's'
-    return markers
+    return "".join(markers)
 def _process_marked_lines(lines, markers):
@@ -111,3 +110,4 @@ def _process_marked_lines(lines, markers):
        return (lines[:-signature.end()], lines[-signature.end():])
    return (lines, None)
@@ -6,6 +6,9 @@ body belongs to the signature.
 """
 from __future__ import absolute_import
 import pickle
 from numpy import genfromtxt
 from sklearn.svm import LinearSVC
 from sklearn.externals import joblib
@@ -29,4 +32,10 @@ def train(classifier, train_data_filename, save_classifier_filename=None):
 def load(saved_classifier_filename, train_data_filename):
    """Loads saved classifier. """
-    return joblib.load(saved_classifier_filename)
+    try:
        return joblib.load(saved_classifier_filename)
    except ValueError:
        loaded = pickle.load(open(saved_classifier_filename, 'rb'), encoding='latin1')
        joblib.dump(loaded, saved_classifier_filename, compress=True)
        return loaded
@@ -17,13 +17,14 @@ suffix which should be `_sender`.
 """
 from __future__ import absolute_import
 import os
 import regex as re
 from six.moves import range
 from talon.signature.constants import SIGNATURE_MAX_LINES
 from talon.signature.learning.featurespace import build_pattern, features
 from six.moves import range
 SENDER_SUFFIX = '_sender'
 BODY_SUFFIX = '_body'
@@ -59,7 +60,7 @@ def parse_msg_sender(filename, sender_known=True):
    """
    sender, msg = None, None
    if os.path.isfile(filename) and not is_sender_filename(filename):
-        with open(filename) as f:
+        with open(filename, encoding='utf-8') as f:
            msg = f.read()
            sender = u''
            if sender_known:
@@ -147,7 +148,7 @@ def build_extraction_dataset(folder, dataset_filename,
                continue
            lines = msg.splitlines()
            for i in range(1, min(SIGNATURE_MAX_LINES,
-                                   len(lines)) + 1):
+                                  len(lines)) + 1):
                line = lines[-i]
                label = -1
                if line[:len(SIGNATURE_ANNOTATION)] == \
@@ -1,19 +1,18 @@
 # coding:utf-8
 from __future__ import absolute_import
-import logging
+
 from random import shuffle
-import chardet
+
 import cchardet
-import regex as re
+import chardet
 from lxml.html import html5parser
 from lxml.cssselect import CSSSelector
 import html5lib
 import regex as re
 import six
 from lxml.cssselect import CSSSelector
 from lxml.html import html5parser
 from talon.constants import RE_DELIMITER
 import six
 def safe_format(format_string, *args, **kwargs):
@@ -128,7 +127,7 @@ def html_tree_to_text(tree):
        parent.remove(c)
-    text   = ""
+    text = ""
    for el in tree.iter():
        el_text = (el.text or '') + (el.tail or '')
        if len(el_text) > 1:
@@ -177,6 +176,8 @@ def html_to_text(string):
 def html_fromstring(s):
    """Parse html tree from string. Return None if the string can't be parsed.
    """
    if isinstance(s, six.text_type):
        s = s.encode('utf8')
    try:
        if html_too_big(s):
            return None
@@ -189,6 +190,8 @@ def html_fromstring(s):
 def html_document_fromstring(s):
    """Parse html tree from string. Return None if the string can't be parsed.
    """
    if isinstance(s, six.text_type):
        s = s.encode('utf8')
    try:
        if html_too_big(s):
            return None
@@ -203,7 +206,9 @@ def cssselect(expr, tree):
 def html_too_big(s):
-    return s.count('<') > _MAX_TAGS_COUNT
+    if isinstance(s, six.text_type):
        s = s.encode('utf8')
    return s.count(b'<') > _MAX_TAGS_COUNT
 def _contains_charset_spec(s):
@@ -248,8 +253,7 @@ def _html5lib_parser():
 _UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;'
                     b'charset=utf-8">')
-
+_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
 _BLOCKTAGS  = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
 _HARDBREAKS = ['br', 'hr', 'tr']
 _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
@@ -1,13 +1,12 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import
 from . import *
 from . fixtures import *
 import regex as re
 from talon import quotations, utils as u
-
+from . import *
 from .fixtures import *
 RE_WHITESPACE = re.compile("\s")
 RE_DOUBLE_WHITESPACE = re.compile("\s")
@@ -303,7 +302,7 @@ Reply
 def extract_reply_and_check(filename):
-    f = open(filename)
+    f = open(filename, encoding='utf8')
    msg_body = f.read()
    reply = quotations.extract_from_html(msg_body)
@@ -373,7 +372,7 @@ reply
 </blockquote>"""
    msg_body = msg_body.replace('\n', '\r\n')
    extracted = quotations.extract_from_html(msg_body)
-    assert_false(symbol in extracted)    
+    assert_false(symbol in extracted)
    # Keep new lines otherwise "My reply" becomes one word - "Myreply" 
    eq_("<html><head></head><body>My\nreply\n</body></html>", extracted)
@@ -31,7 +31,7 @@ def test_messages_longer_SIGNATURE_MAX_LINES():
        sender, body = dataset.parse_msg_sender(filename)
        text, extracted_signature = signature.extract(body, sender)
        extracted_signature = extracted_signature or ''
-        with open(filename[:-len('body')] + 'signature') as ms:
+        with open(filename[:-len('body')] + 'signature', encoding='utf8') as ms:
            msg_signature = ms.read()
            eq_(msg_signature.strip(), extracted_signature.strip())
            stripped_msg = body.strip()[:len(body.strip())-len(msg_signature)]
@@ -1,12 +1,12 @@
 # coding:utf-8
 from __future__ import absolute_import
 from . import *
 from talon import utils as u
 import cchardet
 import six
-from lxml import html
+
 from talon import utils as u
 from . import *
 def test_get_delimiter():
@@ -16,35 +16,35 @@ def test_get_delimiter():
 def test_unicode():
-    eq_ (u'hi', u.to_unicode('hi'))
+    eq_(u'hi', u.to_unicode('hi'))
-    eq_ (type(u.to_unicode('hi')), six.text_type )
+    eq_(type(u.to_unicode('hi')), six.text_type)
-    eq_ (type(u.to_unicode(u'hi')), six.text_type )
+    eq_(type(u.to_unicode(u'hi')), six.text_type)
-    eq_ (type(u.to_unicode('привет')), six.text_type )
+    eq_(type(u.to_unicode('привет')), six.text_type)
-    eq_ (type(u.to_unicode(u'привет')), six.text_type )
+    eq_(type(u.to_unicode(u'привет')), six.text_type)
-    eq_ (u"привет", u.to_unicode('привет'))
+    eq_(u"привет", u.to_unicode('привет'))
-    eq_ (u"привет", u.to_unicode(u'привет'))
+    eq_(u"привет", u.to_unicode(u'привет'))
    # some latin1 stuff
-    eq_ (u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True))
+    eq_(u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True))
 def test_detect_encoding():
-    eq_ ('ascii', u.detect_encoding(b'qwe').lower())
+    eq_('ascii', u.detect_encoding(b'qwe').lower())
-    ok_ (u.detect_encoding(
+    ok_(u.detect_encoding(
        u'Versi\xf3n'.encode('iso-8859-2')).lower() in [
            'iso-8859-1', 'iso-8859-2'])
-    eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower())
+    eq_('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower())
    # fallback to utf-8
    with patch.object(u.chardet, 'detect') as detect:
        detect.side_effect = Exception
-        eq_ ('utf-8', u.detect_encoding('qwe'.encode('utf8')).lower())
+        eq_('utf-8', u.detect_encoding('qwe'.encode('utf8')).lower())
 def test_quick_detect_encoding():
-    eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower())
+    eq_('ascii', u.quick_detect_encoding(b'qwe').lower())
-    ok_ (u.quick_detect_encoding(
+    ok_(u.quick_detect_encoding(
        u'Versi\xf3n'.encode('windows-1252')).lower() in [
            'windows-1252', 'windows-1250'])
-    eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower())
+    eq_('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower())
@patch.object(cchardet, 'detect')
@@ -84,7 +84,7 @@ Haha
    eq_(u"привет!", u.html_to_text("<b>привет!</b>").decode('utf8'))
    html = '<body><br/><br/>Hi</body>'
-    eq_ (b'Hi', u.html_to_text(html))
+    eq_(b'Hi', u.html_to_text(html))
    html = """Hi
 <style type="text/css">
@@ -104,7 +104,7 @@ font: 13px 'Lucida Grande', Arial, sans-serif;
 }
 </style>"""
-    eq_ (b'Hi', u.html_to_text(html))
+    eq_(b'Hi', u.html_to_text(html))
    html = """<div>
 <!-- COMMENT 1 -->
@@ -115,15 +115,16 @@ font: 13px 'Lucida Grande', Arial, sans-serif;
 def test_comment_no_parent():
-    s = "<!-- COMMENT 1 --> no comment"
+    s = b'<!-- COMMENT 1 --> no comment'
    d = u.html_document_fromstring(s)
-    eq_("no comment", u.html_tree_to_text(d))
+    eq_(b"no comment", u.html_tree_to_text(d))
@patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception()))
 def test_html_fromstring_exception():
    eq_(None, u.html_fromstring("<html></html>"))
@patch.object(u, 'html_too_big', Mock())
@patch.object(u.html5parser, 'fromstring')
 def test_html_fromstring_too_big(fromstring):
@@ -158,5 +159,5 @@ def test_html_too_big():
@patch.object(u, '_MAX_TAGS_COUNT', 3)
 def test_html_to_text():
-    eq_("Hello", u.html_to_text("<div>Hello</div>"))
+    eq_(b"Hello", u.html_to_text("<div>Hello</div>"))
    eq_(None, u.html_to_text("<div><span>Hi</span></div>"))