diff --git a/.gitignore b/.gitignore index 002f03e..d1a3778 100644 --- a/.gitignore +++ b/.gitignore @@ -39,6 +39,8 @@ nosetests.xml /.emacs.desktop /.emacs.desktop.lock .elc +.idea +.cache auto-save-list tramp .\#* @@ -51,4 +53,4 @@ tramp _trial_temp # OSX -.DS_Store \ No newline at end of file +.DS_Store diff --git a/talon/signature/bruteforce.py b/talon/signature/bruteforce.py index 7f666bd..85e25ce 100644 --- a/talon/signature/bruteforce.py +++ b/talon/signature/bruteforce.py @@ -1,15 +1,15 @@ from __future__ import absolute_import + import logging import regex as re -from talon.utils import get_delimiter from talon.signature.constants import (SIGNATURE_MAX_LINES, TOO_LONG_SIGNATURE_LINE) +from talon.utils import get_delimiter log = logging.getLogger(__name__) - # regex to fetch signature based on common signature words RE_SIGNATURE = re.compile(r''' ( @@ -28,7 +28,6 @@ RE_SIGNATURE = re.compile(r''' ) ''', re.I | re.X | re.M | re.S) - # signatures appended by phone email clients RE_PHONE_SIGNATURE = re.compile(r''' ( @@ -45,7 +44,6 @@ RE_PHONE_SIGNATURE = re.compile(r''' ) ''', re.I | re.X | re.M | re.S) - # see _mark_candidate_indexes() for details # c - could be signature line # d - line starts with dashes (could be signature or list item) @@ -163,7 +161,7 @@ def _mark_candidate_indexes(lines, candidate): 'cdc' """ # at first consider everything to be potential signature lines - markers = bytearray('c'*len(candidate)) + markers = list('c' * len(candidate)) # mark lines starting from bottom up for i, line_idx in reversed(list(enumerate(candidate))): @@ -174,7 +172,7 @@ def _mark_candidate_indexes(lines, candidate): if line.startswith('-') and line.strip("-"): markers[i] = 'd' - return markers + return "".join(markers) def _process_marked_candidate_indexes(candidate, markers): diff --git a/talon/signature/data/classifier b/talon/signature/data/classifier index 1c3a4b0..5caaf86 100644 Binary files a/talon/signature/data/classifier and b/talon/signature/data/classifier differ diff --git a/talon/signature/extraction.py b/talon/signature/extraction.py index 3259171..fb88b32 100644 --- a/talon/signature/extraction.py +++ b/talon/signature/extraction.py @@ -1,16 +1,15 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import + import logging -import regex as re import numpy - -from talon.signature.learning.featurespace import features, build_pattern -from talon.utils import get_delimiter +import regex as re from talon.signature.bruteforce import get_signature_candidate +from talon.signature.learning.featurespace import features, build_pattern from talon.signature.learning.helpers import has_signature - +from talon.utils import get_delimiter log = logging.getLogger(__name__) @@ -58,7 +57,7 @@ def extract(body, sender): text = delimiter.join(text) if text.strip(): return (text, delimiter.join(signature)) - except Exception: + except Exception as e: log.exception('ERROR when extracting signature with classifiers') return (body, None) @@ -81,7 +80,7 @@ def _mark_lines(lines, sender): candidate = get_signature_candidate(lines) # at first consider everything to be text no signature - markers = bytearray('t'*len(lines)) + markers = list('t' * len(lines)) # mark lines starting from bottom up # mark only lines that belong to candidate @@ -96,7 +95,7 @@ def _mark_lines(lines, sender): elif is_signature_line(line, sender, EXTRACTOR): markers[j] = 's' - return markers + return "".join(markers) def _process_marked_lines(lines, markers): @@ -111,3 +110,4 @@ def _process_marked_lines(lines, markers): return (lines[:-signature.end()], lines[-signature.end():]) return (lines, None) + diff --git a/talon/signature/learning/classifier.py b/talon/signature/learning/classifier.py index 8ec3228..ce062bc 100644 --- a/talon/signature/learning/classifier.py +++ b/talon/signature/learning/classifier.py @@ -6,6 +6,9 @@ body belongs to the signature. """ from __future__ import absolute_import + +import pickle + from numpy import genfromtxt from sklearn.svm import LinearSVC from sklearn.externals import joblib @@ -29,4 +32,10 @@ def train(classifier, train_data_filename, save_classifier_filename=None): def load(saved_classifier_filename, train_data_filename): """Loads saved classifier. """ - return joblib.load(saved_classifier_filename) + try: + return joblib.load(saved_classifier_filename) + except ValueError: + loaded = pickle.load(open(saved_classifier_filename, 'rb'), encoding='latin1') + joblib.dump(loaded, saved_classifier_filename, compress=True) + return loaded + diff --git a/talon/signature/learning/dataset.py b/talon/signature/learning/dataset.py index 308995b..fbb37db 100644 --- a/talon/signature/learning/dataset.py +++ b/talon/signature/learning/dataset.py @@ -17,13 +17,14 @@ suffix which should be `_sender`. """ from __future__ import absolute_import + import os + import regex as re +from six.moves import range from talon.signature.constants import SIGNATURE_MAX_LINES from talon.signature.learning.featurespace import build_pattern, features -from six.moves import range - SENDER_SUFFIX = '_sender' BODY_SUFFIX = '_body' @@ -59,7 +60,7 @@ def parse_msg_sender(filename, sender_known=True): """ sender, msg = None, None if os.path.isfile(filename) and not is_sender_filename(filename): - with open(filename) as f: + with open(filename, encoding='utf-8') as f: msg = f.read() sender = u'' if sender_known: @@ -147,7 +148,7 @@ def build_extraction_dataset(folder, dataset_filename, continue lines = msg.splitlines() for i in range(1, min(SIGNATURE_MAX_LINES, - len(lines)) + 1): + len(lines)) + 1): line = lines[-i] label = -1 if line[:len(SIGNATURE_ANNOTATION)] == \ diff --git a/talon/utils.py b/talon/utils.py index e6c884b..34a21c6 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -1,19 +1,18 @@ # coding:utf-8 from __future__ import absolute_import -import logging + from random import shuffle -import chardet + import cchardet -import regex as re - -from lxml.html import html5parser -from lxml.cssselect import CSSSelector - +import chardet import html5lib +import regex as re +import six +from lxml.cssselect import CSSSelector +from lxml.html import html5parser from talon.constants import RE_DELIMITER -import six def safe_format(format_string, *args, **kwargs): @@ -128,7 +127,7 @@ def html_tree_to_text(tree): parent.remove(c) - text = "" + text = "" for el in tree.iter(): el_text = (el.text or '') + (el.tail or '') if len(el_text) > 1: @@ -177,6 +176,8 @@ def html_to_text(string): def html_fromstring(s): """Parse html tree from string. Return None if the string can't be parsed. """ + if isinstance(s, six.text_type): + s = s.encode('utf8') try: if html_too_big(s): return None @@ -189,6 +190,8 @@ def html_fromstring(s): def html_document_fromstring(s): """Parse html tree from string. Return None if the string can't be parsed. """ + if isinstance(s, six.text_type): + s = s.encode('utf8') try: if html_too_big(s): return None @@ -203,7 +206,9 @@ def cssselect(expr, tree): def html_too_big(s): - return s.count('<') > _MAX_TAGS_COUNT + if isinstance(s, six.text_type): + s = s.encode('utf8') + return s.count(b'<') > _MAX_TAGS_COUNT def _contains_charset_spec(s): @@ -248,8 +253,7 @@ def _html5lib_parser(): _UTF8_DECLARATION = (b'') - -_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] +_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] _HARDBREAKS = ['br', 'hr', 'tr'] _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index b78409b..7e3591f 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -1,13 +1,12 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -from . import * -from . fixtures import * import regex as re from talon import quotations, utils as u - +from . import * +from .fixtures import * RE_WHITESPACE = re.compile("\s") RE_DOUBLE_WHITESPACE = re.compile("\s") @@ -303,7 +302,7 @@ Reply def extract_reply_and_check(filename): - f = open(filename) + f = open(filename, encoding='utf8') msg_body = f.read() reply = quotations.extract_from_html(msg_body) @@ -373,7 +372,7 @@ reply """ msg_body = msg_body.replace('\n', '\r\n') extracted = quotations.extract_from_html(msg_body) - assert_false(symbol in extracted) + assert_false(symbol in extracted) # Keep new lines otherwise "My reply" becomes one word - "Myreply" eq_("
My\nreply\n", extracted) diff --git a/tests/signature/extraction_test.py b/tests/signature/extraction_test.py index b570303..0bce56d 100644 --- a/tests/signature/extraction_test.py +++ b/tests/signature/extraction_test.py @@ -31,7 +31,7 @@ def test_messages_longer_SIGNATURE_MAX_LINES(): sender, body = dataset.parse_msg_sender(filename) text, extracted_signature = signature.extract(body, sender) extracted_signature = extracted_signature or '' - with open(filename[:-len('body')] + 'signature') as ms: + with open(filename[:-len('body')] + 'signature', encoding='utf8') as ms: msg_signature = ms.read() eq_(msg_signature.strip(), extracted_signature.strip()) stripped_msg = body.strip()[:len(body.strip())-len(msg_signature)] diff --git a/tests/utils_test.py b/tests/utils_test.py index 8ddebdc..7ba4b52 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -1,12 +1,12 @@ # coding:utf-8 from __future__ import absolute_import -from . import * -from talon import utils as u import cchardet import six -from lxml import html + +from talon import utils as u +from . import * def test_get_delimiter(): @@ -16,35 +16,35 @@ def test_get_delimiter(): def test_unicode(): - eq_ (u'hi', u.to_unicode('hi')) - eq_ (type(u.to_unicode('hi')), six.text_type ) - eq_ (type(u.to_unicode(u'hi')), six.text_type ) - eq_ (type(u.to_unicode('привет')), six.text_type ) - eq_ (type(u.to_unicode(u'привет')), six.text_type ) - eq_ (u"привет", u.to_unicode('привет')) - eq_ (u"привет", u.to_unicode(u'привет')) + eq_(u'hi', u.to_unicode('hi')) + eq_(type(u.to_unicode('hi')), six.text_type) + eq_(type(u.to_unicode(u'hi')), six.text_type) + eq_(type(u.to_unicode('привет')), six.text_type) + eq_(type(u.to_unicode(u'привет')), six.text_type) + eq_(u"привет", u.to_unicode('привет')) + eq_(u"привет", u.to_unicode(u'привет')) # some latin1 stuff - eq_ (u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True)) + eq_(u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True)) def test_detect_encoding(): - eq_ ('ascii', u.detect_encoding(b'qwe').lower()) - ok_ (u.detect_encoding( + eq_('ascii', u.detect_encoding(b'qwe').lower()) + ok_(u.detect_encoding( u'Versi\xf3n'.encode('iso-8859-2')).lower() in [ 'iso-8859-1', 'iso-8859-2']) - eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower()) + eq_('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower()) # fallback to utf-8 with patch.object(u.chardet, 'detect') as detect: detect.side_effect = Exception - eq_ ('utf-8', u.detect_encoding('qwe'.encode('utf8')).lower()) + eq_('utf-8', u.detect_encoding('qwe'.encode('utf8')).lower()) def test_quick_detect_encoding(): - eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower()) - ok_ (u.quick_detect_encoding( + eq_('ascii', u.quick_detect_encoding(b'qwe').lower()) + ok_(u.quick_detect_encoding( u'Versi\xf3n'.encode('windows-1252')).lower() in [ 'windows-1252', 'windows-1250']) - eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower()) + eq_('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower()) @patch.object(cchardet, 'detect') @@ -84,7 +84,7 @@ Haha eq_(u"привет!", u.html_to_text("привет!").decode('utf8')) html = '