From 086f5ba43b1d657ae548b3607d43e2ed02bf89c2 Mon Sep 17 00:00:00 2001 From: Yacine Filali Date: Tue, 23 May 2017 15:39:50 -0700 Subject: [PATCH 1/7] Updated talon for Python 3 --- .gitignore | 4 ++- talon/signature/bruteforce.py | 10 +++--- talon/signature/data/classifier | Bin 608 -> 505 bytes talon/signature/extraction.py | 16 ++++----- talon/signature/learning/classifier.py | 11 +++++- talon/signature/learning/dataset.py | 9 ++--- talon/utils.py | 28 ++++++++------- tests/html_quotations_test.py | 9 +++-- tests/signature/extraction_test.py | 2 +- tests/utils_test.py | 47 +++++++++++++------------ 10 files changed, 75 insertions(+), 61 deletions(-) diff --git a/.gitignore b/.gitignore index 002f03e..d1a3778 100644 --- a/.gitignore +++ b/.gitignore @@ -39,6 +39,8 @@ nosetests.xml /.emacs.desktop /.emacs.desktop.lock .elc +.idea +.cache auto-save-list tramp .\#* @@ -51,4 +53,4 @@ tramp _trial_temp # OSX -.DS_Store \ No newline at end of file +.DS_Store diff --git a/talon/signature/bruteforce.py b/talon/signature/bruteforce.py index 7f666bd..85e25ce 100644 --- a/talon/signature/bruteforce.py +++ b/talon/signature/bruteforce.py @@ -1,15 +1,15 @@ from __future__ import absolute_import + import logging import regex as re -from talon.utils import get_delimiter from talon.signature.constants import (SIGNATURE_MAX_LINES, TOO_LONG_SIGNATURE_LINE) +from talon.utils import get_delimiter log = logging.getLogger(__name__) - # regex to fetch signature based on common signature words RE_SIGNATURE = re.compile(r''' ( @@ -28,7 +28,6 @@ RE_SIGNATURE = re.compile(r''' ) ''', re.I | re.X | re.M | re.S) - # signatures appended by phone email clients RE_PHONE_SIGNATURE = re.compile(r''' ( @@ -45,7 +44,6 @@ RE_PHONE_SIGNATURE = re.compile(r''' ) ''', re.I | re.X | re.M | re.S) - # see _mark_candidate_indexes() for details # c - could be signature line # d - line starts with dashes (could be signature or list item) @@ -163,7 +161,7 @@ def _mark_candidate_indexes(lines, candidate): 'cdc' """ # at first consider everything to be potential signature lines - markers = bytearray('c'*len(candidate)) + markers = list('c' * len(candidate)) # mark lines starting from bottom up for i, line_idx in reversed(list(enumerate(candidate))): @@ -174,7 +172,7 @@ def _mark_candidate_indexes(lines, candidate): if line.startswith('-') and line.strip("-"): markers[i] = 'd' - return markers + return "".join(markers) def _process_marked_candidate_indexes(candidate, markers): diff --git a/talon/signature/data/classifier b/talon/signature/data/classifier index 1c3a4b0865f3e951b1b3b17fb31bacc48d8d005b..5caaf86e63a0e614ce42c48aa82942589a51890d 100644 GIT binary patch literal 505 zcmV!}AP7`pEi~y8h88MrM4x3+9|~yE2IhJ&yZd4T13YvAujxPrb0>Sm zkrPI9Wy$*@Q8FRA%F7WeWr{>tTI3~nVf^Z)F?@7jcv%V)!aTBA7{jTKpK7bb4~8)H z;86E@-5x^Q*9TGw&2s^Zbmb2TIgrAz?Jm*f2$rgZ&SPaLOj1|@r=!4kN{!r7Q95Q* zu22}9i5gybzv2p(kG=ANbnK)PR*wD7oF6fXXgK@kN7E12{+q$M2}zO`Mv%NxPEN?y zD>!O9KNZn;qf&%$fzH%S9>c{bh#Ftv(mq`7L{0DSGq@7R@#|lB{1pWbR+C=qTQdmY z>K9nshiiU!iTafr!F4}O?Z6G%u1VJ0*~Gh>6hT2TG_Xz=F*02Di=0`uu55+RQ=>7;I$qi`*H*-A)!}?d)Hpcf8T}(CF|lW1-`rn`5~DKjFs|?W@6*fGcj9bu|)KuW9Ll{iCAu9|H4 1: @@ -177,6 +176,8 @@ def html_to_text(string): def html_fromstring(s): """Parse html tree from string. Return None if the string can't be parsed. """ + if isinstance(s, six.text_type): + s = s.encode('utf8') try: if html_too_big(s): return None @@ -189,6 +190,8 @@ def html_fromstring(s): def html_document_fromstring(s): """Parse html tree from string. Return None if the string can't be parsed. """ + if isinstance(s, six.text_type): + s = s.encode('utf8') try: if html_too_big(s): return None @@ -203,7 +206,9 @@ def cssselect(expr, tree): def html_too_big(s): - return s.count('<') > _MAX_TAGS_COUNT + if isinstance(s, six.text_type): + s = s.encode('utf8') + return s.count(b'<') > _MAX_TAGS_COUNT def _contains_charset_spec(s): @@ -248,8 +253,7 @@ def _html5lib_parser(): _UTF8_DECLARATION = (b'') - -_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] +_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] _HARDBREAKS = ['br', 'hr', 'tr'] _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index b78409b..7e3591f 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -1,13 +1,12 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -from . import * -from . fixtures import * import regex as re from talon import quotations, utils as u - +from . import * +from .fixtures import * RE_WHITESPACE = re.compile("\s") RE_DOUBLE_WHITESPACE = re.compile("\s") @@ -303,7 +302,7 @@ Reply def extract_reply_and_check(filename): - f = open(filename) + f = open(filename, encoding='utf8') msg_body = f.read() reply = quotations.extract_from_html(msg_body) @@ -373,7 +372,7 @@ reply """ msg_body = msg_body.replace('\n', '\r\n') extracted = quotations.extract_from_html(msg_body) - assert_false(symbol in extracted) + assert_false(symbol in extracted) # Keep new lines otherwise "My reply" becomes one word - "Myreply" eq_("My\nreply\n", extracted) diff --git a/tests/signature/extraction_test.py b/tests/signature/extraction_test.py index b570303..0bce56d 100644 --- a/tests/signature/extraction_test.py +++ b/tests/signature/extraction_test.py @@ -31,7 +31,7 @@ def test_messages_longer_SIGNATURE_MAX_LINES(): sender, body = dataset.parse_msg_sender(filename) text, extracted_signature = signature.extract(body, sender) extracted_signature = extracted_signature or '' - with open(filename[:-len('body')] + 'signature') as ms: + with open(filename[:-len('body')] + 'signature', encoding='utf8') as ms: msg_signature = ms.read() eq_(msg_signature.strip(), extracted_signature.strip()) stripped_msg = body.strip()[:len(body.strip())-len(msg_signature)] diff --git a/tests/utils_test.py b/tests/utils_test.py index 8ddebdc..7ba4b52 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -1,12 +1,12 @@ # coding:utf-8 from __future__ import absolute_import -from . import * -from talon import utils as u import cchardet import six -from lxml import html + +from talon import utils as u +from . import * def test_get_delimiter(): @@ -16,35 +16,35 @@ def test_get_delimiter(): def test_unicode(): - eq_ (u'hi', u.to_unicode('hi')) - eq_ (type(u.to_unicode('hi')), six.text_type ) - eq_ (type(u.to_unicode(u'hi')), six.text_type ) - eq_ (type(u.to_unicode('привет')), six.text_type ) - eq_ (type(u.to_unicode(u'привет')), six.text_type ) - eq_ (u"привет", u.to_unicode('привет')) - eq_ (u"привет", u.to_unicode(u'привет')) + eq_(u'hi', u.to_unicode('hi')) + eq_(type(u.to_unicode('hi')), six.text_type) + eq_(type(u.to_unicode(u'hi')), six.text_type) + eq_(type(u.to_unicode('привет')), six.text_type) + eq_(type(u.to_unicode(u'привет')), six.text_type) + eq_(u"привет", u.to_unicode('привет')) + eq_(u"привет", u.to_unicode(u'привет')) # some latin1 stuff - eq_ (u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True)) + eq_(u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True)) def test_detect_encoding(): - eq_ ('ascii', u.detect_encoding(b'qwe').lower()) - ok_ (u.detect_encoding( + eq_('ascii', u.detect_encoding(b'qwe').lower()) + ok_(u.detect_encoding( u'Versi\xf3n'.encode('iso-8859-2')).lower() in [ 'iso-8859-1', 'iso-8859-2']) - eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower()) + eq_('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower()) # fallback to utf-8 with patch.object(u.chardet, 'detect') as detect: detect.side_effect = Exception - eq_ ('utf-8', u.detect_encoding('qwe'.encode('utf8')).lower()) + eq_('utf-8', u.detect_encoding('qwe'.encode('utf8')).lower()) def test_quick_detect_encoding(): - eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower()) - ok_ (u.quick_detect_encoding( + eq_('ascii', u.quick_detect_encoding(b'qwe').lower()) + ok_(u.quick_detect_encoding( u'Versi\xf3n'.encode('windows-1252')).lower() in [ 'windows-1252', 'windows-1250']) - eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower()) + eq_('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower()) @patch.object(cchardet, 'detect') @@ -84,7 +84,7 @@ Haha eq_(u"привет!", u.html_to_text("привет!").decode('utf8')) html = '

Hi' - eq_ (b'Hi', u.html_to_text(html)) + eq_(b'Hi', u.html_to_text(html)) html = """Hi """ - eq_ (b'Hi', u.html_to_text(html)) + eq_(b'Hi', u.html_to_text(html)) html = """
@@ -115,15 +115,16 @@ font: 13px 'Lucida Grande', Arial, sans-serif; def test_comment_no_parent(): - s = " no comment" + s = b' no comment' d = u.html_document_fromstring(s) - eq_("no comment", u.html_tree_to_text(d)) + eq_(b"no comment", u.html_tree_to_text(d)) @patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception())) def test_html_fromstring_exception(): eq_(None, u.html_fromstring("")) + @patch.object(u, 'html_too_big', Mock()) @patch.object(u.html5parser, 'fromstring') def test_html_fromstring_too_big(fromstring): @@ -158,5 +159,5 @@ def test_html_too_big(): @patch.object(u, '_MAX_TAGS_COUNT', 3) def test_html_to_text(): - eq_("Hello", u.html_to_text("
Hello
")) + eq_(b"Hello", u.html_to_text("
Hello
")) eq_(None, u.html_to_text("
Hi
")) From dd0a0f5c4da5f3ffc40b5eae5715dd5c5ad16dcb Mon Sep 17 00:00:00 2001 From: Yacine Filali Date: Tue, 23 May 2017 16:10:13 -0700 Subject: [PATCH 2/7] Python 2.7 backward compat --- talon/signature/data/classifier | Bin 505 -> 608 bytes talon/signature/learning/classifier.py | 10 +++-- talon/signature/learning/dataset.py | 7 ++- tests/html_quotations_test.py | 10 ++++- tests/signature/extraction_test.py | 57 ++++++++++++++----------- 5 files changed, 52 insertions(+), 32 deletions(-) diff --git a/talon/signature/data/classifier b/talon/signature/data/classifier index 5caaf86e63a0e614ce42c48aa82942589a51890d..1c3a4b0865f3e951b1b3b17fb31bacc48d8d005b 100644 GIT binary patch literal 608 zcmZut%Z}496iu5B4QUIMH^cBApu;1A0kJ{?RHTszlf|G5OUp^!iNSGlZKq5Ym0$z& zReTI9Bt8OMchm*qE-d-#yzcqZ&h#h2sFIPM6;URr)=YFs|?W@6*fGcj9bu|)KuW9Ll{iCAu9|H4!}AP7`pEi~y8h88MrM4x3+9|~yE2IhJ&yZd4T13YvAujxPrb0>Sm zkrPI9Wy$*@Q8FRA%F7WeWr{>tTI3~nVf^Z)F?@7jcv%V)!aTBA7{jTKpK7bb4~8)H z;86E@-5x^Q*9TGw&2s^Zbmb2TIgrAz?Jm*f2$rgZ&SPaLOj1|@r=!4kN{!r7Q95Q* zu22}9i5gybzv2p(kG=ANbnK)PR*wD7oF6fXXgK@kN7E12{+q$M2}zO`Mv%NxPEN?y zD>!O9KNZn;qf&%$fzH%S9>c{bh#Ftv(mq`7L{0DSGq@7R@#|lB{1pWbR+C=qTQdmY z>K9nshiiU!iTafr!F4}O?Z6G%u1VJ0*~Gh>6hT2TG_Xz=F*02Di=0`uu55+RQ=>7;I$qi`*H*-A)!}?d)Hpcf8T}(CF|lW1-`rn`5~DKj (3, 0): + pickle_options["encoding"] = "bytes" + + loaded = pickle.load(open(saved_classifier_filename, 'rb'), **pickle_options) joblib.dump(loaded, saved_classifier_filename, compress=True) return loaded - diff --git a/talon/signature/learning/dataset.py b/talon/signature/learning/dataset.py index fbb37db..c7a88d4 100644 --- a/talon/signature/learning/dataset.py +++ b/talon/signature/learning/dataset.py @@ -58,9 +58,14 @@ def parse_msg_sender(filename, sender_known=True): algorithm: >>> parse_msg_sender(filename, False) """ + import sys + kwargs = {} + if sys.version_info > (3, 0): + kwargs["encoding"] = "bytes" + sender, msg = None, None if os.path.isfile(filename) and not is_sender_filename(filename): - with open(filename, encoding='utf-8') as f: + with open(filename, **kwargs) as f: msg = f.read() sender = u'' if sender_known: diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index 7e3591f..f26148c 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -2,7 +2,8 @@ from __future__ import absolute_import -import regex as re +# noinspection PyUnresolvedReferences +import re from talon import quotations, utils as u from . import * @@ -302,7 +303,12 @@ Reply def extract_reply_and_check(filename): - f = open(filename, encoding='utf8') + import sys + kwargs = {} + if sys.version_info > (3, 0): + kwargs["encoding"] = "bytes" + + f = open(filename, **kwargs) msg_body = f.read() reply = quotations.extract_from_html(msg_body) diff --git a/tests/signature/extraction_test.py b/tests/signature/extraction_test.py index 0bce56d..86b8705 100644 --- a/tests/signature/extraction_test.py +++ b/tests/signature/extraction_test.py @@ -1,16 +1,16 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -from .. import * import os -from talon.signature.learning import dataset -from talon import signature -from talon.signature import extraction as e -from talon.signature import bruteforce from six.moves import range +from talon.signature import bruteforce, extraction, extract +from talon.signature import extraction as e +from talon.signature.learning import dataset +from .. import * + def test_message_shorter_SIGNATURE_MAX_LINES(): sender = "bob@foo.bar" @@ -18,23 +18,28 @@ def test_message_shorter_SIGNATURE_MAX_LINES(): Thanks in advance, Bob""" - text, extracted_signature = signature.extract(body, sender) + text, extracted_signature = extract(body, sender) eq_('\n'.join(body.splitlines()[:2]), text) eq_('\n'.join(body.splitlines()[-2:]), extracted_signature) def test_messages_longer_SIGNATURE_MAX_LINES(): + import sys + kwargs = {} + if sys.version_info > (3, 0): + kwargs["encoding"] = "bytes" + for filename in os.listdir(STRIPPED): filename = os.path.join(STRIPPED, filename) if not filename.endswith('_body'): continue sender, body = dataset.parse_msg_sender(filename) - text, extracted_signature = signature.extract(body, sender) + text, extracted_signature = extract(body, sender) extracted_signature = extracted_signature or '' - with open(filename[:-len('body')] + 'signature', encoding='utf8') as ms: + with open(filename[:-len('body')] + 'signature', **kwargs) as ms: msg_signature = ms.read() eq_(msg_signature.strip(), extracted_signature.strip()) - stripped_msg = body.strip()[:len(body.strip())-len(msg_signature)] + stripped_msg = body.strip()[:len(body.strip()) - len(msg_signature)] eq_(stripped_msg.strip(), text.strip()) @@ -47,7 +52,7 @@ Thanks in advance, some text which doesn't seem to be a signature at all Bob""" - text, extracted_signature = signature.extract(body, sender) + text, extracted_signature = extract(body, sender) eq_('\n'.join(body.splitlines()[:2]), text) eq_('\n'.join(body.splitlines()[-3:]), extracted_signature) @@ -60,7 +65,7 @@ Thanks in advance, some long text here which doesn't seem to be a signature at all Bob""" - text, extracted_signature = signature.extract(body, sender) + text, extracted_signature = extract(body, sender) eq_('\n'.join(body.splitlines()[:-1]), text) eq_('Bob', extracted_signature) @@ -68,13 +73,13 @@ Bob""" some *long* text here which doesn't seem to be a signature at all """ - ((body, None), signature.extract(body, "david@example.com")) + ((body, None), extract(body, "david@example.com")) def test_basic(): msg_body = 'Blah\r\n--\r\n\r\nSergey Obukhov' eq_(('Blah', '--\r\n\r\nSergey Obukhov'), - signature.extract(msg_body, 'Sergey')) + extract(msg_body, 'Sergey')) def test_capitalized(): @@ -99,7 +104,7 @@ Doe Inc Doe Inc 555-531-7967""" - eq_(sig, signature.extract(msg_body, 'Doe')[1]) + eq_(sig, extract(msg_body, 'Doe')[1]) def test_over_2_text_lines_after_signature(): @@ -110,25 +115,25 @@ def test_over_2_text_lines_after_signature(): 2 non signature lines in the end It's not signature """ - text, extracted_signature = signature.extract(body, "Bob") + text, extracted_signature = extract(body, "Bob") eq_(extracted_signature, None) def test_no_signature(): sender, body = "bob@foo.bar", "Hello" - eq_((body, None), signature.extract(body, sender)) + eq_((body, None), extract(body, sender)) def test_handles_unicode(): sender, body = dataset.parse_msg_sender(UNICODE_MSG) - text, extracted_signature = signature.extract(body, sender) + text, extracted_signature = extract(body, sender) -@patch.object(signature.extraction, 'has_signature') +@patch.object(extraction, 'has_signature') def test_signature_extract_crash(has_signature): has_signature.side_effect = Exception('Bam!') msg_body = u'Blah\r\n--\r\n\r\nСергей' - eq_((msg_body, None), signature.extract(msg_body, 'Сергей')) + eq_((msg_body, None), extract(msg_body, 'Сергей')) def test_mark_lines(): @@ -137,19 +142,19 @@ def test_mark_lines(): # (starting from the bottom) because we don't count empty line eq_('ttset', e._mark_lines(['Bob Smith', - 'Bob Smith', - 'Bob Smith', - '', - 'some text'], 'Bob Smith')) + 'Bob Smith', + 'Bob Smith', + '', + 'some text'], 'Bob Smith')) with patch.object(bruteforce, 'SIGNATURE_MAX_LINES', 3): # we don't analyse the 1st line because # signature cant start from the 1st line eq_('tset', e._mark_lines(['Bob Smith', - 'Bob Smith', - '', - 'some text'], 'Bob Smith')) + 'Bob Smith', + '', + 'some text'], 'Bob Smith')) def test_process_marked_lines(): From 15e61768f2b7e0ba746b1d8c188933f414eaf2fe Mon Sep 17 00:00:00 2001 From: Yacine Filali Date: Tue, 23 May 2017 16:17:39 -0700 Subject: [PATCH 3/7] Encoding fixes --- talon/signature/learning/classifier.py | 8 ++++---- talon/signature/learning/dataset.py | 2 +- tests/html_quotations_test.py | 2 +- tests/signature/extraction_test.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/talon/signature/learning/classifier.py b/talon/signature/learning/classifier.py index 2ca1a21..f775413 100644 --- a/talon/signature/learning/classifier.py +++ b/talon/signature/learning/classifier.py @@ -36,10 +36,10 @@ def load(saved_classifier_filename, train_data_filename): return joblib.load(saved_classifier_filename) except ValueError: import sys - pickle_options = {} + kwargs = {} if sys.version_info > (3, 0): - pickle_options["encoding"] = "bytes" + kwargs["encoding"] = "latin1" - loaded = pickle.load(open(saved_classifier_filename, 'rb'), **pickle_options) + loaded = pickle.load(open(saved_classifier_filename, 'rb'), **kwargs) joblib.dump(loaded, saved_classifier_filename, compress=True) - return loaded + return joblib.load(saved_classifier_filename) diff --git a/talon/signature/learning/dataset.py b/talon/signature/learning/dataset.py index c7a88d4..5026c0b 100644 --- a/talon/signature/learning/dataset.py +++ b/talon/signature/learning/dataset.py @@ -61,7 +61,7 @@ def parse_msg_sender(filename, sender_known=True): import sys kwargs = {} if sys.version_info > (3, 0): - kwargs["encoding"] = "bytes" + kwargs["encoding"] = "utf8" sender, msg = None, None if os.path.isfile(filename) and not is_sender_filename(filename): diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index f26148c..2c5c2e5 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -306,7 +306,7 @@ def extract_reply_and_check(filename): import sys kwargs = {} if sys.version_info > (3, 0): - kwargs["encoding"] = "bytes" + kwargs["encoding"] = "utf8" f = open(filename, **kwargs) diff --git a/tests/signature/extraction_test.py b/tests/signature/extraction_test.py index 86b8705..b942674 100644 --- a/tests/signature/extraction_test.py +++ b/tests/signature/extraction_test.py @@ -27,7 +27,7 @@ def test_messages_longer_SIGNATURE_MAX_LINES(): import sys kwargs = {} if sys.version_info > (3, 0): - kwargs["encoding"] = "bytes" + kwargs["encoding"] = "utf8" for filename in os.listdir(STRIPPED): filename = os.path.join(STRIPPED, filename) From 4364bebf3889de6fd58acf3c344a847c771a1008 Mon Sep 17 00:00:00 2001 From: Yacine Filali Date: Wed, 24 May 2017 10:26:33 -0700 Subject: [PATCH 4/7] Added exception checking for pickle format conversion --- talon/signature/bruteforce.py | 2 +- talon/signature/learning/classifier.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/talon/signature/bruteforce.py b/talon/signature/bruteforce.py index 85e25ce..e502bab 100644 --- a/talon/signature/bruteforce.py +++ b/talon/signature/bruteforce.py @@ -110,7 +110,7 @@ def extract_signature(msg_body): return (stripped_body.strip(), signature.strip()) - except Exception as e: + except Exception: log.exception('ERROR extracting signature') return (msg_body, None) diff --git a/talon/signature/learning/classifier.py b/talon/signature/learning/classifier.py index f775413..9267db0 100644 --- a/talon/signature/learning/classifier.py +++ b/talon/signature/learning/classifier.py @@ -35,11 +35,16 @@ def load(saved_classifier_filename, train_data_filename): try: return joblib.load(saved_classifier_filename) except ValueError: + # load python 2 pickle format with python 3, and save it permissions allowing import sys kwargs = {} if sys.version_info > (3, 0): kwargs["encoding"] = "latin1" loaded = pickle.load(open(saved_classifier_filename, 'rb'), **kwargs) - joblib.dump(loaded, saved_classifier_filename, compress=True) + try: + joblib.dump(loaded, saved_classifier_filename, compress=True) + except Exception: + pass + return joblib.load(saved_classifier_filename) From f5f726407717b6f7c56536c3f75638df0d28f5b6 Mon Sep 17 00:00:00 2001 From: Yacine Filali Date: Wed, 24 May 2017 13:22:24 -0700 Subject: [PATCH 5/7] Can now handle read only classifier data as well --- talon/signature/learning/classifier.py | 46 +++++++++++++++++--------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/talon/signature/learning/classifier.py b/talon/signature/learning/classifier.py index 9267db0..4e1e886 100644 --- a/talon/signature/learning/classifier.py +++ b/talon/signature/learning/classifier.py @@ -7,8 +7,6 @@ body belongs to the signature. from __future__ import absolute_import -import pickle - from numpy import genfromtxt from sklearn.externals import joblib from sklearn.svm import LinearSVC @@ -32,19 +30,37 @@ def train(classifier, train_data_filename, save_classifier_filename=None): def load(saved_classifier_filename, train_data_filename): """Loads saved classifier. """ + import sys + if sys.version_info > (3, 0): + return load_compat(saved_classifier_filename) + + return joblib.load(saved_classifier_filename) + + +def load_compat(saved_classifier_filename): + import os + import pickle + import tempfile + + # we need to switch to the data path to properly load the related _xx.npy files + cwd = os.getcwd() + os.chdir(os.path.dirname(saved_classifier_filename)) + + # convert encoding using pick.load and write to temp file which we'll tell joblib to use + pickle_file = open(saved_classifier_filename, 'rb') + classifier = pickle.load(pickle_file, encoding='latin1') + try: - return joblib.load(saved_classifier_filename) - except ValueError: - # load python 2 pickle format with python 3, and save it permissions allowing - import sys - kwargs = {} - if sys.version_info > (3, 0): - kwargs["encoding"] = "latin1" + # save our conversion if permissions allow + joblib.dump(classifier, saved_classifier_filename) + except Exception: + # can't write to classifier, use a temp file + tmp = tempfile.SpooledTemporaryFile() + joblib.dump(classifier, tmp) + saved_classifier_filename = tmp - loaded = pickle.load(open(saved_classifier_filename, 'rb'), **kwargs) - try: - joblib.dump(loaded, saved_classifier_filename, compress=True) - except Exception: - pass + # important, use joblib.load before switching back to original cwd + jb_classifier = joblib.load(saved_classifier_filename) + os.chdir(cwd) - return joblib.load(saved_classifier_filename) + return jb_classifier From 4acf05cf2875e31e3e63db99a269ffca73c85ccd Mon Sep 17 00:00:00 2001 From: Yacine Filali Date: Wed, 24 May 2017 13:29:59 -0700 Subject: [PATCH 6/7] Only use load compat if we can't load the classifier --- talon/signature/learning/classifier.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/talon/signature/learning/classifier.py b/talon/signature/learning/classifier.py index 4e1e886..0519713 100644 --- a/talon/signature/learning/classifier.py +++ b/talon/signature/learning/classifier.py @@ -30,11 +30,14 @@ def train(classifier, train_data_filename, save_classifier_filename=None): def load(saved_classifier_filename, train_data_filename): """Loads saved classifier. """ - import sys - if sys.version_info > (3, 0): - return load_compat(saved_classifier_filename) + try: + return joblib.load(saved_classifier_filename) + except Exception: + import sys + if sys.version_info > (3, 0): + return load_compat(saved_classifier_filename) - return joblib.load(saved_classifier_filename) + raise def load_compat(saved_classifier_filename): From 743c76f159b54c46742537be46478fe238c7b607 Mon Sep 17 00:00:00 2001 From: Sergey Obukhov Date: Sun, 18 Jun 2017 22:48:12 -0700 Subject: [PATCH 7/7] bump version after merging python 3 support PR --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 80d9d5f..a9d98d6 100755 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ class InstallCommand(install): setup(name='talon', - version='1.3.7', + version='1.4.0', description=("Mailgun library " "to extract message quotations and signatures."), long_description=open("README.rst").read(),