From dd0a0f5c4da5f3ffc40b5eae5715dd5c5ad16dcb Mon Sep 17 00:00:00 2001 From: Yacine Filali Date: Tue, 23 May 2017 16:10:13 -0700 Subject: [PATCH] Python 2.7 backward compat --- talon/signature/data/classifier | Bin 505 -> 608 bytes talon/signature/learning/classifier.py | 10 +++-- talon/signature/learning/dataset.py | 7 ++- tests/html_quotations_test.py | 10 ++++- tests/signature/extraction_test.py | 57 ++++++++++++++----------- 5 files changed, 52 insertions(+), 32 deletions(-) diff --git a/talon/signature/data/classifier b/talon/signature/data/classifier index 5caaf86e63a0e614ce42c48aa82942589a51890d..1c3a4b0865f3e951b1b3b17fb31bacc48d8d005b 100644 GIT binary patch literal 608 zcmZut%Z}496iu5B4QUIMH^cBApu;1A0kJ{?RHTszlf|G5OUp^!iNSGlZKq5Ym0$z& zReTI9Bt8OMchm*qE-d-#yzcqZ&h#h2sFIPM6;URr)=YFs|?W@6*fGcj9bu|)KuW9Ll{iCAu9|H4!}AP7`pEi~y8h88MrM4x3+9|~yE2IhJ&yZd4T13YvAujxPrb0>Sm zkrPI9Wy$*@Q8FRA%F7WeWr{>tTI3~nVf^Z)F?@7jcv%V)!aTBA7{jTKpK7bb4~8)H z;86E@-5x^Q*9TGw&2s^Zbmb2TIgrAz?Jm*f2$rgZ&SPaLOj1|@r=!4kN{!r7Q95Q* zu22}9i5gybzv2p(kG=ANbnK)PR*wD7oF6fXXgK@kN7E12{+q$M2}zO`Mv%NxPEN?y zD>!O9KNZn;qf&%$fzH%S9>c{bh#Ftv(mq`7L{0DSGq@7R@#|lB{1pWbR+C=qTQdmY z>K9nshiiU!iTafr!F4}O?Z6G%u1VJ0*~Gh>6hT2TG_Xz=F*02Di=0`uu55+RQ=>7;I$qi`*H*-A)!}?d)Hpcf8T}(CF|lW1-`rn`5~DKj (3, 0): + pickle_options["encoding"] = "bytes" + + loaded = pickle.load(open(saved_classifier_filename, 'rb'), **pickle_options) joblib.dump(loaded, saved_classifier_filename, compress=True) return loaded - diff --git a/talon/signature/learning/dataset.py b/talon/signature/learning/dataset.py index fbb37db..c7a88d4 100644 --- a/talon/signature/learning/dataset.py +++ b/talon/signature/learning/dataset.py @@ -58,9 +58,14 @@ def parse_msg_sender(filename, sender_known=True): algorithm: >>> parse_msg_sender(filename, False) """ + import sys + kwargs = {} + if sys.version_info > (3, 0): + kwargs["encoding"] = "bytes" + sender, msg = None, None if os.path.isfile(filename) and not is_sender_filename(filename): - with open(filename, encoding='utf-8') as f: + with open(filename, **kwargs) as f: msg = f.read() sender = u'' if sender_known: diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index 7e3591f..f26148c 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -2,7 +2,8 @@ from __future__ import absolute_import -import regex as re +# noinspection PyUnresolvedReferences +import re from talon import quotations, utils as u from . import * @@ -302,7 +303,12 @@ Reply def extract_reply_and_check(filename): - f = open(filename, encoding='utf8') + import sys + kwargs = {} + if sys.version_info > (3, 0): + kwargs["encoding"] = "bytes" + + f = open(filename, **kwargs) msg_body = f.read() reply = quotations.extract_from_html(msg_body) diff --git a/tests/signature/extraction_test.py b/tests/signature/extraction_test.py index 0bce56d..86b8705 100644 --- a/tests/signature/extraction_test.py +++ b/tests/signature/extraction_test.py @@ -1,16 +1,16 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import -from .. import * import os -from talon.signature.learning import dataset -from talon import signature -from talon.signature import extraction as e -from talon.signature import bruteforce from six.moves import range +from talon.signature import bruteforce, extraction, extract +from talon.signature import extraction as e +from talon.signature.learning import dataset +from .. import * + def test_message_shorter_SIGNATURE_MAX_LINES(): sender = "bob@foo.bar" @@ -18,23 +18,28 @@ def test_message_shorter_SIGNATURE_MAX_LINES(): Thanks in advance, Bob""" - text, extracted_signature = signature.extract(body, sender) + text, extracted_signature = extract(body, sender) eq_('\n'.join(body.splitlines()[:2]), text) eq_('\n'.join(body.splitlines()[-2:]), extracted_signature) def test_messages_longer_SIGNATURE_MAX_LINES(): + import sys + kwargs = {} + if sys.version_info > (3, 0): + kwargs["encoding"] = "bytes" + for filename in os.listdir(STRIPPED): filename = os.path.join(STRIPPED, filename) if not filename.endswith('_body'): continue sender, body = dataset.parse_msg_sender(filename) - text, extracted_signature = signature.extract(body, sender) + text, extracted_signature = extract(body, sender) extracted_signature = extracted_signature or '' - with open(filename[:-len('body')] + 'signature', encoding='utf8') as ms: + with open(filename[:-len('body')] + 'signature', **kwargs) as ms: msg_signature = ms.read() eq_(msg_signature.strip(), extracted_signature.strip()) - stripped_msg = body.strip()[:len(body.strip())-len(msg_signature)] + stripped_msg = body.strip()[:len(body.strip()) - len(msg_signature)] eq_(stripped_msg.strip(), text.strip()) @@ -47,7 +52,7 @@ Thanks in advance, some text which doesn't seem to be a signature at all Bob""" - text, extracted_signature = signature.extract(body, sender) + text, extracted_signature = extract(body, sender) eq_('\n'.join(body.splitlines()[:2]), text) eq_('\n'.join(body.splitlines()[-3:]), extracted_signature) @@ -60,7 +65,7 @@ Thanks in advance, some long text here which doesn't seem to be a signature at all Bob""" - text, extracted_signature = signature.extract(body, sender) + text, extracted_signature = extract(body, sender) eq_('\n'.join(body.splitlines()[:-1]), text) eq_('Bob', extracted_signature) @@ -68,13 +73,13 @@ Bob""" some *long* text here which doesn't seem to be a signature at all """ - ((body, None), signature.extract(body, "david@example.com")) + ((body, None), extract(body, "david@example.com")) def test_basic(): msg_body = 'Blah\r\n--\r\n\r\nSergey Obukhov' eq_(('Blah', '--\r\n\r\nSergey Obukhov'), - signature.extract(msg_body, 'Sergey')) + extract(msg_body, 'Sergey')) def test_capitalized(): @@ -99,7 +104,7 @@ Doe Inc Doe Inc 555-531-7967""" - eq_(sig, signature.extract(msg_body, 'Doe')[1]) + eq_(sig, extract(msg_body, 'Doe')[1]) def test_over_2_text_lines_after_signature(): @@ -110,25 +115,25 @@ def test_over_2_text_lines_after_signature(): 2 non signature lines in the end It's not signature """ - text, extracted_signature = signature.extract(body, "Bob") + text, extracted_signature = extract(body, "Bob") eq_(extracted_signature, None) def test_no_signature(): sender, body = "bob@foo.bar", "Hello" - eq_((body, None), signature.extract(body, sender)) + eq_((body, None), extract(body, sender)) def test_handles_unicode(): sender, body = dataset.parse_msg_sender(UNICODE_MSG) - text, extracted_signature = signature.extract(body, sender) + text, extracted_signature = extract(body, sender) -@patch.object(signature.extraction, 'has_signature') +@patch.object(extraction, 'has_signature') def test_signature_extract_crash(has_signature): has_signature.side_effect = Exception('Bam!') msg_body = u'Blah\r\n--\r\n\r\nСергей' - eq_((msg_body, None), signature.extract(msg_body, 'Сергей')) + eq_((msg_body, None), extract(msg_body, 'Сергей')) def test_mark_lines(): @@ -137,19 +142,19 @@ def test_mark_lines(): # (starting from the bottom) because we don't count empty line eq_('ttset', e._mark_lines(['Bob Smith', - 'Bob Smith', - 'Bob Smith', - '', - 'some text'], 'Bob Smith')) + 'Bob Smith', + 'Bob Smith', + '', + 'some text'], 'Bob Smith')) with patch.object(bruteforce, 'SIGNATURE_MAX_LINES', 3): # we don't analyse the 1st line because # signature cant start from the 1st line eq_('tset', e._mark_lines(['Bob Smith', - 'Bob Smith', - '', - 'some text'], 'Bob Smith')) + 'Bob Smith', + '', + 'some text'], 'Bob Smith')) def test_process_marked_lines():