Python 2.7 backward compat

This commit is contained in:
Yacine Filali
2017-05-23 16:10:13 -07:00
parent 086f5ba43b
commit dd0a0f5c4d
5 changed files with 52 additions and 32 deletions

Binary file not shown.

View File

@@ -10,8 +10,8 @@ from __future__ import absolute_import
import pickle import pickle
from numpy import genfromtxt from numpy import genfromtxt
from sklearn.svm import LinearSVC
from sklearn.externals import joblib from sklearn.externals import joblib
from sklearn.svm import LinearSVC
def init(): def init():
@@ -35,7 +35,11 @@ def load(saved_classifier_filename, train_data_filename):
try: try:
return joblib.load(saved_classifier_filename) return joblib.load(saved_classifier_filename)
except ValueError: except ValueError:
loaded = pickle.load(open(saved_classifier_filename, 'rb'), encoding='latin1') import sys
pickle_options = {}
if sys.version_info > (3, 0):
pickle_options["encoding"] = "bytes"
loaded = pickle.load(open(saved_classifier_filename, 'rb'), **pickle_options)
joblib.dump(loaded, saved_classifier_filename, compress=True) joblib.dump(loaded, saved_classifier_filename, compress=True)
return loaded return loaded

View File

@@ -58,9 +58,14 @@ def parse_msg_sender(filename, sender_known=True):
algorithm: algorithm:
>>> parse_msg_sender(filename, False) >>> parse_msg_sender(filename, False)
""" """
import sys
kwargs = {}
if sys.version_info > (3, 0):
kwargs["encoding"] = "bytes"
sender, msg = None, None sender, msg = None, None
if os.path.isfile(filename) and not is_sender_filename(filename): if os.path.isfile(filename) and not is_sender_filename(filename):
with open(filename, encoding='utf-8') as f: with open(filename, **kwargs) as f:
msg = f.read() msg = f.read()
sender = u'' sender = u''
if sender_known: if sender_known:

View File

@@ -2,7 +2,8 @@
from __future__ import absolute_import from __future__ import absolute_import
import regex as re # noinspection PyUnresolvedReferences
import re
from talon import quotations, utils as u from talon import quotations, utils as u
from . import * from . import *
@@ -302,7 +303,12 @@ Reply
def extract_reply_and_check(filename): def extract_reply_and_check(filename):
f = open(filename, encoding='utf8') import sys
kwargs = {}
if sys.version_info > (3, 0):
kwargs["encoding"] = "bytes"
f = open(filename, **kwargs)
msg_body = f.read() msg_body = f.read()
reply = quotations.extract_from_html(msg_body) reply = quotations.extract_from_html(msg_body)

View File

@@ -1,16 +1,16 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import absolute_import from __future__ import absolute_import
from .. import *
import os import os
from talon.signature.learning import dataset
from talon import signature
from talon.signature import extraction as e
from talon.signature import bruteforce
from six.moves import range from six.moves import range
from talon.signature import bruteforce, extraction, extract
from talon.signature import extraction as e
from talon.signature.learning import dataset
from .. import *
def test_message_shorter_SIGNATURE_MAX_LINES(): def test_message_shorter_SIGNATURE_MAX_LINES():
sender = "bob@foo.bar" sender = "bob@foo.bar"
@@ -18,23 +18,28 @@ def test_message_shorter_SIGNATURE_MAX_LINES():
Thanks in advance, Thanks in advance,
Bob""" Bob"""
text, extracted_signature = signature.extract(body, sender) text, extracted_signature = extract(body, sender)
eq_('\n'.join(body.splitlines()[:2]), text) eq_('\n'.join(body.splitlines()[:2]), text)
eq_('\n'.join(body.splitlines()[-2:]), extracted_signature) eq_('\n'.join(body.splitlines()[-2:]), extracted_signature)
def test_messages_longer_SIGNATURE_MAX_LINES(): def test_messages_longer_SIGNATURE_MAX_LINES():
import sys
kwargs = {}
if sys.version_info > (3, 0):
kwargs["encoding"] = "bytes"
for filename in os.listdir(STRIPPED): for filename in os.listdir(STRIPPED):
filename = os.path.join(STRIPPED, filename) filename = os.path.join(STRIPPED, filename)
if not filename.endswith('_body'): if not filename.endswith('_body'):
continue continue
sender, body = dataset.parse_msg_sender(filename) sender, body = dataset.parse_msg_sender(filename)
text, extracted_signature = signature.extract(body, sender) text, extracted_signature = extract(body, sender)
extracted_signature = extracted_signature or '' extracted_signature = extracted_signature or ''
with open(filename[:-len('body')] + 'signature', encoding='utf8') as ms: with open(filename[:-len('body')] + 'signature', **kwargs) as ms:
msg_signature = ms.read() msg_signature = ms.read()
eq_(msg_signature.strip(), extracted_signature.strip()) eq_(msg_signature.strip(), extracted_signature.strip())
stripped_msg = body.strip()[:len(body.strip())-len(msg_signature)] stripped_msg = body.strip()[:len(body.strip()) - len(msg_signature)]
eq_(stripped_msg.strip(), text.strip()) eq_(stripped_msg.strip(), text.strip())
@@ -47,7 +52,7 @@ Thanks in advance,
some text which doesn't seem to be a signature at all some text which doesn't seem to be a signature at all
Bob""" Bob"""
text, extracted_signature = signature.extract(body, sender) text, extracted_signature = extract(body, sender)
eq_('\n'.join(body.splitlines()[:2]), text) eq_('\n'.join(body.splitlines()[:2]), text)
eq_('\n'.join(body.splitlines()[-3:]), extracted_signature) eq_('\n'.join(body.splitlines()[-3:]), extracted_signature)
@@ -60,7 +65,7 @@ Thanks in advance,
some long text here which doesn't seem to be a signature at all some long text here which doesn't seem to be a signature at all
Bob""" Bob"""
text, extracted_signature = signature.extract(body, sender) text, extracted_signature = extract(body, sender)
eq_('\n'.join(body.splitlines()[:-1]), text) eq_('\n'.join(body.splitlines()[:-1]), text)
eq_('Bob', extracted_signature) eq_('Bob', extracted_signature)
@@ -68,13 +73,13 @@ Bob"""
some *long* text here which doesn't seem to be a signature at all some *long* text here which doesn't seem to be a signature at all
""" """
((body, None), signature.extract(body, "david@example.com")) ((body, None), extract(body, "david@example.com"))
def test_basic(): def test_basic():
msg_body = 'Blah\r\n--\r\n\r\nSergey Obukhov' msg_body = 'Blah\r\n--\r\n\r\nSergey Obukhov'
eq_(('Blah', '--\r\n\r\nSergey Obukhov'), eq_(('Blah', '--\r\n\r\nSergey Obukhov'),
signature.extract(msg_body, 'Sergey')) extract(msg_body, 'Sergey'))
def test_capitalized(): def test_capitalized():
@@ -99,7 +104,7 @@ Doe Inc
Doe Inc Doe Inc
555-531-7967""" 555-531-7967"""
eq_(sig, signature.extract(msg_body, 'Doe')[1]) eq_(sig, extract(msg_body, 'Doe')[1])
def test_over_2_text_lines_after_signature(): def test_over_2_text_lines_after_signature():
@@ -110,25 +115,25 @@ def test_over_2_text_lines_after_signature():
2 non signature lines in the end 2 non signature lines in the end
It's not signature It's not signature
""" """
text, extracted_signature = signature.extract(body, "Bob") text, extracted_signature = extract(body, "Bob")
eq_(extracted_signature, None) eq_(extracted_signature, None)
def test_no_signature(): def test_no_signature():
sender, body = "bob@foo.bar", "Hello" sender, body = "bob@foo.bar", "Hello"
eq_((body, None), signature.extract(body, sender)) eq_((body, None), extract(body, sender))
def test_handles_unicode(): def test_handles_unicode():
sender, body = dataset.parse_msg_sender(UNICODE_MSG) sender, body = dataset.parse_msg_sender(UNICODE_MSG)
text, extracted_signature = signature.extract(body, sender) text, extracted_signature = extract(body, sender)
@patch.object(signature.extraction, 'has_signature') @patch.object(extraction, 'has_signature')
def test_signature_extract_crash(has_signature): def test_signature_extract_crash(has_signature):
has_signature.side_effect = Exception('Bam!') has_signature.side_effect = Exception('Bam!')
msg_body = u'Blah\r\n--\r\n\r\nСергей' msg_body = u'Blah\r\n--\r\n\r\nСергей'
eq_((msg_body, None), signature.extract(msg_body, 'Сергей')) eq_((msg_body, None), extract(msg_body, 'Сергей'))
def test_mark_lines(): def test_mark_lines():
@@ -137,19 +142,19 @@ def test_mark_lines():
# (starting from the bottom) because we don't count empty line # (starting from the bottom) because we don't count empty line
eq_('ttset', eq_('ttset',
e._mark_lines(['Bob Smith', e._mark_lines(['Bob Smith',
'Bob Smith', 'Bob Smith',
'Bob Smith', 'Bob Smith',
'', '',
'some text'], 'Bob Smith')) 'some text'], 'Bob Smith'))
with patch.object(bruteforce, 'SIGNATURE_MAX_LINES', 3): with patch.object(bruteforce, 'SIGNATURE_MAX_LINES', 3):
# we don't analyse the 1st line because # we don't analyse the 1st line because
# signature cant start from the 1st line # signature cant start from the 1st line
eq_('tset', eq_('tset',
e._mark_lines(['Bob Smith', e._mark_lines(['Bob Smith',
'Bob Smith', 'Bob Smith',
'', '',
'some text'], 'Bob Smith')) 'some text'], 'Bob Smith'))
def test_process_marked_lines(): def test_process_marked_lines():