Remove flanker and replace PyML with scikit-learn
I never was actually able to successfully install PyML but the source-forge distribution and lack of python3 support convinced me that scikit-learn would be a fine substitute. Flanker was also difficult for me to install and seemed only to be used in the tests, so I removed it as well to get into a position where I could run the tests. As of this commit, only one is not passing (test_standard_replies with android.eml) though I'm not familiar with the `email` library yet.
This commit is contained in:
85
setup.py
85
setup.py
@@ -1,8 +1,3 @@
|
|||||||
import os
|
|
||||||
import sys
|
|
||||||
import contextlib
|
|
||||||
|
|
||||||
from distutils.spawn import find_executable
|
|
||||||
from setuptools import setup, find_packages
|
from setuptools import setup, find_packages
|
||||||
|
|
||||||
|
|
||||||
@@ -20,87 +15,11 @@ setup(name='talon',
|
|||||||
zip_safe=True,
|
zip_safe=True,
|
||||||
install_requires=[
|
install_requires=[
|
||||||
"lxml==2.3.3",
|
"lxml==2.3.3",
|
||||||
"regex==0.1.20110315",
|
"regex==0.1.20110315", # handling of .* changes from version 0 to 1
|
||||||
"chardet==1.0.1",
|
|
||||||
"dnspython==1.11.1",
|
|
||||||
"html2text",
|
"html2text",
|
||||||
"nose==1.2.1",
|
"nose==1.2.1",
|
||||||
"mock",
|
"mock",
|
||||||
"coverage",
|
"coverage",
|
||||||
"flanker"
|
"scikit-learn",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def install_pyml():
|
|
||||||
'''
|
|
||||||
Downloads and installs PyML
|
|
||||||
'''
|
|
||||||
try:
|
|
||||||
import PyML
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
return
|
|
||||||
|
|
||||||
# install numpy first
|
|
||||||
pip('install numpy==1.6.1 --upgrade')
|
|
||||||
|
|
||||||
pyml_tarball = (
|
|
||||||
'http://09cce49df173f6f6e61f-fd6930021b51685920a6fa76529ee321'
|
|
||||||
'.r45.cf2.rackcdn.com/PyML-0.7.9.tar.gz')
|
|
||||||
pyml_srcidr = 'PyML-0.7.9'
|
|
||||||
|
|
||||||
# see if PyML tarball needs to be fetched:
|
|
||||||
if not dir_exists(pyml_srcidr):
|
|
||||||
run("curl %s | tar -xz" % pyml_tarball)
|
|
||||||
|
|
||||||
# compile&install:
|
|
||||||
with cd(pyml_srcidr):
|
|
||||||
python('setup.py build')
|
|
||||||
python('setup.py install')
|
|
||||||
|
|
||||||
|
|
||||||
def run(command):
|
|
||||||
if os.system(command) != 0:
|
|
||||||
raise Exception("Failed '{}'".format(command))
|
|
||||||
else:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
def python(command):
|
|
||||||
command = '{} {}'.format(sys.executable, command)
|
|
||||||
run(command)
|
|
||||||
|
|
||||||
|
|
||||||
def enforce_executable(name, install_info):
|
|
||||||
if os.system("which {}".format(name)) != 0:
|
|
||||||
raise Exception(
|
|
||||||
'{} utility is missing.\nTo install, run:\n\n{}\n'.format(
|
|
||||||
name, install_info))
|
|
||||||
|
|
||||||
|
|
||||||
def pip(command):
|
|
||||||
command = '{} {}'.format(find_executable('pip'), command)
|
|
||||||
run(command)
|
|
||||||
|
|
||||||
|
|
||||||
def dir_exists(path):
|
|
||||||
return os.path.isdir(path)
|
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
|
||||||
def cd(directory):
|
|
||||||
curdir = os.getcwd()
|
|
||||||
try:
|
|
||||||
os.chdir(directory)
|
|
||||||
yield {}
|
|
||||||
finally:
|
|
||||||
os.chdir(curdir)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
if len(sys.argv) > 1 and sys.argv[1] in ['develop', 'install']:
|
|
||||||
enforce_executable('curl', 'sudo aptitude install curl')
|
|
||||||
|
|
||||||
install_pyml()
|
|
||||||
|
|||||||
Binary file not shown.
BIN
talon/signature/data/classifier_01.npy
Normal file
BIN
talon/signature/data/classifier_01.npy
Normal file
Binary file not shown.
BIN
talon/signature/data/classifier_02.npy
Normal file
BIN
talon/signature/data/classifier_02.npy
Normal file
Binary file not shown.
BIN
talon/signature/data/classifier_03.npy
Normal file
BIN
talon/signature/data/classifier_03.npy
Normal file
Binary file not shown.
BIN
talon/signature/data/classifier_04.npy
Normal file
BIN
talon/signature/data/classifier_04.npy
Normal file
Binary file not shown.
BIN
talon/signature/data/classifier_05.npy
Normal file
BIN
talon/signature/data/classifier_05.npy
Normal file
Binary file not shown.
@@ -3,7 +3,7 @@
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
import regex as re
|
import regex as re
|
||||||
from PyML import SparseDataSet
|
import numpy
|
||||||
|
|
||||||
from talon.signature.learning.featurespace import features, build_pattern
|
from talon.signature.learning.featurespace import features, build_pattern
|
||||||
from talon.utils import get_delimiter
|
from talon.utils import get_delimiter
|
||||||
@@ -32,8 +32,8 @@ RE_REVERSE_SIGNATURE = re.compile(r'''
|
|||||||
|
|
||||||
def is_signature_line(line, sender, classifier):
|
def is_signature_line(line, sender, classifier):
|
||||||
'''Checks if the line belongs to signature. Returns True or False.'''
|
'''Checks if the line belongs to signature. Returns True or False.'''
|
||||||
data = SparseDataSet([build_pattern(line, features(sender))])
|
data = numpy.array(build_pattern(line, features(sender)))
|
||||||
return classifier.decisionFunc(data, 0) > 0
|
return classifier.predict(data) > 0
|
||||||
|
|
||||||
|
|
||||||
def extract(body, sender):
|
def extract(body, sender):
|
||||||
|
|||||||
@@ -5,32 +5,27 @@ The classifier could be used to detect if a certain line of the message
|
|||||||
body belongs to the signature.
|
body belongs to the signature.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
from numpy import genfromtxt
|
||||||
import sys
|
from sklearn.svm import LinearSVC
|
||||||
|
from sklearn.externals import joblib
|
||||||
from PyML import SparseDataSet, SVM
|
|
||||||
|
|
||||||
|
|
||||||
def init():
|
def init():
|
||||||
'''Inits classifier with optimal options.'''
|
"""Inits classifier with optimal options."""
|
||||||
return SVM(C=10, optimization='liblinear')
|
return LinearSVC(C=10.0)
|
||||||
|
|
||||||
|
|
||||||
def train(classifier, train_data_filename, save_classifier_filename=None):
|
def train(classifier, train_data_filename, save_classifier_filename=None):
|
||||||
'''Trains and saves classifier so that it could be easily loaded later.'''
|
"""Trains and saves classifier so that it could be easily loaded later."""
|
||||||
data = SparseDataSet(train_data_filename, labelsColumn=-1)
|
file_data = genfromtxt(train_data_filename, delimiter=",")
|
||||||
classifier.train(data)
|
train_data, labels = file_data[:, :-1], file_data[:, -1]
|
||||||
|
classifier.fit(train_data, labels)
|
||||||
|
|
||||||
if save_classifier_filename:
|
if save_classifier_filename:
|
||||||
classifier.save(save_classifier_filename)
|
joblib.dump(classifier, save_classifier_filename)
|
||||||
return classifier
|
return classifier
|
||||||
|
|
||||||
|
|
||||||
def load(saved_classifier_filename, train_data_filename):
|
def load(saved_classifier_filename, train_data_filename):
|
||||||
"""Loads saved classifier.
|
"""Loads saved classifier. """
|
||||||
|
return joblib.load(saved_classifier_filename)
|
||||||
Classifier should be loaded with the same data it was trained against
|
|
||||||
"""
|
|
||||||
train_data = SparseDataSet(train_data_filename, labelsColumn=-1)
|
|
||||||
classifier = init()
|
|
||||||
classifier.load(saved_classifier_filename, train_data)
|
|
||||||
return classifier
|
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ from . import *
|
|||||||
from . fixtures import *
|
from . fixtures import *
|
||||||
|
|
||||||
import regex as re
|
import regex as re
|
||||||
from flanker import mime
|
|
||||||
|
|
||||||
from talon import quotations
|
from talon import quotations
|
||||||
|
|
||||||
@@ -224,10 +223,7 @@ def test_reply_shares_div_with_from_block():
|
|||||||
|
|
||||||
|
|
||||||
def test_reply_quotations_share_block():
|
def test_reply_quotations_share_block():
|
||||||
msg = mime.from_string(REPLY_QUOTATIONS_SHARE_BLOCK)
|
stripped_html = quotations.extract_from_plain(REPLY_QUOTATIONS_SHARE_BLOCK)
|
||||||
html_part = list(msg.walk())[1]
|
|
||||||
assert html_part.content_type == 'text/html'
|
|
||||||
stripped_html = quotations.extract_from_html(html_part.body)
|
|
||||||
ok_(stripped_html)
|
ok_(stripped_html)
|
||||||
ok_('From' not in stripped_html)
|
ok_('From' not in stripped_html)
|
||||||
|
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
from ... import *
|
from ... import *
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from PyML import SparseDataSet
|
from numpy import genfromtxt
|
||||||
|
|
||||||
from talon.signature.learning import dataset as d
|
from talon.signature.learning import dataset as d
|
||||||
|
|
||||||
@@ -41,10 +41,13 @@ def test_build_extraction_dataset():
|
|||||||
d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'),
|
d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'),
|
||||||
os.path.join(TMP_DIR,
|
os.path.join(TMP_DIR,
|
||||||
'extraction.data'), 1)
|
'extraction.data'), 1)
|
||||||
test_data = SparseDataSet(os.path.join(TMP_DIR, 'extraction.data'),
|
|
||||||
labelsColumn=-1)
|
filename = os.path.join(TMP_DIR, 'extraction.data')
|
||||||
|
file_data = genfromtxt(filename, delimiter=",")
|
||||||
|
test_data = file_data[:, :-1]
|
||||||
|
|
||||||
# the result is a loadable signature extraction dataset
|
# the result is a loadable signature extraction dataset
|
||||||
# 32 comes from 3 emails in emails/P folder, 11 lines checked to be
|
# 32 comes from 3 emails in emails/P folder, 11 lines checked to be
|
||||||
# a signature, one email has only 10 lines
|
# a signature, one email has only 10 lines
|
||||||
eq_(test_data.size(), 32)
|
eq_(test_data.shape[0], 32)
|
||||||
eq_(len(features('')), test_data.numFeatures)
|
eq_(len(features('')), test_data.shape[1])
|
||||||
|
|||||||
@@ -5,8 +5,7 @@ from . fixtures import *
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from flanker import mime
|
import email.iterators
|
||||||
|
|
||||||
from talon import quotations
|
from talon import quotations
|
||||||
|
|
||||||
|
|
||||||
@@ -614,14 +613,13 @@ def test_preprocess_postprocess_2_links():
|
|||||||
def test_standard_replies():
|
def test_standard_replies():
|
||||||
for filename in os.listdir(STANDARD_REPLIES):
|
for filename in os.listdir(STANDARD_REPLIES):
|
||||||
filename = os.path.join(STANDARD_REPLIES, filename)
|
filename = os.path.join(STANDARD_REPLIES, filename)
|
||||||
if os.path.isdir(filename):
|
if not filename.endswith('.eml') or os.path.isdir(filename):
|
||||||
continue
|
continue
|
||||||
with open(filename) as f:
|
with open(filename) as f:
|
||||||
msg = f.read()
|
message = email.message_from_file(f)
|
||||||
m = mime.from_string(msg)
|
body = email.iterators.typed_subpart_iterator(message, subtype='plain').next()
|
||||||
for part in m.walk():
|
text = ''.join(email.iterators.body_line_iterator(body))
|
||||||
if part.content_type == 'text/plain':
|
|
||||||
text = part.body
|
|
||||||
stripped_text = quotations.extract_from_plain(text)
|
stripped_text = quotations.extract_from_plain(text)
|
||||||
reply_text_fn = filename[:-4] + '_reply_text'
|
reply_text_fn = filename[:-4] + '_reply_text'
|
||||||
if os.path.isfile(reply_text_fn):
|
if os.path.isfile(reply_text_fn):
|
||||||
@@ -629,7 +627,7 @@ def test_standard_replies():
|
|||||||
reply_text = f.read()
|
reply_text = f.read()
|
||||||
else:
|
else:
|
||||||
reply_text = 'Hello'
|
reply_text = 'Hello'
|
||||||
eq_(reply_text, stripped_text,
|
yield eq_, reply_text, stripped_text, \
|
||||||
"'%(reply)s' != %(stripped)s for %(fn)s" %
|
"'%(reply)s' != %(stripped)s for %(fn)s" % \
|
||||||
{'reply': reply_text, 'stripped': stripped_text,
|
{'reply': reply_text, 'stripped': stripped_text,
|
||||||
'fn': filename})
|
'fn': filename}
|
||||||
|
|||||||
Reference in New Issue
Block a user