Remove flanker and replace PyML with scikit-learn

I never was actually able to successfully install PyML but the source-forge
distribution and lack of python3 support convinced me that scikit-learn would
be a fine substitute. Flanker was also difficult for me to install and seemed
only to be used in the tests, so I removed it as well to get into a position
where I could run the tests. As of this commit, only one is not passing
(test_standard_replies with android.eml) though I'm not familiar with the `email`
library yet.
This commit is contained in:
Alex Riina
2015-03-08 00:06:01 -05:00
committed by Alex Riina
parent b36287e573
commit f16760c466
12 changed files with 44 additions and 133 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -3,7 +3,7 @@
import logging
import regex as re
from PyML import SparseDataSet
import numpy
from talon.signature.learning.featurespace import features, build_pattern
from talon.utils import get_delimiter
@@ -32,8 +32,8 @@ RE_REVERSE_SIGNATURE = re.compile(r'''
def is_signature_line(line, sender, classifier):
'''Checks if the line belongs to signature. Returns True or False.'''
data = SparseDataSet([build_pattern(line, features(sender))])
return classifier.decisionFunc(data, 0) > 0
data = numpy.array(build_pattern(line, features(sender)))
return classifier.predict(data) > 0
def extract(body, sender):

View File

@@ -5,32 +5,27 @@ The classifier could be used to detect if a certain line of the message
body belongs to the signature.
"""
import os
import sys
from PyML import SparseDataSet, SVM
from numpy import genfromtxt
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
def init():
'''Inits classifier with optimal options.'''
return SVM(C=10, optimization='liblinear')
"""Inits classifier with optimal options."""
return LinearSVC(C=10.0)
def train(classifier, train_data_filename, save_classifier_filename=None):
'''Trains and saves classifier so that it could be easily loaded later.'''
data = SparseDataSet(train_data_filename, labelsColumn=-1)
classifier.train(data)
"""Trains and saves classifier so that it could be easily loaded later."""
file_data = genfromtxt(train_data_filename, delimiter=",")
train_data, labels = file_data[:, :-1], file_data[:, -1]
classifier.fit(train_data, labels)
if save_classifier_filename:
classifier.save(save_classifier_filename)
joblib.dump(classifier, save_classifier_filename)
return classifier
def load(saved_classifier_filename, train_data_filename):
"""Loads saved classifier.
Classifier should be loaded with the same data it was trained against
"""
train_data = SparseDataSet(train_data_filename, labelsColumn=-1)
classifier = init()
classifier.load(saved_classifier_filename, train_data)
return classifier
"""Loads saved classifier. """
return joblib.load(saved_classifier_filename)