Remove flanker and replace PyML with scikit-learn

I never was actually able to successfully install PyML but the source-forge distribution and lack of python3 support convinced me that scikit-learn would be a fine substitute. Flanker was also difficult for me to install and seemed only to be used in the tests, so I removed it as well to get into a position where I could run the tests. As of this commit, only one is not passing (test_standard_replies with android.eml) though I'm not familiar with the `email` library yet.
2015-03-08 00:06:01 -05:00
parent b36287e573
commit f16760c466
12 changed files with 44 additions and 133 deletions
--- a/setup.py
+++ b/setup.py
@@ -1,8 +1,3 @@
 import os
 import sys
 import contextlib
 from distutils.spawn import find_executable
 from setuptools import setup, find_packages
@@ -20,87 +15,11 @@ setup(name='talon',
      zip_safe=True,
      install_requires=[
          "lxml==2.3.3",
-          "regex==0.1.20110315",
+          "regex==0.1.20110315",  # handling of .* changes from version 0 to 1
          "chardet==1.0.1",
          "dnspython==1.11.1",
          "html2text",
          "nose==1.2.1",
          "mock",
          "coverage",
-          "flanker"
+          "scikit-learn",
          ]
      )
 def install_pyml():
    '''
    Downloads and installs PyML
    '''
    try:
        import PyML
    except:
        pass
    else:
        return
    # install numpy first
    pip('install numpy==1.6.1 --upgrade')
    pyml_tarball = (
        'http://09cce49df173f6f6e61f-fd6930021b51685920a6fa76529ee321'
        '.r45.cf2.rackcdn.com/PyML-0.7.9.tar.gz')
    pyml_srcidr = 'PyML-0.7.9'
    # see if PyML tarball needs to be fetched:
    if not dir_exists(pyml_srcidr):
        run("curl %s | tar -xz" % pyml_tarball)
    # compile&install:
    with cd(pyml_srcidr):
        python('setup.py build')
        python('setup.py install')
 def run(command):
    if os.system(command) != 0:
        raise Exception("Failed '{}'".format(command))
    else:
        return 0
 def python(command):
    command = '{} {}'.format(sys.executable, command)
    run(command)
 def enforce_executable(name, install_info):
    if os.system("which {}".format(name)) != 0:
        raise Exception(
            '{} utility is missing.\nTo install, run:\n\n{}\n'.format(
                name, install_info))
 def pip(command):
    command = '{} {}'.format(find_executable('pip'), command)
    run(command)
 def dir_exists(path):
    return os.path.isdir(path)
@contextlib.contextmanager
 def cd(directory):
    curdir = os.getcwd()
    try:
        os.chdir(directory)
        yield {}
    finally:
        os.chdir(curdir)
 if __name__ == '__main__':
    if len(sys.argv) > 1 and sys.argv[1] in ['develop', 'install']:
        enforce_executable('curl', 'sudo aptitude install curl')
        install_pyml()
--- a/talon/signature/data/classifier
+++ b/talon/signature/data/classifier
--- a/talon/signature/data/classifier_01.npy
+++ b/talon/signature/data/classifier_01.npy
--- a/talon/signature/data/classifier_02.npy
+++ b/talon/signature/data/classifier_02.npy
--- a/talon/signature/data/classifier_03.npy
+++ b/talon/signature/data/classifier_03.npy
--- a/talon/signature/data/classifier_04.npy
+++ b/talon/signature/data/classifier_04.npy
--- a/talon/signature/data/classifier_05.npy
+++ b/talon/signature/data/classifier_05.npy
--- a/talon/signature/extraction.py
+++ b/talon/signature/extraction.py
@@ -3,7 +3,7 @@
 import logging
 import regex as re
-from PyML import SparseDataSet
+import numpy
 from talon.signature.learning.featurespace import features, build_pattern
 from talon.utils import get_delimiter
@@ -32,8 +32,8 @@ RE_REVERSE_SIGNATURE = re.compile(r'''
 def is_signature_line(line, sender, classifier):
    '''Checks if the line belongs to signature. Returns True or False.'''
-    data = SparseDataSet([build_pattern(line, features(sender))])
+    data = numpy.array(build_pattern(line, features(sender)))
-    return classifier.decisionFunc(data, 0) > 0
+    return classifier.predict(data) > 0
 def extract(body, sender):
--- a/talon/signature/learning/classifier.py
+++ b/talon/signature/learning/classifier.py
@@ -5,32 +5,27 @@ The classifier could be used to detect if a certain line of the message
 body belongs to the signature.
 """
-import os
+from numpy import genfromtxt
-import sys
+from sklearn.svm import LinearSVC
-
+from sklearn.externals import joblib
 from PyML import SparseDataSet, SVM
 def init():
-    '''Inits classifier with optimal options.'''
+    """Inits classifier with optimal options."""
-    return SVM(C=10, optimization='liblinear')
+    return LinearSVC(C=10.0)
 def train(classifier, train_data_filename, save_classifier_filename=None):
-    '''Trains and saves classifier so that it could be easily loaded later.'''
+    """Trains and saves classifier so that it could be easily loaded later."""
-    data = SparseDataSet(train_data_filename, labelsColumn=-1)
+    file_data = genfromtxt(train_data_filename, delimiter=",")
-    classifier.train(data)
+    train_data, labels = file_data[:, :-1], file_data[:, -1]
    classifier.fit(train_data, labels)
    if save_classifier_filename:
-        classifier.save(save_classifier_filename)
+        joblib.dump(classifier, save_classifier_filename)
    return classifier
 def load(saved_classifier_filename, train_data_filename):
-    """Loads saved classifier.
+    """Loads saved classifier. """
-
+    return joblib.load(saved_classifier_filename)
    Classifier should be loaded with the same data it was trained against
    """
    train_data = SparseDataSet(train_data_filename, labelsColumn=-1)
    classifier = init()
    classifier.load(saved_classifier_filename, train_data)
    return classifier
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -4,7 +4,6 @@ from . import *
 from . fixtures import *
 import regex as re
 from flanker import mime
 from talon import quotations
@@ -224,10 +223,7 @@ def test_reply_shares_div_with_from_block():
 def test_reply_quotations_share_block():
-    msg = mime.from_string(REPLY_QUOTATIONS_SHARE_BLOCK)
+    stripped_html = quotations.extract_from_plain(REPLY_QUOTATIONS_SHARE_BLOCK)
    html_part = list(msg.walk())[1]
    assert html_part.content_type == 'text/html'
    stripped_html = quotations.extract_from_html(html_part.body)
    ok_(stripped_html)
    ok_('From' not in stripped_html)
--- a/tests/signature/learning/dataset_test.py
+++ b/tests/signature/learning/dataset_test.py
@@ -3,7 +3,7 @@
 from ... import *
 import os
-from PyML import SparseDataSet
+from numpy import genfromtxt
 from talon.signature.learning import dataset as d
@@ -41,10 +41,13 @@ def test_build_extraction_dataset():
    d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'),
                               os.path.join(TMP_DIR,
                                            'extraction.data'), 1)
-    test_data = SparseDataSet(os.path.join(TMP_DIR, 'extraction.data'),
+
-                              labelsColumn=-1)
+    filename = os.path.join(TMP_DIR, 'extraction.data')
    file_data = genfromtxt(filename, delimiter=",")
    test_data = file_data[:, :-1]
    # the result is a loadable signature extraction dataset
    # 32 comes from 3 emails in emails/P folder, 11 lines checked to be
    # a signature, one email has only 10 lines
-    eq_(test_data.size(), 32)
+    eq_(test_data.shape[0], 32)
-    eq_(len(features('')), test_data.numFeatures)
+    eq_(len(features('')), test_data.shape[1])
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -5,8 +5,7 @@ from . fixtures import *
 import os
-from flanker import mime
+import email.iterators
 from talon import quotations
@@ -614,14 +613,13 @@ def test_preprocess_postprocess_2_links():
 def test_standard_replies():
    for filename in os.listdir(STANDARD_REPLIES):
        filename = os.path.join(STANDARD_REPLIES, filename)
-        if os.path.isdir(filename):
+        if not filename.endswith('.eml') or os.path.isdir(filename):
            continue
        with open(filename) as f:
-            msg = f.read()
+            message = email.message_from_file(f)
-            m = mime.from_string(msg)
+            body = email.iterators.typed_subpart_iterator(message, subtype='plain').next()
-            for part in m.walk():
+            text = ''.join(email.iterators.body_line_iterator(body))
-                if part.content_type == 'text/plain':
+
                    text = part.body
            stripped_text = quotations.extract_from_plain(text)
            reply_text_fn = filename[:-4] + '_reply_text'
            if os.path.isfile(reply_text_fn):
@@ -629,7 +627,7 @@ def test_standard_replies():
                    reply_text = f.read()
            else:
                reply_text = 'Hello'
-                    eq_(reply_text, stripped_text,
+            yield eq_, reply_text, stripped_text, \
-                        "'%(reply)s' != %(stripped)s for %(fn)s" %
+                "'%(reply)s' != %(stripped)s for %(fn)s" % \
                {'reply': reply_text, 'stripped': stripped_text,
-                         'fn': filename})
+                 'fn': filename}