Merge pull request #45 from AlexRiina/master

Replace PyML with sklearn and clean up dependencies
2015-09-03 10:56:18 -07:00
parent 3a37d8b649 85c7ee980c
commit 08c9d7db03
24 changed files with 94 additions and 204 deletions
@@ -48,4 +48,7 @@ tramp
 *_archive

 # Trial temp
-_trial_temp
+_trial_temp
+
+# OSX
+.DS_Store
@@ -89,7 +89,7 @@ the power of machine learning algorithms:
    # text == "Thanks Sasha, I can't go any higher and is why I limited it to the\nhomepage."
    # signature == "John Doe\nvia mobile"

-For machine learning talon currently uses `PyML`_ library to build SVM
+For machine learning talon currently uses the `scikit-learn`_ library to build SVM
 classifiers. The core of machine learning algorithm lays in
 ``talon.signature.learning package``. It defines a set of features to
 apply to a message (``featurespace.py``), how data sets are built
@@ -102,7 +102,21 @@ of features to the dataset we provide files ``classifier`` and
 used to load trained classifier. Those files should be regenerated every
 time the feature/data set is changed.

-.. _PyML: http://pyml.sourceforge.net/
+To regenerate the model files, you can run
+
+.. code:: sh
+
+    python train.py
+
+or
+
+.. code:: python
+    
+    from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA
+    from talon.signature.learning.classifier import train, init
+    train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME)
+
+.. _scikit-learn: http://scikit-learn.org
 .. _ENRON: https://www.cs.cmu.edu/~enron/

 Research
@@ -1,8 +1,3 @@
-import os
-import sys
-import contextlib
-
-from distutils.spawn import find_executable
 from setuptools import setup, find_packages


@@ -20,87 +15,15 @@ setup(name='talon',
      zip_safe=True,
      install_requires=[
          "lxml==2.3.3",
-          "regex==0.1.20110315",
-          "chardet==1.0.1",
-          "dnspython==1.11.1",
+          "regex>=1",
          "html2text",
-          "nose==1.2.1",
+          "numpy",
+          "scipy",
+          "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
+          ],
+      tests_require=[
          "mock",
-          "coverage",
-          "flanker"
+          "nose>=1.2.1",
+          "coverage"
          ]
      )
-
-
-def install_pyml():
-    '''
-    Downloads and installs PyML
-    '''
-    try:
-        import PyML
-    except:
-        pass
-    else:
-        return
-
-    # install numpy first
-    pip('install numpy==1.6.1 --upgrade')
-
-    pyml_tarball = (
-        'http://09cce49df173f6f6e61f-fd6930021b51685920a6fa76529ee321'
-        '.r45.cf2.rackcdn.com/PyML-0.7.9.tar.gz')
-    pyml_srcidr = 'PyML-0.7.9'
-
-    # see if PyML tarball needs to be fetched:
-    if not dir_exists(pyml_srcidr):
-        run("curl %s | tar -xz" % pyml_tarball)
-
-    # compile&install:
-    with cd(pyml_srcidr):
-        python('setup.py build')
-        python('setup.py install')
-
-
-def run(command):
-    if os.system(command) != 0:
-        raise Exception("Failed '{}'".format(command))
-    else:
-        return 0
-
-
-def python(command):
-    command = '{} {}'.format(sys.executable, command)
-    run(command)
-
-
-def enforce_executable(name, install_info):
-    if os.system("which {}".format(name)) != 0:
-        raise Exception(
-            '{} utility is missing.\nTo install, run:\n\n{}\n'.format(
-                name, install_info))
-
-
-def pip(command):
-    command = '{} {}'.format(find_executable('pip'), command)
-    run(command)
-
-
-def dir_exists(path):
-    return os.path.isdir(path)
-
-
-@contextlib.contextmanager
-def cd(directory):
-    curdir = os.getcwd()
-    try:
-        os.chdir(directory)
-        yield {}
-    finally:
-        os.chdir(curdir)
-
-
-if __name__ == '__main__':
-    if len(sys.argv) > 1 and sys.argv[1] in ['develop', 'install']:
-        enforce_executable('curl', 'sudo aptitude install curl')
-
-        install_pyml()
@@ -12,8 +12,7 @@ from copy import deepcopy
 from lxml import html, etree
 import html2text

-from talon.constants import RE_DELIMITER
-from talon.utils import random_token, get_delimiter
+from talon.utils import get_delimiter
 from talon import html_quotations


@@ -151,7 +150,7 @@ def extract_from(msg_body, content_type='text/plain'):
            return extract_from_plain(msg_body)
        elif content_type == 'text/html':
            return extract_from_html(msg_body)
-    except Exception, e:
+    except Exception:
        log.exception('ERROR extracting message')

    return msg_body
@@ -344,7 +343,7 @@ def extract_from_html(msg_body):
    html_tree_copy = deepcopy(html_tree)

    number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
-    quotation_checkpoints = [False for i in xrange(number_of_checkpoints)]
+    quotation_checkpoints = [False] * number_of_checkpoints
    msg_with_checkpoints = html.tostring(html_tree)

    h = html2text.HTML2Text()
@@ -21,11 +21,9 @@ trained against, don't forget to regenerate:
 """

 import os
-import sys
-from cStringIO import StringIO

 from . import extraction
-from . extraction import extract
+from . extraction import extract  #noqa
 from . learning import classifier


@@ -36,13 +34,5 @@ EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data')


 def initialize():
-    try:
-        # redirect output
-        so, sys.stdout = sys.stdout, StringIO()
-
-        extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME,
-                                               EXTRACTOR_DATA)
-        sys.stdout = so
-    except Exception, e:
-        raise Exception(
-            "Failed initializing signature parsing with classifiers", e)
+    extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME,
+                                           EXTRACTOR_DATA)
@@ -1,14 +1,10 @@
 # -*- coding: utf-8 -*-

-import os
 import logging

 import regex as re
-from PyML import SparseDataSet
+import numpy

-from talon.constants import RE_DELIMITER
-from talon.signature.constants import (SIGNATURE_MAX_LINES,
-                                       TOO_LONG_SIGNATURE_LINE)
 from talon.signature.learning.featurespace import features, build_pattern
 from talon.utils import get_delimiter
 from talon.signature.bruteforce import get_signature_candidate
@@ -36,8 +32,8 @@ RE_REVERSE_SIGNATURE = re.compile(r'''

 def is_signature_line(line, sender, classifier):
    '''Checks if the line belongs to signature. Returns True or False.'''
-    data = SparseDataSet([build_pattern(line, features(sender))])
-    return classifier.decisionFunc(data, 0) > 0
+    data = numpy.array(build_pattern(line, features(sender)))
+    return classifier.predict(data) > 0


 def extract(body, sender):
@@ -61,7 +57,7 @@ def extract(body, sender):
                text = delimiter.join(text)
                if text.strip():
                    return (text, delimiter.join(signature))
-    except Exception, e:
+    except Exception:
        log.exception('ERROR when extracting signature with classifiers')

    return (body, None)
@@ -5,32 +5,27 @@ The classifier could be used to detect if a certain line of the message
 body belongs to the signature.
 """

-import os
-import sys
-
-from PyML import SparseDataSet, SVM
+from numpy import genfromtxt
+from sklearn.svm import LinearSVC
+from sklearn.externals import joblib


 def init():
-    '''Inits classifier with optimal options.'''
-    return SVM(C=10, optimization='liblinear')
+    """Inits classifier with optimal options."""
+    return LinearSVC(C=10.0)


 def train(classifier, train_data_filename, save_classifier_filename=None):
-    '''Trains and saves classifier so that it could be easily loaded later.'''
-    data = SparseDataSet(train_data_filename, labelsColumn=-1)
-    classifier.train(data)
+    """Trains and saves classifier so that it could be easily loaded later."""
+    file_data = genfromtxt(train_data_filename, delimiter=",")
+    train_data, labels = file_data[:, :-1], file_data[:, -1]
+    classifier.fit(train_data, labels)
+
    if save_classifier_filename:
-        classifier.save(save_classifier_filename)
+        joblib.dump(classifier, save_classifier_filename)
    return classifier


 def load(saved_classifier_filename, train_data_filename):
-    """Loads saved classifier.
-
-    Classifier should be loaded with the same data it was trained against
-    """
-    train_data = SparseDataSet(train_data_filename, labelsColumn=-1)
-    classifier = init()
-    classifier.load(saved_classifier_filename, train_data)
-    return classifier
+    """Loads saved classifier. """
+    return joblib.load(saved_classifier_filename)
@@ -17,7 +17,7 @@ from talon.signature.constants import SIGNATURE_MAX_LINES
 rc = re.compile

 RE_EMAIL = rc('@')
-RE_RELAX_PHONE = rc('.*(\(? ?[\d]{2,3} ?\)?.{,3}){2,}')
+RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
 RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')

 # Taken from:
@@ -40,14 +40,6 @@ RE_SIGNATURE_WORDS = rc(('(T|t)hank.*,|(B|b)est|(R|r)egards|'
 # Line contains a pattern like Vitor R. Carvalho or William W. Cohen.
 RE_NAME = rc('[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+')

-# Pattern to match if e.g. 'Sender:' header field has sender names.
-SENDER_WITH_NAME_PATTERN = '([\s]*[\S]+,?)+[\s]*<.*>.*'
-RE_SENDER_WITH_NAME = rc(SENDER_WITH_NAME_PATTERN)
-
-# Reply line clue line endings, as in regular expression:
-# " wrote:$" or " writes:$"
-RE_CLUE_LINE_END = rc('.*(W|w)rotes?:$')
-
 INVALID_WORD_START = rc('\(|\+|[\d]')

 BAD_SENDER_NAMES = [
@@ -9,11 +9,11 @@ To: bob <bob@example.com>
 Content-Transfer-Encoding: quoted-printable
 Mime-Version: 1.0 (1.0)

-hello
+Hello

 Sent from my iPhone

 On Apr 3, 2012, at 4:19 PM, bob <bob@example.com> wr=
 ote:

-> Hi
+> Hi
@@ -0,0 +1,3 @@
+Hello
+
+Sent from my iPhone
@@ -4,7 +4,6 @@ from . import *
 from . fixtures import *

 import regex as re
-from flanker import mime

 from talon import quotations

@@ -224,10 +223,7 @@ def test_reply_shares_div_with_from_block():


 def test_reply_quotations_share_block():
-    msg = mime.from_string(REPLY_QUOTATIONS_SHARE_BLOCK)
-    html_part = list(msg.walk())[1]
-    assert html_part.content_type == 'text/html'
-    stripped_html = quotations.extract_from_html(html_part.body)
+    stripped_html = quotations.extract_from_plain(REPLY_QUOTATIONS_SHARE_BLOCK)
    ok_(stripped_html)
    ok_('From' not in stripped_html)

@@ -3,8 +3,6 @@
 from . import *
 from . fixtures import *

-from flanker import mime
-
 from talon import quotations


@@ -2,10 +2,6 @@

 from .. import *

-import os
-
-from flanker import mime
-
 from talon.signature import bruteforce


@@ -4,8 +4,6 @@ from .. import *

 import os

-from PyML import SparseDataSet
-
 from talon.signature.learning import dataset
 from talon import signature
 from talon.signature import extraction as e
@@ -3,9 +3,8 @@
 from ... import *
 import os

-from PyML import SparseDataSet
+from numpy import genfromtxt

-from talon.utils import to_unicode
 from talon.signature.learning import dataset as d

 from talon.signature.learning.featurespace import features
@@ -42,10 +41,13 @@ def test_build_extraction_dataset():
    d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'),
                               os.path.join(TMP_DIR,
                                            'extraction.data'), 1)
-    test_data = SparseDataSet(os.path.join(TMP_DIR, 'extraction.data'),
-                              labelsColumn=-1)
+
+    filename = os.path.join(TMP_DIR, 'extraction.data')
+    file_data = genfromtxt(filename, delimiter=",")
+    test_data = file_data[:, :-1]
+
    # the result is a loadable signature extraction dataset
    # 32 comes from 3 emails in emails/P folder, 11 lines checked to be
    # a signature, one email has only 10 lines
-    eq_(test_data.size(), 32)
-    eq_(len(features('')), test_data.numFeatures)
+    eq_(test_data.shape[0], 32)
+    eq_(len(features('')), test_data.shape[1])
@@ -43,7 +43,7 @@ VALID_PHONE_NUMBERS = [e.strip() for e in VALID.splitlines() if e.strip()]

 def test_match_phone_numbers():
    for phone in VALID_PHONE_NUMBERS:
-        ok_(RE_RELAX_PHONE.match(phone), "{} should be matched".format(phone))
+        ok_(RE_RELAX_PHONE.search(phone), "{} should be matched".format(phone))


 def test_match_names():
@@ -52,29 +52,6 @@ def test_match_names():
        ok_(RE_NAME.match(name), "{} should be matched".format(name))


-def test_sender_with_name():
-    ok_lines = ['Sergey Obukhov <serobnic@example.com>',
-                '\tSergey  <serobnic@example.com>',
-                ('"Doe, John (TX)"'
-                 '<DowJ@example.com>@EXAMPLE'
-                 '<IMCEANOTES-+22Doe+2C+20John+20'
-                 '+28TX+29+22+20+3CDoeJ+40example+2Ecom+3E'
-                 '+40EXAMPLE@EXAMPLE.com>'),
-                ('Company Sleuth <csleuth@email.xxx.com>'
-                 '@EXAMPLE <XXX-Company+20Sleuth+20+3Ccsleuth'
-                 '+40email+2Exxx+2Ecom+3E+40EXAMPLE@EXAMPLE.com>'),
-                ('Doe III, John '
-                 '</O=EXAMPLE/OU=NA/CN=RECIPIENTS/CN=jDOE5>')]
-    for line in ok_lines:
-        ok_(RE_SENDER_WITH_NAME.match(line),
-            '{} should be matched'.format(line))
-
-    nok_lines = ['', '<serobnic@xxx.ru>', 'Sergey serobnic@xxx.ru']
-    for line in nok_lines:
-        assert_false(RE_SENDER_WITH_NAME.match(line),
-                     '{} should not be matched'.format(line))
-
-
 # Now test helpers functions
 def test_binary_regex_search():
    eq_(1, h.binary_regex_search(re.compile("12"))("12"))
@@ -5,8 +5,7 @@ from . fixtures import *

 import os

-from flanker import mime
-
+import email.iterators
 from talon import quotations


@@ -614,22 +613,21 @@ def test_preprocess_postprocess_2_links():
 def test_standard_replies():
    for filename in os.listdir(STANDARD_REPLIES):
        filename = os.path.join(STANDARD_REPLIES, filename)
-        if os.path.isdir(filename):
+        if not filename.endswith('.eml') or os.path.isdir(filename):
            continue
        with open(filename) as f:
-            msg = f.read()
-            m = mime.from_string(msg)
-            for part in m.walk():
-                if part.content_type == 'text/plain':
-                    text = part.body
-                    stripped_text = quotations.extract_from_plain(text)
-                    reply_text_fn = filename[:-4] + '_reply_text'
-                    if os.path.isfile(reply_text_fn):
-                        with open(reply_text_fn) as f:
-                            reply_text = f.read()
-                    else:
-                        reply_text = 'Hello'
-                    eq_(reply_text, stripped_text,
-                        "'%(reply)s' != %(stripped)s for %(fn)s" %
-                        {'reply': reply_text, 'stripped': stripped_text,
-                         'fn': filename})
+            message = email.message_from_file(f)
+            body = email.iterators.typed_subpart_iterator(message, subtype='plain').next()
+            text = ''.join(email.iterators.body_line_iterator(body, True))
+
+            stripped_text = quotations.extract_from_plain(text)
+            reply_text_fn = filename[:-4] + '_reply_text'
+            if os.path.isfile(reply_text_fn):
+                with open(reply_text_fn) as f:
+                    reply_text = f.read().strip()
+            else:
+                reply_text = 'Hello'
+            yield eq_, reply_text, stripped_text, \
+                "'%(reply)s' != %(stripped)s for %(fn)s" % \
+                {'reply': reply_text, 'stripped': stripped_text,
+                 'fn': filename}
@@ -0,0 +1,10 @@
+from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA
+from talon.signature.learning.classifier import train, init
+
+
+def train_model():
+    """ retrain model and persist """
+    train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME)
+
+if __name__ == "__main__":
+    train_model()