diff --git a/setup.py b/setup.py index e8bd3c1..626c378 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,3 @@ -import os -import sys -import contextlib - -from distutils.spawn import find_executable from setuptools import setup, find_packages @@ -20,87 +15,11 @@ setup(name='talon', zip_safe=True, install_requires=[ "lxml==2.3.3", - "regex==0.1.20110315", - "chardet==1.0.1", - "dnspython==1.11.1", + "regex==0.1.20110315", # handling of .* changes from version 0 to 1 "html2text", "nose==1.2.1", "mock", "coverage", - "flanker" + "scikit-learn", ] ) - - -def install_pyml(): - ''' - Downloads and installs PyML - ''' - try: - import PyML - except: - pass - else: - return - - # install numpy first - pip('install numpy==1.6.1 --upgrade') - - pyml_tarball = ( - 'http://09cce49df173f6f6e61f-fd6930021b51685920a6fa76529ee321' - '.r45.cf2.rackcdn.com/PyML-0.7.9.tar.gz') - pyml_srcidr = 'PyML-0.7.9' - - # see if PyML tarball needs to be fetched: - if not dir_exists(pyml_srcidr): - run("curl %s | tar -xz" % pyml_tarball) - - # compile&install: - with cd(pyml_srcidr): - python('setup.py build') - python('setup.py install') - - -def run(command): - if os.system(command) != 0: - raise Exception("Failed '{}'".format(command)) - else: - return 0 - - -def python(command): - command = '{} {}'.format(sys.executable, command) - run(command) - - -def enforce_executable(name, install_info): - if os.system("which {}".format(name)) != 0: - raise Exception( - '{} utility is missing.\nTo install, run:\n\n{}\n'.format( - name, install_info)) - - -def pip(command): - command = '{} {}'.format(find_executable('pip'), command) - run(command) - - -def dir_exists(path): - return os.path.isdir(path) - - -@contextlib.contextmanager -def cd(directory): - curdir = os.getcwd() - try: - os.chdir(directory) - yield {} - finally: - os.chdir(curdir) - - -if __name__ == '__main__': - if len(sys.argv) > 1 and sys.argv[1] in ['develop', 'install']: - enforce_executable('curl', 'sudo aptitude install curl') - - install_pyml() diff --git a/talon/signature/data/classifier b/talon/signature/data/classifier index fe717d5..405e6cd 100644 Binary files a/talon/signature/data/classifier and b/talon/signature/data/classifier differ diff --git a/talon/signature/data/classifier_01.npy b/talon/signature/data/classifier_01.npy new file mode 100644 index 0000000..2934424 Binary files /dev/null and b/talon/signature/data/classifier_01.npy differ diff --git a/talon/signature/data/classifier_02.npy b/talon/signature/data/classifier_02.npy new file mode 100644 index 0000000..7c6997d Binary files /dev/null and b/talon/signature/data/classifier_02.npy differ diff --git a/talon/signature/data/classifier_03.npy b/talon/signature/data/classifier_03.npy new file mode 100644 index 0000000..97d9aa3 Binary files /dev/null and b/talon/signature/data/classifier_03.npy differ diff --git a/talon/signature/data/classifier_04.npy b/talon/signature/data/classifier_04.npy new file mode 100644 index 0000000..11d1302 Binary files /dev/null and b/talon/signature/data/classifier_04.npy differ diff --git a/talon/signature/data/classifier_05.npy b/talon/signature/data/classifier_05.npy new file mode 100644 index 0000000..4ff7714 Binary files /dev/null and b/talon/signature/data/classifier_05.npy differ diff --git a/talon/signature/extraction.py b/talon/signature/extraction.py index 58df68d..995ad27 100644 --- a/talon/signature/extraction.py +++ b/talon/signature/extraction.py @@ -3,7 +3,7 @@ import logging import regex as re -from PyML import SparseDataSet +import numpy from talon.signature.learning.featurespace import features, build_pattern from talon.utils import get_delimiter @@ -32,8 +32,8 @@ RE_REVERSE_SIGNATURE = re.compile(r''' def is_signature_line(line, sender, classifier): '''Checks if the line belongs to signature. Returns True or False.''' - data = SparseDataSet([build_pattern(line, features(sender))]) - return classifier.decisionFunc(data, 0) > 0 + data = numpy.array(build_pattern(line, features(sender))) + return classifier.predict(data) > 0 def extract(body, sender): diff --git a/talon/signature/learning/classifier.py b/talon/signature/learning/classifier.py index 476fdb6..9ce5e75 100644 --- a/talon/signature/learning/classifier.py +++ b/talon/signature/learning/classifier.py @@ -5,32 +5,27 @@ The classifier could be used to detect if a certain line of the message body belongs to the signature. """ -import os -import sys - -from PyML import SparseDataSet, SVM +from numpy import genfromtxt +from sklearn.svm import LinearSVC +from sklearn.externals import joblib def init(): - '''Inits classifier with optimal options.''' - return SVM(C=10, optimization='liblinear') + """Inits classifier with optimal options.""" + return LinearSVC(C=10.0) def train(classifier, train_data_filename, save_classifier_filename=None): - '''Trains and saves classifier so that it could be easily loaded later.''' - data = SparseDataSet(train_data_filename, labelsColumn=-1) - classifier.train(data) + """Trains and saves classifier so that it could be easily loaded later.""" + file_data = genfromtxt(train_data_filename, delimiter=",") + train_data, labels = file_data[:, :-1], file_data[:, -1] + classifier.fit(train_data, labels) + if save_classifier_filename: - classifier.save(save_classifier_filename) + joblib.dump(classifier, save_classifier_filename) return classifier def load(saved_classifier_filename, train_data_filename): - """Loads saved classifier. - - Classifier should be loaded with the same data it was trained against - """ - train_data = SparseDataSet(train_data_filename, labelsColumn=-1) - classifier = init() - classifier.load(saved_classifier_filename, train_data) - return classifier + """Loads saved classifier. """ + return joblib.load(saved_classifier_filename) diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index 27fec9e..5dc400c 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -4,7 +4,6 @@ from . import * from . fixtures import * import regex as re -from flanker import mime from talon import quotations @@ -224,10 +223,7 @@ def test_reply_shares_div_with_from_block(): def test_reply_quotations_share_block(): - msg = mime.from_string(REPLY_QUOTATIONS_SHARE_BLOCK) - html_part = list(msg.walk())[1] - assert html_part.content_type == 'text/html' - stripped_html = quotations.extract_from_html(html_part.body) + stripped_html = quotations.extract_from_plain(REPLY_QUOTATIONS_SHARE_BLOCK) ok_(stripped_html) ok_('From' not in stripped_html) diff --git a/tests/signature/learning/dataset_test.py b/tests/signature/learning/dataset_test.py index 5eeff36..42d8ae6 100644 --- a/tests/signature/learning/dataset_test.py +++ b/tests/signature/learning/dataset_test.py @@ -3,7 +3,7 @@ from ... import * import os -from PyML import SparseDataSet +from numpy import genfromtxt from talon.signature.learning import dataset as d @@ -41,10 +41,13 @@ def test_build_extraction_dataset(): d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'), os.path.join(TMP_DIR, 'extraction.data'), 1) - test_data = SparseDataSet(os.path.join(TMP_DIR, 'extraction.data'), - labelsColumn=-1) + + filename = os.path.join(TMP_DIR, 'extraction.data') + file_data = genfromtxt(filename, delimiter=",") + test_data = file_data[:, :-1] + # the result is a loadable signature extraction dataset # 32 comes from 3 emails in emails/P folder, 11 lines checked to be # a signature, one email has only 10 lines - eq_(test_data.size(), 32) - eq_(len(features('')), test_data.numFeatures) + eq_(test_data.shape[0], 32) + eq_(len(features('')), test_data.shape[1]) diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index 918ed29..0a87e56 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -5,8 +5,7 @@ from . fixtures import * import os -from flanker import mime - +import email.iterators from talon import quotations @@ -614,22 +613,21 @@ def test_preprocess_postprocess_2_links(): def test_standard_replies(): for filename in os.listdir(STANDARD_REPLIES): filename = os.path.join(STANDARD_REPLIES, filename) - if os.path.isdir(filename): + if not filename.endswith('.eml') or os.path.isdir(filename): continue with open(filename) as f: - msg = f.read() - m = mime.from_string(msg) - for part in m.walk(): - if part.content_type == 'text/plain': - text = part.body - stripped_text = quotations.extract_from_plain(text) - reply_text_fn = filename[:-4] + '_reply_text' - if os.path.isfile(reply_text_fn): - with open(reply_text_fn) as f: - reply_text = f.read() - else: - reply_text = 'Hello' - eq_(reply_text, stripped_text, - "'%(reply)s' != %(stripped)s for %(fn)s" % - {'reply': reply_text, 'stripped': stripped_text, - 'fn': filename}) + message = email.message_from_file(f) + body = email.iterators.typed_subpart_iterator(message, subtype='plain').next() + text = ''.join(email.iterators.body_line_iterator(body)) + + stripped_text = quotations.extract_from_plain(text) + reply_text_fn = filename[:-4] + '_reply_text' + if os.path.isfile(reply_text_fn): + with open(reply_text_fn) as f: + reply_text = f.read() + else: + reply_text = 'Hello' + yield eq_, reply_text, stripped_text, \ + "'%(reply)s' != %(stripped)s for %(fn)s" % \ + {'reply': reply_text, 'stripped': stripped_text, + 'fn': filename}