diff --git a/.gitignore b/.gitignore index af985ab..002f03e 100644 --- a/.gitignore +++ b/.gitignore @@ -48,4 +48,7 @@ tramp *_archive # Trial temp -_trial_temp \ No newline at end of file +_trial_temp + +# OSX +.DS_Store \ No newline at end of file diff --git a/README.rst b/README.rst index 2b5966f..2517450 100644 --- a/README.rst +++ b/README.rst @@ -89,7 +89,7 @@ the power of machine learning algorithms: # text == "Thanks Sasha, I can't go any higher and is why I limited it to the\nhomepage." # signature == "John Doe\nvia mobile" -For machine learning talon currently uses `PyML`_ library to build SVM +For machine learning talon currently uses the `scikit-learn`_ library to build SVM classifiers. The core of machine learning algorithm lays in ``talon.signature.learning package``. It defines a set of features to apply to a message (``featurespace.py``), how data sets are built @@ -102,7 +102,21 @@ of features to the dataset we provide files ``classifier`` and used to load trained classifier. Those files should be regenerated every time the feature/data set is changed. -.. _PyML: http://pyml.sourceforge.net/ +To regenerate the model files, you can run + +.. code:: sh + + python train.py + +or + +.. code:: python + + from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA + from talon.signature.learning.classifier import train, init + train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME) + +.. _scikit-learn: http://scikit-learn.org .. _ENRON: https://www.cs.cmu.edu/~enron/ Research diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 index e8bd3c1..320520a --- a/setup.py +++ b/setup.py @@ -1,8 +1,3 @@ -import os -import sys -import contextlib - -from distutils.spawn import find_executable from setuptools import setup, find_packages @@ -20,87 +15,15 @@ setup(name='talon', zip_safe=True, install_requires=[ "lxml==2.3.3", - "regex==0.1.20110315", - "chardet==1.0.1", - "dnspython==1.11.1", + "regex>=1", "html2text", - "nose==1.2.1", + "numpy", + "scipy", + "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild + ], + tests_require=[ "mock", - "coverage", - "flanker" + "nose>=1.2.1", + "coverage" ] ) - - -def install_pyml(): - ''' - Downloads and installs PyML - ''' - try: - import PyML - except: - pass - else: - return - - # install numpy first - pip('install numpy==1.6.1 --upgrade') - - pyml_tarball = ( - 'http://09cce49df173f6f6e61f-fd6930021b51685920a6fa76529ee321' - '.r45.cf2.rackcdn.com/PyML-0.7.9.tar.gz') - pyml_srcidr = 'PyML-0.7.9' - - # see if PyML tarball needs to be fetched: - if not dir_exists(pyml_srcidr): - run("curl %s | tar -xz" % pyml_tarball) - - # compile&install: - with cd(pyml_srcidr): - python('setup.py build') - python('setup.py install') - - -def run(command): - if os.system(command) != 0: - raise Exception("Failed '{}'".format(command)) - else: - return 0 - - -def python(command): - command = '{} {}'.format(sys.executable, command) - run(command) - - -def enforce_executable(name, install_info): - if os.system("which {}".format(name)) != 0: - raise Exception( - '{} utility is missing.\nTo install, run:\n\n{}\n'.format( - name, install_info)) - - -def pip(command): - command = '{} {}'.format(find_executable('pip'), command) - run(command) - - -def dir_exists(path): - return os.path.isdir(path) - - -@contextlib.contextmanager -def cd(directory): - curdir = os.getcwd() - try: - os.chdir(directory) - yield {} - finally: - os.chdir(curdir) - - -if __name__ == '__main__': - if len(sys.argv) > 1 and sys.argv[1] in ['develop', 'install']: - enforce_executable('curl', 'sudo aptitude install curl') - - install_pyml() diff --git a/talon/quotations.py b/talon/quotations.py index dc77fd4..cdd22b1 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -12,8 +12,7 @@ from copy import deepcopy from lxml import html, etree import html2text -from talon.constants import RE_DELIMITER -from talon.utils import random_token, get_delimiter +from talon.utils import get_delimiter from talon import html_quotations @@ -151,7 +150,7 @@ def extract_from(msg_body, content_type='text/plain'): return extract_from_plain(msg_body) elif content_type == 'text/html': return extract_from_html(msg_body) - except Exception, e: + except Exception: log.exception('ERROR extracting message') return msg_body @@ -344,7 +343,7 @@ def extract_from_html(msg_body): html_tree_copy = deepcopy(html_tree) number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) - quotation_checkpoints = [False for i in xrange(number_of_checkpoints)] + quotation_checkpoints = [False] * number_of_checkpoints msg_with_checkpoints = html.tostring(html_tree) h = html2text.HTML2Text() diff --git a/talon/signature/__init__.py b/talon/signature/__init__.py index d1962f3..a871447 100644 --- a/talon/signature/__init__.py +++ b/talon/signature/__init__.py @@ -21,11 +21,9 @@ trained against, don't forget to regenerate: """ import os -import sys -from cStringIO import StringIO from . import extraction -from . extraction import extract +from . extraction import extract #noqa from . learning import classifier @@ -36,13 +34,5 @@ EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data') def initialize(): - try: - # redirect output - so, sys.stdout = sys.stdout, StringIO() - - extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME, - EXTRACTOR_DATA) - sys.stdout = so - except Exception, e: - raise Exception( - "Failed initializing signature parsing with classifiers", e) + extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME, + EXTRACTOR_DATA) diff --git a/talon/signature/data/classifier b/talon/signature/data/classifier index fe717d5..b7d72eb 100644 Binary files a/talon/signature/data/classifier and b/talon/signature/data/classifier differ diff --git a/talon/signature/data/classifier_01.npy b/talon/signature/data/classifier_01.npy new file mode 100644 index 0000000..11d1302 Binary files /dev/null and b/talon/signature/data/classifier_01.npy differ diff --git a/talon/signature/data/classifier_02.npy b/talon/signature/data/classifier_02.npy new file mode 100644 index 0000000..0f965ba Binary files /dev/null and b/talon/signature/data/classifier_02.npy differ diff --git a/talon/signature/data/classifier_03.npy b/talon/signature/data/classifier_03.npy new file mode 100644 index 0000000..5a35962 Binary files /dev/null and b/talon/signature/data/classifier_03.npy differ diff --git a/talon/signature/data/classifier_04.npy b/talon/signature/data/classifier_04.npy new file mode 100644 index 0000000..11d1302 Binary files /dev/null and b/talon/signature/data/classifier_04.npy differ diff --git a/talon/signature/data/classifier_05.npy b/talon/signature/data/classifier_05.npy new file mode 100644 index 0000000..4ff7714 Binary files /dev/null and b/talon/signature/data/classifier_05.npy differ diff --git a/talon/signature/extraction.py b/talon/signature/extraction.py index 8c7b74e..995ad27 100644 --- a/talon/signature/extraction.py +++ b/talon/signature/extraction.py @@ -1,14 +1,10 @@ # -*- coding: utf-8 -*- -import os import logging import regex as re -from PyML import SparseDataSet +import numpy -from talon.constants import RE_DELIMITER -from talon.signature.constants import (SIGNATURE_MAX_LINES, - TOO_LONG_SIGNATURE_LINE) from talon.signature.learning.featurespace import features, build_pattern from talon.utils import get_delimiter from talon.signature.bruteforce import get_signature_candidate @@ -36,8 +32,8 @@ RE_REVERSE_SIGNATURE = re.compile(r''' def is_signature_line(line, sender, classifier): '''Checks if the line belongs to signature. Returns True or False.''' - data = SparseDataSet([build_pattern(line, features(sender))]) - return classifier.decisionFunc(data, 0) > 0 + data = numpy.array(build_pattern(line, features(sender))) + return classifier.predict(data) > 0 def extract(body, sender): @@ -61,7 +57,7 @@ def extract(body, sender): text = delimiter.join(text) if text.strip(): return (text, delimiter.join(signature)) - except Exception, e: + except Exception: log.exception('ERROR when extracting signature with classifiers') return (body, None) diff --git a/talon/signature/learning/classifier.py b/talon/signature/learning/classifier.py index 476fdb6..9ce5e75 100644 --- a/talon/signature/learning/classifier.py +++ b/talon/signature/learning/classifier.py @@ -5,32 +5,27 @@ The classifier could be used to detect if a certain line of the message body belongs to the signature. """ -import os -import sys - -from PyML import SparseDataSet, SVM +from numpy import genfromtxt +from sklearn.svm import LinearSVC +from sklearn.externals import joblib def init(): - '''Inits classifier with optimal options.''' - return SVM(C=10, optimization='liblinear') + """Inits classifier with optimal options.""" + return LinearSVC(C=10.0) def train(classifier, train_data_filename, save_classifier_filename=None): - '''Trains and saves classifier so that it could be easily loaded later.''' - data = SparseDataSet(train_data_filename, labelsColumn=-1) - classifier.train(data) + """Trains and saves classifier so that it could be easily loaded later.""" + file_data = genfromtxt(train_data_filename, delimiter=",") + train_data, labels = file_data[:, :-1], file_data[:, -1] + classifier.fit(train_data, labels) + if save_classifier_filename: - classifier.save(save_classifier_filename) + joblib.dump(classifier, save_classifier_filename) return classifier def load(saved_classifier_filename, train_data_filename): - """Loads saved classifier. - - Classifier should be loaded with the same data it was trained against - """ - train_data = SparseDataSet(train_data_filename, labelsColumn=-1) - classifier = init() - classifier.load(saved_classifier_filename, train_data) - return classifier + """Loads saved classifier. """ + return joblib.load(saved_classifier_filename) diff --git a/talon/signature/learning/helpers.py b/talon/signature/learning/helpers.py index 70a4820..51a9227 100644 --- a/talon/signature/learning/helpers.py +++ b/talon/signature/learning/helpers.py @@ -17,7 +17,7 @@ from talon.signature.constants import SIGNATURE_MAX_LINES rc = re.compile RE_EMAIL = rc('@') -RE_RELAX_PHONE = rc('.*(\(? ?[\d]{2,3} ?\)?.{,3}){2,}') +RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}') RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''') # Taken from: @@ -40,14 +40,6 @@ RE_SIGNATURE_WORDS = rc(('(T|t)hank.*,|(B|b)est|(R|r)egards|' # Line contains a pattern like Vitor R. Carvalho or William W. Cohen. RE_NAME = rc('[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+') -# Pattern to match if e.g. 'Sender:' header field has sender names. -SENDER_WITH_NAME_PATTERN = '([\s]*[\S]+,?)+[\s]*<.*>.*' -RE_SENDER_WITH_NAME = rc(SENDER_WITH_NAME_PATTERN) - -# Reply line clue line endings, as in regular expression: -# " wrote:$" or " writes:$" -RE_CLUE_LINE_END = rc('.*(W|w)rotes?:$') - INVALID_WORD_START = rc('\(|\+|[\d]') BAD_SENDER_NAMES = [ diff --git a/tests/fixtures/standard_replies/iphone.eml b/tests/fixtures/standard_replies/iphone.eml index 60622f1..320f8ac 100644 --- a/tests/fixtures/standard_replies/iphone.eml +++ b/tests/fixtures/standard_replies/iphone.eml @@ -9,11 +9,11 @@ To: bob Content-Transfer-Encoding: quoted-printable Mime-Version: 1.0 (1.0) -hello +Hello Sent from my iPhone On Apr 3, 2012, at 4:19 PM, bob wr= ote: -> Hi \ No newline at end of file +> Hi diff --git a/tests/fixtures/standard_replies/iphone_reply_text b/tests/fixtures/standard_replies/iphone_reply_text new file mode 100644 index 0000000..460d6d7 --- /dev/null +++ b/tests/fixtures/standard_replies/iphone_reply_text @@ -0,0 +1,3 @@ +Hello + +Sent from my iPhone diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index 27fec9e..5dc400c 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -4,7 +4,6 @@ from . import * from . fixtures import * import regex as re -from flanker import mime from talon import quotations @@ -224,10 +223,7 @@ def test_reply_shares_div_with_from_block(): def test_reply_quotations_share_block(): - msg = mime.from_string(REPLY_QUOTATIONS_SHARE_BLOCK) - html_part = list(msg.walk())[1] - assert html_part.content_type == 'text/html' - stripped_html = quotations.extract_from_html(html_part.body) + stripped_html = quotations.extract_from_plain(REPLY_QUOTATIONS_SHARE_BLOCK) ok_(stripped_html) ok_('From' not in stripped_html) diff --git a/tests/quotations_test.py b/tests/quotations_test.py index dcc723e..7184368 100644 --- a/tests/quotations_test.py +++ b/tests/quotations_test.py @@ -3,8 +3,6 @@ from . import * from . fixtures import * -from flanker import mime - from talon import quotations diff --git a/tests/signature/bruteforce_test.py b/tests/signature/bruteforce_test.py index ecbd626..09665fe 100644 --- a/tests/signature/bruteforce_test.py +++ b/tests/signature/bruteforce_test.py @@ -2,10 +2,6 @@ from .. import * -import os - -from flanker import mime - from talon.signature import bruteforce diff --git a/tests/signature/extraction_test.py b/tests/signature/extraction_test.py index 9cf76dd..a055064 100644 --- a/tests/signature/extraction_test.py +++ b/tests/signature/extraction_test.py @@ -4,8 +4,6 @@ from .. import * import os -from PyML import SparseDataSet - from talon.signature.learning import dataset from talon import signature from talon.signature import extraction as e diff --git a/tests/signature/learning/dataset_test.py b/tests/signature/learning/dataset_test.py index 062ff17..42d8ae6 100644 --- a/tests/signature/learning/dataset_test.py +++ b/tests/signature/learning/dataset_test.py @@ -3,9 +3,8 @@ from ... import * import os -from PyML import SparseDataSet +from numpy import genfromtxt -from talon.utils import to_unicode from talon.signature.learning import dataset as d from talon.signature.learning.featurespace import features @@ -42,10 +41,13 @@ def test_build_extraction_dataset(): d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'), os.path.join(TMP_DIR, 'extraction.data'), 1) - test_data = SparseDataSet(os.path.join(TMP_DIR, 'extraction.data'), - labelsColumn=-1) + + filename = os.path.join(TMP_DIR, 'extraction.data') + file_data = genfromtxt(filename, delimiter=",") + test_data = file_data[:, :-1] + # the result is a loadable signature extraction dataset # 32 comes from 3 emails in emails/P folder, 11 lines checked to be # a signature, one email has only 10 lines - eq_(test_data.size(), 32) - eq_(len(features('')), test_data.numFeatures) + eq_(test_data.shape[0], 32) + eq_(len(features('')), test_data.shape[1]) diff --git a/tests/signature/learning/helpers_test.py b/tests/signature/learning/helpers_test.py index 29b6fca..704db4e 100644 --- a/tests/signature/learning/helpers_test.py +++ b/tests/signature/learning/helpers_test.py @@ -43,7 +43,7 @@ VALID_PHONE_NUMBERS = [e.strip() for e in VALID.splitlines() if e.strip()] def test_match_phone_numbers(): for phone in VALID_PHONE_NUMBERS: - ok_(RE_RELAX_PHONE.match(phone), "{} should be matched".format(phone)) + ok_(RE_RELAX_PHONE.search(phone), "{} should be matched".format(phone)) def test_match_names(): @@ -52,29 +52,6 @@ def test_match_names(): ok_(RE_NAME.match(name), "{} should be matched".format(name)) -def test_sender_with_name(): - ok_lines = ['Sergey Obukhov ', - '\tSergey ', - ('"Doe, John (TX)"' - '@EXAMPLE' - ''), - ('Company Sleuth ' - '@EXAMPLE '), - ('Doe III, John ' - '')] - for line in ok_lines: - ok_(RE_SENDER_WITH_NAME.match(line), - '{} should be matched'.format(line)) - - nok_lines = ['', '', 'Sergey serobnic@xxx.ru'] - for line in nok_lines: - assert_false(RE_SENDER_WITH_NAME.match(line), - '{} should not be matched'.format(line)) - - # Now test helpers functions def test_binary_regex_search(): eq_(1, h.binary_regex_search(re.compile("12"))("12")) diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index 918ed29..a56c48d 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -5,8 +5,7 @@ from . fixtures import * import os -from flanker import mime - +import email.iterators from talon import quotations @@ -614,22 +613,21 @@ def test_preprocess_postprocess_2_links(): def test_standard_replies(): for filename in os.listdir(STANDARD_REPLIES): filename = os.path.join(STANDARD_REPLIES, filename) - if os.path.isdir(filename): + if not filename.endswith('.eml') or os.path.isdir(filename): continue with open(filename) as f: - msg = f.read() - m = mime.from_string(msg) - for part in m.walk(): - if part.content_type == 'text/plain': - text = part.body - stripped_text = quotations.extract_from_plain(text) - reply_text_fn = filename[:-4] + '_reply_text' - if os.path.isfile(reply_text_fn): - with open(reply_text_fn) as f: - reply_text = f.read() - else: - reply_text = 'Hello' - eq_(reply_text, stripped_text, - "'%(reply)s' != %(stripped)s for %(fn)s" % - {'reply': reply_text, 'stripped': stripped_text, - 'fn': filename}) + message = email.message_from_file(f) + body = email.iterators.typed_subpart_iterator(message, subtype='plain').next() + text = ''.join(email.iterators.body_line_iterator(body, True)) + + stripped_text = quotations.extract_from_plain(text) + reply_text_fn = filename[:-4] + '_reply_text' + if os.path.isfile(reply_text_fn): + with open(reply_text_fn) as f: + reply_text = f.read().strip() + else: + reply_text = 'Hello' + yield eq_, reply_text, stripped_text, \ + "'%(reply)s' != %(stripped)s for %(fn)s" % \ + {'reply': reply_text, 'stripped': stripped_text, + 'fn': filename} diff --git a/train.py b/train.py new file mode 100644 index 0000000..54d04b5 --- /dev/null +++ b/train.py @@ -0,0 +1,10 @@ +from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA +from talon.signature.learning.classifier import train, init + + +def train_model(): + """ retrain model and persist """ + train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME) + +if __name__ == "__main__": + train_model()