From 4df7aa284b4ab9baaaf4b957527c978e0b6a2027 Mon Sep 17 00:00:00 2001 From: Alex Riina Date: Fri, 6 Mar 2015 20:52:58 -0500 Subject: [PATCH 01/10] remove extra imports --- tests/signature/bruteforce_test.py | 4 ---- tests/signature/extraction_test.py | 2 -- tests/signature/learning/dataset_test.py | 1 - 3 files changed, 7 deletions(-) diff --git a/tests/signature/bruteforce_test.py b/tests/signature/bruteforce_test.py index ecbd626..09665fe 100644 --- a/tests/signature/bruteforce_test.py +++ b/tests/signature/bruteforce_test.py @@ -2,10 +2,6 @@ from .. import * -import os - -from flanker import mime - from talon.signature import bruteforce diff --git a/tests/signature/extraction_test.py b/tests/signature/extraction_test.py index 9cf76dd..a055064 100644 --- a/tests/signature/extraction_test.py +++ b/tests/signature/extraction_test.py @@ -4,8 +4,6 @@ from .. import * import os -from PyML import SparseDataSet - from talon.signature.learning import dataset from talon import signature from talon.signature import extraction as e diff --git a/tests/signature/learning/dataset_test.py b/tests/signature/learning/dataset_test.py index 062ff17..5eeff36 100644 --- a/tests/signature/learning/dataset_test.py +++ b/tests/signature/learning/dataset_test.py @@ -5,7 +5,6 @@ import os from PyML import SparseDataSet -from talon.utils import to_unicode from talon.signature.learning import dataset as d from talon.signature.learning.featurespace import features From b36287e573b97c065dee4d5f36a8757ed4f375d9 Mon Sep 17 00:00:00 2001 From: Alex Riina Date: Sun, 8 Mar 2015 00:04:41 -0500 Subject: [PATCH 02/10] clean up style and extra imports --- talon/quotations.py | 7 +++---- talon/signature/__init__.py | 16 +++------------- talon/signature/extraction.py | 6 +----- tests/quotations_test.py | 2 -- 4 files changed, 7 insertions(+), 24 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index dc77fd4..cdd22b1 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -12,8 +12,7 @@ from copy import deepcopy from lxml import html, etree import html2text -from talon.constants import RE_DELIMITER -from talon.utils import random_token, get_delimiter +from talon.utils import get_delimiter from talon import html_quotations @@ -151,7 +150,7 @@ def extract_from(msg_body, content_type='text/plain'): return extract_from_plain(msg_body) elif content_type == 'text/html': return extract_from_html(msg_body) - except Exception, e: + except Exception: log.exception('ERROR extracting message') return msg_body @@ -344,7 +343,7 @@ def extract_from_html(msg_body): html_tree_copy = deepcopy(html_tree) number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) - quotation_checkpoints = [False for i in xrange(number_of_checkpoints)] + quotation_checkpoints = [False] * number_of_checkpoints msg_with_checkpoints = html.tostring(html_tree) h = html2text.HTML2Text() diff --git a/talon/signature/__init__.py b/talon/signature/__init__.py index d1962f3..a871447 100644 --- a/talon/signature/__init__.py +++ b/talon/signature/__init__.py @@ -21,11 +21,9 @@ trained against, don't forget to regenerate: """ import os -import sys -from cStringIO import StringIO from . import extraction -from . extraction import extract +from . extraction import extract #noqa from . learning import classifier @@ -36,13 +34,5 @@ EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data') def initialize(): - try: - # redirect output - so, sys.stdout = sys.stdout, StringIO() - - extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME, - EXTRACTOR_DATA) - sys.stdout = so - except Exception, e: - raise Exception( - "Failed initializing signature parsing with classifiers", e) + extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME, + EXTRACTOR_DATA) diff --git a/talon/signature/extraction.py b/talon/signature/extraction.py index 8c7b74e..58df68d 100644 --- a/talon/signature/extraction.py +++ b/talon/signature/extraction.py @@ -1,14 +1,10 @@ # -*- coding: utf-8 -*- -import os import logging import regex as re from PyML import SparseDataSet -from talon.constants import RE_DELIMITER -from talon.signature.constants import (SIGNATURE_MAX_LINES, - TOO_LONG_SIGNATURE_LINE) from talon.signature.learning.featurespace import features, build_pattern from talon.utils import get_delimiter from talon.signature.bruteforce import get_signature_candidate @@ -61,7 +57,7 @@ def extract(body, sender): text = delimiter.join(text) if text.strip(): return (text, delimiter.join(signature)) - except Exception, e: + except Exception: log.exception('ERROR when extracting signature with classifiers') return (body, None) diff --git a/tests/quotations_test.py b/tests/quotations_test.py index dcc723e..7184368 100644 --- a/tests/quotations_test.py +++ b/tests/quotations_test.py @@ -3,8 +3,6 @@ from . import * from . fixtures import * -from flanker import mime - from talon import quotations From f16760c466257df41c0c77c832792c9724349d66 Mon Sep 17 00:00:00 2001 From: Alex Riina Date: Sun, 8 Mar 2015 00:06:01 -0500 Subject: [PATCH 03/10] Remove flanker and replace PyML with scikit-learn I never was actually able to successfully install PyML but the source-forge distribution and lack of python3 support convinced me that scikit-learn would be a fine substitute. Flanker was also difficult for me to install and seemed only to be used in the tests, so I removed it as well to get into a position where I could run the tests. As of this commit, only one is not passing (test_standard_replies with android.eml) though I'm not familiar with the `email` library yet. --- setup.py | 85 +---------------------- talon/signature/data/classifier | Bin 10377 -> 632 bytes talon/signature/data/classifier_01.npy | Bin 0 -> 88 bytes talon/signature/data/classifier_02.npy | Bin 0 -> 96 bytes talon/signature/data/classifier_03.npy | Bin 0 -> 184 bytes talon/signature/data/classifier_04.npy | Bin 0 -> 96 bytes talon/signature/data/classifier_05.npy | Bin 0 -> 176 bytes talon/signature/extraction.py | 6 +- talon/signature/learning/classifier.py | 31 ++++----- tests/html_quotations_test.py | 6 +- tests/signature/learning/dataset_test.py | 13 ++-- tests/text_quotations_test.py | 36 +++++----- 12 files changed, 44 insertions(+), 133 deletions(-) create mode 100644 talon/signature/data/classifier_01.npy create mode 100644 talon/signature/data/classifier_02.npy create mode 100644 talon/signature/data/classifier_03.npy create mode 100644 talon/signature/data/classifier_04.npy create mode 100644 talon/signature/data/classifier_05.npy diff --git a/setup.py b/setup.py index e8bd3c1..626c378 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,3 @@ -import os -import sys -import contextlib - -from distutils.spawn import find_executable from setuptools import setup, find_packages @@ -20,87 +15,11 @@ setup(name='talon', zip_safe=True, install_requires=[ "lxml==2.3.3", - "regex==0.1.20110315", - "chardet==1.0.1", - "dnspython==1.11.1", + "regex==0.1.20110315", # handling of .* changes from version 0 to 1 "html2text", "nose==1.2.1", "mock", "coverage", - "flanker" + "scikit-learn", ] ) - - -def install_pyml(): - ''' - Downloads and installs PyML - ''' - try: - import PyML - except: - pass - else: - return - - # install numpy first - pip('install numpy==1.6.1 --upgrade') - - pyml_tarball = ( - 'http://09cce49df173f6f6e61f-fd6930021b51685920a6fa76529ee321' - '.r45.cf2.rackcdn.com/PyML-0.7.9.tar.gz') - pyml_srcidr = 'PyML-0.7.9' - - # see if PyML tarball needs to be fetched: - if not dir_exists(pyml_srcidr): - run("curl %s | tar -xz" % pyml_tarball) - - # compile&install: - with cd(pyml_srcidr): - python('setup.py build') - python('setup.py install') - - -def run(command): - if os.system(command) != 0: - raise Exception("Failed '{}'".format(command)) - else: - return 0 - - -def python(command): - command = '{} {}'.format(sys.executable, command) - run(command) - - -def enforce_executable(name, install_info): - if os.system("which {}".format(name)) != 0: - raise Exception( - '{} utility is missing.\nTo install, run:\n\n{}\n'.format( - name, install_info)) - - -def pip(command): - command = '{} {}'.format(find_executable('pip'), command) - run(command) - - -def dir_exists(path): - return os.path.isdir(path) - - -@contextlib.contextmanager -def cd(directory): - curdir = os.getcwd() - try: - os.chdir(directory) - yield {} - finally: - os.chdir(curdir) - - -if __name__ == '__main__': - if len(sys.argv) > 1 and sys.argv[1] in ['develop', 'install']: - enforce_executable('curl', 'sudo aptitude install curl') - - install_pyml() diff --git a/talon/signature/data/classifier b/talon/signature/data/classifier index fe717d51de6928da1b32ee90e4d5eabd5e7bad1e..405e6cd13dec1aa226d471dfca371bc52b8949d5 100644 GIT binary patch literal 632 zcmZvYOK%e~6h`xy($q;Q;Y}%zLV*;-2+Cs%D4;G{Wm6fJmhIeh3?7fKJ(H5fNU-B~ zvgJ?Uais{Mc;Us>^>>cHA06qxY2ePJex9dNbML{EV`akZp_Q-YIVONJMgk?=hX4c9*XS_rK~YO35Wm{l?`$>ie)=lJ4_ z{|3HE7krZgCy0}kjgZmF39$6mZlja}eoCFuGb zoG3it#iJHYrWMz?^^0!KRLPc}cqvTcb9M&mGz;RxtWmgJcq?+YYrMUZ^D1(7 z0w-Z$B;Hx}X&qQsNbp{n(^PW-njAOxCnJk#)8XClfqOOHr@iO{p5bn!J*e^FO4{p4 z+pB~Zv|AP$A058ie*62+&iFZ9JNJDY6Q0yKTAA=BChP~_!M}at(+M3e>td@3Vpy#LVO0f=-q7m8ROL~yf9$E-|us3HDm+q&dRFpu?x_JLhfBF5N|N7(iFTa*YD{+sQDc|$zkJ_`RUf86g=`+gBb`zf{ zVe|Zph#xk9Q=sqHZLFLeH`7f#d9J$6@rQ}eRB^ODDxVt88tEjlK1;n*j2<79xJFu~rvh!pYot}$OrH`zF49wJ&&DqlwYsj-Iq6w@jC53fR@@hoZYBMoNYA7> z+OEn~+J?4?t8ICVQr_;(+>rMzVe|Yvajl6w`MyifeT>q!hlJI1l|HY)ry}P^Z*!KK zbL4(3e+&E7Xv%1%4nmDB%;s1hW2Dh%`Q|g$nqrm8<4$Ad(QB{!v)O8SlD5(|@pC-a zsK=D=SFTCEUpi)7!)&ITxZ0kicRQYfo^;2AN97vzXu2t15!bArFTGOhQ@-yV3cu27 z$~pHM{ag2e_vZM1zo&-08NWNryB)UzHt`g8GyPi3xditBep08Oc60u^sz3jr-?T4{ z_7HbV^M4?I7*nS}@72A^Z@x>ObF^rC+pZP8n&(e%mVQ{Mbp$8rhn2J?I@Pr5Hplvi z7Tf|#pS|oex7i*8ZMHv4{J0uFF4D%u8u%n_j+^*?&n?g^;#$NjkzVm#tH_U0Q*Gmp z{H+I@r;@H&R_PpQwXKhm^SBdl=Gy)8Y^)6UOUkw~~*_qwO=ETcB6O{HUW3-W|45 zwpV7yijK-x3R(lN(kpS#zH6TEm!65b*?v@#E$mVJ<=_AL`)^<7mC{x3%a^468tIzt zN_=Wgsa&b_E5&`K8JC}*bGkVaN9HITm7{TV4r~%O8Jo;?o)goK(@A!Ad z(kt)1BVTc+EAWrZfo+VNqj1z~l+4ktLEORP`rw25WIVIPH96;rLTFVO0-W3=2D~;D zC*0f`r<-g137GH!AILD7atH)S3=mMkFjA&2UKv3!&o2V;az&df9>tQyj zOZ++nwDa9IvqYj2Brn!dVGY2fGAF4rA%KcZY1OJKgG{v&sv4l0s5a`WhN@5nsDP+c zCkm$`HG85lxycP8wMNj^h|-!``X{vsbd7r<0KbL=!uLiQoe59fj^ZmSt>wsr2}@zN zU@B8gisXS#csnD}klo?PhaEWz@rG`LCIQqsQNxKEGzj3Cl<-QP{=G4t2IVEcK+vi0 zl!Iwi({W74F&)R$cMeVDU{V@iRcQO$zQ1kwo4EvN@tXhu4S&NQpf&{76AQ;xLvp}% z!(doMu9o_?gjgRiE2w2008tXPu&6vU3ZoVj-{LXToaw=W&2OuZ_gtJYjR3xxX7wYA zKrNZU0BT*%#e&L>=&A%L!?l@7t#CJQfOH-HeF5|9RKs#HY%bz2!Mrv}0$;UH*9F5LDQ^UH0Zn9=f0Pk4kyy8NKoi*w*$vqZ*^M`LZP6jcZvOxtJd6q$4=Ls~ zgg}wQ7Udu11nuN>xXy>f4!d*6?vU8QZ5GhPJ{|VykZd0U0CPyT&jWzQcJZ)_dHo@1 z;sgvQV8}fm76h+JF>h-HJ?u$uK!Dj~(pwRrvHT6G8}_9CH;{+xoNv&A15x73FzZPqEBGo`=%7Yf4S AwEzGB diff --git a/talon/signature/data/classifier_01.npy b/talon/signature/data/classifier_01.npy new file mode 100644 index 0000000000000000000000000000000000000000..29344244b01bdd6d427cfe78e1083bad25c74ff7 GIT binary patch literal 88 zcmbR27wQ`j$;jZwP_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh= eXCxM+0{I$-I+{8PwF*dpYv;V@A{ElF_5%QG<{7#G literal 0 HcmV?d00001 diff --git a/talon/signature/data/classifier_02.npy b/talon/signature/data/classifier_02.npy new file mode 100644 index 0000000000000000000000000000000000000000..7c6997dd69eef019d6745dee125e92434b7954e3 GIT binary patch literal 96 zcmbR27wQ`j$;jZwP_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh= bXCxM+0{I$7I+{8PwF*dpivbKi*u!W54uThp literal 0 HcmV?d00001 diff --git a/talon/signature/data/classifier_03.npy b/talon/signature/data/classifier_03.npy new file mode 100644 index 0000000000000000000000000000000000000000..97d9aa3ec06e932df59bcf4aa1d56cc0ad039210 GIT binary patch literal 184 zcmbR27wQ`j$;jZwP_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(4=E~51qv5u zBo?Fsxf+H#3Wmm-ItsN4aKQD!F#Mj*$+!054W;g94lx|qzD%uO@9r=Ab^Q`42G^e0 z@1CLJa%9y9`w5%Fz4$7B*r%9n`LSQ*js2q=8*({Ye%gPKjx4EY+HbF 0 + data = numpy.array(build_pattern(line, features(sender))) + return classifier.predict(data) > 0 def extract(body, sender): diff --git a/talon/signature/learning/classifier.py b/talon/signature/learning/classifier.py index 476fdb6..9ce5e75 100644 --- a/talon/signature/learning/classifier.py +++ b/talon/signature/learning/classifier.py @@ -5,32 +5,27 @@ The classifier could be used to detect if a certain line of the message body belongs to the signature. """ -import os -import sys - -from PyML import SparseDataSet, SVM +from numpy import genfromtxt +from sklearn.svm import LinearSVC +from sklearn.externals import joblib def init(): - '''Inits classifier with optimal options.''' - return SVM(C=10, optimization='liblinear') + """Inits classifier with optimal options.""" + return LinearSVC(C=10.0) def train(classifier, train_data_filename, save_classifier_filename=None): - '''Trains and saves classifier so that it could be easily loaded later.''' - data = SparseDataSet(train_data_filename, labelsColumn=-1) - classifier.train(data) + """Trains and saves classifier so that it could be easily loaded later.""" + file_data = genfromtxt(train_data_filename, delimiter=",") + train_data, labels = file_data[:, :-1], file_data[:, -1] + classifier.fit(train_data, labels) + if save_classifier_filename: - classifier.save(save_classifier_filename) + joblib.dump(classifier, save_classifier_filename) return classifier def load(saved_classifier_filename, train_data_filename): - """Loads saved classifier. - - Classifier should be loaded with the same data it was trained against - """ - train_data = SparseDataSet(train_data_filename, labelsColumn=-1) - classifier = init() - classifier.load(saved_classifier_filename, train_data) - return classifier + """Loads saved classifier. """ + return joblib.load(saved_classifier_filename) diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index 27fec9e..5dc400c 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -4,7 +4,6 @@ from . import * from . fixtures import * import regex as re -from flanker import mime from talon import quotations @@ -224,10 +223,7 @@ def test_reply_shares_div_with_from_block(): def test_reply_quotations_share_block(): - msg = mime.from_string(REPLY_QUOTATIONS_SHARE_BLOCK) - html_part = list(msg.walk())[1] - assert html_part.content_type == 'text/html' - stripped_html = quotations.extract_from_html(html_part.body) + stripped_html = quotations.extract_from_plain(REPLY_QUOTATIONS_SHARE_BLOCK) ok_(stripped_html) ok_('From' not in stripped_html) diff --git a/tests/signature/learning/dataset_test.py b/tests/signature/learning/dataset_test.py index 5eeff36..42d8ae6 100644 --- a/tests/signature/learning/dataset_test.py +++ b/tests/signature/learning/dataset_test.py @@ -3,7 +3,7 @@ from ... import * import os -from PyML import SparseDataSet +from numpy import genfromtxt from talon.signature.learning import dataset as d @@ -41,10 +41,13 @@ def test_build_extraction_dataset(): d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'), os.path.join(TMP_DIR, 'extraction.data'), 1) - test_data = SparseDataSet(os.path.join(TMP_DIR, 'extraction.data'), - labelsColumn=-1) + + filename = os.path.join(TMP_DIR, 'extraction.data') + file_data = genfromtxt(filename, delimiter=",") + test_data = file_data[:, :-1] + # the result is a loadable signature extraction dataset # 32 comes from 3 emails in emails/P folder, 11 lines checked to be # a signature, one email has only 10 lines - eq_(test_data.size(), 32) - eq_(len(features('')), test_data.numFeatures) + eq_(test_data.shape[0], 32) + eq_(len(features('')), test_data.shape[1]) diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index 918ed29..0a87e56 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -5,8 +5,7 @@ from . fixtures import * import os -from flanker import mime - +import email.iterators from talon import quotations @@ -614,22 +613,21 @@ def test_preprocess_postprocess_2_links(): def test_standard_replies(): for filename in os.listdir(STANDARD_REPLIES): filename = os.path.join(STANDARD_REPLIES, filename) - if os.path.isdir(filename): + if not filename.endswith('.eml') or os.path.isdir(filename): continue with open(filename) as f: - msg = f.read() - m = mime.from_string(msg) - for part in m.walk(): - if part.content_type == 'text/plain': - text = part.body - stripped_text = quotations.extract_from_plain(text) - reply_text_fn = filename[:-4] + '_reply_text' - if os.path.isfile(reply_text_fn): - with open(reply_text_fn) as f: - reply_text = f.read() - else: - reply_text = 'Hello' - eq_(reply_text, stripped_text, - "'%(reply)s' != %(stripped)s for %(fn)s" % - {'reply': reply_text, 'stripped': stripped_text, - 'fn': filename}) + message = email.message_from_file(f) + body = email.iterators.typed_subpart_iterator(message, subtype='plain').next() + text = ''.join(email.iterators.body_line_iterator(body)) + + stripped_text = quotations.extract_from_plain(text) + reply_text_fn = filename[:-4] + '_reply_text' + if os.path.isfile(reply_text_fn): + with open(reply_text_fn) as f: + reply_text = f.read() + else: + reply_text = 'Hello' + yield eq_, reply_text, stripped_text, \ + "'%(reply)s' != %(stripped)s for %(fn)s" % \ + {'reply': reply_text, 'stripped': stripped_text, + 'fn': filename} From e3ef622031c65c694592ce039ee308b8351a89d7 Mon Sep 17 00:00:00 2001 From: Alex Riina Date: Sun, 8 Mar 2015 00:33:03 -0500 Subject: [PATCH 04/10] remove unused regex --- talon/signature/learning/helpers.py | 8 -------- tests/signature/learning/helpers_test.py | 23 ----------------------- 2 files changed, 31 deletions(-) diff --git a/talon/signature/learning/helpers.py b/talon/signature/learning/helpers.py index 70a4820..38259c3 100644 --- a/talon/signature/learning/helpers.py +++ b/talon/signature/learning/helpers.py @@ -40,14 +40,6 @@ RE_SIGNATURE_WORDS = rc(('(T|t)hank.*,|(B|b)est|(R|r)egards|' # Line contains a pattern like Vitor R. Carvalho or William W. Cohen. RE_NAME = rc('[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+') -# Pattern to match if e.g. 'Sender:' header field has sender names. -SENDER_WITH_NAME_PATTERN = '([\s]*[\S]+,?)+[\s]*<.*>.*' -RE_SENDER_WITH_NAME = rc(SENDER_WITH_NAME_PATTERN) - -# Reply line clue line endings, as in regular expression: -# " wrote:$" or " writes:$" -RE_CLUE_LINE_END = rc('.*(W|w)rotes?:$') - INVALID_WORD_START = rc('\(|\+|[\d]') BAD_SENDER_NAMES = [ diff --git a/tests/signature/learning/helpers_test.py b/tests/signature/learning/helpers_test.py index 29b6fca..7a57f6c 100644 --- a/tests/signature/learning/helpers_test.py +++ b/tests/signature/learning/helpers_test.py @@ -52,29 +52,6 @@ def test_match_names(): ok_(RE_NAME.match(name), "{} should be matched".format(name)) -def test_sender_with_name(): - ok_lines = ['Sergey Obukhov ', - '\tSergey ', - ('"Doe, John (TX)"' - '@EXAMPLE' - ''), - ('Company Sleuth ' - '@EXAMPLE '), - ('Doe III, John ' - '')] - for line in ok_lines: - ok_(RE_SENDER_WITH_NAME.match(line), - '{} should be matched'.format(line)) - - nok_lines = ['', '', 'Sergey serobnic@xxx.ru'] - for line in nok_lines: - assert_false(RE_SENDER_WITH_NAME.match(line), - '{} should not be matched'.format(line)) - - # Now test helpers functions def test_binary_regex_search(): eq_(1, h.binary_regex_search(re.compile("12"))("12")) From 215e36e9ed2662741be17b8a6719cb3fa9f5715c Mon Sep 17 00:00:00 2001 From: Alex Riina Date: Sun, 8 Mar 2015 00:36:19 -0500 Subject: [PATCH 05/10] allow higher version of regex library --- setup.py | 2 +- talon/signature/learning/helpers.py | 2 +- tests/signature/learning/helpers_test.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 626c378..3776961 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ setup(name='talon', zip_safe=True, install_requires=[ "lxml==2.3.3", - "regex==0.1.20110315", # handling of .* changes from version 0 to 1 + "regex>=1", "html2text", "nose==1.2.1", "mock", diff --git a/talon/signature/learning/helpers.py b/talon/signature/learning/helpers.py index 38259c3..51a9227 100644 --- a/talon/signature/learning/helpers.py +++ b/talon/signature/learning/helpers.py @@ -17,7 +17,7 @@ from talon.signature.constants import SIGNATURE_MAX_LINES rc = re.compile RE_EMAIL = rc('@') -RE_RELAX_PHONE = rc('.*(\(? ?[\d]{2,3} ?\)?.{,3}){2,}') +RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}') RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''') # Taken from: diff --git a/tests/signature/learning/helpers_test.py b/tests/signature/learning/helpers_test.py index 7a57f6c..704db4e 100644 --- a/tests/signature/learning/helpers_test.py +++ b/tests/signature/learning/helpers_test.py @@ -43,7 +43,7 @@ VALID_PHONE_NUMBERS = [e.strip() for e in VALID.splitlines() if e.strip()] def test_match_phone_numbers(): for phone in VALID_PHONE_NUMBERS: - ok_(RE_RELAX_PHONE.match(phone), "{} should be matched".format(phone)) + ok_(RE_RELAX_PHONE.search(phone), "{} should be matched".format(phone)) def test_match_names(): From c5e4cd9ab461593f5ee31501c38e42e09baa12e0 Mon Sep 17 00:00:00 2001 From: Alex Riina Date: Sun, 8 Mar 2015 00:40:32 -0500 Subject: [PATCH 06/10] dont be too restrictive on the test library version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3776961..8bd9591 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ setup(name='talon', "lxml==2.3.3", "regex>=1", "html2text", - "nose==1.2.1", + "nose>=1.2.1", "mock", "coverage", "scikit-learn", From 8b1f87b1c027672b46871a623c57f43110e9ed39 Mon Sep 17 00:00:00 2001 From: Scott MacVicar Date: Wed, 6 May 2015 14:16:11 -0700 Subject: [PATCH 07/10] Get this building and passing tests Changes: * add .DS_Store to .gitignore * Decode base64 encoded emails for tests * Pick a version of scikit since the pickled clasifiers are based on that * Add missing numpy and scipy dependencies --- .gitignore | 5 ++++- setup.py | 4 +++- talon/signature/data/classifier | Bin 632 -> 608 bytes talon/signature/data/classifier_01.npy | Bin 88 -> 96 bytes talon/signature/data/classifier_02.npy | Bin 96 -> 176 bytes talon/signature/data/classifier_03.npy | Bin 184 -> 88 bytes tests/text_quotations_test.py | 2 +- 7 files changed, 8 insertions(+), 3 deletions(-) mode change 100644 => 100755 setup.py diff --git a/.gitignore b/.gitignore index af985ab..002f03e 100644 --- a/.gitignore +++ b/.gitignore @@ -48,4 +48,7 @@ tramp *_archive # Trial temp -_trial_temp \ No newline at end of file +_trial_temp + +# OSX +.DS_Store \ No newline at end of file diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 index 8bd9591..fa94f58 --- a/setup.py +++ b/setup.py @@ -18,8 +18,10 @@ setup(name='talon', "regex>=1", "html2text", "nose>=1.2.1", + "numpy", "mock", "coverage", - "scikit-learn", + "scipy", + "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild ] ) diff --git a/talon/signature/data/classifier b/talon/signature/data/classifier index 405e6cd13dec1aa226d471dfca371bc52b8949d5..b7d72eb989596b74547efcbbcfbe280cc79f855d 100644 GIT binary patch delta 386 zcmX|+%}WAN6vbzp5oJ=$$}F|AveZ6Yh_ng{TnG)#Lb&V4bA0N+eBGHh)htBXl*^=l zFj@)y1MO;>z_Yk;Ilp@k=Ue`e-#YcIj{)`AkW37F%uoigi^<+?O^5|)-K4(H{6&8; z%NsUlp3YoOe4UyRk2zHYc^FU=*XNKO8Jw&b%+Dch(I;X=N4Rmnlptlt3&Bm@4lFlp z>5j)*R$y|2dL9jxzqiy=aO>Y!hCFncf(1wz9h_~ef-1gZv5z|qRRZxEcQsAB`-v0} zRp3|}fV`9h0n0B~>Ej-xTxL^~hqw<3vyBI^7Uz+;7gj-X9y~mRj7ROEV-cTI&hSVB zi9049_ijp8pI?>kB_#En!$t&alT`B=Po{YKJ`)SocFT4{tOo;7R?P`EVyHRAvj{2| U&m%h%2a-o-j3*o~v|!Ns1sX$zi2wiq delta 403 zcmZvYJx>Bb5QYVgs0T*zTR{9m1&tSD&{h*-C{((wEH-<~!d+bUZg%%16c=M>Dd7(q zTmOJ|o@+uZoMMW3GtWGkH|x`SZB)|y3@nstLbi2Poic@NUuY&N3-#!J)UY}c;V}P# z=DNPdlL!`)3FSm7JYas$(-;cgbp06?V^|u0rAl|pk&C87CJ5IqlJs*et;+ z$^?%R)=*v&(FyukX`&gdx6!O5155+f#js(B#TZIsCtZ3fCqxS~L1B}s2hNBTPAFL@ z1Im>0MbF{HW!%2~>DPh=9VTt4BvM;3Y@-=-nJElAhPxZXUd^+;oR>!hllfhrNqpLH zL^yW4#~!647Gm|=LS(vg!6x&1V?RDnEVhcN_e6T K$2Y0Sb-n=t*N8g+ diff --git a/talon/signature/data/classifier_01.npy b/talon/signature/data/classifier_01.npy index 29344244b01bdd6d427cfe78e1083bad25c74ff7..11d130269642d0ec81c199f8cb370f603d1e5da9 100644 GIT binary patch delta 40 dcmaz@m|&}Hq@$^$P^*9hxER3T!+t1j4*;LS2VnpJ delta 32 ecmYd@m|!bysH3T)P^*9hxOUEaE>a==YCiydEeQw! diff --git a/talon/signature/data/classifier_02.npy b/talon/signature/data/classifier_02.npy index 7c6997dd69eef019d6745dee125e92434b7954e3..0f965baa69b0df1b8f38acd7fa1e76f2c1a0a897 100644 GIT binary patch delta 139 zcmYe;z&JstKBTBLRYyU+I3uwjRozNK!%#=T&`489p;iG7xD2*3RF)rkYyU^w;J$6B|6?}aS@psG!J=o+r&av0Kj6H=S6$?d{f^D0?*m$X i+V}jb@KI^rZ(ruH=zF-w_Wdxr@yTJ<4{kLu= diff --git a/talon/signature/data/classifier_03.npy b/talon/signature/data/classifier_03.npy index 97d9aa3ec06e932df59bcf4aa1d56cc0ad039210..5a35962eb9617631417f9bf564c051e71818867b 100644 GIT binary patch delta 50 wcmdnN7%@Rd$1O3ZI8{eMy*MMWAXVK;LBmi-Q%9jz0SR!GNzbm2lYX@y0AL6WxBvhE delta 147 zcma#p!8k#uKBTBLRYyU+I3uwjRozNK!%#=T&{$JPp;iG7xIP$$-_tqy);_$U)cwpM zh6CG|srBpK{bj$dUn0fe+7tWTGgMrTtomR-VRN__U*!+`6tgWq_KUo+e{^F*E@#V6 p`w!BQB^6Ek?Ui(mjOV*=-w&f_@N4mEo4)*RzjNMmkqYTo`vEfdJ$wKF diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index 0a87e56..fcf5fcd 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -618,7 +618,7 @@ def test_standard_replies(): with open(filename) as f: message = email.message_from_file(f) body = email.iterators.typed_subpart_iterator(message, subtype='plain').next() - text = ''.join(email.iterators.body_line_iterator(body)) + text = ''.join(email.iterators.body_line_iterator(body, True)) stripped_text = quotations.extract_from_plain(text) reply_text_fn = filename[:-4] + '_reply_text' From e3c4ff38fe7017dfed9f24f486e61916a7421c35 Mon Sep 17 00:00:00 2001 From: Scott MacVicar Date: Wed, 6 May 2015 15:19:50 -0700 Subject: [PATCH 08/10] move test stuff out to its own section --- setup.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index fa94f58..320520a 100755 --- a/setup.py +++ b/setup.py @@ -17,11 +17,13 @@ setup(name='talon', "lxml==2.3.3", "regex>=1", "html2text", - "nose>=1.2.1", "numpy", - "mock", - "coverage", "scipy", "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild + ], + tests_require=[ + "mock", + "nose>=1.2.1", + "coverage" ] ) From 7ea773e6a9820241d665f36448b621dd094a9361 Mon Sep 17 00:00:00 2001 From: Oliver Song Date: Thu, 2 Jul 2015 13:23:00 -0700 Subject: [PATCH 09/10] Fix iphone test --- tests/fixtures/standard_replies/iphone.eml | 4 ++-- tests/fixtures/standard_replies/iphone_reply_text | 3 +++ tests/text_quotations_test.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) create mode 100644 tests/fixtures/standard_replies/iphone_reply_text diff --git a/tests/fixtures/standard_replies/iphone.eml b/tests/fixtures/standard_replies/iphone.eml index 60622f1..320f8ac 100644 --- a/tests/fixtures/standard_replies/iphone.eml +++ b/tests/fixtures/standard_replies/iphone.eml @@ -9,11 +9,11 @@ To: bob Content-Transfer-Encoding: quoted-printable Mime-Version: 1.0 (1.0) -hello +Hello Sent from my iPhone On Apr 3, 2012, at 4:19 PM, bob wr= ote: -> Hi \ No newline at end of file +> Hi diff --git a/tests/fixtures/standard_replies/iphone_reply_text b/tests/fixtures/standard_replies/iphone_reply_text new file mode 100644 index 0000000..460d6d7 --- /dev/null +++ b/tests/fixtures/standard_replies/iphone_reply_text @@ -0,0 +1,3 @@ +Hello + +Sent from my iPhone diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index fcf5fcd..a56c48d 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -624,7 +624,7 @@ def test_standard_replies(): reply_text_fn = filename[:-4] + '_reply_text' if os.path.isfile(reply_text_fn): with open(reply_text_fn) as f: - reply_text = f.read() + reply_text = f.read().strip() else: reply_text = 'Hello' yield eq_, reply_text, stripped_text, \ From 85c7ee980c668dea6da72def85975c4f965faa42 Mon Sep 17 00:00:00 2001 From: Alex Riina Date: Thu, 2 Jul 2015 21:46:27 -0400 Subject: [PATCH 10/10] add script to regenerate ml model --- README.rst | 18 ++++++++++++++++-- train.py | 10 ++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) create mode 100644 train.py diff --git a/README.rst b/README.rst index 2b5966f..2517450 100644 --- a/README.rst +++ b/README.rst @@ -89,7 +89,7 @@ the power of machine learning algorithms: # text == "Thanks Sasha, I can't go any higher and is why I limited it to the\nhomepage." # signature == "John Doe\nvia mobile" -For machine learning talon currently uses `PyML`_ library to build SVM +For machine learning talon currently uses the `scikit-learn`_ library to build SVM classifiers. The core of machine learning algorithm lays in ``talon.signature.learning package``. It defines a set of features to apply to a message (``featurespace.py``), how data sets are built @@ -102,7 +102,21 @@ of features to the dataset we provide files ``classifier`` and used to load trained classifier. Those files should be regenerated every time the feature/data set is changed. -.. _PyML: http://pyml.sourceforge.net/ +To regenerate the model files, you can run + +.. code:: sh + + python train.py + +or + +.. code:: python + + from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA + from talon.signature.learning.classifier import train, init + train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME) + +.. _scikit-learn: http://scikit-learn.org .. _ENRON: https://www.cs.cmu.edu/~enron/ Research diff --git a/train.py b/train.py new file mode 100644 index 0000000..54d04b5 --- /dev/null +++ b/train.py @@ -0,0 +1,10 @@ +from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA +from talon.signature.learning.classifier import train, init + + +def train_model(): + """ retrain model and persist """ + train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME) + +if __name__ == "__main__": + train_model()