From f16760c466257df41c0c77c832792c9724349d66 Mon Sep 17 00:00:00 2001 From: Alex Riina Date: Sun, 8 Mar 2015 00:06:01 -0500 Subject: [PATCH] Remove flanker and replace PyML with scikit-learn I never was actually able to successfully install PyML but the source-forge distribution and lack of python3 support convinced me that scikit-learn would be a fine substitute. Flanker was also difficult for me to install and seemed only to be used in the tests, so I removed it as well to get into a position where I could run the tests. As of this commit, only one is not passing (test_standard_replies with android.eml) though I'm not familiar with the `email` library yet. --- setup.py | 85 +---------------------- talon/signature/data/classifier | Bin 10377 -> 632 bytes talon/signature/data/classifier_01.npy | Bin 0 -> 88 bytes talon/signature/data/classifier_02.npy | Bin 0 -> 96 bytes talon/signature/data/classifier_03.npy | Bin 0 -> 184 bytes talon/signature/data/classifier_04.npy | Bin 0 -> 96 bytes talon/signature/data/classifier_05.npy | Bin 0 -> 176 bytes talon/signature/extraction.py | 6 +- talon/signature/learning/classifier.py | 31 ++++----- tests/html_quotations_test.py | 6 +- tests/signature/learning/dataset_test.py | 13 ++-- tests/text_quotations_test.py | 36 +++++----- 12 files changed, 44 insertions(+), 133 deletions(-) create mode 100644 talon/signature/data/classifier_01.npy create mode 100644 talon/signature/data/classifier_02.npy create mode 100644 talon/signature/data/classifier_03.npy create mode 100644 talon/signature/data/classifier_04.npy create mode 100644 talon/signature/data/classifier_05.npy diff --git a/setup.py b/setup.py index e8bd3c1..626c378 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,3 @@ -import os -import sys -import contextlib - -from distutils.spawn import find_executable from setuptools import setup, find_packages @@ -20,87 +15,11 @@ setup(name='talon', zip_safe=True, install_requires=[ "lxml==2.3.3", - "regex==0.1.20110315", - "chardet==1.0.1", - "dnspython==1.11.1", + "regex==0.1.20110315", # handling of .* changes from version 0 to 1 "html2text", "nose==1.2.1", "mock", "coverage", - "flanker" + "scikit-learn", ] ) - - -def install_pyml(): - ''' - Downloads and installs PyML - ''' - try: - import PyML - except: - pass - else: - return - - # install numpy first - pip('install numpy==1.6.1 --upgrade') - - pyml_tarball = ( - 'http://09cce49df173f6f6e61f-fd6930021b51685920a6fa76529ee321' - '.r45.cf2.rackcdn.com/PyML-0.7.9.tar.gz') - pyml_srcidr = 'PyML-0.7.9' - - # see if PyML tarball needs to be fetched: - if not dir_exists(pyml_srcidr): - run("curl %s | tar -xz" % pyml_tarball) - - # compile&install: - with cd(pyml_srcidr): - python('setup.py build') - python('setup.py install') - - -def run(command): - if os.system(command) != 0: - raise Exception("Failed '{}'".format(command)) - else: - return 0 - - -def python(command): - command = '{} {}'.format(sys.executable, command) - run(command) - - -def enforce_executable(name, install_info): - if os.system("which {}".format(name)) != 0: - raise Exception( - '{} utility is missing.\nTo install, run:\n\n{}\n'.format( - name, install_info)) - - -def pip(command): - command = '{} {}'.format(find_executable('pip'), command) - run(command) - - -def dir_exists(path): - return os.path.isdir(path) - - -@contextlib.contextmanager -def cd(directory): - curdir = os.getcwd() - try: - os.chdir(directory) - yield {} - finally: - os.chdir(curdir) - - -if __name__ == '__main__': - if len(sys.argv) > 1 and sys.argv[1] in ['develop', 'install']: - enforce_executable('curl', 'sudo aptitude install curl') - - install_pyml() diff --git a/talon/signature/data/classifier b/talon/signature/data/classifier index fe717d51de6928da1b32ee90e4d5eabd5e7bad1e..405e6cd13dec1aa226d471dfca371bc52b8949d5 100644 GIT binary patch literal 632 zcmZvYOK%e~6h`xy($q;Q;Y}%zLV*;-2+Cs%D4;G{Wm6fJmhIeh3?7fKJ(H5fNU-B~ zvgJ?Uais{Mc;Us>^>>cHA06qxY2ePJex9dNbML{EV`akZp_Q-YIVONJMgk?=hX4c9*XS_rK~YO35Wm{l?`$>ie)=lJ4_ z{|3HE7krZgCy0}kjgZmF39$6mZlja}eoCFuGb zoG3it#iJHYrWMz?^^0!KRLPc}cqvTcb9M&mGz;RxtWmgJcq?+YYrMUZ^D1(7 z0w-Z$B;Hx}X&qQsNbp{n(^PW-njAOxCnJk#)8XClfqOOHr@iO{p5bn!J*e^FO4{p4 z+pB~Zv|AP$A058ie*62+&iFZ9JNJDY6Q0yKTAA=BChP~_!M}at(+M3e>td@3Vpy#LVO0f=-q7m8ROL~yf9$E-|us3HDm+q&dRFpu?x_JLhfBF5N|N7(iFTa*YD{+sQDc|$zkJ_`RUf86g=`+gBb`zf{ zVe|Zph#xk9Q=sqHZLFLeH`7f#d9J$6@rQ}eRB^ODDxVt88tEjlK1;n*j2<79xJFu~rvh!pYot}$OrH`zF49wJ&&DqlwYsj-Iq6w@jC53fR@@hoZYBMoNYA7> z+OEn~+J?4?t8ICVQr_;(+>rMzVe|Yvajl6w`MyifeT>q!hlJI1l|HY)ry}P^Z*!KK zbL4(3e+&E7Xv%1%4nmDB%;s1hW2Dh%`Q|g$nqrm8<4$Ad(QB{!v)O8SlD5(|@pC-a zsK=D=SFTCEUpi)7!)&ITxZ0kicRQYfo^;2AN97vzXu2t15!bArFTGOhQ@-yV3cu27 z$~pHM{ag2e_vZM1zo&-08NWNryB)UzHt`g8GyPi3xditBep08Oc60u^sz3jr-?T4{ z_7HbV^M4?I7*nS}@72A^Z@x>ObF^rC+pZP8n&(e%mVQ{Mbp$8rhn2J?I@Pr5Hplvi z7Tf|#pS|oex7i*8ZMHv4{J0uFF4D%u8u%n_j+^*?&n?g^;#$NjkzVm#tH_U0Q*Gmp z{H+I@r;@H&R_PpQwXKhm^SBdl=Gy)8Y^)6UOUkw~~*_qwO=ETcB6O{HUW3-W|45 zwpV7yijK-x3R(lN(kpS#zH6TEm!65b*?v@#E$mVJ<=_AL`)^<7mC{x3%a^468tIzt zN_=Wgsa&b_E5&`K8JC}*bGkVaN9HITm7{TV4r~%O8Jo;?o)goK(@A!Ad z(kt)1BVTc+EAWrZfo+VNqj1z~l+4ktLEORP`rw25WIVIPH96;rLTFVO0-W3=2D~;D zC*0f`r<-g137GH!AILD7atH)S3=mMkFjA&2UKv3!&o2V;az&df9>tQyj zOZ++nwDa9IvqYj2Brn!dVGY2fGAF4rA%KcZY1OJKgG{v&sv4l0s5a`WhN@5nsDP+c zCkm$`HG85lxycP8wMNj^h|-!``X{vsbd7r<0KbL=!uLiQoe59fj^ZmSt>wsr2}@zN zU@B8gisXS#csnD}klo?PhaEWz@rG`LCIQqsQNxKEGzj3Cl<-QP{=G4t2IVEcK+vi0 zl!Iwi({W74F&)R$cMeVDU{V@iRcQO$zQ1kwo4EvN@tXhu4S&NQpf&{76AQ;xLvp}% z!(doMu9o_?gjgRiE2w2008tXPu&6vU3ZoVj-{LXToaw=W&2OuZ_gtJYjR3xxX7wYA zKrNZU0BT*%#e&L>=&A%L!?l@7t#CJQfOH-HeF5|9RKs#HY%bz2!Mrv}0$;UH*9F5LDQ^UH0Zn9=f0Pk4kyy8NKoi*w*$vqZ*^M`LZP6jcZvOxtJd6q$4=Ls~ zgg}wQ7Udu11nuN>xXy>f4!d*6?vU8QZ5GhPJ{|VykZd0U0CPyT&jWzQcJZ)_dHo@1 z;sgvQV8}fm76h+JF>h-HJ?u$uK!Dj~(pwRrvHT6G8}_9CH;{+xoNv&A15x73FzZPqEBGo`=%7Yf4S AwEzGB diff --git a/talon/signature/data/classifier_01.npy b/talon/signature/data/classifier_01.npy new file mode 100644 index 0000000000000000000000000000000000000000..29344244b01bdd6d427cfe78e1083bad25c74ff7 GIT binary patch literal 88 zcmbR27wQ`j$;jZwP_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh= eXCxM+0{I$-I+{8PwF*dpYv;V@A{ElF_5%QG<{7#G literal 0 HcmV?d00001 diff --git a/talon/signature/data/classifier_02.npy b/talon/signature/data/classifier_02.npy new file mode 100644 index 0000000000000000000000000000000000000000..7c6997dd69eef019d6745dee125e92434b7954e3 GIT binary patch literal 96 zcmbR27wQ`j$;jZwP_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh= bXCxM+0{I$7I+{8PwF*dpivbKi*u!W54uThp literal 0 HcmV?d00001 diff --git a/talon/signature/data/classifier_03.npy b/talon/signature/data/classifier_03.npy new file mode 100644 index 0000000000000000000000000000000000000000..97d9aa3ec06e932df59bcf4aa1d56cc0ad039210 GIT binary patch literal 184 zcmbR27wQ`j$;jZwP_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(4=E~51qv5u zBo?Fsxf+H#3Wmm-ItsN4aKQD!F#Mj*$+!054W;g94lx|qzD%uO@9r=Ab^Q`42G^e0 z@1CLJa%9y9`w5%Fz4$7B*r%9n`LSQ*js2q=8*({Ye%gPKjx4EY+HbF 0 + data = numpy.array(build_pattern(line, features(sender))) + return classifier.predict(data) > 0 def extract(body, sender): diff --git a/talon/signature/learning/classifier.py b/talon/signature/learning/classifier.py index 476fdb6..9ce5e75 100644 --- a/talon/signature/learning/classifier.py +++ b/talon/signature/learning/classifier.py @@ -5,32 +5,27 @@ The classifier could be used to detect if a certain line of the message body belongs to the signature. """ -import os -import sys - -from PyML import SparseDataSet, SVM +from numpy import genfromtxt +from sklearn.svm import LinearSVC +from sklearn.externals import joblib def init(): - '''Inits classifier with optimal options.''' - return SVM(C=10, optimization='liblinear') + """Inits classifier with optimal options.""" + return LinearSVC(C=10.0) def train(classifier, train_data_filename, save_classifier_filename=None): - '''Trains and saves classifier so that it could be easily loaded later.''' - data = SparseDataSet(train_data_filename, labelsColumn=-1) - classifier.train(data) + """Trains and saves classifier so that it could be easily loaded later.""" + file_data = genfromtxt(train_data_filename, delimiter=",") + train_data, labels = file_data[:, :-1], file_data[:, -1] + classifier.fit(train_data, labels) + if save_classifier_filename: - classifier.save(save_classifier_filename) + joblib.dump(classifier, save_classifier_filename) return classifier def load(saved_classifier_filename, train_data_filename): - """Loads saved classifier. - - Classifier should be loaded with the same data it was trained against - """ - train_data = SparseDataSet(train_data_filename, labelsColumn=-1) - classifier = init() - classifier.load(saved_classifier_filename, train_data) - return classifier + """Loads saved classifier. """ + return joblib.load(saved_classifier_filename) diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index 27fec9e..5dc400c 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -4,7 +4,6 @@ from . import * from . fixtures import * import regex as re -from flanker import mime from talon import quotations @@ -224,10 +223,7 @@ def test_reply_shares_div_with_from_block(): def test_reply_quotations_share_block(): - msg = mime.from_string(REPLY_QUOTATIONS_SHARE_BLOCK) - html_part = list(msg.walk())[1] - assert html_part.content_type == 'text/html' - stripped_html = quotations.extract_from_html(html_part.body) + stripped_html = quotations.extract_from_plain(REPLY_QUOTATIONS_SHARE_BLOCK) ok_(stripped_html) ok_('From' not in stripped_html) diff --git a/tests/signature/learning/dataset_test.py b/tests/signature/learning/dataset_test.py index 5eeff36..42d8ae6 100644 --- a/tests/signature/learning/dataset_test.py +++ b/tests/signature/learning/dataset_test.py @@ -3,7 +3,7 @@ from ... import * import os -from PyML import SparseDataSet +from numpy import genfromtxt from talon.signature.learning import dataset as d @@ -41,10 +41,13 @@ def test_build_extraction_dataset(): d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'), os.path.join(TMP_DIR, 'extraction.data'), 1) - test_data = SparseDataSet(os.path.join(TMP_DIR, 'extraction.data'), - labelsColumn=-1) + + filename = os.path.join(TMP_DIR, 'extraction.data') + file_data = genfromtxt(filename, delimiter=",") + test_data = file_data[:, :-1] + # the result is a loadable signature extraction dataset # 32 comes from 3 emails in emails/P folder, 11 lines checked to be # a signature, one email has only 10 lines - eq_(test_data.size(), 32) - eq_(len(features('')), test_data.numFeatures) + eq_(test_data.shape[0], 32) + eq_(len(features('')), test_data.shape[1]) diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index 918ed29..0a87e56 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -5,8 +5,7 @@ from . fixtures import * import os -from flanker import mime - +import email.iterators from talon import quotations @@ -614,22 +613,21 @@ def test_preprocess_postprocess_2_links(): def test_standard_replies(): for filename in os.listdir(STANDARD_REPLIES): filename = os.path.join(STANDARD_REPLIES, filename) - if os.path.isdir(filename): + if not filename.endswith('.eml') or os.path.isdir(filename): continue with open(filename) as f: - msg = f.read() - m = mime.from_string(msg) - for part in m.walk(): - if part.content_type == 'text/plain': - text = part.body - stripped_text = quotations.extract_from_plain(text) - reply_text_fn = filename[:-4] + '_reply_text' - if os.path.isfile(reply_text_fn): - with open(reply_text_fn) as f: - reply_text = f.read() - else: - reply_text = 'Hello' - eq_(reply_text, stripped_text, - "'%(reply)s' != %(stripped)s for %(fn)s" % - {'reply': reply_text, 'stripped': stripped_text, - 'fn': filename}) + message = email.message_from_file(f) + body = email.iterators.typed_subpart_iterator(message, subtype='plain').next() + text = ''.join(email.iterators.body_line_iterator(body)) + + stripped_text = quotations.extract_from_plain(text) + reply_text_fn = filename[:-4] + '_reply_text' + if os.path.isfile(reply_text_fn): + with open(reply_text_fn) as f: + reply_text = f.read() + else: + reply_text = 'Hello' + yield eq_, reply_text, stripped_text, \ + "'%(reply)s' != %(stripped)s for %(fn)s" % \ + {'reply': reply_text, 'stripped': stripped_text, + 'fn': filename}