From 4df7aa284b4ab9baaaf4b957527c978e0b6a2027 Mon Sep 17 00:00:00 2001
From: Alex Riina <alex.riina@grapevinelogic.com>
Date: Fri, 6 Mar 2015 20:52:58 -0500
Subject: [PATCH 01/10] remove extra imports

---
 tests/signature/bruteforce_test.py       | 4 ----
 tests/signature/extraction_test.py       | 2 --
 tests/signature/learning/dataset_test.py | 1 -
 3 files changed, 7 deletions(-)

diff --git a/tests/signature/bruteforce_test.py b/tests/signature/bruteforce_test.py
index ecbd626..09665fe 100644
--- a/tests/signature/bruteforce_test.py
+++ b/tests/signature/bruteforce_test.py
@@ -2,10 +2,6 @@
 
 from .. import *
 
-import os
-
-from flanker import mime
-
 from talon.signature import bruteforce
 
 
diff --git a/tests/signature/extraction_test.py b/tests/signature/extraction_test.py
index 9cf76dd..a055064 100644
--- a/tests/signature/extraction_test.py
+++ b/tests/signature/extraction_test.py
@@ -4,8 +4,6 @@ from .. import *
 
 import os
 
-from PyML import SparseDataSet
-
 from talon.signature.learning import dataset
 from talon import signature
 from talon.signature import extraction as e
diff --git a/tests/signature/learning/dataset_test.py b/tests/signature/learning/dataset_test.py
index 062ff17..5eeff36 100644
--- a/tests/signature/learning/dataset_test.py
+++ b/tests/signature/learning/dataset_test.py
@@ -5,7 +5,6 @@ import os
 
 from PyML import SparseDataSet
 
-from talon.utils import to_unicode
 from talon.signature.learning import dataset as d
 
 from talon.signature.learning.featurespace import features

From b36287e573b97c065dee4d5f36a8757ed4f375d9 Mon Sep 17 00:00:00 2001
From: Alex Riina <alex.riina@grapevinelogic.com>
Date: Sun, 8 Mar 2015 00:04:41 -0500
Subject: [PATCH 02/10] clean up style and extra imports

---
 talon/quotations.py           |  7 +++----
 talon/signature/__init__.py   | 16 +++-------------
 talon/signature/extraction.py |  6 +-----
 tests/quotations_test.py      |  2 --
 4 files changed, 7 insertions(+), 24 deletions(-)

diff --git a/talon/quotations.py b/talon/quotations.py
index dc77fd4..cdd22b1 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -12,8 +12,7 @@ from copy import deepcopy
 from lxml import html, etree
 import html2text
 
-from talon.constants import RE_DELIMITER
-from talon.utils import random_token, get_delimiter
+from talon.utils import get_delimiter
 from talon import html_quotations
 
 
@@ -151,7 +150,7 @@ def extract_from(msg_body, content_type='text/plain'):
             return extract_from_plain(msg_body)
         elif content_type == 'text/html':
             return extract_from_html(msg_body)
-    except Exception, e:
+    except Exception:
         log.exception('ERROR extracting message')
 
     return msg_body
@@ -344,7 +343,7 @@ def extract_from_html(msg_body):
     html_tree_copy = deepcopy(html_tree)
 
     number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
-    quotation_checkpoints = [False for i in xrange(number_of_checkpoints)]
+    quotation_checkpoints = [False] * number_of_checkpoints
     msg_with_checkpoints = html.tostring(html_tree)
 
     h = html2text.HTML2Text()
diff --git a/talon/signature/__init__.py b/talon/signature/__init__.py
index d1962f3..a871447 100644
--- a/talon/signature/__init__.py
+++ b/talon/signature/__init__.py
@@ -21,11 +21,9 @@ trained against, don't forget to regenerate:
 """
 
 import os
-import sys
-from cStringIO import StringIO
 
 from . import extraction
-from . extraction import extract
+from . extraction import extract  #noqa
 from . learning import classifier
 
 
@@ -36,13 +34,5 @@ EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data')
 
 
 def initialize():
-    try:
-        # redirect output
-        so, sys.stdout = sys.stdout, StringIO()
-
-        extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME,
-                                               EXTRACTOR_DATA)
-        sys.stdout = so
-    except Exception, e:
-        raise Exception(
-            "Failed initializing signature parsing with classifiers", e)
+    extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME,
+                                           EXTRACTOR_DATA)
diff --git a/talon/signature/extraction.py b/talon/signature/extraction.py
index 8c7b74e..58df68d 100644
--- a/talon/signature/extraction.py
+++ b/talon/signature/extraction.py
@@ -1,14 +1,10 @@
 # -*- coding: utf-8 -*-
 
-import os
 import logging
 
 import regex as re
 from PyML import SparseDataSet
 
-from talon.constants import RE_DELIMITER
-from talon.signature.constants import (SIGNATURE_MAX_LINES,
-                                       TOO_LONG_SIGNATURE_LINE)
 from talon.signature.learning.featurespace import features, build_pattern
 from talon.utils import get_delimiter
 from talon.signature.bruteforce import get_signature_candidate
@@ -61,7 +57,7 @@ def extract(body, sender):
                 text = delimiter.join(text)
                 if text.strip():
                     return (text, delimiter.join(signature))
-    except Exception, e:
+    except Exception:
         log.exception('ERROR when extracting signature with classifiers')
 
     return (body, None)
diff --git a/tests/quotations_test.py b/tests/quotations_test.py
index dcc723e..7184368 100644
--- a/tests/quotations_test.py
+++ b/tests/quotations_test.py
@@ -3,8 +3,6 @@
 from . import *
 from . fixtures import *
 
-from flanker import mime
-
 from talon import quotations
 
 

From f16760c466257df41c0c77c832792c9724349d66 Mon Sep 17 00:00:00 2001
From: Alex Riina <alex.riina@grapevinelogic.com>
Date: Sun, 8 Mar 2015 00:06:01 -0500
Subject: [PATCH 03/10] Remove flanker and replace PyML with scikit-learn

I never was actually able to successfully install PyML but the source-forge
distribution and lack of python3 support convinced me that scikit-learn would
be a fine substitute. Flanker was also difficult for me to install and seemed
only to be used in the tests, so I removed it as well to get into a position
where I could run the tests. As of this commit, only one is not passing
(test_standard_replies with android.eml) though I'm not familiar with the `email`
library yet.
---
 setup.py                                 |  85 +----------------------
 talon/signature/data/classifier          | Bin 10377 -> 632 bytes
 talon/signature/data/classifier_01.npy   | Bin 0 -> 88 bytes
 talon/signature/data/classifier_02.npy   | Bin 0 -> 96 bytes
 talon/signature/data/classifier_03.npy   | Bin 0 -> 184 bytes
 talon/signature/data/classifier_04.npy   | Bin 0 -> 96 bytes
 talon/signature/data/classifier_05.npy   | Bin 0 -> 176 bytes
 talon/signature/extraction.py            |   6 +-
 talon/signature/learning/classifier.py   |  31 ++++-----
 tests/html_quotations_test.py            |   6 +-
 tests/signature/learning/dataset_test.py |  13 ++--
 tests/text_quotations_test.py            |  36 +++++-----
 12 files changed, 44 insertions(+), 133 deletions(-)
 create mode 100644 talon/signature/data/classifier_01.npy
 create mode 100644 talon/signature/data/classifier_02.npy
 create mode 100644 talon/signature/data/classifier_03.npy
 create mode 100644 talon/signature/data/classifier_04.npy
 create mode 100644 talon/signature/data/classifier_05.npy

diff --git a/setup.py b/setup.py
index e8bd3c1..626c378 100644
--- a/setup.py
+++ b/setup.py
@@ -1,8 +1,3 @@
-import os
-import sys
-import contextlib
-
-from distutils.spawn import find_executable
 from setuptools import setup, find_packages
 
 
@@ -20,87 +15,11 @@ setup(name='talon',
       zip_safe=True,
       install_requires=[
           "lxml==2.3.3",
-          "regex==0.1.20110315",
-          "chardet==1.0.1",
-          "dnspython==1.11.1",
+          "regex==0.1.20110315",  # handling of .* changes from version 0 to 1
           "html2text",
           "nose==1.2.1",
           "mock",
           "coverage",
-          "flanker"
+          "scikit-learn",
           ]
       )
-
-
-def install_pyml():
-    '''
-    Downloads and installs PyML
-    '''
-    try:
-        import PyML
-    except:
-        pass
-    else:
-        return
-
-    # install numpy first
-    pip('install numpy==1.6.1 --upgrade')
-
-    pyml_tarball = (
-        'http://09cce49df173f6f6e61f-fd6930021b51685920a6fa76529ee321'
-        '.r45.cf2.rackcdn.com/PyML-0.7.9.tar.gz')
-    pyml_srcidr = 'PyML-0.7.9'
-
-    # see if PyML tarball needs to be fetched:
-    if not dir_exists(pyml_srcidr):
-        run("curl %s | tar -xz" % pyml_tarball)
-
-    # compile&install:
-    with cd(pyml_srcidr):
-        python('setup.py build')
-        python('setup.py install')
-
-
-def run(command):
-    if os.system(command) != 0:
-        raise Exception("Failed '{}'".format(command))
-    else:
-        return 0
-
-
-def python(command):
-    command = '{} {}'.format(sys.executable, command)
-    run(command)
-
-
-def enforce_executable(name, install_info):
-    if os.system("which {}".format(name)) != 0:
-        raise Exception(
-            '{} utility is missing.\nTo install, run:\n\n{}\n'.format(
-                name, install_info))
-
-
-def pip(command):
-    command = '{} {}'.format(find_executable('pip'), command)
-    run(command)
-
-
-def dir_exists(path):
-    return os.path.isdir(path)
-
-
-@contextlib.contextmanager
-def cd(directory):
-    curdir = os.getcwd()
-    try:
-        os.chdir(directory)
-        yield {}
-    finally:
-        os.chdir(curdir)
-
-
-if __name__ == '__main__':
-    if len(sys.argv) > 1 and sys.argv[1] in ['develop', 'install']:
-        enforce_executable('curl', 'sudo aptitude install curl')
-
-        install_pyml()
diff --git a/talon/signature/data/classifier b/talon/signature/data/classifier
index fe717d51de6928da1b32ee90e4d5eabd5e7bad1e..405e6cd13dec1aa226d471dfca371bc52b8949d5 100644
GIT binary patch
literal 632
zcmZvYOK%e~6h`xy($q;Q;Y}%zLV*;-2+Cs%D4;G{Wm6fJmhIeh3?7fKJ(H5fNU-B~
zvgJ?Uais{Mc;Us>^>>cHA06qxY2ePJex9dNbML{EV`akZ<nx$dYyT&<YwV0@q6_b_
zOFQ}i6WSi*+Tr`XRy>p_Q-YIVONJMgk?=hX4c9*XS_rK~YO35Wm{l?`$>ie)=lJ4_
z<JN*BxE@Ho8aGC?=PMCakjVIim?>{|3HE7krZgCy0}kjgZmF39$6mZlja}eoCFuGb
zoG3it#iJHYrWMz?^^0!KRLPc}cqvTcb9M&mGz;RxtWmgJ<CRfHvVQho{z1&Y+T`0H
zpvwhb%M#kpD_tsf?t7hfi@C!abl|v|7CG}JF9B~(=!WBGOcpR>cq?+YYrMUZ^D1(7
z0w-Z$B;Hx}X&qQsNbp{n(^PW-njAOxCnJk#)8XClfqOOHr@iO{p5bn!J*e^FO4{p4
z+pB~Zv|AP$A058ie*62+&iFZ9JNJDY6Q0yKTAA=BChP~_!M}at(+M3e>t<eZ9jx)$
IQM(f9U&0X9M*si-

literal 10377
zcmds-%Z}u@5k>d@3Vpy#LVO0f=-q7m8ROL~yf9$E-|us3HDm+q&dRFpu?x_JLh<Bf
zkWxzZk1xO8k2xbsE_tL>fBF5N|N7(iFTa*YD{+sQDc|$zkJ_`RUf86g=`+gBb`zf{
zVe|Zph#xk9Q=sqHZLFLeH`7f#d9J$6@rQ}eRB^ODDxVt88tEjlK1;n*j2<<mer;)1
z*G>79xJFu~rvh!pYot}$OrH`zF49wJ&&DqlwYsj-Iq6w@jC53fR@@hoZYBMoNYA7>
z+OEn~+J?4?t8ICVQr_;(+>rMzVe|Yvajl6w`MyifeT>q!hlJI1l|HY)ry}P^Z*!KK
zbL4(3e+&E7Xv%1%4nmDB%;s1hW2Dh%`Q|g$nqrm8<4$Ad(QB{!v)O8SlD5(|@pC-a
zsK=D=SFTCEUpi)7!)&ITxZ0kicRQYfo^;2AN97vzXu2t15!bArFTGOhQ@-yV3cu27
z$~pHM{ag2e_vZM1zo&-08NWNryB)UzHt`g8GyPi3xditBep08Oc60u^sz3jr-?T4{
z_7HbV^M4?I7*nS}@72A^Z@x>ObF^rC+pZP8n&(e%mVQ{Mbp$8rhn2J?I@Pr5Hplvi
z7Tf|#pS|oex7i*8ZMHv4{J0uFF4D%u8u%n_j+^*?&n?g^;#$NjkzVm#tH_U0Q*Gmp
z{H+I@r;@H&R_PpQwXKhm^SBdl=Gy)<Z?!x}-Lx;bZjqi6e^T0WjUQLkG15_auJEk=
zxWH?qReGj|)pM23aXwJHrd_3{(yXp0>8Y^)6UOUkw~~*_qwO=ETcB6O{HUW3-W|45
zwpV7yijK-x3R(lN(kpS#zH6TEm!65b*?v@#E$mVJ<=_AL`)^<7mC{x3%a^468tIzt
zN_=Wgsa&b_E5&`K8JC}*bGkVaN9HITm7{TV4r~%O8Jo;?o)<FjpEH5;z;^Vj+`%!g
zf_*9XRaztUxGJ`lmaC<5aE-jR=Br`f(zqJ8_SWXr`puCz&_(5FR~vD4>goK(@A!Ad
z(kt)1BVTc+EAWrZfo+VNqj1z~l+4ktLEORP`rw25WIVIPH96;rLTFVO0-W3=2D~;D
zC*0f`r<-g137GH!AILD7atH)S3=mMkFjA&2UKv3!&o2V;az&df9<l~_jSma>>tQyj
zOZ++nwDa9IvqYj2Brn!dVGY2fGAF4rA%KcZY1OJKgG{v&sv4l0s5a`WhN@5nsDP+c
zCkm$`HG85lxycP8wMNj^h|-!``X{vsbd7r<0KbL=!uLiQoe59fj^ZmSt>wsr2}@zN
zU@B8gisXS#csnD}klo?PhaEWz@rG`LCIQqsQNxKEGzj3Cl<-QP{=G4t2IVEcK+vi0
zl!Iwi({W74F&)R$cMeVDU{V@iRcQO$zQ1kwo4EvN@tXhu4S&NQpf&{76AQ;xLvp}%
z!(doMu9o_?gjgRiE2w2008tXPu&6vU3ZoVj-{LXToaw=W&2OuZ_gtJYjR3xxX7wYA
zKrNZU0BT*%#e&L>=&A%L!?l@<i=0m(08InDol%utmtD4*58;AZNCb?+ECAGEV*<uw
zDgaI7rN~R^YFRzGSa_M7Ri+0ZN|l^?V`mrVYPRaIs@bZEs)_hF7~$=VDAiHgrvSdq
zP|c0Xjs?ik+?Sd|XtV&J!WnGNAn(ouYHl=FZVs?Hwq~~Guo|rbXy@1({R|k&d27<%
zoV@1bxnpIwD?k&=ZI;_4!j%FTiarHQ$*KU1;+(fo%d!Xbr3nEQ?4!V&R$rNHc0{uy
znjO(3qD?*C*qGYMbC>7t#CJQfOH-HeF5|9RKs#HY%bz<dpotyZ*|caYwf07LpYQB&
zfNsuKmr2(?;5C`_gBV~o>2!Mrv}0$;UH*9F5LDQ^UH<qE2uNa=#4d@Qtr{?jOm<1^
z^5?DsXeWtX61#ohWwNus0NTl9m&q=Zd?W?rughPTzb<v%?&L*|U;s%B`SYd%kRzln
zZtOz-LjL$)ErCQKe<6Qi6NV&)`-XQUmZhOwTOo<TuAIZh<Uha!J?u_*N5E@R7g85?
zV#r^}U)YIZCx%RhOop8p(i!Y)fX31pG8r-%GRf;IK|6agY)3y~<dDvg&Twt<k7<I&
z5*rd55*u<FZcyG>0Zn9=f0Pk4kyy8NKoi*w*$vqZ*^M`LZP6jcZvOxtJd6q$4=Ls~
zgg}wQ7Udu11nuN>xXy>f4!d*6?vU8QZ5GhPJ{|VykZd0U0CPyT&jWzQcJZ)_dHo@1
z;sgvQV8}fm76h+JF>h-HJ?u$uK!Dj~(pwRrvHT6G8}_9CH;{+xoNv&A1<B-)$sv<N
zCWn1GT<pVU^=SprSWbtW4mlliI^=Z7>5x<JegGa*PN$qs>73FzZPqEBGo`=%7Yf4S
AwEzGB

diff --git a/talon/signature/data/classifier_01.npy b/talon/signature/data/classifier_01.npy
new file mode 100644
index 0000000000000000000000000000000000000000..29344244b01bdd6d427cfe78e1083bad25c74ff7
GIT binary patch
literal 88
zcmbR27wQ`j$;jZwP_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
eXCxM+0{I$-I+{8PwF*dpYv;V@A{ElF_5%QG<{7#G

literal 0
HcmV?d00001

diff --git a/talon/signature/data/classifier_02.npy b/talon/signature/data/classifier_02.npy
new file mode 100644
index 0000000000000000000000000000000000000000..7c6997dd69eef019d6745dee125e92434b7954e3
GIT binary patch
literal 96
zcmbR27wQ`j$;jZwP_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
bXCxM+0{I$7I+{8PwF*dpivbKi*u!W54uThp

literal 0
HcmV?d00001

diff --git a/talon/signature/data/classifier_03.npy b/talon/signature/data/classifier_03.npy
new file mode 100644
index 0000000000000000000000000000000000000000..97d9aa3ec06e932df59bcf4aa1d56cc0ad039210
GIT binary patch
literal 184
zcmbR27wQ`j$;jZwP_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(4=E~51qv5u
zBo?Fsxf+H#3Wmm-ItsN4aKQD!F#Mj*$+!054W;g94lx|qzD%uO@9r=Ab^Q`42G^e0
z@1CLJa%9y9`w5%Fz4$7B*r%9n`LSQ*js2q=8*({Ye%gPKjx4EY+HbF<Yh*m%efxeG
WJ%e9M+w|pk`<?Tii&RLz+7AE><w#us

literal 0
HcmV?d00001

diff --git a/talon/signature/data/classifier_04.npy b/talon/signature/data/classifier_04.npy
new file mode 100644
index 0000000000000000000000000000000000000000..11d130269642d0ec81c199f8cb370f603d1e5da9
GIT binary patch
literal 96
zcmbR27wQ`j$;jZwP_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
cXCxM+0{I$7I+{8PwF*dpivbKi?1$3!022Zj3jhEB

literal 0
HcmV?d00001

diff --git a/talon/signature/data/classifier_05.npy b/talon/signature/data/classifier_05.npy
new file mode 100644
index 0000000000000000000000000000000000000000..4ff77145c11a0cbbdb46694d540b71df54a32014
GIT binary patch
literal 176
zcmbR27wQ`j$;jZwP_3SlTAW;@Zl$1ZlV+i=qoAIaUsO_*m=~X4l#&V(4=E~51qv5u
zBo?Fsxf+H#3Wi3SItsN4aKQD!F#Mj*$+!054W;g94lx|qzD%uO@9r=Ab^Q`42G^e0
z@1CLJa%9y9`w5%Fz4$7B*r%9n`LSQ*js2q=8*({Ye%gPKjx4EY+HbF<Yh*m%efxeG
NJ%e9M+w|pkdjQ!aL(TvI

literal 0
HcmV?d00001

diff --git a/talon/signature/extraction.py b/talon/signature/extraction.py
index 58df68d..995ad27 100644
--- a/talon/signature/extraction.py
+++ b/talon/signature/extraction.py
@@ -3,7 +3,7 @@
 import logging
 
 import regex as re
-from PyML import SparseDataSet
+import numpy
 
 from talon.signature.learning.featurespace import features, build_pattern
 from talon.utils import get_delimiter
@@ -32,8 +32,8 @@ RE_REVERSE_SIGNATURE = re.compile(r'''
 
 def is_signature_line(line, sender, classifier):
     '''Checks if the line belongs to signature. Returns True or False.'''
-    data = SparseDataSet([build_pattern(line, features(sender))])
-    return classifier.decisionFunc(data, 0) > 0
+    data = numpy.array(build_pattern(line, features(sender)))
+    return classifier.predict(data) > 0
 
 
 def extract(body, sender):
diff --git a/talon/signature/learning/classifier.py b/talon/signature/learning/classifier.py
index 476fdb6..9ce5e75 100644
--- a/talon/signature/learning/classifier.py
+++ b/talon/signature/learning/classifier.py
@@ -5,32 +5,27 @@ The classifier could be used to detect if a certain line of the message
 body belongs to the signature.
 """
 
-import os
-import sys
-
-from PyML import SparseDataSet, SVM
+from numpy import genfromtxt
+from sklearn.svm import LinearSVC
+from sklearn.externals import joblib
 
 
 def init():
-    '''Inits classifier with optimal options.'''
-    return SVM(C=10, optimization='liblinear')
+    """Inits classifier with optimal options."""
+    return LinearSVC(C=10.0)
 
 
 def train(classifier, train_data_filename, save_classifier_filename=None):
-    '''Trains and saves classifier so that it could be easily loaded later.'''
-    data = SparseDataSet(train_data_filename, labelsColumn=-1)
-    classifier.train(data)
+    """Trains and saves classifier so that it could be easily loaded later."""
+    file_data = genfromtxt(train_data_filename, delimiter=",")
+    train_data, labels = file_data[:, :-1], file_data[:, -1]
+    classifier.fit(train_data, labels)
+
     if save_classifier_filename:
-        classifier.save(save_classifier_filename)
+        joblib.dump(classifier, save_classifier_filename)
     return classifier
 
 
 def load(saved_classifier_filename, train_data_filename):
-    """Loads saved classifier.
-
-    Classifier should be loaded with the same data it was trained against
-    """
-    train_data = SparseDataSet(train_data_filename, labelsColumn=-1)
-    classifier = init()
-    classifier.load(saved_classifier_filename, train_data)
-    return classifier
+    """Loads saved classifier. """
+    return joblib.load(saved_classifier_filename)
diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py
index 27fec9e..5dc400c 100644
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -4,7 +4,6 @@ from . import *
 from . fixtures import *
 
 import regex as re
-from flanker import mime
 
 from talon import quotations
 
@@ -224,10 +223,7 @@ def test_reply_shares_div_with_from_block():
 
 
 def test_reply_quotations_share_block():
-    msg = mime.from_string(REPLY_QUOTATIONS_SHARE_BLOCK)
-    html_part = list(msg.walk())[1]
-    assert html_part.content_type == 'text/html'
-    stripped_html = quotations.extract_from_html(html_part.body)
+    stripped_html = quotations.extract_from_plain(REPLY_QUOTATIONS_SHARE_BLOCK)
     ok_(stripped_html)
     ok_('From' not in stripped_html)
 
diff --git a/tests/signature/learning/dataset_test.py b/tests/signature/learning/dataset_test.py
index 5eeff36..42d8ae6 100644
--- a/tests/signature/learning/dataset_test.py
+++ b/tests/signature/learning/dataset_test.py
@@ -3,7 +3,7 @@
 from ... import *
 import os
 
-from PyML import SparseDataSet
+from numpy import genfromtxt
 
 from talon.signature.learning import dataset as d
 
@@ -41,10 +41,13 @@ def test_build_extraction_dataset():
     d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'),
                                os.path.join(TMP_DIR,
                                             'extraction.data'), 1)
-    test_data = SparseDataSet(os.path.join(TMP_DIR, 'extraction.data'),
-                              labelsColumn=-1)
+
+    filename = os.path.join(TMP_DIR, 'extraction.data')
+    file_data = genfromtxt(filename, delimiter=",")
+    test_data = file_data[:, :-1]
+
     # the result is a loadable signature extraction dataset
     # 32 comes from 3 emails in emails/P folder, 11 lines checked to be
     # a signature, one email has only 10 lines
-    eq_(test_data.size(), 32)
-    eq_(len(features('')), test_data.numFeatures)
+    eq_(test_data.shape[0], 32)
+    eq_(len(features('')), test_data.shape[1])
diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py
index 918ed29..0a87e56 100644
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -5,8 +5,7 @@ from . fixtures import *
 
 import os
 
-from flanker import mime
-
+import email.iterators
 from talon import quotations
 
 
@@ -614,22 +613,21 @@ def test_preprocess_postprocess_2_links():
 def test_standard_replies():
     for filename in os.listdir(STANDARD_REPLIES):
         filename = os.path.join(STANDARD_REPLIES, filename)
-        if os.path.isdir(filename):
+        if not filename.endswith('.eml') or os.path.isdir(filename):
             continue
         with open(filename) as f:
-            msg = f.read()
-            m = mime.from_string(msg)
-            for part in m.walk():
-                if part.content_type == 'text/plain':
-                    text = part.body
-                    stripped_text = quotations.extract_from_plain(text)
-                    reply_text_fn = filename[:-4] + '_reply_text'
-                    if os.path.isfile(reply_text_fn):
-                        with open(reply_text_fn) as f:
-                            reply_text = f.read()
-                    else:
-                        reply_text = 'Hello'
-                    eq_(reply_text, stripped_text,
-                        "'%(reply)s' != %(stripped)s for %(fn)s" %
-                        {'reply': reply_text, 'stripped': stripped_text,
-                         'fn': filename})
+            message = email.message_from_file(f)
+            body = email.iterators.typed_subpart_iterator(message, subtype='plain').next()
+            text = ''.join(email.iterators.body_line_iterator(body))
+
+            stripped_text = quotations.extract_from_plain(text)
+            reply_text_fn = filename[:-4] + '_reply_text'
+            if os.path.isfile(reply_text_fn):
+                with open(reply_text_fn) as f:
+                    reply_text = f.read()
+            else:
+                reply_text = 'Hello'
+            yield eq_, reply_text, stripped_text, \
+                "'%(reply)s' != %(stripped)s for %(fn)s" % \
+                {'reply': reply_text, 'stripped': stripped_text,
+                 'fn': filename}

From e3ef622031c65c694592ce039ee308b8351a89d7 Mon Sep 17 00:00:00 2001
From: Alex Riina <alex.riina@grapevinelogic.com>
Date: Sun, 8 Mar 2015 00:33:03 -0500
Subject: [PATCH 04/10] remove unused regex

---
 talon/signature/learning/helpers.py      |  8 --------
 tests/signature/learning/helpers_test.py | 23 -----------------------
 2 files changed, 31 deletions(-)

diff --git a/talon/signature/learning/helpers.py b/talon/signature/learning/helpers.py
index 70a4820..38259c3 100644
--- a/talon/signature/learning/helpers.py
+++ b/talon/signature/learning/helpers.py
@@ -40,14 +40,6 @@ RE_SIGNATURE_WORDS = rc(('(T|t)hank.*,|(B|b)est|(R|r)egards|'
 # Line contains a pattern like Vitor R. Carvalho or William W. Cohen.
 RE_NAME = rc('[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+')
 
-# Pattern to match if e.g. 'Sender:' header field has sender names.
-SENDER_WITH_NAME_PATTERN = '([\s]*[\S]+,?)+[\s]*<.*>.*'
-RE_SENDER_WITH_NAME = rc(SENDER_WITH_NAME_PATTERN)
-
-# Reply line clue line endings, as in regular expression:
-# " wrote:$" or " writes:$"
-RE_CLUE_LINE_END = rc('.*(W|w)rotes?:$')
-
 INVALID_WORD_START = rc('\(|\+|[\d]')
 
 BAD_SENDER_NAMES = [
diff --git a/tests/signature/learning/helpers_test.py b/tests/signature/learning/helpers_test.py
index 29b6fca..7a57f6c 100644
--- a/tests/signature/learning/helpers_test.py
+++ b/tests/signature/learning/helpers_test.py
@@ -52,29 +52,6 @@ def test_match_names():
         ok_(RE_NAME.match(name), "{} should be matched".format(name))
 
 
-def test_sender_with_name():
-    ok_lines = ['Sergey Obukhov <serobnic@example.com>',
-                '\tSergey  <serobnic@example.com>',
-                ('"Doe, John (TX)"'
-                 '<DowJ@example.com>@EXAMPLE'
-                 '<IMCEANOTES-+22Doe+2C+20John+20'
-                 '+28TX+29+22+20+3CDoeJ+40example+2Ecom+3E'
-                 '+40EXAMPLE@EXAMPLE.com>'),
-                ('Company Sleuth <csleuth@email.xxx.com>'
-                 '@EXAMPLE <XXX-Company+20Sleuth+20+3Ccsleuth'
-                 '+40email+2Exxx+2Ecom+3E+40EXAMPLE@EXAMPLE.com>'),
-                ('Doe III, John '
-                 '</O=EXAMPLE/OU=NA/CN=RECIPIENTS/CN=jDOE5>')]
-    for line in ok_lines:
-        ok_(RE_SENDER_WITH_NAME.match(line),
-            '{} should be matched'.format(line))
-
-    nok_lines = ['', '<serobnic@xxx.ru>', 'Sergey serobnic@xxx.ru']
-    for line in nok_lines:
-        assert_false(RE_SENDER_WITH_NAME.match(line),
-                     '{} should not be matched'.format(line))
-
-
 # Now test helpers functions
 def test_binary_regex_search():
     eq_(1, h.binary_regex_search(re.compile("12"))("12"))

From 215e36e9ed2662741be17b8a6719cb3fa9f5715c Mon Sep 17 00:00:00 2001
From: Alex Riina <alex.riina@grapevinelogic.com>
Date: Sun, 8 Mar 2015 00:36:19 -0500
Subject: [PATCH 05/10] allow higher version of regex library

---
 setup.py                                 | 2 +-
 talon/signature/learning/helpers.py      | 2 +-
 tests/signature/learning/helpers_test.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 626c378..3776961 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@ setup(name='talon',
       zip_safe=True,
       install_requires=[
           "lxml==2.3.3",
-          "regex==0.1.20110315",  # handling of .* changes from version 0 to 1
+          "regex>=1",
           "html2text",
           "nose==1.2.1",
           "mock",
diff --git a/talon/signature/learning/helpers.py b/talon/signature/learning/helpers.py
index 38259c3..51a9227 100644
--- a/talon/signature/learning/helpers.py
+++ b/talon/signature/learning/helpers.py
@@ -17,7 +17,7 @@ from talon.signature.constants import SIGNATURE_MAX_LINES
 rc = re.compile
 
 RE_EMAIL = rc('@')
-RE_RELAX_PHONE = rc('.*(\(? ?[\d]{2,3} ?\)?.{,3}){2,}')
+RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
 RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')
 
 # Taken from:
diff --git a/tests/signature/learning/helpers_test.py b/tests/signature/learning/helpers_test.py
index 7a57f6c..704db4e 100644
--- a/tests/signature/learning/helpers_test.py
+++ b/tests/signature/learning/helpers_test.py
@@ -43,7 +43,7 @@ VALID_PHONE_NUMBERS = [e.strip() for e in VALID.splitlines() if e.strip()]
 
 def test_match_phone_numbers():
     for phone in VALID_PHONE_NUMBERS:
-        ok_(RE_RELAX_PHONE.match(phone), "{} should be matched".format(phone))
+        ok_(RE_RELAX_PHONE.search(phone), "{} should be matched".format(phone))
 
 
 def test_match_names():

From c5e4cd9ab461593f5ee31501c38e42e09baa12e0 Mon Sep 17 00:00:00 2001
From: Alex Riina <alex.riina@grapevinelogic.com>
Date: Sun, 8 Mar 2015 00:40:32 -0500
Subject: [PATCH 06/10] dont be too restrictive on the test library version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 3776961..8bd9591 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,7 @@ setup(name='talon',
           "lxml==2.3.3",
           "regex>=1",
           "html2text",
-          "nose==1.2.1",
+          "nose>=1.2.1",
           "mock",
           "coverage",
           "scikit-learn",

From 8b1f87b1c027672b46871a623c57f43110e9ed39 Mon Sep 17 00:00:00 2001
From: Scott MacVicar <scott@macvicar.net>
Date: Wed, 6 May 2015 14:16:11 -0700
Subject: [PATCH 07/10] Get this building and passing tests

Changes:
* add .DS_Store to .gitignore
* Decode base64 encoded emails for tests
* Pick a version of scikit since the pickled clasifiers are based on that
* Add missing numpy and scipy dependencies
---
 .gitignore                             |   5 ++++-
 setup.py                               |   4 +++-
 talon/signature/data/classifier        | Bin 632 -> 608 bytes
 talon/signature/data/classifier_01.npy | Bin 88 -> 96 bytes
 talon/signature/data/classifier_02.npy | Bin 96 -> 176 bytes
 talon/signature/data/classifier_03.npy | Bin 184 -> 88 bytes
 tests/text_quotations_test.py          |   2 +-
 7 files changed, 8 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 setup.py

diff --git a/.gitignore b/.gitignore
index af985ab..002f03e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,4 +48,7 @@ tramp
 *_archive
 
 # Trial temp
-_trial_temp
\ No newline at end of file
+_trial_temp
+
+# OSX
+.DS_Store
\ No newline at end of file
diff --git a/setup.py b/setup.py
old mode 100644
new mode 100755
index 8bd9591..fa94f58
--- a/setup.py
+++ b/setup.py
@@ -18,8 +18,10 @@ setup(name='talon',
           "regex>=1",
           "html2text",
           "nose>=1.2.1",
+          "numpy",
           "mock",
           "coverage",
-          "scikit-learn",
+          "scipy",
+          "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
           ]
       )
diff --git a/talon/signature/data/classifier b/talon/signature/data/classifier
index 405e6cd13dec1aa226d471dfca371bc52b8949d5..b7d72eb989596b74547efcbbcfbe280cc79f855d 100644
GIT binary patch
delta 386
zcmX|+%}WAN6vbzp5oJ=$$}F|AveZ6Yh_ng{TnG)#Lb&V4bA0N+eBGHh)htBXl*^=l
zFj@)y1MO;>z_Yk;Ilp@k=Ue`e-#YcIj{)`AkW37F%uoigi^<+?O^5|)-K4(H{6&8;
z%NsUlp3YoOe4UyRk2zHYc^FU=*XNKO8Jw&b%+Dch(I;X=N4Rmnlptlt3&Bm@4lFlp
z>5j)*R$y|2dL9jxzqiy=aO>Y!hCFncf(1wz9h_~ef-1gZv5z|qRRZxEcQsAB`-v0}
zRp3|}fV`9h0n0B~>Ej-xTxL^~hqw<3vyBI^7Uz+;7gj-X9y~mRj7ROEV-cTI&hSVB
zi9049_ijp8pI?>kB_#En!$t&alT`B=Po{YKJ`)SocFT4{tOo;7R?P`EVyHRAvj{2|
U&m%h%2a-o-j3*o~v|!Ns1sX$zi2wiq

delta 403
zcmZvYJx>Bb5QYVgs0T*zTR{9m1&tSD&{h*-C{((wEH-<~!d+bUZg%%16c=M>Dd7(q
zTmOJ|o@+uZoMMW3GtWGkH|x`SZB)|y3@nstLbi2Poic@NUuY&N3-#!J)UY}c;V}P#
z=DNPdlL!`)3FSm7JYas$(-;cgbp06?V^|u0rAl|pk&C87CJ5I<Sn({B>qlJs*et;+
z$^?%R)=*v&(FyukX`&gdx6!O5155+f#js(B#TZIsCtZ3fCqxS~L1B}s2hNBTPAFL@
z1Im>0MbF{HW!%2~>DPh=9VTt4BvM;3Y@-=-nJElAhPxZXUd^+;oR>!hllfhrNqpLH
zL^y<M<SG|sbiiT1by>W4#~<b93CjMe9T>!647Gm|=LS(vg!6x&1V?RDnEVhcN_e6T
K$2Y0Sb-n=t*N8g+

diff --git a/talon/signature/data/classifier_01.npy b/talon/signature/data/classifier_01.npy
index 29344244b01bdd6d427cfe78e1083bad25c74ff7..11d130269642d0ec81c199f8cb370f603d1e5da9 100644
GIT binary patch
delta 40
dcmaz@m|&}Hq@$^$P^*9hxER3T!+t1j4*;LS2VnpJ

delta 32
ecmYd@m|!bysH3T)P^*9hxOUEaE>a==YCiydEeQw!

diff --git a/talon/signature/data/classifier_02.npy b/talon/signature/data/classifier_02.npy
index 7c6997dd69eef019d6745dee125e92434b7954e3..0f965baa69b0df1b8f38acd7fa1e76f2c1a0a897 100644
GIT binary patch
delta 139
zcmYe;z&JstKBTBLRYyU+I3uwjRozNK!%#=T&`489p;iG7xD2*3RF)rkYyU^w;<el%
zh66MC8UC@E{<cq9S+~mP>J$6B|6?}aS@psG!J=o+r&av0Kj6H=S6$?d{f^D0?*m$X
i+V}jb@KI^rZ(ruH=zF-w_Wdxr@yTJ<4{<NQ+XDc*R6ipC

delta 58
ucmdnMm@q-d#4RzWI8{eMy*MMWAXVK;LBmK#Q%9jz0SRz1fWZfQ7!3fg>kLu=

diff --git a/talon/signature/data/classifier_03.npy b/talon/signature/data/classifier_03.npy
index 97d9aa3ec06e932df59bcf4aa1d56cc0ad039210..5a35962eb9617631417f9bf564c051e71818867b 100644
GIT binary patch
delta 50
wcmdnN7%@Rd$1O3ZI8{eMy*MMWAXVK;LBmi-Q%9jz0SR!GNzbm2lYX@y0AL6WxBvhE

delta 147
zcma#p!8k#uKBTBLRYyU+I3uwjRozNK!%#=T&{$JPp;iG7xIP$$-_tqy);_$U)cwpM
zh6CG|srBpK{bj$dUn0fe+7tWTGgMrTtomR-VRN__U*!+`6tgWq_KUo+e{^F*E@#V6
p`w!BQB^6Ek?Ui(mjOV*=-w&f_@N4mEo4)*RzjNMmkqYTo`vEfdJ$wKF

diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py
index 0a87e56..fcf5fcd 100644
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -618,7 +618,7 @@ def test_standard_replies():
         with open(filename) as f:
             message = email.message_from_file(f)
             body = email.iterators.typed_subpart_iterator(message, subtype='plain').next()
-            text = ''.join(email.iterators.body_line_iterator(body))
+            text = ''.join(email.iterators.body_line_iterator(body, True))
 
             stripped_text = quotations.extract_from_plain(text)
             reply_text_fn = filename[:-4] + '_reply_text'

From e3c4ff38fe7017dfed9f24f486e61916a7421c35 Mon Sep 17 00:00:00 2001
From: Scott MacVicar <scott@macvicar.net>
Date: Wed, 6 May 2015 15:19:50 -0700
Subject: [PATCH 08/10] move test stuff out to its own section

---
 setup.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index fa94f58..320520a 100755
--- a/setup.py
+++ b/setup.py
@@ -17,11 +17,13 @@ setup(name='talon',
           "lxml==2.3.3",
           "regex>=1",
           "html2text",
-          "nose>=1.2.1",
           "numpy",
-          "mock",
-          "coverage",
           "scipy",
           "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
+          ],
+      tests_require=[
+          "mock",
+          "nose>=1.2.1",
+          "coverage"
           ]
       )

From 7ea773e6a9820241d665f36448b621dd094a9361 Mon Sep 17 00:00:00 2001
From: Oliver Song <osong@mit.edu>
Date: Thu, 2 Jul 2015 13:23:00 -0700
Subject: [PATCH 09/10] Fix iphone test

---
 tests/fixtures/standard_replies/iphone.eml        | 4 ++--
 tests/fixtures/standard_replies/iphone_reply_text | 3 +++
 tests/text_quotations_test.py                     | 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)
 create mode 100644 tests/fixtures/standard_replies/iphone_reply_text

diff --git a/tests/fixtures/standard_replies/iphone.eml b/tests/fixtures/standard_replies/iphone.eml
index 60622f1..320f8ac 100644
--- a/tests/fixtures/standard_replies/iphone.eml
+++ b/tests/fixtures/standard_replies/iphone.eml
@@ -9,11 +9,11 @@ To: bob <bob@example.com>
 Content-Transfer-Encoding: quoted-printable
 Mime-Version: 1.0 (1.0)
 
-hello
+Hello
 
 Sent from my iPhone
 
 On Apr 3, 2012, at 4:19 PM, bob <bob@example.com> wr=
 ote:
 
-> Hi
\ No newline at end of file
+> Hi
diff --git a/tests/fixtures/standard_replies/iphone_reply_text b/tests/fixtures/standard_replies/iphone_reply_text
new file mode 100644
index 0000000..460d6d7
--- /dev/null
+++ b/tests/fixtures/standard_replies/iphone_reply_text
@@ -0,0 +1,3 @@
+Hello
+
+Sent from my iPhone
diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py
index fcf5fcd..a56c48d 100644
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -624,7 +624,7 @@ def test_standard_replies():
             reply_text_fn = filename[:-4] + '_reply_text'
             if os.path.isfile(reply_text_fn):
                 with open(reply_text_fn) as f:
-                    reply_text = f.read()
+                    reply_text = f.read().strip()
             else:
                 reply_text = 'Hello'
             yield eq_, reply_text, stripped_text, \

From 85c7ee980c668dea6da72def85975c4f965faa42 Mon Sep 17 00:00:00 2001
From: Alex Riina <alex.riina@gmail.com>
Date: Thu, 2 Jul 2015 21:46:27 -0400
Subject: [PATCH 10/10] add script to regenerate ml model

---
 README.rst | 18 ++++++++++++++++--
 train.py   | 10 ++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)
 create mode 100644 train.py

diff --git a/README.rst b/README.rst
index 2b5966f..2517450 100644
--- a/README.rst
+++ b/README.rst
@@ -89,7 +89,7 @@ the power of machine learning algorithms:
     # text == "Thanks Sasha, I can't go any higher and is why I limited it to the\nhomepage."
     # signature == "John Doe\nvia mobile"
 
-For machine learning talon currently uses `PyML`_ library to build SVM
+For machine learning talon currently uses the `scikit-learn`_ library to build SVM
 classifiers. The core of machine learning algorithm lays in
 ``talon.signature.learning package``. It defines a set of features to
 apply to a message (``featurespace.py``), how data sets are built
@@ -102,7 +102,21 @@ of features to the dataset we provide files ``classifier`` and
 used to load trained classifier. Those files should be regenerated every
 time the feature/data set is changed.
 
-.. _PyML: http://pyml.sourceforge.net/
+To regenerate the model files, you can run
+
+.. code:: sh
+
+    python train.py
+
+or
+
+.. code:: python
+    
+    from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA
+    from talon.signature.learning.classifier import train, init
+    train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME)
+
+.. _scikit-learn: http://scikit-learn.org
 .. _ENRON: https://www.cs.cmu.edu/~enron/
 
 Research
diff --git a/train.py b/train.py
new file mode 100644
index 0000000..54d04b5
--- /dev/null
+++ b/train.py
@@ -0,0 +1,10 @@
+from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA
+from talon.signature.learning.classifier import train, init
+
+
+def train_model():
+    """ retrain model and persist """
+    train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME)
+
+if __name__ == "__main__":
+    train_model()