13 Commits

Author SHA1 Message Date
Ralph Meijer
2377c387c7 Actually bump up talon's version up to 1.0.5 to match the tag. 2015-09-09 22:46:18 +02:00
Sergey Obukhov
9358db6cee bump up talon version 2015-09-03 11:03:01 -07:00
Sergey Obukhov
08c9d7db03 Merge pull request #45 from AlexRiina/master
Replace PyML with sklearn and clean up dependencies
2015-09-03 10:56:18 -07:00
Alex Riina
85c7ee980c add script to regenerate ml model 2015-07-02 21:49:09 -04:00
Oliver Song
7ea773e6a9 Fix iphone test 2015-07-02 21:49:09 -04:00
Scott MacVicar
e3c4ff38fe move test stuff out to its own section 2015-07-02 21:49:09 -04:00
Scott MacVicar
8b1f87b1c0 Get this building and passing tests
Changes:
* add .DS_Store to .gitignore
* Decode base64 encoded emails for tests
* Pick a version of scikit since the pickled clasifiers are based on that
* Add missing numpy and scipy dependencies
2015-07-02 21:49:09 -04:00
Alex Riina
c5e4cd9ab4 dont be too restrictive on the test library version 2015-07-02 21:49:09 -04:00
Alex Riina
215e36e9ed allow higher version of regex library 2015-07-02 21:49:09 -04:00
Alex Riina
e3ef622031 remove unused regex 2015-07-02 21:49:09 -04:00
Alex Riina
f16760c466 Remove flanker and replace PyML with scikit-learn
I never was actually able to successfully install PyML but the source-forge
distribution and lack of python3 support convinced me that scikit-learn would
be a fine substitute. Flanker was also difficult for me to install and seemed
only to be used in the tests, so I removed it as well to get into a position
where I could run the tests. As of this commit, only one is not passing
(test_standard_replies with android.eml) though I'm not familiar with the `email`
library yet.
2015-07-02 21:49:09 -04:00
Alex Riina
b36287e573 clean up style and extra imports 2015-07-02 21:49:09 -04:00
Alex Riina
4df7aa284b remove extra imports 2015-07-02 21:49:09 -04:00
24 changed files with 95 additions and 205 deletions

5
.gitignore vendored
View File

@@ -48,4 +48,7 @@ tramp
*_archive *_archive
# Trial temp # Trial temp
_trial_temp _trial_temp
# OSX
.DS_Store

View File

@@ -89,7 +89,7 @@ the power of machine learning algorithms:
# text == "Thanks Sasha, I can't go any higher and is why I limited it to the\nhomepage." # text == "Thanks Sasha, I can't go any higher and is why I limited it to the\nhomepage."
# signature == "John Doe\nvia mobile" # signature == "John Doe\nvia mobile"
For machine learning talon currently uses `PyML`_ library to build SVM For machine learning talon currently uses the `scikit-learn`_ library to build SVM
classifiers. The core of machine learning algorithm lays in classifiers. The core of machine learning algorithm lays in
``talon.signature.learning package``. It defines a set of features to ``talon.signature.learning package``. It defines a set of features to
apply to a message (``featurespace.py``), how data sets are built apply to a message (``featurespace.py``), how data sets are built
@@ -102,7 +102,21 @@ of features to the dataset we provide files ``classifier`` and
used to load trained classifier. Those files should be regenerated every used to load trained classifier. Those files should be regenerated every
time the feature/data set is changed. time the feature/data set is changed.
.. _PyML: http://pyml.sourceforge.net/ To regenerate the model files, you can run
.. code:: sh
python train.py
or
.. code:: python
from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA
from talon.signature.learning.classifier import train, init
train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME)
.. _scikit-learn: http://scikit-learn.org
.. _ENRON: https://www.cs.cmu.edu/~enron/ .. _ENRON: https://www.cs.cmu.edu/~enron/
Research Research

95
setup.py Normal file → Executable file
View File

@@ -1,13 +1,8 @@
import os
import sys
import contextlib
from distutils.spawn import find_executable
from setuptools import setup, find_packages from setuptools import setup, find_packages
setup(name='talon', setup(name='talon',
version='1.0.2', version='1.0.5',
description=("Mailgun library " description=("Mailgun library "
"to extract message quotations and signatures."), "to extract message quotations and signatures."),
long_description=open("README.rst").read(), long_description=open("README.rst").read(),
@@ -20,87 +15,15 @@ setup(name='talon',
zip_safe=True, zip_safe=True,
install_requires=[ install_requires=[
"lxml==2.3.3", "lxml==2.3.3",
"regex==0.1.20110315", "regex>=1",
"chardet==1.0.1",
"dnspython==1.11.1",
"html2text", "html2text",
"nose==1.2.1", "numpy",
"scipy",
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
],
tests_require=[
"mock", "mock",
"coverage", "nose>=1.2.1",
"flanker" "coverage"
] ]
) )
def install_pyml():
'''
Downloads and installs PyML
'''
try:
import PyML
except:
pass
else:
return
# install numpy first
pip('install numpy==1.6.1 --upgrade')
pyml_tarball = (
'http://09cce49df173f6f6e61f-fd6930021b51685920a6fa76529ee321'
'.r45.cf2.rackcdn.com/PyML-0.7.9.tar.gz')
pyml_srcidr = 'PyML-0.7.9'
# see if PyML tarball needs to be fetched:
if not dir_exists(pyml_srcidr):
run("curl %s | tar -xz" % pyml_tarball)
# compile&install:
with cd(pyml_srcidr):
python('setup.py build')
python('setup.py install')
def run(command):
if os.system(command) != 0:
raise Exception("Failed '{}'".format(command))
else:
return 0
def python(command):
command = '{} {}'.format(sys.executable, command)
run(command)
def enforce_executable(name, install_info):
if os.system("which {}".format(name)) != 0:
raise Exception(
'{} utility is missing.\nTo install, run:\n\n{}\n'.format(
name, install_info))
def pip(command):
command = '{} {}'.format(find_executable('pip'), command)
run(command)
def dir_exists(path):
return os.path.isdir(path)
@contextlib.contextmanager
def cd(directory):
curdir = os.getcwd()
try:
os.chdir(directory)
yield {}
finally:
os.chdir(curdir)
if __name__ == '__main__':
if len(sys.argv) > 1 and sys.argv[1] in ['develop', 'install']:
enforce_executable('curl', 'sudo aptitude install curl')
install_pyml()

View File

@@ -12,8 +12,7 @@ from copy import deepcopy
from lxml import html, etree from lxml import html, etree
import html2text import html2text
from talon.constants import RE_DELIMITER from talon.utils import get_delimiter
from talon.utils import random_token, get_delimiter
from talon import html_quotations from talon import html_quotations
@@ -151,7 +150,7 @@ def extract_from(msg_body, content_type='text/plain'):
return extract_from_plain(msg_body) return extract_from_plain(msg_body)
elif content_type == 'text/html': elif content_type == 'text/html':
return extract_from_html(msg_body) return extract_from_html(msg_body)
except Exception, e: except Exception:
log.exception('ERROR extracting message') log.exception('ERROR extracting message')
return msg_body return msg_body
@@ -344,7 +343,7 @@ def extract_from_html(msg_body):
html_tree_copy = deepcopy(html_tree) html_tree_copy = deepcopy(html_tree)
number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
quotation_checkpoints = [False for i in xrange(number_of_checkpoints)] quotation_checkpoints = [False] * number_of_checkpoints
msg_with_checkpoints = html.tostring(html_tree) msg_with_checkpoints = html.tostring(html_tree)
h = html2text.HTML2Text() h = html2text.HTML2Text()

View File

@@ -21,11 +21,9 @@ trained against, don't forget to regenerate:
""" """
import os import os
import sys
from cStringIO import StringIO
from . import extraction from . import extraction
from . extraction import extract from . extraction import extract #noqa
from . learning import classifier from . learning import classifier
@@ -36,13 +34,5 @@ EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data')
def initialize(): def initialize():
try: extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME,
# redirect output EXTRACTOR_DATA)
so, sys.stdout = sys.stdout, StringIO()
extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME,
EXTRACTOR_DATA)
sys.stdout = so
except Exception, e:
raise Exception(
"Failed initializing signature parsing with classifiers", e)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -1,14 +1,10 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os
import logging import logging
import regex as re import regex as re
from PyML import SparseDataSet import numpy
from talon.constants import RE_DELIMITER
from talon.signature.constants import (SIGNATURE_MAX_LINES,
TOO_LONG_SIGNATURE_LINE)
from talon.signature.learning.featurespace import features, build_pattern from talon.signature.learning.featurespace import features, build_pattern
from talon.utils import get_delimiter from talon.utils import get_delimiter
from talon.signature.bruteforce import get_signature_candidate from talon.signature.bruteforce import get_signature_candidate
@@ -36,8 +32,8 @@ RE_REVERSE_SIGNATURE = re.compile(r'''
def is_signature_line(line, sender, classifier): def is_signature_line(line, sender, classifier):
'''Checks if the line belongs to signature. Returns True or False.''' '''Checks if the line belongs to signature. Returns True or False.'''
data = SparseDataSet([build_pattern(line, features(sender))]) data = numpy.array(build_pattern(line, features(sender)))
return classifier.decisionFunc(data, 0) > 0 return classifier.predict(data) > 0
def extract(body, sender): def extract(body, sender):
@@ -61,7 +57,7 @@ def extract(body, sender):
text = delimiter.join(text) text = delimiter.join(text)
if text.strip(): if text.strip():
return (text, delimiter.join(signature)) return (text, delimiter.join(signature))
except Exception, e: except Exception:
log.exception('ERROR when extracting signature with classifiers') log.exception('ERROR when extracting signature with classifiers')
return (body, None) return (body, None)

View File

@@ -5,32 +5,27 @@ The classifier could be used to detect if a certain line of the message
body belongs to the signature. body belongs to the signature.
""" """
import os from numpy import genfromtxt
import sys from sklearn.svm import LinearSVC
from sklearn.externals import joblib
from PyML import SparseDataSet, SVM
def init(): def init():
'''Inits classifier with optimal options.''' """Inits classifier with optimal options."""
return SVM(C=10, optimization='liblinear') return LinearSVC(C=10.0)
def train(classifier, train_data_filename, save_classifier_filename=None): def train(classifier, train_data_filename, save_classifier_filename=None):
'''Trains and saves classifier so that it could be easily loaded later.''' """Trains and saves classifier so that it could be easily loaded later."""
data = SparseDataSet(train_data_filename, labelsColumn=-1) file_data = genfromtxt(train_data_filename, delimiter=",")
classifier.train(data) train_data, labels = file_data[:, :-1], file_data[:, -1]
classifier.fit(train_data, labels)
if save_classifier_filename: if save_classifier_filename:
classifier.save(save_classifier_filename) joblib.dump(classifier, save_classifier_filename)
return classifier return classifier
def load(saved_classifier_filename, train_data_filename): def load(saved_classifier_filename, train_data_filename):
"""Loads saved classifier. """Loads saved classifier. """
return joblib.load(saved_classifier_filename)
Classifier should be loaded with the same data it was trained against
"""
train_data = SparseDataSet(train_data_filename, labelsColumn=-1)
classifier = init()
classifier.load(saved_classifier_filename, train_data)
return classifier

View File

@@ -17,7 +17,7 @@ from talon.signature.constants import SIGNATURE_MAX_LINES
rc = re.compile rc = re.compile
RE_EMAIL = rc('@') RE_EMAIL = rc('@')
RE_RELAX_PHONE = rc('.*(\(? ?[\d]{2,3} ?\)?.{,3}){2,}') RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''') RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')
# Taken from: # Taken from:
@@ -40,14 +40,6 @@ RE_SIGNATURE_WORDS = rc(('(T|t)hank.*,|(B|b)est|(R|r)egards|'
# Line contains a pattern like Vitor R. Carvalho or William W. Cohen. # Line contains a pattern like Vitor R. Carvalho or William W. Cohen.
RE_NAME = rc('[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+') RE_NAME = rc('[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+')
# Pattern to match if e.g. 'Sender:' header field has sender names.
SENDER_WITH_NAME_PATTERN = '([\s]*[\S]+,?)+[\s]*<.*>.*'
RE_SENDER_WITH_NAME = rc(SENDER_WITH_NAME_PATTERN)
# Reply line clue line endings, as in regular expression:
# " wrote:$" or " writes:$"
RE_CLUE_LINE_END = rc('.*(W|w)rotes?:$')
INVALID_WORD_START = rc('\(|\+|[\d]') INVALID_WORD_START = rc('\(|\+|[\d]')
BAD_SENDER_NAMES = [ BAD_SENDER_NAMES = [

View File

@@ -9,11 +9,11 @@ To: bob <bob@example.com>
Content-Transfer-Encoding: quoted-printable Content-Transfer-Encoding: quoted-printable
Mime-Version: 1.0 (1.0) Mime-Version: 1.0 (1.0)
hello Hello
Sent from my iPhone Sent from my iPhone
On Apr 3, 2012, at 4:19 PM, bob <bob@example.com> wr= On Apr 3, 2012, at 4:19 PM, bob <bob@example.com> wr=
ote: ote:
> Hi > Hi

View File

@@ -0,0 +1,3 @@
Hello
Sent from my iPhone

View File

@@ -4,7 +4,6 @@ from . import *
from . fixtures import * from . fixtures import *
import regex as re import regex as re
from flanker import mime
from talon import quotations from talon import quotations
@@ -224,10 +223,7 @@ def test_reply_shares_div_with_from_block():
def test_reply_quotations_share_block(): def test_reply_quotations_share_block():
msg = mime.from_string(REPLY_QUOTATIONS_SHARE_BLOCK) stripped_html = quotations.extract_from_plain(REPLY_QUOTATIONS_SHARE_BLOCK)
html_part = list(msg.walk())[1]
assert html_part.content_type == 'text/html'
stripped_html = quotations.extract_from_html(html_part.body)
ok_(stripped_html) ok_(stripped_html)
ok_('From' not in stripped_html) ok_('From' not in stripped_html)

View File

@@ -3,8 +3,6 @@
from . import * from . import *
from . fixtures import * from . fixtures import *
from flanker import mime
from talon import quotations from talon import quotations

View File

@@ -2,10 +2,6 @@
from .. import * from .. import *
import os
from flanker import mime
from talon.signature import bruteforce from talon.signature import bruteforce

View File

@@ -4,8 +4,6 @@ from .. import *
import os import os
from PyML import SparseDataSet
from talon.signature.learning import dataset from talon.signature.learning import dataset
from talon import signature from talon import signature
from talon.signature import extraction as e from talon.signature import extraction as e

View File

@@ -3,9 +3,8 @@
from ... import * from ... import *
import os import os
from PyML import SparseDataSet from numpy import genfromtxt
from talon.utils import to_unicode
from talon.signature.learning import dataset as d from talon.signature.learning import dataset as d
from talon.signature.learning.featurespace import features from talon.signature.learning.featurespace import features
@@ -42,10 +41,13 @@ def test_build_extraction_dataset():
d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'), d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'),
os.path.join(TMP_DIR, os.path.join(TMP_DIR,
'extraction.data'), 1) 'extraction.data'), 1)
test_data = SparseDataSet(os.path.join(TMP_DIR, 'extraction.data'),
labelsColumn=-1) filename = os.path.join(TMP_DIR, 'extraction.data')
file_data = genfromtxt(filename, delimiter=",")
test_data = file_data[:, :-1]
# the result is a loadable signature extraction dataset # the result is a loadable signature extraction dataset
# 32 comes from 3 emails in emails/P folder, 11 lines checked to be # 32 comes from 3 emails in emails/P folder, 11 lines checked to be
# a signature, one email has only 10 lines # a signature, one email has only 10 lines
eq_(test_data.size(), 32) eq_(test_data.shape[0], 32)
eq_(len(features('')), test_data.numFeatures) eq_(len(features('')), test_data.shape[1])

View File

@@ -43,7 +43,7 @@ VALID_PHONE_NUMBERS = [e.strip() for e in VALID.splitlines() if e.strip()]
def test_match_phone_numbers(): def test_match_phone_numbers():
for phone in VALID_PHONE_NUMBERS: for phone in VALID_PHONE_NUMBERS:
ok_(RE_RELAX_PHONE.match(phone), "{} should be matched".format(phone)) ok_(RE_RELAX_PHONE.search(phone), "{} should be matched".format(phone))
def test_match_names(): def test_match_names():
@@ -52,29 +52,6 @@ def test_match_names():
ok_(RE_NAME.match(name), "{} should be matched".format(name)) ok_(RE_NAME.match(name), "{} should be matched".format(name))
def test_sender_with_name():
ok_lines = ['Sergey Obukhov <serobnic@example.com>',
'\tSergey <serobnic@example.com>',
('"Doe, John (TX)"'
'<DowJ@example.com>@EXAMPLE'
'<IMCEANOTES-+22Doe+2C+20John+20'
'+28TX+29+22+20+3CDoeJ+40example+2Ecom+3E'
'+40EXAMPLE@EXAMPLE.com>'),
('Company Sleuth <csleuth@email.xxx.com>'
'@EXAMPLE <XXX-Company+20Sleuth+20+3Ccsleuth'
'+40email+2Exxx+2Ecom+3E+40EXAMPLE@EXAMPLE.com>'),
('Doe III, John '
'</O=EXAMPLE/OU=NA/CN=RECIPIENTS/CN=jDOE5>')]
for line in ok_lines:
ok_(RE_SENDER_WITH_NAME.match(line),
'{} should be matched'.format(line))
nok_lines = ['', '<serobnic@xxx.ru>', 'Sergey serobnic@xxx.ru']
for line in nok_lines:
assert_false(RE_SENDER_WITH_NAME.match(line),
'{} should not be matched'.format(line))
# Now test helpers functions # Now test helpers functions
def test_binary_regex_search(): def test_binary_regex_search():
eq_(1, h.binary_regex_search(re.compile("12"))("12")) eq_(1, h.binary_regex_search(re.compile("12"))("12"))

View File

@@ -5,8 +5,7 @@ from . fixtures import *
import os import os
from flanker import mime import email.iterators
from talon import quotations from talon import quotations
@@ -614,22 +613,21 @@ def test_preprocess_postprocess_2_links():
def test_standard_replies(): def test_standard_replies():
for filename in os.listdir(STANDARD_REPLIES): for filename in os.listdir(STANDARD_REPLIES):
filename = os.path.join(STANDARD_REPLIES, filename) filename = os.path.join(STANDARD_REPLIES, filename)
if os.path.isdir(filename): if not filename.endswith('.eml') or os.path.isdir(filename):
continue continue
with open(filename) as f: with open(filename) as f:
msg = f.read() message = email.message_from_file(f)
m = mime.from_string(msg) body = email.iterators.typed_subpart_iterator(message, subtype='plain').next()
for part in m.walk(): text = ''.join(email.iterators.body_line_iterator(body, True))
if part.content_type == 'text/plain':
text = part.body stripped_text = quotations.extract_from_plain(text)
stripped_text = quotations.extract_from_plain(text) reply_text_fn = filename[:-4] + '_reply_text'
reply_text_fn = filename[:-4] + '_reply_text' if os.path.isfile(reply_text_fn):
if os.path.isfile(reply_text_fn): with open(reply_text_fn) as f:
with open(reply_text_fn) as f: reply_text = f.read().strip()
reply_text = f.read() else:
else: reply_text = 'Hello'
reply_text = 'Hello' yield eq_, reply_text, stripped_text, \
eq_(reply_text, stripped_text, "'%(reply)s' != %(stripped)s for %(fn)s" % \
"'%(reply)s' != %(stripped)s for %(fn)s" % {'reply': reply_text, 'stripped': stripped_text,
{'reply': reply_text, 'stripped': stripped_text, 'fn': filename}
'fn': filename})

10
train.py Normal file
View File

@@ -0,0 +1,10 @@
from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA
from talon.signature.learning.classifier import train, init
def train_model():
""" retrain model and persist """
train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME)
if __name__ == "__main__":
train_model()