Actually bump up talon's version up to 1.0.5 to match the tag.

bump up talon version
Merge pull request #45 from AlexRiina/master
2015-09-09 22:46:18 +02:00 · 2015-09-03 11:03:01 -07:00 · 2015-09-03 10:56:18 -07:00 · 2015-07-02 21:49:09 -04:00 · 2015-07-02 21:49:09 -04:00 · 2015-07-02 21:49:09 -04:00
25 changed files with 278 additions and 234 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -48,4 +48,7 @@ tramp
 *_archive
 # Trial temp
-_trial_temp
+_trial_temp
 # OSX
 .DS_Store
--- a/README.rst
+++ b/README.rst
@@ -89,7 +89,7 @@ the power of machine learning algorithms:
    # text == "Thanks Sasha, I can't go any higher and is why I limited it to the\nhomepage."
    # signature == "John Doe\nvia mobile"
-For machine learning talon currently uses `PyML`_ library to build SVM
+For machine learning talon currently uses the `scikit-learn`_ library to build SVM
 classifiers. The core of machine learning algorithm lays in
 ``talon.signature.learning package``. It defines a set of features to
 apply to a message (``featurespace.py``), how data sets are built
@@ -102,7 +102,21 @@ of features to the dataset we provide files ``classifier`` and
 used to load trained classifier. Those files should be regenerated every
 time the feature/data set is changed.
-.. _PyML: http://pyml.sourceforge.net/
+To regenerate the model files, you can run
 .. code:: sh
    python train.py
 or
 .. code:: python
    from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA
    from talon.signature.learning.classifier import train, init
    train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME)
 .. _scikit-learn: http://scikit-learn.org
 .. _ENRON: https://www.cs.cmu.edu/~enron/
 Research
--- a/setup.py
+++ b/setup.py
@@ -1,13 +1,8 @@
 import os
 import sys
 import contextlib
 from distutils.spawn import find_executable
 from setuptools import setup, find_packages
 setup(name='talon',
-      version='1.0.2',
+      version='1.0.5',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
@@ -20,87 +15,15 @@ setup(name='talon',
      zip_safe=True,
      install_requires=[
          "lxml==2.3.3",
-          "regex==0.1.20110315",
+          "regex>=1",
          "chardet==1.0.1",
          "dnspython==1.11.1",
          "html2text",
-          "nose==1.2.1",
+          "numpy",
          "scipy",
          "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
          ],
      tests_require=[
          "mock",
-          "coverage",
+          "nose>=1.2.1",
-          "flanker"
+          "coverage"
          ]
      )
 def install_pyml():
    '''
    Downloads and installs PyML
    '''
    try:
        import PyML
    except:
        pass
    else:
        return
    # install numpy first
    pip('install numpy==1.6.1 --upgrade')
    pyml_tarball = (
        'http://09cce49df173f6f6e61f-fd6930021b51685920a6fa76529ee321'
        '.r45.cf2.rackcdn.com/PyML-0.7.9.tar.gz')
    pyml_srcidr = 'PyML-0.7.9'
    # see if PyML tarball needs to be fetched:
    if not dir_exists(pyml_srcidr):
        run("curl %s | tar -xz" % pyml_tarball)
    # compile&install:
    with cd(pyml_srcidr):
        python('setup.py build')
        python('setup.py install')
 def run(command):
    if os.system(command) != 0:
        raise Exception("Failed '{}'".format(command))
    else:
        return 0
 def python(command):
    command = '{} {}'.format(sys.executable, command)
    run(command)
 def enforce_executable(name, install_info):
    if os.system("which {}".format(name)) != 0:
        raise Exception(
            '{} utility is missing.\nTo install, run:\n\n{}\n'.format(
                name, install_info))
 def pip(command):
    command = '{} {}'.format(find_executable('pip'), command)
    run(command)
 def dir_exists(path):
    return os.path.isdir(path)
@contextlib.contextmanager
 def cd(directory):
    curdir = os.getcwd()
    try:
        os.chdir(directory)
        yield {}
    finally:
        os.chdir(curdir)
 if __name__ == '__main__':
    if len(sys.argv) > 1 and sys.argv[1] in ['develop', 'install']:
        enforce_executable('curl', 'sudo aptitude install curl')
        install_pyml()
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -12,8 +12,7 @@ from copy import deepcopy
 from lxml import html, etree
 import html2text
-from talon.constants import RE_DELIMITER
+from talon.utils import get_delimiter
 from talon.utils import random_token, get_delimiter
 from talon import html_quotations
@@ -23,14 +22,49 @@ log = logging.getLogger(__name__)
 RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
 RE_ON_DATE_SMB_WROTE = re.compile(
-    r'''
+    u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
-    (
+        # Beginning of the line
-        -*  # could include dashes
+        u'|'.join((
-        [ ]?On[ ].*,  # date part ends with comma
+            # English
-        (.*\n){0,2}  # splitter takes 4 lines at most
+            'On',
-        .*(wrote|sent):
+            # French
            'Le',
            # Polish
            'W dniu',
            # Dutch
            'Op'
        )),
        # Date and sender separator
        u'|'.join((
            # most languages separate date and sender address by comma
            ',',
            # polish date and sender address separator
            u'użytkownik'
        )),
        # Ending of the line
        u'|'.join((
            # English
            'wrote', 'sent',
            # French
            u'a écrit',
            # Polish
            u'napisał',
            # Dutch
            'schreef','verzond','geschreven'
        ))
    ))
 # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
 RE_ON_DATE_WROTE_SMB = re.compile(
    u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format(
        # Beginning of the line
        	'Op',
        # Ending of the line
        u'|'.join((
            # Dutch
            'schreef','verzond','geschreven'
        ))
    )
    )
    ''', re.VERBOSE)
 RE_QUOTATION = re.compile(
    r'''
@@ -66,16 +100,33 @@ RE_EMPTY_QUOTATION = re.compile(
    e*
    ''', re.VERBOSE)
 # ------Original Message------ or ---- Reply Message ----
 # With variations in other languages.
 RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
    u'|'.join((
        # English
        'Original Message', 'Reply Message',
        # German
        u'Ursprüngliche Nachricht', 'Antwort Nachricht',
        # Danish
        'Oprindelig meddelelse',
    ))), re.I)
 RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format(
    u'|'.join((
        # "From" in different languages.
        'From', 'Van', 'De', 'Von', 'Fra',
        # "Date" in different languages.
        'Date', 'Datum', u'Envoyé'
    ))), re.I)
 SPLITTER_PATTERNS = [
-    # ------Original Message------ or ---- Reply Message ----
+    RE_ORIGINAL_MESSAGE,
    re.compile("[\s]*[-]+[ ]*(Original|Reply) Message[ ]*[-]+", re.I),
    # <date> <person>
    re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE),
    RE_ON_DATE_SMB_WROTE,
-    re.compile('(_+\r?\n)?[\s]*(:?[*]?From|Date):[*]? .*'),
+    RE_ON_DATE_WROTE_SMB,
-    re.compile('(_+\r?\n)?[\s]*(:?[*]?Van|Datum):[*]? .*'),
+    RE_FROM_COLON_OR_DATE_COLON,
    re.compile('(_+\r?\n)?[\s]*(:?[*]?De|Date):[*]? .*'),
    re.compile('(_+\r?\n)?[\s]*(:?[*]?Von|Datum):[*]? .*'),
    re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
               '( \S+){3,6}@\S+:')
    ]
@@ -99,7 +150,7 @@ def extract_from(msg_body, content_type='text/plain'):
            return extract_from_plain(msg_body)
        elif content_type == 'text/html':
            return extract_from_html(msg_body)
-    except Exception, e:
+    except Exception:
        log.exception('ERROR extracting message')
    return msg_body
@@ -292,7 +343,7 @@ def extract_from_html(msg_body):
    html_tree_copy = deepcopy(html_tree)
    number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
-    quotation_checkpoints = [False for i in xrange(number_of_checkpoints)]
+    quotation_checkpoints = [False] * number_of_checkpoints
    msg_with_checkpoints = html.tostring(html_tree)
    h = html2text.HTML2Text()
--- a/talon/signature/init.py
+++ b/talon/signature/init.py
@@ -21,11 +21,9 @@ trained against, don't forget to regenerate:
 """
 import os
 import sys
 from cStringIO import StringIO
 from . import extraction
-from . extraction import extract
+from . extraction import extract  #noqa
 from . learning import classifier
@@ -36,13 +34,5 @@ EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data')
 def initialize():
-    try:
+    extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME,
-        # redirect output
+                                           EXTRACTOR_DATA)
        so, sys.stdout = sys.stdout, StringIO()
        extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME,
                                               EXTRACTOR_DATA)
        sys.stdout = so
    except Exception, e:
        raise Exception(
            "Failed initializing signature parsing with classifiers", e)
--- a/talon/signature/data/classifier
+++ b/talon/signature/data/classifier
--- a/talon/signature/data/classifier_01.npy
+++ b/talon/signature/data/classifier_01.npy
--- a/talon/signature/data/classifier_02.npy
+++ b/talon/signature/data/classifier_02.npy
--- a/talon/signature/data/classifier_03.npy
+++ b/talon/signature/data/classifier_03.npy
--- a/talon/signature/data/classifier_04.npy
+++ b/talon/signature/data/classifier_04.npy
--- a/talon/signature/data/classifier_05.npy
+++ b/talon/signature/data/classifier_05.npy
--- a/talon/signature/extraction.py
+++ b/talon/signature/extraction.py
@@ -1,14 +1,10 @@
 # -*- coding: utf-8 -*-
 import os
 import logging
 import regex as re
-from PyML import SparseDataSet
+import numpy
 from talon.constants import RE_DELIMITER
 from talon.signature.constants import (SIGNATURE_MAX_LINES,
                                       TOO_LONG_SIGNATURE_LINE)
 from talon.signature.learning.featurespace import features, build_pattern
 from talon.utils import get_delimiter
 from talon.signature.bruteforce import get_signature_candidate
@@ -36,8 +32,8 @@ RE_REVERSE_SIGNATURE = re.compile(r'''
 def is_signature_line(line, sender, classifier):
    '''Checks if the line belongs to signature. Returns True or False.'''
-    data = SparseDataSet([build_pattern(line, features(sender))])
+    data = numpy.array(build_pattern(line, features(sender)))
-    return classifier.decisionFunc(data, 0) > 0
+    return classifier.predict(data) > 0
 def extract(body, sender):
@@ -61,7 +57,7 @@ def extract(body, sender):
                text = delimiter.join(text)
                if text.strip():
                    return (text, delimiter.join(signature))
-    except Exception, e:
+    except Exception:
        log.exception('ERROR when extracting signature with classifiers')
    return (body, None)
--- a/talon/signature/learning/classifier.py
+++ b/talon/signature/learning/classifier.py
@@ -5,32 +5,27 @@ The classifier could be used to detect if a certain line of the message
 body belongs to the signature.
 """
-import os
+from numpy import genfromtxt
-import sys
+from sklearn.svm import LinearSVC
-
+from sklearn.externals import joblib
 from PyML import SparseDataSet, SVM
 def init():
-    '''Inits classifier with optimal options.'''
+    """Inits classifier with optimal options."""
-    return SVM(C=10, optimization='liblinear')
+    return LinearSVC(C=10.0)
 def train(classifier, train_data_filename, save_classifier_filename=None):
-    '''Trains and saves classifier so that it could be easily loaded later.'''
+    """Trains and saves classifier so that it could be easily loaded later."""
-    data = SparseDataSet(train_data_filename, labelsColumn=-1)
+    file_data = genfromtxt(train_data_filename, delimiter=",")
-    classifier.train(data)
+    train_data, labels = file_data[:, :-1], file_data[:, -1]
    classifier.fit(train_data, labels)
    if save_classifier_filename:
-        classifier.save(save_classifier_filename)
+        joblib.dump(classifier, save_classifier_filename)
    return classifier
 def load(saved_classifier_filename, train_data_filename):
-    """Loads saved classifier.
+    """Loads saved classifier. """
-
+    return joblib.load(saved_classifier_filename)
    Classifier should be loaded with the same data it was trained against
    """
    train_data = SparseDataSet(train_data_filename, labelsColumn=-1)
    classifier = init()
    classifier.load(saved_classifier_filename, train_data)
    return classifier
--- a/talon/signature/learning/featurespace.py
+++ b/talon/signature/learning/featurespace.py
@@ -7,7 +7,8 @@ The body and the message sender string are converted into unicode before
 applying features to them.
 """
-from talon.signature.constants import SIGNATURE_MAX_LINES
+from talon.signature.constants import (SIGNATURE_MAX_LINES,
                                       TOO_LONG_SIGNATURE_LINE)
 from talon.signature.learning.helpers import *
@@ -20,7 +21,7 @@ def features(sender=''):
        # This one is not from paper.
        # Line is too long.
        # This one is less aggressive than `Line is too short`
-        lambda line: 1 if len(line) > 60 else 0,
+        lambda line: 1 if len(line) > TOO_LONG_SIGNATURE_LINE else 0,
        # Line contains email pattern.
        binary_regex_search(RE_EMAIL),
        # Line contains url.
--- a/talon/signature/learning/helpers.py
+++ b/talon/signature/learning/helpers.py
@@ -17,7 +17,7 @@ from talon.signature.constants import SIGNATURE_MAX_LINES
 rc = re.compile
 RE_EMAIL = rc('@')
-RE_RELAX_PHONE = rc('.*(\(? ?[\d]{2,3} ?\)?.{,3}){2,}')
+RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
 RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')
 # Taken from:
@@ -40,14 +40,6 @@ RE_SIGNATURE_WORDS = rc(('(T|t)hank.*,|(B|b)est|(R|r)egards|'
 # Line contains a pattern like Vitor R. Carvalho or William W. Cohen.
 RE_NAME = rc('[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+')
 # Pattern to match if e.g. 'Sender:' header field has sender names.
 SENDER_WITH_NAME_PATTERN = '([\s]*[\S]+,?)+[\s]*<.*>.*'
 RE_SENDER_WITH_NAME = rc(SENDER_WITH_NAME_PATTERN)
 # Reply line clue line endings, as in regular expression:
 # " wrote:$" or " writes:$"
 RE_CLUE_LINE_END = rc('.*(W|w)rotes?:$')
 INVALID_WORD_START = rc('\(|\+|[\d]')
 BAD_SENDER_NAMES = [
--- a/tests/fixtures/standard_replies/iphone.eml
+++ b/tests/fixtures/standard_replies/iphone.eml
@@ -9,11 +9,11 @@ To: bob <bob@example.com>
 Content-Transfer-Encoding: quoted-printable
 Mime-Version: 1.0 (1.0)
-hello
+Hello
 Sent from my iPhone
 On Apr 3, 2012, at 4:19 PM, bob <bob@example.com> wr=
 ote:
-> Hi
+> Hi
--- a/tests/fixtures/standard_replies/iphone_reply_text
+++ b/tests/fixtures/standard_replies/iphone_reply_text
@@ -0,0 +1,3 @@
 Hello
 Sent from my iPhone
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -4,7 +4,6 @@ from . import *
 from . fixtures import *
 import regex as re
 from flanker import mime
 from talon import quotations
@@ -224,10 +223,7 @@ def test_reply_shares_div_with_from_block():
 def test_reply_quotations_share_block():
-    msg = mime.from_string(REPLY_QUOTATIONS_SHARE_BLOCK)
+    stripped_html = quotations.extract_from_plain(REPLY_QUOTATIONS_SHARE_BLOCK)
    html_part = list(msg.walk())[1]
    assert html_part.content_type == 'text/html'
    stripped_html = quotations.extract_from_html(html_part.body)
    ok_(stripped_html)
    ok_('From' not in stripped_html)
--- a/tests/quotations_test.py
+++ b/tests/quotations_test.py
@@ -3,8 +3,6 @@
 from . import *
 from . fixtures import *
 from flanker import mime
 from talon import quotations
--- a/tests/signature/bruteforce_test.py
+++ b/tests/signature/bruteforce_test.py
@@ -2,10 +2,6 @@
 from .. import *
 import os
 from flanker import mime
 from talon.signature import bruteforce
--- a/tests/signature/extraction_test.py
+++ b/tests/signature/extraction_test.py
@@ -4,8 +4,6 @@ from .. import *
 import os
 from PyML import SparseDataSet
 from talon.signature.learning import dataset
 from talon import signature
 from talon.signature import extraction as e
--- a/tests/signature/learning/dataset_test.py
+++ b/tests/signature/learning/dataset_test.py
@@ -3,9 +3,8 @@
 from ... import *
 import os
-from PyML import SparseDataSet
+from numpy import genfromtxt
 from talon.utils import to_unicode
 from talon.signature.learning import dataset as d
 from talon.signature.learning.featurespace import features
@@ -42,10 +41,13 @@ def test_build_extraction_dataset():
    d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'),
                               os.path.join(TMP_DIR,
                                            'extraction.data'), 1)
-    test_data = SparseDataSet(os.path.join(TMP_DIR, 'extraction.data'),
+
-                              labelsColumn=-1)
+    filename = os.path.join(TMP_DIR, 'extraction.data')
    file_data = genfromtxt(filename, delimiter=",")
    test_data = file_data[:, :-1]
    # the result is a loadable signature extraction dataset
    # 32 comes from 3 emails in emails/P folder, 11 lines checked to be
    # a signature, one email has only 10 lines
-    eq_(test_data.size(), 32)
+    eq_(test_data.shape[0], 32)
-    eq_(len(features('')), test_data.numFeatures)
+    eq_(len(features('')), test_data.shape[1])
--- a/tests/signature/learning/helpers_test.py
+++ b/tests/signature/learning/helpers_test.py
@@ -43,7 +43,7 @@ VALID_PHONE_NUMBERS = [e.strip() for e in VALID.splitlines() if e.strip()]
 def test_match_phone_numbers():
    for phone in VALID_PHONE_NUMBERS:
-        ok_(RE_RELAX_PHONE.match(phone), "{} should be matched".format(phone))
+        ok_(RE_RELAX_PHONE.search(phone), "{} should be matched".format(phone))
 def test_match_names():
@@ -52,29 +52,6 @@ def test_match_names():
        ok_(RE_NAME.match(name), "{} should be matched".format(name))
 def test_sender_with_name():
    ok_lines = ['Sergey Obukhov <serobnic@example.com>',
                '\tSergey  <serobnic@example.com>',
                ('"Doe, John (TX)"'
                 '<DowJ@example.com>@EXAMPLE'
                 '<IMCEANOTES-+22Doe+2C+20John+20'
                 '+28TX+29+22+20+3CDoeJ+40example+2Ecom+3E'
                 '+40EXAMPLE@EXAMPLE.com>'),
                ('Company Sleuth <csleuth@email.xxx.com>'
                 '@EXAMPLE <XXX-Company+20Sleuth+20+3Ccsleuth'
                 '+40email+2Exxx+2Ecom+3E+40EXAMPLE@EXAMPLE.com>'),
                ('Doe III, John '
                 '</O=EXAMPLE/OU=NA/CN=RECIPIENTS/CN=jDOE5>')]
    for line in ok_lines:
        ok_(RE_SENDER_WITH_NAME.match(line),
            '{} should be matched'.format(line))
    nok_lines = ['', '<serobnic@xxx.ru>', 'Sergey serobnic@xxx.ru']
    for line in nok_lines:
        assert_false(RE_SENDER_WITH_NAME.match(line),
                     '{} should not be matched'.format(line))
 # Now test helpers functions
 def test_binary_regex_search():
    eq_(1, h.binary_regex_search(re.compile("12"))("12"))
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -5,8 +5,7 @@ from . fixtures import *
 import os
-from flanker import mime
+import email.iterators
 from talon import quotations
@@ -33,6 +32,16 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote:
    eq_("Test reply", quotations.extract_from_plain(msg_body))
 def test_pattern_on_date_wrote_somebody():
    eq_('Lorem', quotations.extract_from_plain(
    """Lorem
 Op 13-02-2014 3:18 schreef Julius Caesar <pantheon@rome.com>:
 Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
 """))
 def test_pattern_on_date_somebody_wrote_date_with_slashes():
    msg_body = """Test reply
@@ -98,22 +107,24 @@ bla-bla - bla"""
    eq_(reply, quotations.extract_from_plain(msg_body))
-def test_pattern_original_message():
+def _check_pattern_original_message(original_message_indicator):
-    msg_body = """Test reply
+    msg_body = u"""Test reply
-----Original Message-----
+-----{}-----
 Test"""
    eq_('Test reply', quotations.extract_from_plain(msg_body.format(unicode(original_message_indicator))))
-    eq_("Test reply", quotations.extract_from_plain(msg_body))
+def test_english_original_message():
    _check_pattern_original_message('Original Message')
    _check_pattern_original_message('Reply Message')
-    msg_body = """Test reply
+def test_german_original_message():
    _check_pattern_original_message(u'Ursprüngliche Nachricht')
    _check_pattern_original_message('Antwort Nachricht')
- -----Original Message-----
+def test_danish_original_message():
-
+    _check_pattern_original_message('Oprindelig meddelelse')
 Test"""
    eq_("Test reply", quotations.extract_from_plain(msg_body))
 def test_reply_after_quotations():
@@ -199,6 +210,33 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote:
 > Hello"""
    eq_("Hi", quotations.extract_from_plain(msg_body))
 def test_with_indent():
    msg_body = """YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.
 ------On 12/29/1987 17:32 PM, Julius Caesar wrote-----
 Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur. 
    """
    eq_("YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.", quotations.extract_from_plain(msg_body))
 def test_short_quotation_with_newline():
    msg_body = """Btw blah blah...
 On Tue, Jan 27, 2015 at 12:42 PM -0800, "Company" <christine.XXX@XXX.com> wrote:
 Hi Mark,
 Blah blah? 
 Thanks,Christine 
 On Jan 27, 2015, at 11:55 AM, Mark XXX <mark@XXX.com> wrote:
 Lorem ipsum?
 Mark
 Sent from Acompli"""
    eq_("Btw blah blah...", quotations.extract_from_plain(msg_body))
 def test_pattern_date_email_with_unicode():
    msg_body = """Replying ok
@@ -208,8 +246,8 @@ def test_pattern_date_email_with_unicode():
    eq_("Replying ok", quotations.extract_from_plain(msg_body))
-def test_pattern_from_block():
+def test_english_from_block():
-    msg_body = """Allo! Follow up MIME!
+    eq_('Allo! Follow up MIME!', quotations.extract_from_plain("""Allo! Follow up MIME!
 From: somebody@example.com
 Sent: March-19-11 5:42 PM
@@ -217,8 +255,70 @@ To: Somebody
 Subject: The manager has commented on your Loop
 Blah-blah-blah
-"""
+"""))
-    eq_("Allo! Follow up MIME!", quotations.extract_from_plain(msg_body))
+
 def test_german_from_block():
    eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
    """Allo! Follow up MIME!
 Von: somebody@example.com
 Gesendet: Dienstag, 25. November 2014 14:59
 An: Somebody
 Betreff: The manager has commented on your Loop
 Blah-blah-blah
 """))
 def test_french_multiline_from_block():
    eq_('Lorem ipsum', quotations.extract_from_plain(
    u"""Lorem ipsum
 De : Brendan xxx [mailto:brendan.xxx@xxx.com]
 Envoyé : vendredi 23 janvier 2015 16:39
 À : Camille XXX
 Objet : Follow Up
 Blah-blah-blah
 """))
 def test_french_from_block():
    eq_('Lorem ipsum', quotations.extract_from_plain(
    u"""Lorem ipsum
 Le 23 janv. 2015 à 22:03, Brendan xxx <brendan.xxx@xxx.com<mailto:brendan.xxx@xxx.com>> a écrit:
 Bonjour!"""))
 def test_polish_from_block():
    eq_('Lorem ipsum', quotations.extract_from_plain(
    u"""Lorem ipsum
 W dniu 28 stycznia 2015 01:53 użytkownik Zoe xxx <zoe.xxx@xxx.com>
 napisał:
 Blah!
 """))
 def test_danish_from_block():
    eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
    """Allo! Follow up MIME!
 Fra: somebody@example.com
 Sendt: 19. march 2011 12:10
 Til: Somebody
 Emne: The manager has commented on your Loop
 Blah-blah-blah
 """))
 def test_dutch_from_block():
    eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
    """Gluten-free culpa lo-fi et nesciunt nostrud. 
 Op 17-feb.-2015, om 13:18 heeft Julius Caesar <pantheon@rome.com> het volgende geschreven:
 Small batch beard laboris tempor, non listicle hella Tumblr heirloom. 
 """))
 def test_quotation_marker_false_positive():
@@ -513,22 +613,21 @@ def test_preprocess_postprocess_2_links():
 def test_standard_replies():
    for filename in os.listdir(STANDARD_REPLIES):
        filename = os.path.join(STANDARD_REPLIES, filename)
-        if os.path.isdir(filename):
+        if not filename.endswith('.eml') or os.path.isdir(filename):
            continue
        with open(filename) as f:
-            msg = f.read()
+            message = email.message_from_file(f)
-            m = mime.from_string(msg)
+            body = email.iterators.typed_subpart_iterator(message, subtype='plain').next()
-            for part in m.walk():
+            text = ''.join(email.iterators.body_line_iterator(body, True))
-                if part.content_type == 'text/plain':
+
-                    text = part.body
+            stripped_text = quotations.extract_from_plain(text)
-                    stripped_text = quotations.extract_from_plain(text)
+            reply_text_fn = filename[:-4] + '_reply_text'
-                    reply_text_fn = filename[:-4] + '_reply_text'
+            if os.path.isfile(reply_text_fn):
-                    if os.path.isfile(reply_text_fn):
+                with open(reply_text_fn) as f:
-                        with open(reply_text_fn) as f:
+                    reply_text = f.read().strip()
-                            reply_text = f.read()
+            else:
-                    else:
+                reply_text = 'Hello'
-                        reply_text = 'Hello'
+            yield eq_, reply_text, stripped_text, \
-                    eq_(reply_text, stripped_text,
+                "'%(reply)s' != %(stripped)s for %(fn)s" % \
-                        "'%(reply)s' != %(stripped)s for %(fn)s" %
+                {'reply': reply_text, 'stripped': stripped_text,
-                        {'reply': reply_text, 'stripped': stripped_text,
+                 'fn': filename}
                         'fn': filename})
--- a/train.py
+++ b/train.py
@@ -0,0 +1,10 @@
 from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA
 from talon.signature.learning.classifier import train, init
 def train_model():
    """ retrain model and persist """
    train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME)
 if __name__ == "__main__":
    train_model()
Author	SHA1	Message	Date
Ralph Meijer	2377c387c7	Actually bump up talon's version up to 1.0.5 to match the tag.	2015-09-09 22:46:18 +02:00
Sergey Obukhov	9358db6cee	bump up talon version	2015-09-03 11:03:01 -07:00
Sergey Obukhov	08c9d7db03	Merge pull request #45 from AlexRiina/master Replace PyML with sklearn and clean up dependencies	2015-09-03 10:56:18 -07:00
Alex Riina	85c7ee980c	add script to regenerate ml model	2015-07-02 21:49:09 -04:00
Oliver Song	7ea773e6a9	Fix iphone test	2015-07-02 21:49:09 -04:00
Scott MacVicar	e3c4ff38fe	move test stuff out to its own section	2015-07-02 21:49:09 -04:00
Scott MacVicar	8b1f87b1c0	Get this building and passing tests Changes: * add .DS_Store to .gitignore * Decode base64 encoded emails for tests * Pick a version of scikit since the pickled clasifiers are based on that * Add missing numpy and scipy dependencies	2015-07-02 21:49:09 -04:00
Alex Riina	c5e4cd9ab4	dont be too restrictive on the test library version	2015-07-02 21:49:09 -04:00
Alex Riina	215e36e9ed	allow higher version of regex library	2015-07-02 21:49:09 -04:00
Alex Riina	e3ef622031	remove unused regex	2015-07-02 21:49:09 -04:00
Alex Riina	f16760c466	Remove flanker and replace PyML with scikit-learn I never was actually able to successfully install PyML but the source-forge distribution and lack of python3 support convinced me that scikit-learn would be a fine substitute. Flanker was also difficult for me to install and seemed only to be used in the tests, so I removed it as well to get into a position where I could run the tests. As of this commit, only one is not passing (test_standard_replies with android.eml) though I'm not familiar with the `email` library yet.	2015-07-02 21:49:09 -04:00
Alex Riina	b36287e573	clean up style and extra imports	2015-07-02 21:49:09 -04:00
Alex Riina	4df7aa284b	remove extra imports	2015-07-02 21:49:09 -04:00
Jeremy Schlatter	3a37d8b649	Merge pull request #41 from simonflore/master New splitter pattern for Dutch mail replies	2015-04-22 12:17:39 -07:00
Simon	f9f428f4c3	Revert "Change of behavior when msg_body has more then 1000 lines" This reverts commit `84a83e865e`.	2015-04-16 13:26:17 +02:00
Simon	84a83e865e	Change of behavior when msg_body has more then 1000 lines	2015-04-16 13:22:18 +02:00
Simon	b4c180b9ff	Extra spaces check in RE_ON_DATE_WROTE_SMB reggae	2015-04-15 13:55:59 +02:00
Simon	072a440837	Test cases for new patterns	2015-04-15 13:55:17 +02:00
Simon	105d16644d	For patterns like this '---- On {date} {name} {mail} wrote ---- '	2015-04-14 18:52:45 +02:00
Simon	df3338192a	Another submission to a dutch variation	2015-04-14 18:49:26 +02:00
Simon	f0ed5d6c07	New splitter pattern for Dutch mail replies	2015-04-14 18:22:48 +02:00
Sergey Obukhov	790463821f	Merge pull request #31 from tsheasha/patch-1 Utilising the Constants	2015-03-02 14:48:41 -08:00
Sergey Obukhov	763d3b308e	Merge pull request #35 from futuresimple/more_formats Support some polish and french formats	2015-03-02 14:25:26 -08:00
szymonsobczak	3c9ef4653f	some more french fromats	2015-02-24 12:18:54 +01:00
szymonsobczak	b16060261a	support some polish and french formats	2015-02-24 11:39:12 +01:00
Tarek Sheasha	13dc43e960	Utilising the Constants Checking for the length of a line to determine if it is possibly a signature or not could be done in a more generic way by determining the maximum size of the line via a constant. Hence advocating the spirit of the modifying the code in only one place and propagating that change everywhere. This exact approach has already been used at:	2015-01-21 15:54:57 +01:00
Jeremy Schlatter	3768d7ba31	make a separate test function for each language	2014-12-30 14:41:20 -08:00
Jeremy Schlatter	613d1fc815	Add extra splitter expressions and tests for German and Danish. Also some refactoring to make it a bit easier to add more languages.	2014-12-23 15:44:04 -08:00