Merge pull request #77 from mailgun/sergey/fix-gmail-fwd

fixes mailgun/talon#18
2016-02-19 19:08:37 -08:00 · 2016-02-19 19:07:10 -08:00 · 2016-02-19 18:32:07 -08:00 · 2016-02-19 18:28:23 -08:00 · 2016-02-19 17:53:52 -08:00 · 2015-12-18 19:15:58 -08:00
31 changed files with 627 additions and 299 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -48,4 +48,7 @@ tramp
 *_archive

 # Trial temp
-_trial_temp
+_trial_temp
+
+# OSX
+.DS_Store
--- a/README.rst
+++ b/README.rst
@@ -89,7 +89,7 @@ the power of machine learning algorithms:
    # text == "Thanks Sasha, I can't go any higher and is why I limited it to the\nhomepage."
    # signature == "John Doe\nvia mobile"

-For machine learning talon currently uses `PyML`_ library to build SVM
+For machine learning talon currently uses the `scikit-learn`_ library to build SVM
 classifiers. The core of machine learning algorithm lays in
 ``talon.signature.learning package``. It defines a set of features to
 apply to a message (``featurespace.py``), how data sets are built
@@ -102,7 +102,21 @@ of features to the dataset we provide files ``classifier`` and
 used to load trained classifier. Those files should be regenerated every
 time the feature/data set is changed.

-.. _PyML: http://pyml.sourceforge.net/
+To regenerate the model files, you can run
+
+.. code:: sh
+
+    python train.py
+
+or
+
+.. code:: python
+    
+    from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA
+    from talon.signature.learning.classifier import train, init
+    train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME)
+
+.. _scikit-learn: http://scikit-learn.org
 .. _ENRON: https://www.cs.cmu.edu/~enron/

 Research
--- a/setup.py
+++ b/setup.py
@@ -1,13 +1,8 @@
-import os
-import sys
-import contextlib
-
-from distutils.spawn import find_executable
 from setuptools import setup, find_packages


 setup(name='talon',
-      version='1.0.2',
+      version='1.2.3',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
@@ -19,88 +14,18 @@ setup(name='talon',
      include_package_data=True,
      zip_safe=True,
      install_requires=[
-          "lxml==2.3.3",
-          "regex==0.1.20110315",
-          "chardet==1.0.1",
-          "dnspython==1.11.1",
-          "html2text",
-          "nose==1.2.1",
+          "lxml>=2.3.3",
+          "regex>=1",
+          "numpy",
+          "scipy",
+          "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
+          'chardet>=1.0.1',
+          'cchardet>=0.3.5',
+          'cssselect'
+          ],
+      tests_require=[
          "mock",
-          "coverage",
-          "flanker"
+          "nose>=1.2.1",
+          "coverage"
          ]
      )
-
-
-def install_pyml():
-    '''
-    Downloads and installs PyML
-    '''
-    try:
-        import PyML
-    except:
-        pass
-    else:
-        return
-
-    # install numpy first
-    pip('install numpy==1.6.1 --upgrade')
-
-    pyml_tarball = (
-        'http://09cce49df173f6f6e61f-fd6930021b51685920a6fa76529ee321'
-        '.r45.cf2.rackcdn.com/PyML-0.7.9.tar.gz')
-    pyml_srcidr = 'PyML-0.7.9'
-
-    # see if PyML tarball needs to be fetched:
-    if not dir_exists(pyml_srcidr):
-        run("curl %s | tar -xz" % pyml_tarball)
-
-    # compile&install:
-    with cd(pyml_srcidr):
-        python('setup.py build')
-        python('setup.py install')
-
-
-def run(command):
-    if os.system(command) != 0:
-        raise Exception("Failed '{}'".format(command))
-    else:
-        return 0
-
-
-def python(command):
-    command = '{} {}'.format(sys.executable, command)
-    run(command)
-
-
-def enforce_executable(name, install_info):
-    if os.system("which {}".format(name)) != 0:
-        raise Exception(
-            '{} utility is missing.\nTo install, run:\n\n{}\n'.format(
-                name, install_info))
-
-
-def pip(command):
-    command = '{} {}'.format(find_executable('pip'), command)
-    run(command)
-
-
-def dir_exists(path):
-    return os.path.isdir(path)
-
-
-@contextlib.contextmanager
-def cd(directory):
-    curdir = os.getcwd()
-    try:
-        os.chdir(directory)
-        yield {}
-    finally:
-        os.chdir(curdir)
-
-
-if __name__ == '__main__':
-    if len(sys.argv) > 1 and sys.argv[1] in ['develop', 'install']:
-        enforce_executable('curl', 'sudo aptitude install curl')
-
-        install_pyml()
--- a/talon/html_quotations.py
+++ b/talon/html_quotations.py
@@ -12,6 +12,7 @@ CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX)

 # HTML quote indicators (tag ids)
 QUOTE_IDS = ['OLK_SRC_BODY_SECTION']
+RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)


 def add_checkpoint(html_note, counter):
@@ -76,8 +77,8 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints):

 def cut_gmail_quote(html_message):
    ''' Cuts the outermost block element with class gmail_quote. '''
-    gmail_quote = html_message.cssselect('.gmail_quote')
-    if gmail_quote:
+    gmail_quote = html_message.cssselect('div.gmail_quote')
+    if gmail_quote and not RE_FWD.match(gmail_quote[0].text):
        gmail_quote[0].getparent().remove(gmail_quote[0])
        return True

@@ -138,9 +139,14 @@ def cut_by_id(html_message):


 def cut_blockquote(html_message):
-    ''' Cuts blockquote with wrapping elements. '''
-    quote = html_message.find('.//blockquote')
-    if quote is not None:
+    ''' Cuts the last non-nested blockquote with wrapping elements.'''
+    quote = html_message.xpath(
+        '(.//blockquote)'
+        '[not(@class="gmail_quote") and not(ancestor::blockquote)]'
+        '[last()]')
+
+    if quote:
+        quote = quote[0]
        quote.getparent().remove(quote)
        return True

@@ -154,21 +160,38 @@ def cut_from_block(html_message):

    if block:
        block = block[-1]
+        parent_div = None
        while block.getparent() is not None:
            if block.tag == 'div':
+                parent_div = block
+                break
+            block = block.getparent()
+        if parent_div is not None:
+            maybe_body = parent_div.getparent()
+            # In cases where removing this enclosing div will remove all
+            # content, we should assume the quote is not enclosed in a tag.
+            parent_div_is_all_content = (
+                maybe_body is not None and maybe_body.tag == 'body' and
+                len(maybe_body.getchildren()) == 1)
+
+            if not parent_div_is_all_content:
                block.getparent().remove(block)
                return True
-            else:
-                block = block.getparent()
-    else:
-        # handle the case when From: block goes right after e.g. <hr>
-        # and not enclosed in some tag
-        block = html_message.xpath(
-            ("//*[starts-with(mg:tail(), 'From:')]|"
-             "//*[starts-with(mg:tail(), 'Date:')]"))
-        if block:
-            block = block[0]
-            while(block.getnext() is not None):
-                block.getparent().remove(block.getnext())
-            block.getparent().remove(block)
-            return True
+        else:
+            return False
+
+    # handle the case when From: block goes right after e.g. <hr>
+    # and not enclosed in some tag
+    block = html_message.xpath(
+        ("//*[starts-with(mg:tail(), 'From:')]|"
+         "//*[starts-with(mg:tail(), 'Date:')]"))
+    if block:
+        block = block[0]
+
+        if RE_FWD.match(block.getparent().text or ''):
+            return False
+        
+        while(block.getnext() is not None):
+            block.getparent().remove(block.getnext())
+        block.getparent().remove(block)
+        return True
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -10,10 +10,8 @@ import logging
 from copy import deepcopy

 from lxml import html, etree
-import html2text

-from talon.constants import RE_DELIMITER
-from talon.utils import random_token, get_delimiter
+from talon.utils import get_delimiter, html_to_text
 from talon import html_quotations


@@ -23,7 +21,7 @@ log = logging.getLogger(__name__)
 RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)

 RE_ON_DATE_SMB_WROTE = re.compile(
-    u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
+    u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
        # Beginning of the line
        u'|'.join((
            # English
@@ -33,7 +31,13 @@ RE_ON_DATE_SMB_WROTE = re.compile(
            # Polish
            'W dniu',
            # Dutch
-            'Op'
+            'Op',
+            # German
+            'Am',
+            # Norwegian
+            u'På',
+            # Swedish, Danish
+            'Den',
        )),
        # Date and sender separator
        u'|'.join((
@@ -51,18 +55,28 @@ RE_ON_DATE_SMB_WROTE = re.compile(
            # Polish
            u'napisał',
            # Dutch
-            'schreef','verzond','geschreven'
+            'schreef','verzond','geschreven',
+            # German
+            'schrieb',
+            # Norwegian, Swedish
+            'skrev',
        ))
    ))
 # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
 RE_ON_DATE_WROTE_SMB = re.compile(
-    u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format(
+    u'(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
        # Beginning of the line
+        u'|'.join((
        	'Op',
+        	#German
+        	'Am'
+        )),
        # Ending of the line
        u'|'.join((
            # Dutch
-            'schreef','verzond','geschreven'
+            'schreef','verzond','geschreven',
+            # German
+            'schrieb'
        ))
    )
    )
@@ -93,7 +107,7 @@ RE_EMPTY_QUOTATION = re.compile(
    (
        # quotation border: splitter line or a number of quotation marker lines
        (?:
-            s
+            (?:se*)+
            |
            (?:me*){2,}
        )
@@ -116,18 +130,23 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
 RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format(
    u'|'.join((
        # "From" in different languages.
-        'From', 'Van', 'De', 'Von', 'Fra',
+        'From', 'Van', 'De', 'Von', 'Fra', u'Från',
        # "Date" in different languages.
-        'Date', 'Datum', u'Envoyé'
+        'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
    ))), re.I)

 SPLITTER_PATTERNS = [
    RE_ORIGINAL_MESSAGE,
-    # <date> <person>
-    re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE),
    RE_ON_DATE_SMB_WROTE,
    RE_ON_DATE_WROTE_SMB,
    RE_FROM_COLON_OR_DATE_COLON,
+    # 02.04.2012 14:20 пользователь "bob@example.com" <
+    # bob@xxx.mailgun.org> написал:
+    re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S),
+    # 2014-10-17 11:28 GMT+03:00 Bob <
+    # bob@example.com>:
+    re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S),
+    # Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>:
    re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
               '( \S+){3,6}@\S+:')
    ]
@@ -151,7 +170,7 @@ def extract_from(msg_body, content_type='text/plain'):
            return extract_from_plain(msg_body)
        elif content_type == 'text/html':
            return extract_from_html(msg_body)
-    except Exception, e:
+    except Exception:
        log.exception('ERROR extracting message')

    return msg_body
@@ -182,6 +201,7 @@ def mark_message_lines(lines):
        else:
            # in case splitter is spread across several lines
            splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
+
            if splitter:
                # append as many splitter markers as lines in splitter
                splitter_lines = splitter.group().splitlines()
@@ -294,12 +314,8 @@ def extract_from_plain(msg_body):

    delimiter = get_delimiter(msg_body)
    msg_body = preprocess(msg_body, delimiter)
-    lines = msg_body.splitlines()
-
    # don't process too long messages
-    if len(lines) > MAX_LINES_COUNT:
-        return stripped_text
-
+    lines = msg_body.splitlines()[:MAX_LINES_COUNT]
    markers = mark_message_lines(lines)
    lines = process_marked_lines(lines, markers)

@@ -325,43 +341,27 @@ def extract_from_html(msg_body):
    then checking deleted checkpoints,
    then deleting necessary tags.
    """
-
    if msg_body.strip() == '':
        return msg_body

+    msg_body = msg_body.replace('\r\n', '').replace('\n', '')
    html_tree = html.document_fromstring(
        msg_body,
        parser=html.HTMLParser(encoding="utf-8")
    )
-
    cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
                      html_quotations.cut_blockquote(html_tree) or
                      html_quotations.cut_microsoft_quote(html_tree) or
                      html_quotations.cut_by_id(html_tree) or
                      html_quotations.cut_from_block(html_tree)
                      )
-
    html_tree_copy = deepcopy(html_tree)

    number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
-    quotation_checkpoints = [False for i in xrange(number_of_checkpoints)]
+    quotation_checkpoints = [False] * number_of_checkpoints
    msg_with_checkpoints = html.tostring(html_tree)
-
-    h = html2text.HTML2Text()
-    h.body_width = 0  # generate plain text without wrap
-
-    # html2text adds unnecessary star symbols. Remove them.
-    # Mask star symbols
-    msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432')
-    plain_text = h.handle(msg_with_checkpoints)
-    # Remove created star symbols
-    plain_text = plain_text.replace('*', '')
-    # Unmask saved star symbols
-    plain_text = plain_text.replace('3423oorkg432', '*')
-
-    delimiter = get_delimiter(plain_text)
-
-    plain_text = preprocess(plain_text, delimiter, content_type='text/html')
+    plain_text = html_to_text(msg_with_checkpoints)
+    plain_text = preprocess(plain_text, '\n', content_type='text/html')
    lines = plain_text.splitlines()

    # Don't process too long messages
@@ -383,7 +383,6 @@ def extract_from_html(msg_body):
    return_flags = []
    process_marked_lines(lines, markers, return_flags)
    lines_were_deleted, first_deleted, last_deleted = return_flags
-
    if lines_were_deleted:
        #collect checkpoints from deleted lines
        for i in xrange(first_deleted, last_deleted):
--- a/talon/signature/init.py
+++ b/talon/signature/init.py
@@ -21,11 +21,9 @@ trained against, don't forget to regenerate:
 """

 import os
-import sys
-from cStringIO import StringIO

 from . import extraction
-from . extraction import extract
+from . extraction import extract  #noqa
 from . learning import classifier


@@ -36,13 +34,5 @@ EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data')


 def initialize():
-    try:
-        # redirect output
-        so, sys.stdout = sys.stdout, StringIO()
-
-        extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME,
-                                               EXTRACTOR_DATA)
-        sys.stdout = so
-    except Exception, e:
-        raise Exception(
-            "Failed initializing signature parsing with classifiers", e)
+    extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME,
+                                           EXTRACTOR_DATA)
--- a/talon/signature/data/classifier
+++ b/talon/signature/data/classifier
--- a/talon/signature/data/classifier_01.npy
+++ b/talon/signature/data/classifier_01.npy
--- a/talon/signature/data/classifier_02.npy
+++ b/talon/signature/data/classifier_02.npy
--- a/talon/signature/data/classifier_03.npy
+++ b/talon/signature/data/classifier_03.npy
--- a/talon/signature/data/classifier_04.npy
+++ b/talon/signature/data/classifier_04.npy
--- a/talon/signature/data/classifier_05.npy
+++ b/talon/signature/data/classifier_05.npy
--- a/talon/signature/extraction.py
+++ b/talon/signature/extraction.py
@@ -1,14 +1,10 @@
 # -*- coding: utf-8 -*-

-import os
 import logging

 import regex as re
-from PyML import SparseDataSet
+import numpy

-from talon.constants import RE_DELIMITER
-from talon.signature.constants import (SIGNATURE_MAX_LINES,
-                                       TOO_LONG_SIGNATURE_LINE)
 from talon.signature.learning.featurespace import features, build_pattern
 from talon.utils import get_delimiter
 from talon.signature.bruteforce import get_signature_candidate
@@ -36,8 +32,8 @@ RE_REVERSE_SIGNATURE = re.compile(r'''

 def is_signature_line(line, sender, classifier):
    '''Checks if the line belongs to signature. Returns True or False.'''
-    data = SparseDataSet([build_pattern(line, features(sender))])
-    return classifier.decisionFunc(data, 0) > 0
+    data = numpy.array(build_pattern(line, features(sender)))
+    return classifier.predict(data) > 0


 def extract(body, sender):
@@ -61,7 +57,7 @@ def extract(body, sender):
                text = delimiter.join(text)
                if text.strip():
                    return (text, delimiter.join(signature))
-    except Exception, e:
+    except Exception:
        log.exception('ERROR when extracting signature with classifiers')

    return (body, None)
--- a/talon/signature/learning/classifier.py
+++ b/talon/signature/learning/classifier.py
@@ -5,32 +5,27 @@ The classifier could be used to detect if a certain line of the message
 body belongs to the signature.
 """

-import os
-import sys
-
-from PyML import SparseDataSet, SVM
+from numpy import genfromtxt
+from sklearn.svm import LinearSVC
+from sklearn.externals import joblib


 def init():
-    '''Inits classifier with optimal options.'''
-    return SVM(C=10, optimization='liblinear')
+    """Inits classifier with optimal options."""
+    return LinearSVC(C=10.0)


 def train(classifier, train_data_filename, save_classifier_filename=None):
-    '''Trains and saves classifier so that it could be easily loaded later.'''
-    data = SparseDataSet(train_data_filename, labelsColumn=-1)
-    classifier.train(data)
+    """Trains and saves classifier so that it could be easily loaded later."""
+    file_data = genfromtxt(train_data_filename, delimiter=",")
+    train_data, labels = file_data[:, :-1], file_data[:, -1]
+    classifier.fit(train_data, labels)
+
    if save_classifier_filename:
-        classifier.save(save_classifier_filename)
+        joblib.dump(classifier, save_classifier_filename)
    return classifier


 def load(saved_classifier_filename, train_data_filename):
-    """Loads saved classifier.
-
-    Classifier should be loaded with the same data it was trained against
-    """
-    train_data = SparseDataSet(train_data_filename, labelsColumn=-1)
-    classifier = init()
-    classifier.load(saved_classifier_filename, train_data)
-    return classifier
+    """Loads saved classifier. """
+    return joblib.load(saved_classifier_filename)
--- a/talon/signature/learning/helpers.py
+++ b/talon/signature/learning/helpers.py
@@ -16,8 +16,8 @@ from talon.signature.constants import SIGNATURE_MAX_LINES

 rc = re.compile

-RE_EMAIL = rc('@')
-RE_RELAX_PHONE = rc('.*(\(? ?[\d]{2,3} ?\)?.{,3}){2,}')
+RE_EMAIL = rc('\S@\S')
+RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
 RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')

 # Taken from:
@@ -40,14 +40,6 @@ RE_SIGNATURE_WORDS = rc(('(T|t)hank.*,|(B|b)est|(R|r)egards|'
 # Line contains a pattern like Vitor R. Carvalho or William W. Cohen.
 RE_NAME = rc('[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+')

-# Pattern to match if e.g. 'Sender:' header field has sender names.
-SENDER_WITH_NAME_PATTERN = '([\s]*[\S]+,?)+[\s]*<.*>.*'
-RE_SENDER_WITH_NAME = rc(SENDER_WITH_NAME_PATTERN)
-
-# Reply line clue line endings, as in regular expression:
-# " wrote:$" or " writes:$"
-RE_CLUE_LINE_END = rc('.*(W|w)rotes?:$')
-
 INVALID_WORD_START = rc('\(|\+|[\d]')

 BAD_SENDER_NAMES = [
@@ -128,7 +120,7 @@ def contains_sender_names(sender):
    names = names or sender
    if names != '':
        return binary_regex_search(re.compile(names))
-    return lambda s: False
+    return lambda s: 0


 def extract_names(sender):
@@ -142,7 +134,7 @@ def extract_names(sender):
    >>> extract_names('')
    []
    """
-    sender = to_unicode(sender)
+    sender = to_unicode(sender, precise=True)
    # Remove non-alphabetical characters
    sender = "".join([char if char.isalpha() else ' ' for char in sender])
    # Remove too short words and words from "black" list i.e.
@@ -169,7 +161,7 @@ def categories_percent(s, categories):
    50.0
    '''
    count = 0
-    s = to_unicode(s)
+    s = to_unicode(s, precise=True)
    for c in s:
        if unicodedata.category(c) in categories:
            count += 1
@@ -189,7 +181,7 @@ def punctuation_percent(s):

 def capitalized_words_percent(s):
    '''Returns capitalized words percent.'''
-    s = to_unicode(s)
+    s = to_unicode(s, precise=True)
    words = re.split('\s', s)
    words = [w for w in words if w.strip()]
    capitalized_words_counter = 0
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -2,13 +2,16 @@

 import logging
 from random import shuffle
+import chardet
+import cchardet
+import regex as re
+
+from lxml import html
+from lxml.cssselect import CSSSelector

 from talon.constants import RE_DELIMITER


-log = logging.getLogger(__name__)
-
-
 def safe_format(format_string, *args, **kwargs):
    """
    Helper: formats string with any combination of bytestrings/unicode
@@ -42,12 +45,42 @@ def to_unicode(str_or_unicode, precise=False):
        u'привет'
    If `precise` flag is True, tries to guess the correct encoding first.
    """
-    encoding = detect_encoding(str_or_unicode) if precise else 'utf-8'
+    encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8'
    if isinstance(str_or_unicode, str):
        return unicode(str_or_unicode, encoding, 'replace')
    return str_or_unicode


+def detect_encoding(string):
+    """
+    Tries to detect the encoding of the passed string.
+
+    Defaults to UTF-8.
+    """
+    try:
+        detected = chardet.detect(string)
+        if detected:
+            return detected.get('encoding') or 'utf-8'
+    except Exception, e:
+        pass
+    return 'utf-8'
+
+
+def quick_detect_encoding(string):
+    """
+    Tries to detect the encoding of the passed string.
+
+    Uses cchardet. Fallbacks to detect_encoding.
+    """
+    try:
+        detected = cchardet.detect(string)
+        if detected:
+            return detected.get('encoding') or detect_encoding(string)
+    except Exception, e:
+        pass
+    return detect_encoding(string)
+
+
 def to_utf8(str_or_unicode):
    """
    Safely returns a UTF-8 version of a given string
@@ -74,3 +107,81 @@ def get_delimiter(msg_body):
        delimiter = '\n'

    return delimiter
+
+
+def html_to_text(string):
+    """
+    Dead-simple HTML-to-text converter:
+        >>> html_to_text("one<br>two<br>three")
+        >>> "one\ntwo\nthree"
+
+    NOTES:
+        1. the string is expected to contain UTF-8 encoded HTML!
+        2. returns utf-8 encoded str (not unicode)
+    """
+    s = _prepend_utf8_declaration(string)
+    s = s.replace("\n", "")
+
+    tree = html.fromstring(s)
+
+    for style in CSSSelector('style')(tree):
+        style.getparent().remove(style)
+
+    for c in tree.xpath('//comment()'):
+        c.getparent().remove(c)
+
+    text   = ""
+    for el in tree.iter():
+        el_text = (el.text or '') + (el.tail or '')
+        if len(el_text) > 1:
+            if el.tag in _BLOCKTAGS:
+                text += "\n"
+            if el.tag == 'li':
+                text += "  * "
+            text += el_text.strip() + " "
+
+            # add href to the output
+            href = el.attrib.get('href')
+            if href:
+                text += "(%s) " % href
+
+        if el.tag in _HARDBREAKS and text and not text.endswith("\n"):
+            text += "\n"
+
+    retval = _rm_excessive_newlines(text)
+    return _encode_utf8(retval)
+
+
+def _contains_charset_spec(s):
+    """Return True if the first 4KB contain charset spec
+    """
+    return s.lower().find('html; charset=', 0, 4096) != -1
+
+
+def _prepend_utf8_declaration(s):
+    """Prepend 'utf-8' encoding declaration if the first 4KB don't have any
+    """
+    return s if _contains_charset_spec(s) else _UTF8_DECLARATION + s
+
+
+def _rm_excessive_newlines(s):
+    """Remove excessive newlines that often happen due to tons of divs
+    """
+    return _RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip()
+
+
+def _encode_utf8(s):
+    """Encode in 'utf-8' if unicode
+    """
+    return s.encode('utf-8') if isinstance(s, unicode) else s
+
+
+_UTF8_DECLARATION = ('<meta http-equiv="Content-Type" content="text/html;'
+                     'charset=utf-8">')
+
+
+_BLOCKTAGS  = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
+_HARDBREAKS = ['br', 'hr', 'tr']
+
+
+_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
--- a/tests/fixtures/html_replies/hotmail.html
+++ b/tests/fixtures/html_replies/hotmail.html
@@ -1,3 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
 <html>
 <head>
 <style><!--
--- a/tests/fixtures/html_replies/ms_outlook_2010.html
+++ b/tests/fixtures/html_replies/ms_outlook_2010.html
@@ -0,0 +1,87 @@
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=iso-2022-jp">
+<meta name="Generator" content="Microsoft Word 14 (filtered medium)">
+<style><!--
+/* Font Definitions */
+@font-face
+	{font-family:Calibri;
+	panose-1:2 15 5 2 2 2 4 3 2 4;}
+@font-face
+	{font-family:Tahoma;
+	panose-1:2 11 6 4 3 5 4 4 2 4;}
+/* Style Definitions */
+p.MsoNormal, li.MsoNormal, div.MsoNormal
+	{margin:0in;
+	margin-bottom:.0001pt;
+	font-size:12.0pt;
+	font-family:"Times New Roman","serif";}
+h3
+	{mso-style-priority:9;
+	mso-style-link:"Heading 3 Char";
+	mso-margin-top-alt:auto;
+	margin-right:0in;
+	mso-margin-bottom-alt:auto;
+	margin-left:0in;
+	font-size:13.5pt;
+	font-family:"Times New Roman","serif";
+	font-weight:bold;}
+a:link, span.MsoHyperlink
+	{mso-style-priority:99;
+	color:blue;
+	text-decoration:underline;}
+a:visited, span.MsoHyperlinkFollowed
+	{mso-style-priority:99;
+	color:purple;
+	text-decoration:underline;}
+p
+	{mso-style-priority:99;
+	mso-margin-top-alt:auto;
+	margin-right:0in;
+	mso-margin-bottom-alt:auto;
+	margin-left:0in;
+	font-size:12.0pt;
+	font-family:"Times New Roman","serif";}
+span.Heading3Char
+	{mso-style-name:"Heading 3 Char";
+	mso-style-priority:9;
+	mso-style-link:"Heading 3";
+	font-family:"Cambria","serif";
+	color:#4F81BD;
+	font-weight:bold;}
+span.EmailStyle19
+	{mso-style-type:personal-reply;
+	font-family:"Calibri","sans-serif";
+	color:#1F497D;}
+.MsoChpDefault
+	{mso-style-type:export-only;
+	font-family:"Calibri","sans-serif";}
+@page WordSection1
+	{size:8.5in 11.0in;
+	margin:1.0in 1.0in 1.0in 1.0in;}
+div.WordSection1
+	{page:WordSection1;}
+--></style><!--[if gte mso 9]><xml>
+<o:shapedefaults v:ext="edit" spidmax="1026" />
+</xml><![endif]--><!--[if gte mso 9]><xml>
+<o:shapelayout v:ext="edit">
+<o:idmap v:ext="edit" data="1" />
+</o:shapelayout></xml><![endif]-->
+</head>
+<body lang="EN-US" link="blue" vlink="purple">
+<div class="WordSection1">
+<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Hi. I am fine.<o:p></o:p></span></p>
+<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Thanks,<o:p></o:p></span></p>
+<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Alex<o:p></o:p></span></p>
+<p class="MsoNormal"><b><span style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;">From:</span></b><span style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;"> Foo [mailto:foo@bar.com]
+<b>On Behalf Of </b>baz@bar.com<br>
+<b>Sent:</b> Monday, January 01, 2000 12:00 AM<br>
+<b>To:</b> john@bar.com<br>
+<b>Cc:</b> jane@bar.io<br>
+<b>Subject:</b> Conversation<o:p></o:p></span></p>
+<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
+<p>Hello! How are you?<o:p></o:p></p>
+<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
+</div>
+</body>
+</html>
--- a/tests/fixtures/standard_replies/apple_mail_2.eml
+++ b/tests/fixtures/standard_replies/apple_mail_2.eml
@@ -0,0 +1,19 @@
+Content-Type: text/plain;
+	charset=us-ascii
+Mime-Version: 1.0 (Mac OS X Mail 8.2 \(2104\))
+Subject: Re: Hello there
+X-Universally-Unique-Identifier: 85B1075D-5841-46A9-8565-FCB287A93AC4
+From: Adam Renberg <adam@tictail.com>
+In-Reply-To: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
+Date: Sat, 22 Aug 2015 19:22:20 +0200
+Content-Transfer-Encoding: 7bit
+X-Smtp-Server: smtp.gmail.com:adam@tictail.com
+Message-Id: <68001B29-8EA4-444C-A894-0537D2CA5208@tictail.com>
+References: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
+To: Adam Renberg <tgwizard@gmail.com>
+
+Hello
+> On 22 Aug 2015, at 19:21, Adam Renberg <tgwizard@gmail.com> wrote:
+>
+> Hi there!
+
--- a/tests/fixtures/standard_replies/iphone.eml
+++ b/tests/fixtures/standard_replies/iphone.eml
@@ -9,11 +9,11 @@ To: bob <bob@example.com>
 Content-Transfer-Encoding: quoted-printable
 Mime-Version: 1.0 (1.0)

-hello
+Hello

 Sent from my iPhone

 On Apr 3, 2012, at 4:19 PM, bob <bob@example.com> wr=
 ote:

-> Hi
+> Hi
--- a/tests/fixtures/standard_replies/iphone_reply_text
+++ b/tests/fixtures/standard_replies/iphone_reply_text
@@ -0,0 +1,3 @@
+Hello
+
+Sent from my iPhone
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -4,11 +4,8 @@ from . import *
 from . fixtures import *

 import regex as re
-from flanker import mime

-from talon import quotations
-
-import html2text
+from talon import quotations, utils as u


 RE_WHITESPACE = re.compile("\s")
@@ -46,7 +43,25 @@ def test_quotation_splitter_outside_blockquote():
  </div>
 </blockquote>
 """
-    eq_("<html><body><p>Reply</p><div></div></body></html>",
+    eq_("<html><body><p>Reply</p></body></html>",
+        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
+
+
+def test_regular_blockquote():
+    msg_body = """Reply
+<blockquote>Regular</blockquote>
+
+<div>
+  On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
+</div>
+
+<blockquote>
+  <div>
+    <blockquote>Nested</blockquote>
+  </div>
+</blockquote>
+"""
+    eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote></body></html>",
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))


@@ -116,6 +131,18 @@ def test_gmail_quote():
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))


+def test_gmail_quote_blockquote():
+    msg_body = """Message
+<blockquote class="gmail_quote">
+  <div class="gmail_default">
+    My name is William Shakespeare.
+    <br/>
+  </div>
+</blockquote>"""
+    eq_(RE_WHITESPACE.sub('', msg_body),
+        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
+
+
 def test_unicode_in_reply():
    msg_body = u"""Reply \xa0 \xa0 Text<br>

@@ -123,7 +150,7 @@ def test_unicode_in_reply():
  <br>
 </div>

-<blockquote class="gmail_quote">
+<blockquote>
  Quote
 </blockquote>""".encode("utf-8")

@@ -224,10 +251,7 @@ def test_reply_shares_div_with_from_block():


 def test_reply_quotations_share_block():
-    msg = mime.from_string(REPLY_QUOTATIONS_SHARE_BLOCK)
-    html_part = list(msg.walk())[1]
-    assert html_part.content_type == 'text/html'
-    stripped_html = quotations.extract_from_html(html_part.body)
+    stripped_html = quotations.extract_from_plain(REPLY_QUOTATIONS_SHARE_BLOCK)
    ok_(stripped_html)
    ok_('From' not in stripped_html)

@@ -244,26 +268,15 @@ def test_reply_separated_by_hr():
            '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR)))


-RE_REPLY = re.compile(r"^Hi\. I am fine\.\s*\n\s*Thanks,\s*\n\s*Alex\s*$")
-
-
 def extract_reply_and_check(filename):
    f = open(filename)

-    msg_body = f.read().decode("utf-8")
+    msg_body = f.read()
    reply = quotations.extract_from_html(msg_body)
+    plain_reply = u.html_to_text(reply)

-    h = html2text.HTML2Text()
-    h.body_width = 0
-    plain_reply = h.handle(reply)
-
-    #remove &nbsp; spaces
-    plain_reply = plain_reply.replace(u'\xa0', u' ')
-
-    if RE_REPLY.match(plain_reply):
-        eq_(1, 1)
-    else:
-        eq_("Hi. I am fine.\n\nThanks,\nAlex", plain_reply)
+    eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"),
+        RE_WHITESPACE.sub('', plain_reply))


 def test_gmail_reply():
@@ -286,6 +299,10 @@ def test_ms_outlook_2007_reply():
    extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2007.html")


+def test_ms_outlook_2010_reply():
+    extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2010.html")
+
+
 def test_thunderbird_reply():
    extract_reply_and_check("tests/fixtures/html_replies/thunderbird.html")

@@ -296,3 +313,37 @@ def test_windows_mail_reply():

 def test_yandex_ru_reply():
    extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html")
+
+
+def test_CRLF():
+    """CR is not converted to '&#13;'
+    """
+    symbol = '&#13;'
+    extracted = quotations.extract_from_html('<html>\r\n</html>')
+    assert_false(symbol in extracted)
+    eq_('<html></html>', RE_WHITESPACE.sub('', extracted))
+
+    msg_body = """Reply
+<blockquote>
+
+  <div>
+    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
+  </div>
+
+  <div>
+    Test
+  </div>
+
+</blockquote>"""
+    msg_body = msg_body.replace('\n', '\r\n')
+    extracted = quotations.extract_from_html(msg_body)
+    assert_false(symbol in extracted)    
+    eq_("<html><body><p>Reply</p></body></html>",
+        RE_WHITESPACE.sub('', extracted))
+
+
+def test_gmail_forwarded_msg():
+    msg_body = """<div dir="ltr"><br><div class="gmail_quote">---------- Forwarded message ----------<br>From: <b class="gmail_sendername">Bob</b> <span dir="ltr">&lt;<a href="mailto:bob@example.com">bob@example.com</a>&gt;</span><br>Date: Fri, Feb 11, 2010 at 5:59 PM<br>Subject: Bob WFH today<br>To: Mary &lt;<a href="mailto:mary@example.com">mary@example.com</a>&gt;<br><br><br><div dir="ltr">eom</div>
+</div><br></div>"""
+    extracted = quotations.extract_from_html(msg_body)
+    eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
--- a/tests/quotations_test.py
+++ b/tests/quotations_test.py
@@ -3,8 +3,6 @@
 from . import *
 from . fixtures import *

-from flanker import mime
-
 from talon import quotations


--- a/tests/signature/bruteforce_test.py
+++ b/tests/signature/bruteforce_test.py
@@ -2,10 +2,6 @@

 from .. import *

-import os
-
-from flanker import mime
-
 from talon.signature import bruteforce


--- a/tests/signature/extraction_test.py
+++ b/tests/signature/extraction_test.py
@@ -4,8 +4,6 @@ from .. import *

 import os

-from PyML import SparseDataSet
-
 from talon.signature.learning import dataset
 from talon import signature
 from talon.signature import extraction as e
--- a/tests/signature/learning/dataset_test.py
+++ b/tests/signature/learning/dataset_test.py
@@ -3,9 +3,8 @@
 from ... import *
 import os

-from PyML import SparseDataSet
+from numpy import genfromtxt

-from talon.utils import to_unicode
 from talon.signature.learning import dataset as d

 from talon.signature.learning.featurespace import features
@@ -42,10 +41,13 @@ def test_build_extraction_dataset():
    d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'),
                               os.path.join(TMP_DIR,
                                            'extraction.data'), 1)
-    test_data = SparseDataSet(os.path.join(TMP_DIR, 'extraction.data'),
-                              labelsColumn=-1)
+
+    filename = os.path.join(TMP_DIR, 'extraction.data')
+    file_data = genfromtxt(filename, delimiter=",")
+    test_data = file_data[:, :-1]
+
    # the result is a loadable signature extraction dataset
    # 32 comes from 3 emails in emails/P folder, 11 lines checked to be
    # a signature, one email has only 10 lines
-    eq_(test_data.size(), 32)
-    eq_(len(features('')), test_data.numFeatures)
+    eq_(test_data.shape[0], 32)
+    eq_(len(features('')), test_data.shape[1])
--- a/tests/signature/learning/featurespace_test.py
+++ b/tests/signature/learning/featurespace_test.py
@@ -6,7 +6,9 @@ from talon.signature.learning import featurespace as fs


 def test_apply_features():
-    s = '''John Doe
+    s = '''This is John Doe
+
+Tuesday @3pm suits. I'll chat to you then.

 VP Research and Development, Xxxx Xxxx Xxxxx

@@ -19,11 +21,12 @@ john@example.com'''
    # note that we don't consider the first line because signatures don't
    # usually take all the text, empty lines are not considered
    eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

-    with patch.object(fs, 'SIGNATURE_MAX_LINES', 4):
+    with patch.object(fs, 'SIGNATURE_MAX_LINES', 5):
        features = fs.features(sender)
        new_result = fs.apply_features(s, features)
        # result remains the same because we don't consider empty lines
--- a/tests/signature/learning/helpers_test.py
+++ b/tests/signature/learning/helpers_test.py
@@ -43,7 +43,7 @@ VALID_PHONE_NUMBERS = [e.strip() for e in VALID.splitlines() if e.strip()]

 def test_match_phone_numbers():
    for phone in VALID_PHONE_NUMBERS:
-        ok_(RE_RELAX_PHONE.match(phone), "{} should be matched".format(phone))
+        ok_(RE_RELAX_PHONE.search(phone), "{} should be matched".format(phone))


 def test_match_names():
@@ -52,29 +52,6 @@ def test_match_names():
        ok_(RE_NAME.match(name), "{} should be matched".format(name))


-def test_sender_with_name():
-    ok_lines = ['Sergey Obukhov <serobnic@example.com>',
-                '\tSergey  <serobnic@example.com>',
-                ('"Doe, John (TX)"'
-                 '<DowJ@example.com>@EXAMPLE'
-                 '<IMCEANOTES-+22Doe+2C+20John+20'
-                 '+28TX+29+22+20+3CDoeJ+40example+2Ecom+3E'
-                 '+40EXAMPLE@EXAMPLE.com>'),
-                ('Company Sleuth <csleuth@email.xxx.com>'
-                 '@EXAMPLE <XXX-Company+20Sleuth+20+3Ccsleuth'
-                 '+40email+2Exxx+2Ecom+3E+40EXAMPLE@EXAMPLE.com>'),
-                ('Doe III, John '
-                 '</O=EXAMPLE/OU=NA/CN=RECIPIENTS/CN=jDOE5>')]
-    for line in ok_lines:
-        ok_(RE_SENDER_WITH_NAME.match(line),
-            '{} should be matched'.format(line))
-
-    nok_lines = ['', '<serobnic@xxx.ru>', 'Sergey serobnic@xxx.ru']
-    for line in nok_lines:
-        assert_false(RE_SENDER_WITH_NAME.match(line),
-                     '{} should not be matched'.format(line))
-
-
 # Now test helpers functions
 def test_binary_regex_search():
    eq_(1, h.binary_regex_search(re.compile("12"))("12"))
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -5,19 +5,18 @@ from . fixtures import *

 import os

-from flanker import mime
-
+import email.iterators
 from talon import quotations


@patch.object(quotations, 'MAX_LINES_COUNT', 1)
 def test_too_many_lines():
    msg_body = """Test reply
-
+Hi
 -----Original Message-----

 Test"""
-    eq_(msg_body, quotations.extract_from_plain(msg_body))
+    eq_("Test reply", quotations.extract_from_plain(msg_body))


 def test_pattern_on_date_somebody_wrote():
@@ -55,6 +54,18 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote:
    eq_("Test reply", quotations.extract_from_plain(msg_body))


+def test_date_time_email_splitter():
+    msg_body = """Test reply
+
+2014-10-17 11:28 GMT+03:00 Postmaster <
+postmaster@sandboxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.mailgun.org>:
+
+> First from site
+>
+    """
+    eq_("Test reply", quotations.extract_from_plain(msg_body))
+
+
 def test_pattern_on_date_somebody_wrote_allows_space_in_front():
    msg_body = """Thanks Thanmai
 On Mar 8, 2012 9:59 AM, "Example.com" <
@@ -312,6 +323,33 @@ Emne: The manager has commented on your Loop
 Blah-blah-blah
 """))

+def test_swedish_from_block():
+    eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
+    u"""Allo! Follow up MIME!
+Från: Anno Sportel [mailto:anno.spoel@hsbcssad.com]
+Skickat: den 26 augusti 2015 14:45
+Till: Isacson Leiff
+Ämne: RE: Week 36
+
+Blah-blah-blah
+"""))
+
+def test_swedish_from_line():
+    eq_('Lorem', quotations.extract_from_plain(
+    """Lorem
+Den 14 september, 2015 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
+
+Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
+"""))
+
+def test_norwegian_from_line():
+    eq_('Lorem', quotations.extract_from_plain(
+    u"""Lorem
+På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
+
+Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
+"""))
+
 def test_dutch_from_block():
    eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
    """Gluten-free culpa lo-fi et nesciunt nostrud. 
@@ -614,22 +652,21 @@ def test_preprocess_postprocess_2_links():
 def test_standard_replies():
    for filename in os.listdir(STANDARD_REPLIES):
        filename = os.path.join(STANDARD_REPLIES, filename)
-        if os.path.isdir(filename):
+        if not filename.endswith('.eml') or os.path.isdir(filename):
            continue
        with open(filename) as f:
-            msg = f.read()
-            m = mime.from_string(msg)
-            for part in m.walk():
-                if part.content_type == 'text/plain':
-                    text = part.body
-                    stripped_text = quotations.extract_from_plain(text)
-                    reply_text_fn = filename[:-4] + '_reply_text'
-                    if os.path.isfile(reply_text_fn):
-                        with open(reply_text_fn) as f:
-                            reply_text = f.read()
-                    else:
-                        reply_text = 'Hello'
-                    eq_(reply_text, stripped_text,
-                        "'%(reply)s' != %(stripped)s for %(fn)s" %
-                        {'reply': reply_text, 'stripped': stripped_text,
-                         'fn': filename})
+            message = email.message_from_file(f)
+            body = email.iterators.typed_subpart_iterator(message, subtype='plain').next()
+            text = ''.join(email.iterators.body_line_iterator(body, True))
+
+            stripped_text = quotations.extract_from_plain(text)
+            reply_text_fn = filename[:-4] + '_reply_text'
+            if os.path.isfile(reply_text_fn):
+                with open(reply_text_fn) as f:
+                    reply_text = f.read().strip()
+            else:
+                reply_text = 'Hello'
+            yield eq_, reply_text, stripped_text, \
+                "'%(reply)s' != %(stripped)s for %(fn)s" % \
+                {'reply': reply_text, 'stripped': stripped_text,
+                 'fn': filename}
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -1,9 +1,107 @@
+# coding:utf-8
+
 from . import *

-from talon import utils
+from talon import utils as u
+import cchardet


 def test_get_delimiter():
-    eq_('\r\n', utils.get_delimiter('abc\r\n123'))
-    eq_('\n', utils.get_delimiter('abc\n123'))
-    eq_('\n', utils.get_delimiter('abc'))
+    eq_('\r\n', u.get_delimiter('abc\r\n123'))
+    eq_('\n', u.get_delimiter('abc\n123'))
+    eq_('\n', u.get_delimiter('abc'))
+
+
+def test_unicode():
+    eq_ (u'hi', u.to_unicode('hi'))
+    eq_ (type(u.to_unicode('hi')), unicode )
+    eq_ (type(u.to_unicode(u'hi')), unicode )
+    eq_ (type(u.to_unicode('привет')), unicode )
+    eq_ (type(u.to_unicode(u'привет')), unicode )
+    eq_ (u"привет", u.to_unicode('привет'))
+    eq_ (u"привет", u.to_unicode(u'привет'))
+    # some latin1 stuff
+    eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True))
+
+
+def test_detect_encoding():
+    eq_ ('ascii', u.detect_encoding('qwe').lower())
+    eq_ ('iso-8859-2', u.detect_encoding('Versi\xf3n').lower())
+    eq_ ('utf-8', u.detect_encoding('привет').lower())
+    # fallback to utf-8
+    with patch.object(u.chardet, 'detect') as detect:
+        detect.side_effect = Exception
+        eq_ ('utf-8', u.detect_encoding('qwe').lower())
+
+
+def test_quick_detect_encoding():
+    eq_ ('ascii', u.quick_detect_encoding('qwe').lower())
+    eq_ ('windows-1252', u.quick_detect_encoding('Versi\xf3n').lower())
+    eq_ ('utf-8', u.quick_detect_encoding('привет').lower())
+
+
+@patch.object(cchardet, 'detect')
+@patch.object(u, 'detect_encoding')
+def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect):
+    cchardet_detect.return_value = {'encoding': 'ascii'}
+    eq_('ascii', u.quick_detect_encoding("qwe"))
+    cchardet_detect.assert_called_once_with("qwe")
+
+    # fallback to detect_encoding
+    cchardet_detect.return_value = {}
+    detect_encoding.return_value = 'utf-8'
+    eq_('utf-8', u.quick_detect_encoding("qwe"))
+
+    # exception
+    detect_encoding.reset_mock()
+    cchardet_detect.side_effect = Exception()
+    detect_encoding.return_value = 'utf-8'
+    eq_('utf-8', u.quick_detect_encoding("qwe"))
+    ok_(detect_encoding.called)
+
+
+def test_html_to_text():
+    html = """<body>
+<p>Hello world!</p>
+<br>
+<ul>
+<li>One!</li>
+<li>Two</li>
+</ul>
+<p>
+Haha
+</p>
+</body>"""
+    text = u.html_to_text(html)
+    eq_("Hello world! \n\n  * One! \n  * Two \nHaha", text)
+    eq_("привет!", u.html_to_text("<b>привет!</b>"))
+
+    html = '<body><br/><br/>Hi</body>'
+    eq_ ('Hi', u.html_to_text(html))
+
+    html = """Hi
+<style type="text/css">
+
+div, p, li {
+
+font: 13px 'Lucida Grande', Arial, sans-serif;
+
+}
+</style>
+
+<style type="text/css">
+
+h1 {
+
+font: 13px 'Lucida Grande', Arial, sans-serif;
+
+}
+</style>"""
+    eq_ ('Hi', u.html_to_text(html))
+
+    html = """<div>
+<!-- COMMENT 1 -->
+<span>TEXT 1</span>
+<p>TEXT 2 <!-- COMMENT 2 --></p>
+</div>"""
+    eq_('TEXT 1 \nTEXT 2', u.html_to_text(html))
--- a/train.py
+++ b/train.py
@@ -0,0 +1,10 @@
+from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA
+from talon.signature.learning.classifier import train, init
+
+
+def train_model():
+    """ retrain model and persist """
+    train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME)
+
+if __name__ == "__main__":
+    train_model()
Author	SHA1	Message	Date
Sergey Obukhov	c762f3c337	Merge pull request #77 from mailgun/sergey/fix-gmail-fwd fixes mailgun/talon#18	2016-02-19 19:08:37 -08:00
Sergey Obukhov	31803d41bc	fixes mailgun/talon#18	2016-02-19 19:07:10 -08:00
Sergey Obukhov	2ecd9779fc	bump up version	2016-02-19 18:32:07 -08:00
Sergey Obukhov	5a7047233e	Merge pull request #76 from mailgun/sergey/fix-date-splitter fixes mailgun/talon#19	2016-02-19 18:28:23 -08:00
Sergey Obukhov	999e9c3725	fixes mailgun/talon#19	2016-02-19 17:53:52 -08:00
Sergey Obukhov	f6940fe878	bump up version	2015-12-18 19:15:58 -08:00
Sergey Obukhov	ce65ff8fc8	Merge pull request #71 from clara-labs/ms-2010-issue First pass at handling issue with ms outlook 2010 with unenclosed quo…	2015-12-18 19:14:13 -08:00
Sergey Obukhov	eed6784f25	Merge pull request #70 from mailgun/sergey/gmail fixes mailgun/talon#38 mailgun/talon#20	2015-12-18 19:00:13 -08:00
Sergey Obukhov	3d9ae356ea	add more tests, make standard reply tests more relaxed	2015-12-18 18:56:41 -08:00
Carlos Correa	f688d074b5	First pass at handling issue with ms outlook 2010 with unenclosed quoted text.	2015-12-10 19:16:13 -08:00
Sergey Obukhov	41457d8fbd	fixes mailgun/talon#38 mailgun/talon#20	2015-12-05 00:37:02 -08:00
Sergey Obukhov	2c416ecc0e	Merge pull request #62 from tgwizard/better-support-for-scandinavian-languages Add better support for Scandinavian languages	2015-10-14 21:48:10 -07:00
Sergey Obukhov	3ab33c557b	Merge pull request #65 from mailgun/sergey/cssselect add cssselect to dependencies	2015-10-14 20:34:02 -07:00
Sergey Obukhov	8db05f4950	add cssselect to dependencies	2015-10-14 20:31:26 -07:00
Sergey Obukhov	3d5bc82a03	Merge pull request #61 from tgwizard/fix-for-apple-mail Add fix for Apple Mail email format	2015-10-14 12:38:06 -07:00
Adam Renberg	14e3a0d80b	Add better support for Scandinavian languages This is a port of https://github.com/tictail/claw/pull/6 by @simonflore.	2015-09-21 21:42:01 +02:00
Adam Renberg	fcd9e2716a	Add fix for Apple Mail email format Where they have an initial > on the "date line".	2015-09-21 21:33:57 +02:00
Sergey Obukhov	d62d633215	bump up version	2015-09-21 09:55:51 -07:00
Sergey Obukhov	3b0c9273c1	Merge pull request #60 from mailgun/sergey/26 fixes mailgun/talon#26	2015-09-21 09:54:35 -07:00
Sergey Obukhov	e4c1c11845	remove print	2015-09-21 09:52:47 -07:00
Sergey Obukhov	ae508fe0e5	fixes mailgun/talon#26	2015-09-21 09:51:26 -07:00
Sergey Obukhov	2cb9b5399c	bump up version	2015-09-18 05:23:29 -07:00
Sergey Obukhov	134c47f515	Merge pull request #59 from mailgun/sergey/43 fixes mailgun/talon#43	2015-09-18 05:20:51 -07:00
Sergey Obukhov	d328c9d128	fixes mailgun/talon#43	2015-09-18 05:19:59 -07:00
Sergey Obukhov	77b62b0fef	Merge pull request #58 from mailgun/sergey/52 fixes mailgun/talon#52	2015-09-18 04:48:50 -07:00
Sergey Obukhov	ad09b18f3f	fixes mailgun/talon#52	2015-09-18 04:47:23 -07:00
Sergey Obukhov	b5af9c03a5	bump up version	2015-09-11 10:42:26 -07:00
Sergey Obukhov	176c7e7532	Merge pull request #57 from mailgun/sergey/to_unicode use precise encoding when converting to unicode	2015-09-11 10:40:52 -07:00
Sergey Obukhov	15976888a0	use precise encoding when converting to unicode	2015-09-11 10:38:28 -07:00
Sergey Obukhov	9bee502903	bump up version	2015-09-11 06:27:12 -07:00
Sergey Obukhov	e3cb8dc3e6	Merge pull request #56 from mailgun/sergey/1000+German+NL process first 1000 lines for long messages, support for German and Dutch	2015-09-11 06:20:34 -07:00
Sergey Obukhov	385285e5de	process first 1000 lines for long messages, support for German and Dutch	2015-09-11 06:17:14 -07:00
Sergey Obukhov	127771dac9	bump up version	2015-09-11 04:51:39 -07:00
Sergey Obukhov	cc98befba5	Merge pull request #50 from Easy-D/preserve-regular-blockquotes Preserve regular blockquotes	2015-09-11 04:49:36 -07:00
Sergey Obukhov	567549cba4	bump up talon version	2015-09-10 10:47:16 -07:00
Sergey Obukhov	76c4f49be8	Merge pull request #55 from mailgun/sergey/lxml unpin lxml version	2015-09-10 10:44:59 -07:00
Sergey Obukhov	d9d89dc250	unpin lxml version	2015-09-10 10:44:05 -07:00
Sergey Obukhov	9358db6cee	bump up talon version	2015-09-03 11:03:01 -07:00
Sergey Obukhov	08c9d7db03	Merge pull request #45 from AlexRiina/master Replace PyML with sklearn and clean up dependencies	2015-09-03 10:56:18 -07:00
Easy-D	390b0a6dc9	preserve regular blockquotes	2015-07-16 21:31:41 +02:00
Easy-D	ed6b861a47	add failing test that shows how regular blockquotes are removed	2015-07-16 21:24:49 +02:00
Alex Riina	85c7ee980c	add script to regenerate ml model	2015-07-02 21:49:09 -04:00
Oliver Song	7ea773e6a9	Fix iphone test	2015-07-02 21:49:09 -04:00
Scott MacVicar	e3c4ff38fe	move test stuff out to its own section	2015-07-02 21:49:09 -04:00
Scott MacVicar	8b1f87b1c0	Get this building and passing tests Changes: * add .DS_Store to .gitignore * Decode base64 encoded emails for tests * Pick a version of scikit since the pickled clasifiers are based on that * Add missing numpy and scipy dependencies	2015-07-02 21:49:09 -04:00
Alex Riina	c5e4cd9ab4	dont be too restrictive on the test library version	2015-07-02 21:49:09 -04:00
Alex Riina	215e36e9ed	allow higher version of regex library	2015-07-02 21:49:09 -04:00
Alex Riina	e3ef622031	remove unused regex	2015-07-02 21:49:09 -04:00
Alex Riina	f16760c466	Remove flanker and replace PyML with scikit-learn I never was actually able to successfully install PyML but the source-forge distribution and lack of python3 support convinced me that scikit-learn would be a fine substitute. Flanker was also difficult for me to install and seemed only to be used in the tests, so I removed it as well to get into a position where I could run the tests. As of this commit, only one is not passing (test_standard_replies with android.eml) though I'm not familiar with the `email` library yet.	2015-07-02 21:49:09 -04:00
Alex Riina	b36287e573	clean up style and extra imports	2015-07-02 21:49:09 -04:00
Alex Riina	4df7aa284b	remove extra imports	2015-07-02 21:49:09 -04:00