Actually bump up talon's version up to 1.0.5 to match the tag.

2015-09-09 22:46:18 +02:00
16 changed files with 98 additions and 617 deletions
--- a/README.rst
+++ b/README.rst
@@ -95,7 +95,7 @@ classifiers. The core of machine learning algorithm lays in
 apply to a message (``featurespace.py``), how data sets are built
 (``dataset.py``), classifier’s interface (``classifier.py``).

-Currently the data used for training is taken from our personal email
+The data used for training is taken from our personal email
 conversations and from `ENRON`_ dataset. As a result of applying our set
 of features to the dataset we provide files ``classifier`` and
 ``train.data`` that don’t have any personal information but could be
@@ -116,19 +116,8 @@ or
    from talon.signature.learning.classifier import train, init
    train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME)

-Open-source Dataset
-------------------
-
-Recently we started a `forge`_ project to create an open-source, annotated dataset of raw emails. In the project we
-used a subset of `ENRON`_ data, cleansed of private, health and financial information by `EDRM`_. At the moment over 190
-emails are annotated. Any contribution and collaboration on the project are welcome. Once the dataset is ready we plan to
-start using it for talon.
-
 .. _scikit-learn: http://scikit-learn.org
 .. _ENRON: https://www.cs.cmu.edu/~enron/
-.. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set
-.. _forge: https://github.com/mailgun/forge
-

 Research
 --------
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages


 setup(name='talon',
-      version='1.2.10',
+      version='1.0.5',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
@@ -14,14 +14,12 @@ setup(name='talon',
      include_package_data=True,
      zip_safe=True,
      install_requires=[
-          "lxml>=2.3.3",
+          "lxml==2.3.3",
          "regex>=1",
+          "html2text",
          "numpy",
          "scipy",
          "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
-          'chardet>=1.0.1',
-          'cchardet>=0.3.5',
-          'cssselect'
          ],
      tests_require=[
          "mock",
--- a/talon/html_quotations.py
+++ b/talon/html_quotations.py
@@ -12,7 +12,6 @@ CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX)

 # HTML quote indicators (tag ids)
 QUOTE_IDS = ['OLK_SRC_BODY_SECTION']
-RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)


 def add_checkpoint(html_note, counter):
@@ -77,8 +76,8 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints):

 def cut_gmail_quote(html_message):
    ''' Cuts the outermost block element with class gmail_quote. '''
-    gmail_quote = html_message.cssselect('div.gmail_quote')
-    if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)):
+    gmail_quote = html_message.cssselect('.gmail_quote')
+    if gmail_quote:
        gmail_quote[0].getparent().remove(gmail_quote[0])
        return True

@@ -86,12 +85,9 @@ def cut_gmail_quote(html_message):
 def cut_microsoft_quote(html_message):
    ''' Cuts splitter block and all following blocks. '''
    splitter = html_message.xpath(
-        #outlook 2007, 2010 (international)
+        #outlook 2007, 2010
        "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
        "padding:3.0pt 0cm 0cm 0cm']|"
-        #outlook 2007, 2010 (american)
-        "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
-        "padding:3.0pt 0in 0in 0in']|"
        #windows mail
        "//div[@style='padding-top: 5px; "
        "border-top-color: rgb(229, 229, 229); "
@@ -142,14 +138,9 @@ def cut_by_id(html_message):


 def cut_blockquote(html_message):
-    ''' Cuts the last non-nested blockquote with wrapping elements.'''
-    quote = html_message.xpath(
-        '(.//blockquote)'
-        '[not(@class="gmail_quote") and not(ancestor::blockquote)]'
-        '[last()]')
-
-    if quote:
-        quote = quote[0]
+    ''' Cuts blockquote with wrapping elements. '''
+    quote = html_message.find('.//blockquote')
+    if quote is not None:
        quote.getparent().remove(quote)
        return True

@@ -163,58 +154,21 @@ def cut_from_block(html_message):

    if block:
        block = block[-1]
-        parent_div = None
        while block.getparent() is not None:
            if block.tag == 'div':
-                parent_div = block
-                break
-            block = block.getparent()
-        if parent_div is not None:
-            maybe_body = parent_div.getparent()
-            # In cases where removing this enclosing div will remove all
-            # content, we should assume the quote is not enclosed in a tag.
-            parent_div_is_all_content = (
-                maybe_body is not None and maybe_body.tag == 'body' and
-                len(maybe_body.getchildren()) == 1)
-
-            if not parent_div_is_all_content:
-                parent = block.getparent()
-                next_sibling = block.getnext()
-
-                # remove all tags after found From block
-                # (From block and quoted message are in separate divs)
-                while next_sibling is not None:
-                    parent.remove(block)
-                    block = next_sibling
-                    next_sibling = block.getnext()
-
-                # remove the last sibling (or the
-                # From block if no siblings)
-                if block is not None:
-                    parent.remove(block)
-
+                block.getparent().remove(block)
                return True
-        else:
-            return False
-
-    # handle the case when From: block goes right after e.g. <hr>
-    # and not enclosed in some tag
-    block = html_message.xpath(
-        ("//*[starts-with(mg:tail(), 'From:')]|"
-         "//*[starts-with(mg:tail(), 'Date:')]"))
-    if block:
-        block = block[0]
-
-        if RE_FWD.match(block.getparent().text or ''):
-            return False
-        
-        while(block.getnext() is not None):
-            block.getparent().remove(block.getnext())
-        block.getparent().remove(block)
-        return True
-
-def cut_zimbra_quote(html_message):
-    zDivider = html_message.xpath('//hr[@data-marker="__DIVIDER__"]')
-    if zDivider:
-        zDivider[0].getparent().remove(zDivider[0])
-        return True
+            else:
+                block = block.getparent()
+    else:
+        # handle the case when From: block goes right after e.g. <hr>
+        # and not enclosed in some tag
+        block = html_message.xpath(
+            ("//*[starts-with(mg:tail(), 'From:')]|"
+             "//*[starts-with(mg:tail(), 'Date:')]"))
+        if block:
+            block = block[0]
+            while(block.getnext() is not None):
+                block.getparent().remove(block.getnext())
+            block.getparent().remove(block)
+            return True
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -10,8 +10,9 @@ import logging
 from copy import deepcopy

 from lxml import html, etree
+import html2text

-from talon.utils import get_delimiter, html_to_text
+from talon.utils import get_delimiter
 from talon import html_quotations


@@ -21,7 +22,7 @@ log = logging.getLogger(__name__)
 RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)

 RE_ON_DATE_SMB_WROTE = re.compile(
-    u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
+    u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
        # Beginning of the line
        u'|'.join((
            # English
@@ -31,13 +32,7 @@ RE_ON_DATE_SMB_WROTE = re.compile(
            # Polish
            'W dniu',
            # Dutch
-            'Op',
-            # German
-            'Am',
-            # Norwegian
-            u'På',
-            # Swedish, Danish
-            'Den',
+            'Op'
        )),
        # Date and sender separator
        u'|'.join((
@@ -55,28 +50,18 @@ RE_ON_DATE_SMB_WROTE = re.compile(
            # Polish
            u'napisał',
            # Dutch
-            'schreef','verzond','geschreven',
-            # German
-            'schrieb',
-            # Norwegian, Swedish
-            'skrev',
+            'schreef','verzond','geschreven'
        ))
    ))
 # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
 RE_ON_DATE_WROTE_SMB = re.compile(
-    u'(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
+    u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format(
        # Beginning of the line
-        u'|'.join((
        	'Op',
-        	#German
-        	'Am'
-        )),
        # Ending of the line
        u'|'.join((
            # Dutch
-            'schreef','verzond','geschreven',
-            # German
-            'schrieb'
+            'schreef','verzond','geschreven'
        ))
    )
    )
@@ -107,7 +92,7 @@ RE_EMPTY_QUOTATION = re.compile(
    (
        # quotation border: splitter line or a number of quotation marker lines
        (?:
-            (?:se*)+
+            s
            |
            (?:me*){2,}
        )
@@ -130,27 +115,20 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
 RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format(
    u'|'.join((
        # "From" in different languages.
-        'From', 'Van', 'De', 'Von', 'Fra', u'Från',
+        'From', 'Van', 'De', 'Von', 'Fra',
        # "Date" in different languages.
-        'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
+        'Date', 'Datum', u'Envoyé'
    ))), re.I)

 SPLITTER_PATTERNS = [
    RE_ORIGINAL_MESSAGE,
+    # <date> <person>
+    re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE),
    RE_ON_DATE_SMB_WROTE,
    RE_ON_DATE_WROTE_SMB,
    RE_FROM_COLON_OR_DATE_COLON,
-    # 02.04.2012 14:20 пользователь "bob@example.com" <
-    # bob@xxx.mailgun.org> написал:
-    re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S),
-    # 2014-10-17 11:28 GMT+03:00 Bob <
-    # bob@example.com>:
-    re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S),
-    # Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>:
    re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
-               '( \S+){3,6}@\S+:'),
-    # Sent from Samsung MobileName <address@example.com> wrote:
-    re.compile('Sent from Samsung .*@.*> wrote')
+               '( \S+){3,6}@\S+:')
    ]


@@ -203,7 +181,6 @@ def mark_message_lines(lines):
        else:
            # in case splitter is spread across several lines
            splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
-
            if splitter:
                # append as many splitter markers as lines in splitter
                splitter_lines = splitter.group().splitlines()
@@ -316,8 +293,12 @@ def extract_from_plain(msg_body):

    delimiter = get_delimiter(msg_body)
    msg_body = preprocess(msg_body, delimiter)
+    lines = msg_body.splitlines()
+
    # don't process too long messages
-    lines = msg_body.splitlines()[:MAX_LINES_COUNT]
+    if len(lines) > MAX_LINES_COUNT:
+        return stripped_text
+
    markers = mark_message_lines(lines)
    lines = process_marked_lines(lines, markers)

@@ -343,28 +324,43 @@ def extract_from_html(msg_body):
    then checking deleted checkpoints,
    then deleting necessary tags.
    """
+
    if msg_body.strip() == '':
        return msg_body

-    msg_body = msg_body.replace('\r\n', '').replace('\n', '')
    html_tree = html.document_fromstring(
        msg_body,
        parser=html.HTMLParser(encoding="utf-8")
    )
+
    cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
-                      html_quotations.cut_zimbra_quote(html_tree) or
                      html_quotations.cut_blockquote(html_tree) or
                      html_quotations.cut_microsoft_quote(html_tree) or
                      html_quotations.cut_by_id(html_tree) or
                      html_quotations.cut_from_block(html_tree)
                      )
+
    html_tree_copy = deepcopy(html_tree)

    number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
    quotation_checkpoints = [False] * number_of_checkpoints
    msg_with_checkpoints = html.tostring(html_tree)
-    plain_text = html_to_text(msg_with_checkpoints)
-    plain_text = preprocess(plain_text, '\n', content_type='text/html')
+
+    h = html2text.HTML2Text()
+    h.body_width = 0  # generate plain text without wrap
+
+    # html2text adds unnecessary star symbols. Remove them.
+    # Mask star symbols
+    msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432')
+    plain_text = h.handle(msg_with_checkpoints)
+    # Remove created star symbols
+    plain_text = plain_text.replace('*', '')
+    # Unmask saved star symbols
+    plain_text = plain_text.replace('3423oorkg432', '*')
+
+    delimiter = get_delimiter(plain_text)
+
+    plain_text = preprocess(plain_text, delimiter, content_type='text/html')
    lines = plain_text.splitlines()

    # Don't process too long messages
@@ -386,6 +382,7 @@ def extract_from_html(msg_body):
    return_flags = []
    process_marked_lines(lines, markers, return_flags)
    lines_were_deleted, first_deleted, last_deleted = return_flags
+
    if lines_were_deleted:
        #collect checkpoints from deleted lines
        for i in xrange(first_deleted, last_deleted):
--- a/talon/signature/data/classifier
+++ b/talon/signature/data/classifier
--- a/talon/signature/data/classifier_02.npy
+++ b/talon/signature/data/classifier_02.npy
--- a/talon/signature/data/classifier_03.npy
+++ b/talon/signature/data/classifier_03.npy
--- a/talon/signature/learning/helpers.py
+++ b/talon/signature/learning/helpers.py
@@ -16,7 +16,7 @@ from talon.signature.constants import SIGNATURE_MAX_LINES

 rc = re.compile

-RE_EMAIL = rc('\S@\S')
+RE_EMAIL = rc('@')
 RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
 RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')

@@ -120,7 +120,7 @@ def contains_sender_names(sender):
    names = names or sender
    if names != '':
        return binary_regex_search(re.compile(names))
-    return lambda s: 0
+    return lambda s: False


 def extract_names(sender):
@@ -134,7 +134,7 @@ def extract_names(sender):
    >>> extract_names('')
    []
    """
-    sender = to_unicode(sender, precise=True)
+    sender = to_unicode(sender)
    # Remove non-alphabetical characters
    sender = "".join([char if char.isalpha() else ' ' for char in sender])
    # Remove too short words and words from "black" list i.e.
@@ -161,7 +161,7 @@ def categories_percent(s, categories):
    50.0
    '''
    count = 0
-    s = to_unicode(s, precise=True)
+    s = to_unicode(s)
    for c in s:
        if unicodedata.category(c) in categories:
            count += 1
@@ -181,7 +181,7 @@ def punctuation_percent(s):

 def capitalized_words_percent(s):
    '''Returns capitalized words percent.'''
-    s = to_unicode(s, precise=True)
+    s = to_unicode(s)
    words = re.split('\s', s)
    words = [w for w in words if w.strip()]
    capitalized_words_counter = 0
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -2,16 +2,13 @@

 import logging
 from random import shuffle
-import chardet
-import cchardet
-import regex as re
-
-from lxml import html
-from lxml.cssselect import CSSSelector

 from talon.constants import RE_DELIMITER


+log = logging.getLogger(__name__)
+
+
 def safe_format(format_string, *args, **kwargs):
    """
    Helper: formats string with any combination of bytestrings/unicode
@@ -45,42 +42,12 @@ def to_unicode(str_or_unicode, precise=False):
        u'привет'
    If `precise` flag is True, tries to guess the correct encoding first.
    """
-    encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8'
+    encoding = detect_encoding(str_or_unicode) if precise else 'utf-8'
    if isinstance(str_or_unicode, str):
        return unicode(str_or_unicode, encoding, 'replace')
    return str_or_unicode


-def detect_encoding(string):
-    """
-    Tries to detect the encoding of the passed string.
-
-    Defaults to UTF-8.
-    """
-    try:
-        detected = chardet.detect(string)
-        if detected:
-            return detected.get('encoding') or 'utf-8'
-    except Exception, e:
-        pass
-    return 'utf-8'
-
-
-def quick_detect_encoding(string):
-    """
-    Tries to detect the encoding of the passed string.
-
-    Uses cchardet. Fallbacks to detect_encoding.
-    """
-    try:
-        detected = cchardet.detect(string)
-        if detected:
-            return detected.get('encoding') or detect_encoding(string)
-    except Exception, e:
-        pass
-    return detect_encoding(string)
-
-
 def to_utf8(str_or_unicode):
    """
    Safely returns a UTF-8 version of a given string
@@ -107,81 +74,3 @@ def get_delimiter(msg_body):
        delimiter = '\n'

    return delimiter
-
-
-def html_to_text(string):
-    """
-    Dead-simple HTML-to-text converter:
-        >>> html_to_text("one<br>two<br>three")
-        >>> "one\ntwo\nthree"
-
-    NOTES:
-        1. the string is expected to contain UTF-8 encoded HTML!
-        2. returns utf-8 encoded str (not unicode)
-    """
-    s = _prepend_utf8_declaration(string)
-    s = s.replace("\n", "")
-
-    tree = html.fromstring(s)
-
-    for style in CSSSelector('style')(tree):
-        style.getparent().remove(style)
-
-    for c in tree.xpath('//comment()'):
-        c.getparent().remove(c)
-
-    text   = ""
-    for el in tree.iter():
-        el_text = (el.text or '') + (el.tail or '')
-        if len(el_text) > 1:
-            if el.tag in _BLOCKTAGS:
-                text += "\n"
-            if el.tag == 'li':
-                text += "  * "
-            text += el_text.strip() + " "
-
-            # add href to the output
-            href = el.attrib.get('href')
-            if href:
-                text += "(%s) " % href
-
-        if el.tag in _HARDBREAKS and text and not text.endswith("\n"):
-            text += "\n"
-
-    retval = _rm_excessive_newlines(text)
-    return _encode_utf8(retval)
-
-
-def _contains_charset_spec(s):
-    """Return True if the first 4KB contain charset spec
-    """
-    return s.lower().find('html; charset=', 0, 4096) != -1
-
-
-def _prepend_utf8_declaration(s):
-    """Prepend 'utf-8' encoding declaration if the first 4KB don't have any
-    """
-    return s if _contains_charset_spec(s) else _UTF8_DECLARATION + s
-
-
-def _rm_excessive_newlines(s):
-    """Remove excessive newlines that often happen due to tons of divs
-    """
-    return _RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip()
-
-
-def _encode_utf8(s):
-    """Encode in 'utf-8' if unicode
-    """
-    return s.encode('utf-8') if isinstance(s, unicode) else s
-
-
-_UTF8_DECLARATION = ('<meta http-equiv="Content-Type" content="text/html;'
-                     'charset=utf-8">')
-
-
-_BLOCKTAGS  = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
-_HARDBREAKS = ['br', 'hr', 'tr']
-
-
-_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
--- a/tests/fixtures/html_replies/hotmail.html
+++ b/tests/fixtures/html_replies/hotmail.html
@@ -1,4 +1,3 @@
-<?xml version="1.0" encoding="UTF-8"?>
 <html>
 <head>
 <style><!--
--- a/tests/fixtures/html_replies/ms_outlook_2010.html
+++ b/tests/fixtures/html_replies/ms_outlook_2010.html
@@ -1,87 +0,0 @@
-<html>
-<head>
-<meta http-equiv="Content-Type" content="text/html; charset=iso-2022-jp">
-<meta name="Generator" content="Microsoft Word 14 (filtered medium)">
-<style><!--
-/* Font Definitions */
-@font-face
-	{font-family:Calibri;
-	panose-1:2 15 5 2 2 2 4 3 2 4;}
-@font-face
-	{font-family:Tahoma;
-	panose-1:2 11 6 4 3 5 4 4 2 4;}
-/* Style Definitions */
-p.MsoNormal, li.MsoNormal, div.MsoNormal
-	{margin:0in;
-	margin-bottom:.0001pt;
-	font-size:12.0pt;
-	font-family:"Times New Roman","serif";}
-h3
-	{mso-style-priority:9;
-	mso-style-link:"Heading 3 Char";
-	mso-margin-top-alt:auto;
-	margin-right:0in;
-	mso-margin-bottom-alt:auto;
-	margin-left:0in;
-	font-size:13.5pt;
-	font-family:"Times New Roman","serif";
-	font-weight:bold;}
-a:link, span.MsoHyperlink
-	{mso-style-priority:99;
-	color:blue;
-	text-decoration:underline;}
-a:visited, span.MsoHyperlinkFollowed
-	{mso-style-priority:99;
-	color:purple;
-	text-decoration:underline;}
-p
-	{mso-style-priority:99;
-	mso-margin-top-alt:auto;
-	margin-right:0in;
-	mso-margin-bottom-alt:auto;
-	margin-left:0in;
-	font-size:12.0pt;
-	font-family:"Times New Roman","serif";}
-span.Heading3Char
-	{mso-style-name:"Heading 3 Char";
-	mso-style-priority:9;
-	mso-style-link:"Heading 3";
-	font-family:"Cambria","serif";
-	color:#4F81BD;
-	font-weight:bold;}
-span.EmailStyle19
-	{mso-style-type:personal-reply;
-	font-family:"Calibri","sans-serif";
-	color:#1F497D;}
-.MsoChpDefault
-	{mso-style-type:export-only;
-	font-family:"Calibri","sans-serif";}
-@page WordSection1
-	{size:8.5in 11.0in;
-	margin:1.0in 1.0in 1.0in 1.0in;}
-div.WordSection1
-	{page:WordSection1;}
--></style><!--[if gte mso 9]><xml>
-<o:shapedefaults v:ext="edit" spidmax="1026" />
-</xml><![endif]--><!--[if gte mso 9]><xml>
-<o:shapelayout v:ext="edit">
-<o:idmap v:ext="edit" data="1" />
-</o:shapelayout></xml><![endif]-->
-</head>
-<body lang="EN-US" link="blue" vlink="purple">
-<div class="WordSection1">
-<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Hi. I am fine.<o:p></o:p></span></p>
-<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Thanks,<o:p></o:p></span></p>
-<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Alex<o:p></o:p></span></p>
-<p class="MsoNormal"><b><span style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;">From:</span></b><span style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;"> Foo [mailto:foo@bar.com]
-<b>On Behalf Of </b>baz@bar.com<br>
-<b>Sent:</b> Monday, January 01, 2000 12:00 AM<br>
-<b>To:</b> john@bar.com<br>
-<b>Cc:</b> jane@bar.io<br>
-<b>Subject:</b> Conversation<o:p></o:p></span></p>
-<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
-<p>Hello! How are you?<o:p></o:p></p>
-<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
-</div>
-</body>
-</html>
--- a/tests/fixtures/standard_replies/apple_mail_2.eml
+++ b/tests/fixtures/standard_replies/apple_mail_2.eml
@@ -1,19 +0,0 @@
-Content-Type: text/plain;
-	charset=us-ascii
-Mime-Version: 1.0 (Mac OS X Mail 8.2 \(2104\))
-Subject: Re: Hello there
-X-Universally-Unique-Identifier: 85B1075D-5841-46A9-8565-FCB287A93AC4
-From: Adam Renberg <adam@tictail.com>
-In-Reply-To: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
-Date: Sat, 22 Aug 2015 19:22:20 +0200
-Content-Transfer-Encoding: 7bit
-X-Smtp-Server: smtp.gmail.com:adam@tictail.com
-Message-Id: <68001B29-8EA4-444C-A894-0537D2CA5208@tictail.com>
-References: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
-To: Adam Renberg <tgwizard@gmail.com>
-
-Hello
-> On 22 Aug 2015, at 19:21, Adam Renberg <tgwizard@gmail.com> wrote:
->
-> Hi there!
-
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -5,7 +5,9 @@ from . fixtures import *

 import regex as re

-from talon import quotations, utils as u
+from talon import quotations
+
+import html2text


 RE_WHITESPACE = re.compile("\s")
@@ -43,25 +45,7 @@ def test_quotation_splitter_outside_blockquote():
  </div>
 </blockquote>
 """
-    eq_("<html><body><p>Reply</p></body></html>",
-        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
-
-
-def test_regular_blockquote():
-    msg_body = """Reply
-<blockquote>Regular</blockquote>
-
-<div>
-  On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
-</div>
-
-<blockquote>
-  <div>
-    <blockquote>Nested</blockquote>
-  </div>
-</blockquote>
-"""
-    eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote></body></html>",
+    eq_("<html><body><p>Reply</p><div></div></body></html>",
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))


@@ -131,29 +115,6 @@ def test_gmail_quote():
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))


-def test_gmail_quote_compact():
-    msg_body = 'Reply' \
-               '<div class="gmail_quote">' \
-               '<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:' \
-               '<div>Test</div>' \
-               '</div>' \
-               '</div>'
-    eq_("<html><body><p>Reply</p></body></html>",
-        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
-
-
-def test_gmail_quote_blockquote():
-    msg_body = """Message
-<blockquote class="gmail_quote">
-  <div class="gmail_default">
-    My name is William Shakespeare.
-    <br/>
-  </div>
-</blockquote>"""
-    eq_(RE_WHITESPACE.sub('', msg_body),
-        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
-
-
 def test_unicode_in_reply():
    msg_body = u"""Reply \xa0 \xa0 Text<br>

@@ -161,7 +122,7 @@ def test_unicode_in_reply():
  <br>
 </div>

-<blockquote>
+<blockquote class="gmail_quote">
  Quote
 </blockquote>""".encode("utf-8")

@@ -279,35 +240,26 @@ def test_reply_separated_by_hr():
            '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR)))


-def test_from_block_and_quotations_in_separate_divs():
-    msg_body = '''
-Reply
-<div>
-  <hr/>
-  <div>
-    <font>
-      <b>From: bob@example.com</b>
-      <b>Date: Thu, 24 Mar 2016 08:07:12 -0700</b>
-    </font>
-  </div>
-  <div>
-    Quoted message
-  </div>
-</div>
-'''
-    eq_('<html><body><p>Reply</p><div><hr></div></body></html>',
-        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
+RE_REPLY = re.compile(r"^Hi\. I am fine\.\s*\n\s*Thanks,\s*\n\s*Alex\s*$")


 def extract_reply_and_check(filename):
    f = open(filename)

-    msg_body = f.read()
+    msg_body = f.read().decode("utf-8")
    reply = quotations.extract_from_html(msg_body)
-    plain_reply = u.html_to_text(reply)

-    eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"),
-        RE_WHITESPACE.sub('', plain_reply))
+    h = html2text.HTML2Text()
+    h.body_width = 0
+    plain_reply = h.handle(reply)
+
+    #remove &nbsp; spaces
+    plain_reply = plain_reply.replace(u'\xa0', u' ')
+
+    if RE_REPLY.match(plain_reply):
+        eq_(1, 1)
+    else:
+        eq_("Hi. I am fine.\n\nThanks,\nAlex", plain_reply)


 def test_gmail_reply():
@@ -330,10 +282,6 @@ def test_ms_outlook_2007_reply():
    extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2007.html")


-def test_ms_outlook_2010_reply():
-    extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2010.html")
-
-
 def test_thunderbird_reply():
    extract_reply_and_check("tests/fixtures/html_replies/thunderbird.html")

@@ -344,37 +292,3 @@ def test_windows_mail_reply():

 def test_yandex_ru_reply():
    extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html")
-
-
-def test_CRLF():
-    """CR is not converted to '&#13;'
-    """
-    symbol = '&#13;'
-    extracted = quotations.extract_from_html('<html>\r\n</html>')
-    assert_false(symbol in extracted)
-    eq_('<html></html>', RE_WHITESPACE.sub('', extracted))
-
-    msg_body = """Reply
-<blockquote>
-
-  <div>
-    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
-  </div>
-
-  <div>
-    Test
-  </div>
-
-</blockquote>"""
-    msg_body = msg_body.replace('\n', '\r\n')
-    extracted = quotations.extract_from_html(msg_body)
-    assert_false(symbol in extracted)    
-    eq_("<html><body><p>Reply</p></body></html>",
-        RE_WHITESPACE.sub('', extracted))
-
-
-def test_gmail_forwarded_msg():
-    msg_body = """<div dir="ltr"><br><div class="gmail_quote">---------- Forwarded message ----------<br>From: <b class="gmail_sendername">Bob</b> <span dir="ltr">&lt;<a href="mailto:bob@example.com">bob@example.com</a>&gt;</span><br>Date: Fri, Feb 11, 2010 at 5:59 PM<br>Subject: Bob WFH today<br>To: Mary &lt;<a href="mailto:mary@example.com">mary@example.com</a>&gt;<br><br><br><div dir="ltr">eom</div>
-</div><br></div>"""
-    extracted = quotations.extract_from_html(msg_body)
-    eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
--- a/tests/signature/learning/featurespace_test.py
+++ b/tests/signature/learning/featurespace_test.py
@@ -6,9 +6,7 @@ from talon.signature.learning import featurespace as fs


 def test_apply_features():
-    s = '''This is John Doe
-
-Tuesday @3pm suits. I'll chat to you then.
+    s = '''John Doe

 VP Research and Development, Xxxx Xxxx Xxxxx

@@ -21,12 +19,11 @@ john@example.com'''
    # note that we don't consider the first line because signatures don't
    # usually take all the text, empty lines are not considered
    eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
-                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

-    with patch.object(fs, 'SIGNATURE_MAX_LINES', 5):
+    with patch.object(fs, 'SIGNATURE_MAX_LINES', 4):
        features = fs.features(sender)
        new_result = fs.apply_features(s, features)
        # result remains the same because we don't consider empty lines
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -12,11 +12,11 @@ from talon import quotations
@patch.object(quotations, 'MAX_LINES_COUNT', 1)
 def test_too_many_lines():
    msg_body = """Test reply
-Hi
+
 -----Original Message-----

 Test"""
-    eq_("Test reply", quotations.extract_from_plain(msg_body))
+    eq_(msg_body, quotations.extract_from_plain(msg_body))


 def test_pattern_on_date_somebody_wrote():
@@ -32,19 +32,6 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote:
    eq_("Test reply", quotations.extract_from_plain(msg_body))


-def test_pattern_sent_from_samsung_smb_wrote():
-    msg_body = """Test reply
-
-Sent from Samsung MobileName <address@example.com> wrote:
-
->
-> Test
->
-> Roman"""
-
-    eq_("Test reply", quotations.extract_from_plain(msg_body))
-
-
 def test_pattern_on_date_wrote_somebody():
    eq_('Lorem', quotations.extract_from_plain(
    """Lorem
@@ -67,18 +54,6 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote:
    eq_("Test reply", quotations.extract_from_plain(msg_body))


-def test_date_time_email_splitter():
-    msg_body = """Test reply
-
-2014-10-17 11:28 GMT+03:00 Postmaster <
-postmaster@sandboxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.mailgun.org>:
-
-> First from site
->
-    """
-    eq_("Test reply", quotations.extract_from_plain(msg_body))
-
-
 def test_pattern_on_date_somebody_wrote_allows_space_in_front():
    msg_body = """Thanks Thanmai
 On Mar 8, 2012 9:59 AM, "Example.com" <
@@ -336,33 +311,6 @@ Emne: The manager has commented on your Loop
 Blah-blah-blah
 """))

-def test_swedish_from_block():
-    eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
-    u"""Allo! Follow up MIME!
-Från: Anno Sportel [mailto:anno.spoel@hsbcssad.com]
-Skickat: den 26 augusti 2015 14:45
-Till: Isacson Leiff
-Ämne: RE: Week 36
-
-Blah-blah-blah
-"""))
-
-def test_swedish_from_line():
-    eq_('Lorem', quotations.extract_from_plain(
-    """Lorem
-Den 14 september, 2015 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
-
-Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
-"""))
-
-def test_norwegian_from_line():
-    eq_('Lorem', quotations.extract_from_plain(
-    u"""Lorem
-På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
-
-Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
-"""))
-
 def test_dutch_from_block():
    eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
    """Gluten-free culpa lo-fi et nesciunt nostrud. 
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -1,107 +1,9 @@
-# coding:utf-8
-
 from . import *

-from talon import utils as u
-import cchardet
+from talon import utils


 def test_get_delimiter():
-    eq_('\r\n', u.get_delimiter('abc\r\n123'))
-    eq_('\n', u.get_delimiter('abc\n123'))
-    eq_('\n', u.get_delimiter('abc'))
-
-
-def test_unicode():
-    eq_ (u'hi', u.to_unicode('hi'))
-    eq_ (type(u.to_unicode('hi')), unicode )
-    eq_ (type(u.to_unicode(u'hi')), unicode )
-    eq_ (type(u.to_unicode('привет')), unicode )
-    eq_ (type(u.to_unicode(u'привет')), unicode )
-    eq_ (u"привет", u.to_unicode('привет'))
-    eq_ (u"привет", u.to_unicode(u'привет'))
-    # some latin1 stuff
-    eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True))
-
-
-def test_detect_encoding():
-    eq_ ('ascii', u.detect_encoding('qwe').lower())
-    eq_ ('iso-8859-2', u.detect_encoding('Versi\xf3n').lower())
-    eq_ ('utf-8', u.detect_encoding('привет').lower())
-    # fallback to utf-8
-    with patch.object(u.chardet, 'detect') as detect:
-        detect.side_effect = Exception
-        eq_ ('utf-8', u.detect_encoding('qwe').lower())
-
-
-def test_quick_detect_encoding():
-    eq_ ('ascii', u.quick_detect_encoding('qwe').lower())
-    eq_ ('windows-1252', u.quick_detect_encoding('Versi\xf3n').lower())
-    eq_ ('utf-8', u.quick_detect_encoding('привет').lower())
-
-
-@patch.object(cchardet, 'detect')
-@patch.object(u, 'detect_encoding')
-def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect):
-    cchardet_detect.return_value = {'encoding': 'ascii'}
-    eq_('ascii', u.quick_detect_encoding("qwe"))
-    cchardet_detect.assert_called_once_with("qwe")
-
-    # fallback to detect_encoding
-    cchardet_detect.return_value = {}
-    detect_encoding.return_value = 'utf-8'
-    eq_('utf-8', u.quick_detect_encoding("qwe"))
-
-    # exception
-    detect_encoding.reset_mock()
-    cchardet_detect.side_effect = Exception()
-    detect_encoding.return_value = 'utf-8'
-    eq_('utf-8', u.quick_detect_encoding("qwe"))
-    ok_(detect_encoding.called)
-
-
-def test_html_to_text():
-    html = """<body>
-<p>Hello world!</p>
-<br>
-<ul>
-<li>One!</li>
-<li>Two</li>
-</ul>
-<p>
-Haha
-</p>
-</body>"""
-    text = u.html_to_text(html)
-    eq_("Hello world! \n\n  * One! \n  * Two \nHaha", text)
-    eq_("привет!", u.html_to_text("<b>привет!</b>"))
-
-    html = '<body><br/><br/>Hi</body>'
-    eq_ ('Hi', u.html_to_text(html))
-
-    html = """Hi
-<style type="text/css">
-
-div, p, li {
-
-font: 13px 'Lucida Grande', Arial, sans-serif;
-
-}
-</style>
-
-<style type="text/css">
-
-h1 {
-
-font: 13px 'Lucida Grande', Arial, sans-serif;
-
-}
-</style>"""
-    eq_ ('Hi', u.html_to_text(html))
-
-    html = """<div>
-<!-- COMMENT 1 -->
-<span>TEXT 1</span>
-<p>TEXT 2 <!-- COMMENT 2 --></p>
-</div>"""
-    eq_('TEXT 1 \nTEXT 2', u.html_to_text(html))
+    eq_('\r\n', utils.get_delimiter('abc\r\n123'))
+    eq_('\n', utils.get_delimiter('abc\n123'))
+    eq_('\n', utils.get_delimiter('abc'))