Merge pull request #94 from mailgun/obukhov-sergey-patch-1

Update README.rst
bump version
2016-05-31 20:16:13 -07:00 · 2016-05-31 18:42:47 -07:00 · 2016-05-31 18:39:07 -07:00 · 2016-05-31 18:15:28 -07:00 · 2016-05-31 16:53:41 -07:00 · 2016-05-31 16:50:35 -07:00
16 changed files with 478 additions and 80 deletions
--- a/README.rst
+++ b/README.rst
@@ -95,7 +95,7 @@ classifiers. The core of machine learning algorithm lays in
 apply to a message (``featurespace.py``), how data sets are built
 (``dataset.py``), classifier’s interface (``classifier.py``).

-The data used for training is taken from our personal email
+Currently the data used for training is taken from our personal email
 conversations and from `ENRON`_ dataset. As a result of applying our set
 of features to the dataset we provide files ``classifier`` and
 ``train.data`` that don’t have any personal information but could be
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages


 setup(name='talon',
-      version='1.0.7',
+      version='1.2.9',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
@@ -16,12 +16,12 @@ setup(name='talon',
      install_requires=[
          "lxml>=2.3.3",
          "regex>=1",
-          "html2text",
          "numpy",
          "scipy",
          "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
          'chardet>=1.0.1',
          'cchardet>=0.3.5',
+          'cssselect'
          ],
      tests_require=[
          "mock",
--- a/talon/html_quotations.py
+++ b/talon/html_quotations.py
@@ -12,6 +12,7 @@ CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX)

 # HTML quote indicators (tag ids)
 QUOTE_IDS = ['OLK_SRC_BODY_SECTION']
+RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)


 def add_checkpoint(html_note, counter):
@@ -76,8 +77,8 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints):

 def cut_gmail_quote(html_message):
    ''' Cuts the outermost block element with class gmail_quote. '''
-    gmail_quote = html_message.cssselect('.gmail_quote')
-    if gmail_quote:
+    gmail_quote = html_message.cssselect('div.gmail_quote')
+    if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)):
        gmail_quote[0].getparent().remove(gmail_quote[0])
        return True

@@ -85,9 +86,12 @@ def cut_gmail_quote(html_message):
 def cut_microsoft_quote(html_message):
    ''' Cuts splitter block and all following blocks. '''
    splitter = html_message.xpath(
-        #outlook 2007, 2010
+        #outlook 2007, 2010 (international)
        "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
        "padding:3.0pt 0cm 0cm 0cm']|"
+        #outlook 2007, 2010 (american)
+        "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
+        "padding:3.0pt 0in 0in 0in']|"
        #windows mail
        "//div[@style='padding-top: 5px; "
        "border-top-color: rgb(229, 229, 229); "
@@ -138,8 +142,12 @@ def cut_by_id(html_message):


 def cut_blockquote(html_message):
-    ''' Cuts the last non-nested blockquote with wrapping elements. '''
-    quote = html_message.xpath('(.//blockquote)[not(ancestor::blockquote)][last()]')
+    ''' Cuts the last non-nested blockquote with wrapping elements.'''
+    quote = html_message.xpath(
+        '(.//blockquote)'
+        '[not(@class="gmail_quote") and not(ancestor::blockquote)]'
+        '[last()]')
+
    if quote:
        quote = quote[0]
        quote.getparent().remove(quote)
@@ -155,13 +163,40 @@ def cut_from_block(html_message):

    if block:
        block = block[-1]
+        parent_div = None
        while block.getparent() is not None:
            if block.tag == 'div':
-                block.getparent().remove(block)
+                parent_div = block
+                break
+            block = block.getparent()
+        if parent_div is not None:
+            maybe_body = parent_div.getparent()
+            # In cases where removing this enclosing div will remove all
+            # content, we should assume the quote is not enclosed in a tag.
+            parent_div_is_all_content = (
+                maybe_body is not None and maybe_body.tag == 'body' and
+                len(maybe_body.getchildren()) == 1)
+
+            if not parent_div_is_all_content:
+                parent = block.getparent()
+                next_sibling = block.getnext()
+
+                # remove all tags after found From block
+                # (From block and quoted message are in separate divs)
+                while next_sibling is not None:
+                    parent.remove(block)
+                    block = next_sibling
+                    next_sibling = block.getnext()
+
+                # remove the last sibling (or the
+                # From block if no siblings)
+                if block is not None:
+                    parent.remove(block)
+
                return True
        else:
-                block = block.getparent()
-    else:
+            return False
+
    # handle the case when From: block goes right after e.g. <hr>
    # and not enclosed in some tag
    block = html_message.xpath(
@@ -169,7 +204,17 @@ def cut_from_block(html_message):
         "//*[starts-with(mg:tail(), 'Date:')]"))
    if block:
        block = block[0]
+
+        if RE_FWD.match(block.getparent().text or ''):
+            return False
+        
        while(block.getnext() is not None):
            block.getparent().remove(block.getnext())
        block.getparent().remove(block)
        return True
+
+def cut_zimbra_quote(html_message):
+    zDivider = html_message.xpath('//hr[@data-marker="__DIVIDER__"]')
+    if zDivider:
+        zDivider[0].getparent().remove(zDivider[0])
+        return True
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -10,9 +10,8 @@ import logging
 from copy import deepcopy

 from lxml import html, etree
-import html2text

-from talon.utils import get_delimiter
+from talon.utils import get_delimiter, html_to_text
 from talon import html_quotations


@@ -22,7 +21,7 @@ log = logging.getLogger(__name__)
 RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)

 RE_ON_DATE_SMB_WROTE = re.compile(
-    u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
+    u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
        # Beginning of the line
        u'|'.join((
            # English
@@ -34,7 +33,11 @@ RE_ON_DATE_SMB_WROTE = re.compile(
            # Dutch
            'Op',
            # German
-            'Am'
+            'Am',
+            # Norwegian
+            u'På',
+            # Swedish, Danish
+            'Den',
        )),
        # Date and sender separator
        u'|'.join((
@@ -54,12 +57,14 @@ RE_ON_DATE_SMB_WROTE = re.compile(
            # Dutch
            'schreef','verzond','geschreven',
            # German
-            'schrieb'
+            'schrieb',
+            # Norwegian, Swedish
+            'skrev',
        ))
    ))
 # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
 RE_ON_DATE_WROTE_SMB = re.compile(
-    u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
+    u'(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
        # Beginning of the line
        u'|'.join((
        	'Op',
@@ -102,7 +107,7 @@ RE_EMPTY_QUOTATION = re.compile(
    (
        # quotation border: splitter line or a number of quotation marker lines
        (?:
-            s
+            (?:se*)+
            |
            (?:me*){2,}
        )
@@ -125,20 +130,27 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
 RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format(
    u'|'.join((
        # "From" in different languages.
-        'From', 'Van', 'De', 'Von', 'Fra',
+        'From', 'Van', 'De', 'Von', 'Fra', u'Från',
        # "Date" in different languages.
-        'Date', 'Datum', u'Envoyé'
+        'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
    ))), re.I)

 SPLITTER_PATTERNS = [
    RE_ORIGINAL_MESSAGE,
-    # <date> <person>
-    re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE),
    RE_ON_DATE_SMB_WROTE,
    RE_ON_DATE_WROTE_SMB,
    RE_FROM_COLON_OR_DATE_COLON,
+    # 02.04.2012 14:20 пользователь "bob@example.com" <
+    # bob@xxx.mailgun.org> написал:
+    re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S),
+    # 2014-10-17 11:28 GMT+03:00 Bob <
+    # bob@example.com>:
+    re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S),
+    # Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>:
    re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
-               '( \S+){3,6}@\S+:')
+               '( \S+){3,6}@\S+:'),
+    # Sent from Samsung MobileName <address@example.com> wrote:
+    re.compile('Sent from Samsung .*@.*> wrote')
    ]


@@ -331,43 +343,28 @@ def extract_from_html(msg_body):
    then checking deleted checkpoints,
    then deleting necessary tags.
    """
-
    if msg_body.strip() == '':
        return msg_body

+    msg_body = msg_body.replace('\r\n', '').replace('\n', '')
    html_tree = html.document_fromstring(
        msg_body,
        parser=html.HTMLParser(encoding="utf-8")
    )
-
    cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
+                      html_quotations.cut_zimbra_quote(html_tree) or
                      html_quotations.cut_blockquote(html_tree) or
                      html_quotations.cut_microsoft_quote(html_tree) or
                      html_quotations.cut_by_id(html_tree) or
                      html_quotations.cut_from_block(html_tree)
                      )
-
    html_tree_copy = deepcopy(html_tree)

    number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
    quotation_checkpoints = [False] * number_of_checkpoints
    msg_with_checkpoints = html.tostring(html_tree)
-
-    h = html2text.HTML2Text()
-    h.body_width = 0  # generate plain text without wrap
-
-    # html2text adds unnecessary star symbols. Remove them.
-    # Mask star symbols
-    msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432')
-    plain_text = h.handle(msg_with_checkpoints)
-    # Remove created star symbols
-    plain_text = plain_text.replace('*', '')
-    # Unmask saved star symbols
-    plain_text = plain_text.replace('3423oorkg432', '*')
-
-    delimiter = get_delimiter(plain_text)
-
-    plain_text = preprocess(plain_text, delimiter, content_type='text/html')
+    plain_text = html_to_text(msg_with_checkpoints)
+    plain_text = preprocess(plain_text, '\n', content_type='text/html')
    lines = plain_text.splitlines()

    # Don't process too long messages
@@ -389,7 +386,6 @@ def extract_from_html(msg_body):
    return_flags = []
    process_marked_lines(lines, markers, return_flags)
    lines_were_deleted, first_deleted, last_deleted = return_flags
-
    if lines_were_deleted:
        #collect checkpoints from deleted lines
        for i in xrange(first_deleted, last_deleted):
--- a/talon/signature/data/classifier
+++ b/talon/signature/data/classifier
--- a/talon/signature/data/classifier_02.npy
+++ b/talon/signature/data/classifier_02.npy
--- a/talon/signature/data/classifier_03.npy
+++ b/talon/signature/data/classifier_03.npy
--- a/talon/signature/learning/helpers.py
+++ b/talon/signature/learning/helpers.py
@@ -16,7 +16,7 @@ from talon.signature.constants import SIGNATURE_MAX_LINES

 rc = re.compile

-RE_EMAIL = rc('@')
+RE_EMAIL = rc('\S@\S')
 RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
 RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')

@@ -120,7 +120,7 @@ def contains_sender_names(sender):
    names = names or sender
    if names != '':
        return binary_regex_search(re.compile(names))
-    return lambda s: False
+    return lambda s: 0


 def extract_names(sender):
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -4,6 +4,10 @@ import logging
 from random import shuffle
 import chardet
 import cchardet
+import regex as re
+
+from lxml import html
+from lxml.cssselect import CSSSelector

 from talon.constants import RE_DELIMITER

@@ -58,7 +62,6 @@ def detect_encoding(string):
        if detected:
            return detected.get('encoding') or 'utf-8'
    except Exception, e:
-        print 11111111111, e
        pass
    return 'utf-8'

@@ -74,7 +77,6 @@ def quick_detect_encoding(string):
        if detected:
            return detected.get('encoding') or detect_encoding(string)
    except Exception, e:
-        print 222222222222, e
        pass
    return detect_encoding(string)

@@ -105,3 +107,81 @@ def get_delimiter(msg_body):
        delimiter = '\n'

    return delimiter
+
+
+def html_to_text(string):
+    """
+    Dead-simple HTML-to-text converter:
+        >>> html_to_text("one<br>two<br>three")
+        >>> "one\ntwo\nthree"
+
+    NOTES:
+        1. the string is expected to contain UTF-8 encoded HTML!
+        2. returns utf-8 encoded str (not unicode)
+    """
+    s = _prepend_utf8_declaration(string)
+    s = s.replace("\n", "")
+
+    tree = html.fromstring(s)
+
+    for style in CSSSelector('style')(tree):
+        style.getparent().remove(style)
+
+    for c in tree.xpath('//comment()'):
+        c.getparent().remove(c)
+
+    text   = ""
+    for el in tree.iter():
+        el_text = (el.text or '') + (el.tail or '')
+        if len(el_text) > 1:
+            if el.tag in _BLOCKTAGS:
+                text += "\n"
+            if el.tag == 'li':
+                text += "  * "
+            text += el_text.strip() + " "
+
+            # add href to the output
+            href = el.attrib.get('href')
+            if href:
+                text += "(%s) " % href
+
+        if el.tag in _HARDBREAKS and text and not text.endswith("\n"):
+            text += "\n"
+
+    retval = _rm_excessive_newlines(text)
+    return _encode_utf8(retval)
+
+
+def _contains_charset_spec(s):
+    """Return True if the first 4KB contain charset spec
+    """
+    return s.lower().find('html; charset=', 0, 4096) != -1
+
+
+def _prepend_utf8_declaration(s):
+    """Prepend 'utf-8' encoding declaration if the first 4KB don't have any
+    """
+    return s if _contains_charset_spec(s) else _UTF8_DECLARATION + s
+
+
+def _rm_excessive_newlines(s):
+    """Remove excessive newlines that often happen due to tons of divs
+    """
+    return _RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip()
+
+
+def _encode_utf8(s):
+    """Encode in 'utf-8' if unicode
+    """
+    return s.encode('utf-8') if isinstance(s, unicode) else s
+
+
+_UTF8_DECLARATION = ('<meta http-equiv="Content-Type" content="text/html;'
+                     'charset=utf-8">')
+
+
+_BLOCKTAGS  = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
+_HARDBREAKS = ['br', 'hr', 'tr']
+
+
+_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
--- a/tests/fixtures/html_replies/hotmail.html
+++ b/tests/fixtures/html_replies/hotmail.html
@@ -1,3 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
 <html>
 <head>
 <style><!--
--- a/tests/fixtures/html_replies/ms_outlook_2010.html
+++ b/tests/fixtures/html_replies/ms_outlook_2010.html
@@ -0,0 +1,87 @@
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=iso-2022-jp">
+<meta name="Generator" content="Microsoft Word 14 (filtered medium)">
+<style><!--
+/* Font Definitions */
+@font-face
+	{font-family:Calibri;
+	panose-1:2 15 5 2 2 2 4 3 2 4;}
+@font-face
+	{font-family:Tahoma;
+	panose-1:2 11 6 4 3 5 4 4 2 4;}
+/* Style Definitions */
+p.MsoNormal, li.MsoNormal, div.MsoNormal
+	{margin:0in;
+	margin-bottom:.0001pt;
+	font-size:12.0pt;
+	font-family:"Times New Roman","serif";}
+h3
+	{mso-style-priority:9;
+	mso-style-link:"Heading 3 Char";
+	mso-margin-top-alt:auto;
+	margin-right:0in;
+	mso-margin-bottom-alt:auto;
+	margin-left:0in;
+	font-size:13.5pt;
+	font-family:"Times New Roman","serif";
+	font-weight:bold;}
+a:link, span.MsoHyperlink
+	{mso-style-priority:99;
+	color:blue;
+	text-decoration:underline;}
+a:visited, span.MsoHyperlinkFollowed
+	{mso-style-priority:99;
+	color:purple;
+	text-decoration:underline;}
+p
+	{mso-style-priority:99;
+	mso-margin-top-alt:auto;
+	margin-right:0in;
+	mso-margin-bottom-alt:auto;
+	margin-left:0in;
+	font-size:12.0pt;
+	font-family:"Times New Roman","serif";}
+span.Heading3Char
+	{mso-style-name:"Heading 3 Char";
+	mso-style-priority:9;
+	mso-style-link:"Heading 3";
+	font-family:"Cambria","serif";
+	color:#4F81BD;
+	font-weight:bold;}
+span.EmailStyle19
+	{mso-style-type:personal-reply;
+	font-family:"Calibri","sans-serif";
+	color:#1F497D;}
+.MsoChpDefault
+	{mso-style-type:export-only;
+	font-family:"Calibri","sans-serif";}
+@page WordSection1
+	{size:8.5in 11.0in;
+	margin:1.0in 1.0in 1.0in 1.0in;}
+div.WordSection1
+	{page:WordSection1;}
+--></style><!--[if gte mso 9]><xml>
+<o:shapedefaults v:ext="edit" spidmax="1026" />
+</xml><![endif]--><!--[if gte mso 9]><xml>
+<o:shapelayout v:ext="edit">
+<o:idmap v:ext="edit" data="1" />
+</o:shapelayout></xml><![endif]-->
+</head>
+<body lang="EN-US" link="blue" vlink="purple">
+<div class="WordSection1">
+<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Hi. I am fine.<o:p></o:p></span></p>
+<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Thanks,<o:p></o:p></span></p>
+<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Alex<o:p></o:p></span></p>
+<p class="MsoNormal"><b><span style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;">From:</span></b><span style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;"> Foo [mailto:foo@bar.com]
+<b>On Behalf Of </b>baz@bar.com<br>
+<b>Sent:</b> Monday, January 01, 2000 12:00 AM<br>
+<b>To:</b> john@bar.com<br>
+<b>Cc:</b> jane@bar.io<br>
+<b>Subject:</b> Conversation<o:p></o:p></span></p>
+<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
+<p>Hello! How are you?<o:p></o:p></p>
+<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
+</div>
+</body>
+</html>
--- a/tests/fixtures/standard_replies/apple_mail_2.eml
+++ b/tests/fixtures/standard_replies/apple_mail_2.eml
@@ -0,0 +1,19 @@
+Content-Type: text/plain;
+	charset=us-ascii
+Mime-Version: 1.0 (Mac OS X Mail 8.2 \(2104\))
+Subject: Re: Hello there
+X-Universally-Unique-Identifier: 85B1075D-5841-46A9-8565-FCB287A93AC4
+From: Adam Renberg <adam@tictail.com>
+In-Reply-To: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
+Date: Sat, 22 Aug 2015 19:22:20 +0200
+Content-Transfer-Encoding: 7bit
+X-Smtp-Server: smtp.gmail.com:adam@tictail.com
+Message-Id: <68001B29-8EA4-444C-A894-0537D2CA5208@tictail.com>
+References: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
+To: Adam Renberg <tgwizard@gmail.com>
+
+Hello
+> On 22 Aug 2015, at 19:21, Adam Renberg <tgwizard@gmail.com> wrote:
+>
+> Hi there!
+
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -5,9 +5,7 @@ from . fixtures import *

 import regex as re

-from talon import quotations
-
-import html2text
+from talon import quotations, utils as u


 RE_WHITESPACE = re.compile("\s")
@@ -45,7 +43,7 @@ def test_quotation_splitter_outside_blockquote():
  </div>
 </blockquote>
 """
-    eq_("<html><body><p>Reply</p><div></div></body></html>",
+    eq_("<html><body><p>Reply</p></body></html>",
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))


@@ -63,7 +61,7 @@ def test_regular_blockquote():
  </div>
 </blockquote>
 """
-    eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote><div></div></body></html>",
+    eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote></body></html>",
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))


@@ -133,6 +131,29 @@ def test_gmail_quote():
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))


+def test_gmail_quote_compact():
+    msg_body = 'Reply' \
+               '<div class="gmail_quote">' \
+               '<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:' \
+               '<div>Test</div>' \
+               '</div>' \
+               '</div>'
+    eq_("<html><body><p>Reply</p></body></html>",
+        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
+
+
+def test_gmail_quote_blockquote():
+    msg_body = """Message
+<blockquote class="gmail_quote">
+  <div class="gmail_default">
+    My name is William Shakespeare.
+    <br/>
+  </div>
+</blockquote>"""
+    eq_(RE_WHITESPACE.sub('', msg_body),
+        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
+
+
 def test_unicode_in_reply():
    msg_body = u"""Reply \xa0 \xa0 Text<br>

@@ -140,7 +161,7 @@ def test_unicode_in_reply():
  <br>
 </div>

-<blockquote class="gmail_quote">
+<blockquote>
  Quote
 </blockquote>""".encode("utf-8")

@@ -258,26 +279,35 @@ def test_reply_separated_by_hr():
            '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR)))


-RE_REPLY = re.compile(r"^Hi\. I am fine\.\s*\n\s*Thanks,\s*\n\s*Alex\s*$")
+def test_from_block_and_quotations_in_separate_divs():
+    msg_body = '''
+Reply
+<div>
+  <hr/>
+  <div>
+    <font>
+      <b>From: bob@example.com</b>
+      <b>Date: Thu, 24 Mar 2016 08:07:12 -0700</b>
+    </font>
+  </div>
+  <div>
+    Quoted message
+  </div>
+</div>
+'''
+    eq_('<html><body><p>Reply</p><div><hr></div></body></html>',
+        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))


 def extract_reply_and_check(filename):
    f = open(filename)

-    msg_body = f.read().decode("utf-8")
+    msg_body = f.read()
    reply = quotations.extract_from_html(msg_body)
+    plain_reply = u.html_to_text(reply)

-    h = html2text.HTML2Text()
-    h.body_width = 0
-    plain_reply = h.handle(reply)
-
-    #remove &nbsp; spaces
-    plain_reply = plain_reply.replace(u'\xa0', u' ')
-
-    if RE_REPLY.match(plain_reply):
-        eq_(1, 1)
-    else:
-        eq_("Hi. I am fine.\n\nThanks,\nAlex", plain_reply)
+    eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"),
+        RE_WHITESPACE.sub('', plain_reply))


 def test_gmail_reply():
@@ -300,6 +330,10 @@ def test_ms_outlook_2007_reply():
    extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2007.html")


+def test_ms_outlook_2010_reply():
+    extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2010.html")
+
+
 def test_thunderbird_reply():
    extract_reply_and_check("tests/fixtures/html_replies/thunderbird.html")

@@ -310,3 +344,37 @@ def test_windows_mail_reply():

 def test_yandex_ru_reply():
    extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html")
+
+
+def test_CRLF():
+    """CR is not converted to '&#13;'
+    """
+    symbol = '&#13;'
+    extracted = quotations.extract_from_html('<html>\r\n</html>')
+    assert_false(symbol in extracted)
+    eq_('<html></html>', RE_WHITESPACE.sub('', extracted))
+
+    msg_body = """Reply
+<blockquote>
+
+  <div>
+    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
+  </div>
+
+  <div>
+    Test
+  </div>
+
+</blockquote>"""
+    msg_body = msg_body.replace('\n', '\r\n')
+    extracted = quotations.extract_from_html(msg_body)
+    assert_false(symbol in extracted)    
+    eq_("<html><body><p>Reply</p></body></html>",
+        RE_WHITESPACE.sub('', extracted))
+
+
+def test_gmail_forwarded_msg():
+    msg_body = """<div dir="ltr"><br><div class="gmail_quote">---------- Forwarded message ----------<br>From: <b class="gmail_sendername">Bob</b> <span dir="ltr">&lt;<a href="mailto:bob@example.com">bob@example.com</a>&gt;</span><br>Date: Fri, Feb 11, 2010 at 5:59 PM<br>Subject: Bob WFH today<br>To: Mary &lt;<a href="mailto:mary@example.com">mary@example.com</a>&gt;<br><br><br><div dir="ltr">eom</div>
+</div><br></div>"""
+    extracted = quotations.extract_from_html(msg_body)
+    eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
--- a/tests/signature/learning/featurespace_test.py
+++ b/tests/signature/learning/featurespace_test.py
@@ -6,7 +6,9 @@ from talon.signature.learning import featurespace as fs


 def test_apply_features():
-    s = '''John Doe
+    s = '''This is John Doe
+
+Tuesday @3pm suits. I'll chat to you then.

 VP Research and Development, Xxxx Xxxx Xxxxx

@@ -19,11 +21,12 @@ john@example.com'''
    # note that we don't consider the first line because signatures don't
    # usually take all the text, empty lines are not considered
    eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

-    with patch.object(fs, 'SIGNATURE_MAX_LINES', 4):
+    with patch.object(fs, 'SIGNATURE_MAX_LINES', 5):
        features = fs.features(sender)
        new_result = fs.apply_features(s, features)
        # result remains the same because we don't consider empty lines
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -32,6 +32,19 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote:
    eq_("Test reply", quotations.extract_from_plain(msg_body))


+def test_pattern_sent_from_samsung_smb_wrote():
+    msg_body = """Test reply
+
+Sent from Samsung MobileName <address@example.com> wrote:
+
+>
+> Test
+>
+> Roman"""
+
+    eq_("Test reply", quotations.extract_from_plain(msg_body))
+
+
 def test_pattern_on_date_wrote_somebody():
    eq_('Lorem', quotations.extract_from_plain(
    """Lorem
@@ -54,6 +67,18 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote:
    eq_("Test reply", quotations.extract_from_plain(msg_body))


+def test_date_time_email_splitter():
+    msg_body = """Test reply
+
+2014-10-17 11:28 GMT+03:00 Postmaster <
+postmaster@sandboxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.mailgun.org>:
+
+> First from site
+>
+    """
+    eq_("Test reply", quotations.extract_from_plain(msg_body))
+
+
 def test_pattern_on_date_somebody_wrote_allows_space_in_front():
    msg_body = """Thanks Thanmai
 On Mar 8, 2012 9:59 AM, "Example.com" <
@@ -311,6 +336,33 @@ Emne: The manager has commented on your Loop
 Blah-blah-blah
 """))

+def test_swedish_from_block():
+    eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
+    u"""Allo! Follow up MIME!
+Från: Anno Sportel [mailto:anno.spoel@hsbcssad.com]
+Skickat: den 26 augusti 2015 14:45
+Till: Isacson Leiff
+Ämne: RE: Week 36
+
+Blah-blah-blah
+"""))
+
+def test_swedish_from_line():
+    eq_('Lorem', quotations.extract_from_plain(
+    """Lorem
+Den 14 september, 2015 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
+
+Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
+"""))
+
+def test_norwegian_from_line():
+    eq_('Lorem', quotations.extract_from_plain(
+    u"""Lorem
+På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
+
+Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
+"""))
+
 def test_dutch_from_block():
    eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
    """Gluten-free culpa lo-fi et nesciunt nostrud. 
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -58,3 +58,50 @@ def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect):
    detect_encoding.return_value = 'utf-8'
    eq_('utf-8', u.quick_detect_encoding("qwe"))
    ok_(detect_encoding.called)
+
+
+def test_html_to_text():
+    html = """<body>
+<p>Hello world!</p>
+<br>
+<ul>
+<li>One!</li>
+<li>Two</li>
+</ul>
+<p>
+Haha
+</p>
+</body>"""
+    text = u.html_to_text(html)
+    eq_("Hello world! \n\n  * One! \n  * Two \nHaha", text)
+    eq_("привет!", u.html_to_text("<b>привет!</b>"))
+
+    html = '<body><br/><br/>Hi</body>'
+    eq_ ('Hi', u.html_to_text(html))
+
+    html = """Hi
+<style type="text/css">
+
+div, p, li {
+
+font: 13px 'Lucida Grande', Arial, sans-serif;
+
+}
+</style>
+
+<style type="text/css">
+
+h1 {
+
+font: 13px 'Lucida Grande', Arial, sans-serif;
+
+}
+</style>"""
+    eq_ ('Hi', u.html_to_text(html))
+
+    html = """<div>
+<!-- COMMENT 1 -->
+<span>TEXT 1</span>
+<p>TEXT 2 <!-- COMMENT 2 --></p>
+</div>"""
+    eq_('TEXT 1 \nTEXT 2', u.html_to_text(html))
Author	SHA1	Message	Date
Sergey Obukhov	5bcf7403ad	Merge pull request #94 from mailgun/obukhov-sergey-patch-1 Update README.rst	2016-05-31 20:16:13 -07:00
Sergey Obukhov	2d6c092b65	bump version	2016-05-31 18:42:47 -07:00
Sergey Obukhov	6d0689cad6	Update README.rst	2016-05-31 18:39:07 -07:00
Sergey Obukhov	3f80e93ee0	Merge pull request #93 from mailgun/sergey/version-bump bump	2016-05-31 18:15:28 -07:00
Sergey Obukhov	1b18abab1d	bump	2016-05-31 16:53:41 -07:00
Sergey Obukhov	03dd5af5ab	Merge pull request #91 from KevinCathcart/patch-1 Support outlook 2007/2010 running in en-us locale	2016-05-31 16:50:35 -07:00
Sergey Obukhov	dfba82b07c	Merge pull request #92 from mailgun/obukhov-sergey-kuntzcamera Update README.rst	2016-05-31 15:42:34 -07:00
Sergey Obukhov	08ca02c87f	Update README.rst	2016-05-31 15:14:32 -07:00
Kevin Cathcart	b61f4ec095	Support outlook 2007/2010 running in en-us locale My American English copy of outlook 2007 is using inches in the reply separator rather than centimeters. The separator is otherwise Identical. What a strange thing to localize. I'm guessing it uses whatever it thinks the preferred units for page margins are.	2016-05-23 17:23:53 -04:00
Sergey Obukhov	9dbe6a494b	Merge pull request #90 from mailgun/sergey/89 fixes mailgun/talon#89	2016-05-17 16:01:56 -07:00
Sergey Obukhov	44e70939d6	fixes mailgun/talon#89	2016-05-17 15:31:01 -07:00
Sergey Obukhov	ab6066eafa	Merge pull request #87 from mailgun/sergey/1.2.6 bump up version	2016-04-07 17:54:12 -07:00
Sergey Obukhov	42258cdd36	bump up version	2016-04-07 17:51:48 -07:00
Sergey Obukhov	d3de9e6893	Merge pull request #86 from dougkeen/master Fix #85 (exception when stripping gmail quotes)	2016-04-07 17:47:38 -07:00
Doug Keen	333beb94af	Fix #85 (exception when stripping gmail quotes)	2016-04-04 14:22:50 -07:00
Sergey Obukhov	f3c0942c49	Merge pull request #80 from mailgun/sergey/12 fixes mailgun/talon#12	2016-03-04 13:33:46 -08:00
Sergey Obukhov	02adf53ab9	fixes mailgun/talon#12	2016-03-04 13:14:50 -08:00
Sergey Obukhov	3497b5cab4	Merge pull request #79 from mailgun/sergey/version bump version	2016-02-29 15:13:51 -08:00
Sergey Obukhov	9c17dca17c	bump version	2016-02-29 14:50:52 -08:00
Sergey Obukhov	de342d3177	Merge pull request #78 from defkev/master Added Zimbra HTML quotation extraction	2016-02-29 14:14:09 -08:00
defkev	743b452daf	Added Zimbra HTML quotation extraction	2016-02-21 16:56:52 +01:00
Sergey Obukhov	c762f3c337	Merge pull request #77 from mailgun/sergey/fix-gmail-fwd fixes mailgun/talon#18	2016-02-19 19:08:37 -08:00
Sergey Obukhov	31803d41bc	fixes mailgun/talon#18	2016-02-19 19:07:10 -08:00
Sergey Obukhov	2ecd9779fc	bump up version	2016-02-19 18:32:07 -08:00
Sergey Obukhov	5a7047233e	Merge pull request #76 from mailgun/sergey/fix-date-splitter fixes mailgun/talon#19	2016-02-19 18:28:23 -08:00
Sergey Obukhov	999e9c3725	fixes mailgun/talon#19	2016-02-19 17:53:52 -08:00
Sergey Obukhov	f6940fe878	bump up version	2015-12-18 19:15:58 -08:00
Sergey Obukhov	ce65ff8fc8	Merge pull request #71 from clara-labs/ms-2010-issue First pass at handling issue with ms outlook 2010 with unenclosed quo…	2015-12-18 19:14:13 -08:00
Sergey Obukhov	eed6784f25	Merge pull request #70 from mailgun/sergey/gmail fixes mailgun/talon#38 mailgun/talon#20	2015-12-18 19:00:13 -08:00
Sergey Obukhov	3d9ae356ea	add more tests, make standard reply tests more relaxed	2015-12-18 18:56:41 -08:00
Carlos Correa	f688d074b5	First pass at handling issue with ms outlook 2010 with unenclosed quoted text.	2015-12-10 19:16:13 -08:00
Sergey Obukhov	41457d8fbd	fixes mailgun/talon#38 mailgun/talon#20	2015-12-05 00:37:02 -08:00
Sergey Obukhov	2c416ecc0e	Merge pull request #62 from tgwizard/better-support-for-scandinavian-languages Add better support for Scandinavian languages	2015-10-14 21:48:10 -07:00
Sergey Obukhov	3ab33c557b	Merge pull request #65 from mailgun/sergey/cssselect add cssselect to dependencies	2015-10-14 20:34:02 -07:00
Sergey Obukhov	8db05f4950	add cssselect to dependencies	2015-10-14 20:31:26 -07:00
Sergey Obukhov	3d5bc82a03	Merge pull request #61 from tgwizard/fix-for-apple-mail Add fix for Apple Mail email format	2015-10-14 12:38:06 -07:00
Adam Renberg	14e3a0d80b	Add better support for Scandinavian languages This is a port of https://github.com/tictail/claw/pull/6 by @simonflore.	2015-09-21 21:42:01 +02:00
Adam Renberg	fcd9e2716a	Add fix for Apple Mail email format Where they have an initial > on the "date line".	2015-09-21 21:33:57 +02:00
Sergey Obukhov	d62d633215	bump up version	2015-09-21 09:55:51 -07:00
Sergey Obukhov	3b0c9273c1	Merge pull request #60 from mailgun/sergey/26 fixes mailgun/talon#26	2015-09-21 09:54:35 -07:00
Sergey Obukhov	e4c1c11845	remove print	2015-09-21 09:52:47 -07:00
Sergey Obukhov	ae508fe0e5	fixes mailgun/talon#26	2015-09-21 09:51:26 -07:00
Sergey Obukhov	2cb9b5399c	bump up version	2015-09-18 05:23:29 -07:00
Sergey Obukhov	134c47f515	Merge pull request #59 from mailgun/sergey/43 fixes mailgun/talon#43	2015-09-18 05:20:51 -07:00
Sergey Obukhov	d328c9d128	fixes mailgun/talon#43	2015-09-18 05:19:59 -07:00
Sergey Obukhov	77b62b0fef	Merge pull request #58 from mailgun/sergey/52 fixes mailgun/talon#52	2015-09-18 04:48:50 -07:00
Sergey Obukhov	ad09b18f3f	fixes mailgun/talon#52	2015-09-18 04:47:23 -07:00