Merge pull request #93 from mailgun/sergey/version-bump

bump
2016-05-31 18:15:28 -07:00 · 2016-05-31 16:53:41 -07:00 · 2016-05-31 16:50:35 -07:00 · 2016-05-31 15:42:34 -07:00 · 2016-05-31 15:14:32 -07:00 · 2016-05-23 17:23:53 -04:00
6 changed files with 99 additions and 6 deletions
--- a/README.rst
+++ b/README.rst
@@ -95,7 +95,7 @@ classifiers. The core of machine learning algorithm lays in
 apply to a message (``featurespace.py``), how data sets are built
 (``dataset.py``), classifier’s interface (``classifier.py``).

-The data used for training is taken from our personal email
+Currently the data used for training is taken from our personal email
 conversations and from `ENRON`_ dataset. As a result of applying our set
 of features to the dataset we provide files ``classifier`` and
 ``train.data`` that don’t have any personal information but could be
@@ -116,8 +116,18 @@ or
    from talon.signature.learning.classifier import train, init
    train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME)

+Open-source Dataset
+-------------------------
+
+Recently we started a `kuntzcamera`_ project to create an open-source, annotated dataset of raw emails. In the project we
+used a subset of `ENRON`_ data, cleansed of private, health and financial information by `EDRM`_. At the moment over 190
+emails are annotated. Any contribution and collaboration on the project are welcome. Once the dataset is ready we plan to
+start using it for talon.
+
 .. _scikit-learn: http://scikit-learn.org
 .. _ENRON: https://www.cs.cmu.edu/~enron/
+.. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set
+.. _kuntzcamera: https://github.com/mailgun/kuntzcamera

 Research
 --------
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages


 setup(name='talon',
-      version='1.2.2',
+      version='1.2.8',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
--- a/talon/html_quotations.py
+++ b/talon/html_quotations.py
@@ -12,6 +12,7 @@ CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX)

 # HTML quote indicators (tag ids)
 QUOTE_IDS = ['OLK_SRC_BODY_SECTION']
+RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)


 def add_checkpoint(html_note, counter):
@@ -77,7 +78,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints):
 def cut_gmail_quote(html_message):
    ''' Cuts the outermost block element with class gmail_quote. '''
    gmail_quote = html_message.cssselect('div.gmail_quote')
-    if gmail_quote:
+    if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)):
        gmail_quote[0].getparent().remove(gmail_quote[0])
        return True

@@ -85,9 +86,12 @@ def cut_gmail_quote(html_message):
 def cut_microsoft_quote(html_message):
    ''' Cuts splitter block and all following blocks. '''
    splitter = html_message.xpath(
-        #outlook 2007, 2010
+        #outlook 2007, 2010 (international)
        "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
        "padding:3.0pt 0cm 0cm 0cm']|"
+        #outlook 2007, 2010 (american)
+        "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
+        "padding:3.0pt 0in 0in 0in']|"
        #windows mail
        "//div[@style='padding-top: 5px; "
        "border-top-color: rgb(229, 229, 229); "
@@ -172,8 +176,23 @@ def cut_from_block(html_message):
            parent_div_is_all_content = (
                maybe_body is not None and maybe_body.tag == 'body' and
                len(maybe_body.getchildren()) == 1)
+
            if not parent_div_is_all_content:
-                block.getparent().remove(block)
+                parent = block.getparent()
+                next_sibling = block.getnext()
+
+                # remove all tags after found From block
+                # (From block and quoted message are in separate divs)
+                while next_sibling is not None:
+                    parent.remove(block)
+                    block = next_sibling
+                    next_sibling = block.getnext()
+
+                # remove the last sibling (or the
+                # From block if no siblings)
+                if block is not None:
+                    parent.remove(block)
+
                return True
        else:
            return False
@@ -185,7 +204,17 @@ def cut_from_block(html_message):
         "//*[starts-with(mg:tail(), 'Date:')]"))
    if block:
        block = block[0]
+
+        if RE_FWD.match(block.getparent().text or ''):
+            return False
+        
        while(block.getnext() is not None):
            block.getparent().remove(block.getnext())
        block.getparent().remove(block)
        return True
+
+def cut_zimbra_quote(html_message):
+    zDivider = html_message.xpath('//hr[@data-marker="__DIVIDER__"]')
+    if zDivider:
+        zDivider[0].getparent().remove(zDivider[0])
+        return True
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -148,7 +148,9 @@ SPLITTER_PATTERNS = [
    re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S),
    # Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>:
    re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
-               '( \S+){3,6}@\S+:')
+               '( \S+){3,6}@\S+:'),
+    # Sent from Samsung MobileName <address@example.com> wrote:
+    re.compile('Sent from Samsung .*@.*> wrote')
    ]


@@ -350,6 +352,7 @@ def extract_from_html(msg_body):
        parser=html.HTMLParser(encoding="utf-8")
    )
    cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
+                      html_quotations.cut_zimbra_quote(html_tree) or
                      html_quotations.cut_blockquote(html_tree) or
                      html_quotations.cut_microsoft_quote(html_tree) or
                      html_quotations.cut_by_id(html_tree) or
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -131,6 +131,17 @@ def test_gmail_quote():
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))


+def test_gmail_quote_compact():
+    msg_body = 'Reply' \
+               '<div class="gmail_quote">' \
+               '<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:' \
+               '<div>Test</div>' \
+               '</div>' \
+               '</div>'
+    eq_("<html><body><p>Reply</p></body></html>",
+        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
+
+
 def test_gmail_quote_blockquote():
    msg_body = """Message
 <blockquote class="gmail_quote">
@@ -268,6 +279,26 @@ def test_reply_separated_by_hr():
            '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR)))


+def test_from_block_and_quotations_in_separate_divs():
+    msg_body = '''
+Reply
+<div>
+  <hr/>
+  <div>
+    <font>
+      <b>From: bob@example.com</b>
+      <b>Date: Thu, 24 Mar 2016 08:07:12 -0700</b>
+    </font>
+  </div>
+  <div>
+    Quoted message
+  </div>
+</div>
+'''
+    eq_('<html><body><p>Reply</p><div><hr></div></body></html>',
+        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
+
+
 def extract_reply_and_check(filename):
    f = open(filename)

@@ -340,3 +371,10 @@ def test_CRLF():
    assert_false(symbol in extracted)    
    eq_("<html><body><p>Reply</p></body></html>",
        RE_WHITESPACE.sub('', extracted))
+
+
+def test_gmail_forwarded_msg():
+    msg_body = """<div dir="ltr"><br><div class="gmail_quote">---------- Forwarded message ----------<br>From: <b class="gmail_sendername">Bob</b> <span dir="ltr">&lt;<a href="mailto:bob@example.com">bob@example.com</a>&gt;</span><br>Date: Fri, Feb 11, 2010 at 5:59 PM<br>Subject: Bob WFH today<br>To: Mary &lt;<a href="mailto:mary@example.com">mary@example.com</a>&gt;<br><br><br><div dir="ltr">eom</div>
+</div><br></div>"""
+    extracted = quotations.extract_from_html(msg_body)
+    eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -32,6 +32,19 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote:
    eq_("Test reply", quotations.extract_from_plain(msg_body))


+def test_pattern_sent_from_samsung_smb_wrote():
+    msg_body = """Test reply
+
+Sent from Samsung MobileName <address@example.com> wrote:
+
+>
+> Test
+>
+> Roman"""
+
+    eq_("Test reply", quotations.extract_from_plain(msg_body))
+
+
 def test_pattern_on_date_wrote_somebody():
    eq_('Lorem', quotations.extract_from_plain(
    """Lorem
Author	SHA1	Message	Date
Sergey Obukhov	3f80e93ee0	Merge pull request #93 from mailgun/sergey/version-bump bump	2016-05-31 18:15:28 -07:00
Sergey Obukhov	1b18abab1d	bump	2016-05-31 16:53:41 -07:00
Sergey Obukhov	03dd5af5ab	Merge pull request #91 from KevinCathcart/patch-1 Support outlook 2007/2010 running in en-us locale	2016-05-31 16:50:35 -07:00
Sergey Obukhov	dfba82b07c	Merge pull request #92 from mailgun/obukhov-sergey-kuntzcamera Update README.rst	2016-05-31 15:42:34 -07:00
Sergey Obukhov	08ca02c87f	Update README.rst	2016-05-31 15:14:32 -07:00
Kevin Cathcart	b61f4ec095	Support outlook 2007/2010 running in en-us locale My American English copy of outlook 2007 is using inches in the reply separator rather than centimeters. The separator is otherwise Identical. What a strange thing to localize. I'm guessing it uses whatever it thinks the preferred units for page margins are.	2016-05-23 17:23:53 -04:00
Sergey Obukhov	9dbe6a494b	Merge pull request #90 from mailgun/sergey/89 fixes mailgun/talon#89	2016-05-17 16:01:56 -07:00
Sergey Obukhov	44e70939d6	fixes mailgun/talon#89	2016-05-17 15:31:01 -07:00
Sergey Obukhov	ab6066eafa	Merge pull request #87 from mailgun/sergey/1.2.6 bump up version	2016-04-07 17:54:12 -07:00
Sergey Obukhov	42258cdd36	bump up version	2016-04-07 17:51:48 -07:00
Sergey Obukhov	d3de9e6893	Merge pull request #86 from dougkeen/master Fix #85 (exception when stripping gmail quotes)	2016-04-07 17:47:38 -07:00
Doug Keen	333beb94af	Fix #85 (exception when stripping gmail quotes)	2016-04-04 14:22:50 -07:00
Sergey Obukhov	f3c0942c49	Merge pull request #80 from mailgun/sergey/12 fixes mailgun/talon#12	2016-03-04 13:33:46 -08:00
Sergey Obukhov	02adf53ab9	fixes mailgun/talon#12	2016-03-04 13:14:50 -08:00
Sergey Obukhov	3497b5cab4	Merge pull request #79 from mailgun/sergey/version bump version	2016-02-29 15:13:51 -08:00
Sergey Obukhov	9c17dca17c	bump version	2016-02-29 14:50:52 -08:00
Sergey Obukhov	de342d3177	Merge pull request #78 from defkev/master Added Zimbra HTML quotation extraction	2016-02-29 14:14:09 -08:00
defkev	743b452daf	Added Zimbra HTML quotation extraction	2016-02-21 16:56:52 +01:00
Sergey Obukhov	c762f3c337	Merge pull request #77 from mailgun/sergey/fix-gmail-fwd fixes mailgun/talon#18	2016-02-19 19:08:37 -08:00
Sergey Obukhov	31803d41bc	fixes mailgun/talon#18	2016-02-19 19:07:10 -08:00