Merge pull request #138 from mailgun/sergey/v1.3.7

bumped talon version
2017-04-25 11:49:29 -07:00 · 2017-04-25 11:43:55 -07:00 · 2017-04-25 11:34:47 -07:00 · 2017-04-25 11:30:53 -07:00 · 2017-04-25 11:29:06 -07:00 · 2017-04-25 11:19:01 -07:00
4 changed files with 172 additions and 42 deletions
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):


 setup(name='talon',
-      version='1.3.3',
+      version='1.3.7',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
@@ -48,7 +48,7 @@ setup(name='talon',
          "regex>=1",
          "numpy",
          "scipy",
-          "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
+          "scikit-learn>=0.16.1", # pickled versions of classifier, else rebuild
          'chardet>=1.0.1',
          'cchardet>=0.3.5',
          'cssselect',
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -139,6 +139,21 @@ RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .
        'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
    ))), re.I)

+# ---- John Smith wrote ----
+RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format(
+    u'|'.join((
+        # English
+        'wrote'
+    ))), re.I)
+
+# Support polymail.io reply format
+# On Tue, Apr 11, 2017 at 10:07 PM John Smith
+#
+# <
+# mailto:John Smith <johnsmith@gmail.com>
+# > wrote:
+RE_POLYMAIL = re.compile('On.*\s{2}<\smailto:.*\s> wrote:', re.I)
+
 SPLITTER_PATTERNS = [
    RE_ORIGINAL_MESSAGE,
    RE_ON_DATE_SMB_WROTE,
@@ -154,16 +169,17 @@ SPLITTER_PATTERNS = [
    re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
               '( \S+){3,6}@\S+:'),
    # Sent from Samsung MobileName <address@example.com> wrote:
-    re.compile('Sent from Samsung .*@.*> wrote')
+    re.compile('Sent from Samsung .*@.*> wrote'),
+    RE_ANDROID_WROTE,
+    RE_POLYMAIL
    ]

-
 RE_LINK = re.compile('<(http://[^>]*)>')
 RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@')

 RE_PARENTHESIS_LINK = re.compile("\(https?://")

-SPLITTER_MAX_LINES = 4
+SPLITTER_MAX_LINES = 6
 MAX_LINES_COUNT = 1000
 # an extensive research shows that exceeding this limit
 # leads to excessive processing time
@@ -188,6 +204,19 @@ def extract_from(msg_body, content_type='text/plain'):
    return msg_body


+def remove_initial_spaces_and_mark_message_lines(lines):
+    """
+    Removes the initial spaces in each line before marking message lines.
+
+    This ensures headers can be identified if they are indented with spaces.
+    """
+    i = 0
+    while i < len(lines):
+        lines[i] = lines[i].lstrip(' ')
+        i += 1
+    return mark_message_lines(lines)
+
+
 def mark_message_lines(lines):
    """Mark message lines with markers to distinguish quotation lines.

@@ -290,9 +319,21 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):

    Converts msg_body into a unicode.
    """
-    # normalize links i.e. replace '<', '>' wrapping the link with some symbols
-    # so that '>' closing the link couldn't be mistakenly taken for quotation
-    # marker.
+    msg_body = _replace_link_brackets(msg_body)
+
+    msg_body = _wrap_splitter_with_newline(msg_body, delimiter, content_type)
+
+    return msg_body
+
+
+def _replace_link_brackets(msg_body):
+    """
+    Normalize links i.e. replace '<', '>' wrapping the link with some symbols
+    so that '>' closing the link couldn't be mistakenly taken for quotation
+    marker.
+
+    Converts msg_body into a unicode
+    """
    if isinstance(msg_body, bytes):
        msg_body = msg_body.decode('utf8')

@@ -304,7 +345,14 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
            return "@@%s@@" % link.group(1)

    msg_body = re.sub(RE_LINK, link_wrapper, msg_body)
+    return msg_body

+
+def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'):
+    """
+    Splits line in two if splitter pattern preceded by some text on the same
+    line (done only for 'On <date> <person> wrote:' pattern.
+    """
    def splitter_wrapper(splitter):
        """Wraps splitter with new line"""
        if splitter.start() and msg_body[splitter.start() - 1] != '\n':
@@ -455,19 +503,22 @@ def _extract_from_html(msg_body):

 def split_emails(msg):
    """
-    Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify
-     split lines, content lines and empty lines.
+    Given a message (which may consist of an email conversation thread with
+    multiple emails), mark the lines to identify split lines, content lines and
+    empty lines.

-    Correct the split line markers inside header blocks. Header blocks are identified by the regular expression
-    RE_HEADER.
+    Correct the split line markers inside header blocks. Header blocks are
+    identified by the regular expression RE_HEADER.

    Return the corrected markers
    """
-    delimiter = get_delimiter(msg)
-    msg_body = preprocess(msg, delimiter)
+    msg_body = _replace_link_brackets(msg)
+
    # don't process too long messages
    lines = msg_body.splitlines()[:MAX_LINES_COUNT]
-    markers = mark_message_lines(lines)
+    markers = remove_initial_spaces_and_mark_message_lines(lines)
+
+    markers = _mark_quoted_email_splitlines(markers, lines)

    # we don't want splitlines in header blocks
    markers = _correct_splitlines_in_headers(markers, lines)
@@ -475,18 +526,43 @@ def split_emails(msg):
    return markers


+def _mark_quoted_email_splitlines(markers, lines):
+    """
+    When there are headers indented with '>' characters, this method will
+    attempt to identify if the header is a splitline header. If it is, then we
+    mark it with 's' instead of leaving it as 'm' and return the new markers.
+    """
+    # Create a list of markers to easily alter specific characters
+    markerlist = list(markers)
+    for i, line in enumerate(lines):
+        if markerlist[i] != 'm':
+            continue
+        for pattern in SPLITTER_PATTERNS:
+            matcher = re.search(pattern, line)
+            if matcher:
+                markerlist[i] = 's'
+                break
+
+    return "".join(markerlist)
+
+
 def _correct_splitlines_in_headers(markers, lines):
-    """Corrects markers by removing splitlines deemed to be inside header blocks"""
+    """
+    Corrects markers by removing splitlines deemed to be inside header blocks.
+    """
    updated_markers = ""
    i = 0
    in_header_block = False

    for m in markers:
-        # Only set in_header_block flag true when we hit an 's' and the line is a header.
+        # Only set in_header_block flag when we hit an 's' and line is a header
        if m == 's':
            if not in_header_block:
                if bool(re.search(RE_HEADER, lines[i])):
                    in_header_block = True
+            else:
+                if QUOT_PATTERN.match(lines[i]):
+                    m = 'm'
                else:
                    m = 't'

--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -35,6 +35,19 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote:

    eq_("Test reply", quotations.extract_from_plain(msg_body))

+def test_pattern_on_date_polymail():
+    msg_body = """Test reply
+
+On Tue, Apr 11, 2017 at 10:07 PM John Smith
+
+<
+mailto:John Smith <johnsmith@gmail.com>
+> wrote:
+Test quoted data
+"""
+
+    eq_("Test reply", quotations.extract_from_plain(msg_body))
+

 def test_pattern_sent_from_samsung_smb_wrote():
    msg_body = """Test reply
@@ -142,7 +155,8 @@ def _check_pattern_original_message(original_message_indicator):
 -----{}-----

 Test"""
-    eq_('Test reply', quotations.extract_from_plain(msg_body.format(six.text_type(original_message_indicator))))
+    eq_('Test reply', quotations.extract_from_plain(
+        msg_body.format(six.text_type(original_message_indicator))))

 def test_english_original_message():
    _check_pattern_original_message('Original Message')
@@ -165,6 +179,17 @@ Test reply"""
    eq_("Test reply", quotations.extract_from_plain(msg_body))


+def test_android_wrote():
+    msg_body = """Test reply
+
+---- John Smith wrote ----
+
+> quoted
+> text
+"""
+    eq_("Test reply", quotations.extract_from_plain(msg_body))
+
+
 def test_reply_wraps_quotations():
    msg_body = """Test reply

@@ -713,10 +738,35 @@ Attachments: none

    Hello.

-- Original Message --
-On 24th February 2016 at 09.32am Conal Wrote:
+        On 24th February 2016 at 09.32am, Conal wrote:
+
        Hey!
+
+        On Mon, 2016-10-03 at 09:45 -0600, Stangel, Dan wrote:
+        > Mohan,
+        >
+        > We have not yet migrated the systems.
+        >
+        > Dan
+        >
+        > > -----Original Message-----
+        > > Date: Mon, 2 Apr 2012 17:44:22 +0400
+        > > Subject: Test
+        > > From: bob@xxx.mailgun.org
+        > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
+        > >
+        > > Hi
+        > >
+        > > > From: bob@xxx.mailgun.org
+        > > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
+        > > > Date: Mon, 2 Apr 2012 17:44:22 +0400
+        > > > Subject: Test
+        > > > Hi
+        > > >
+        > >
+        >
+        >
 """
-    expected_markers = "stttttsttttetestt"
+    expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm"
    markers = quotations.split_emails(msg)
    eq_(markers, expected_markers)
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -29,7 +29,9 @@ def test_unicode():

 def test_detect_encoding():
    eq_ ('ascii', u.detect_encoding(b'qwe').lower())
-    eq_ ('iso-8859-2', u.detect_encoding(u'Versi\xf3n'.encode('iso-8859-2')).lower())
+    ok_ (u.detect_encoding(
+        u'Versi\xf3n'.encode('iso-8859-2')).lower() in [
+            'iso-8859-1', 'iso-8859-2'])
    eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower())
    # fallback to utf-8
    with patch.object(u.chardet, 'detect') as detect:
@@ -39,7 +41,9 @@ def test_detect_encoding():

 def test_quick_detect_encoding():
    eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower())
-    eq_ ('windows-1252', u.quick_detect_encoding(u'Versi\xf3n'.encode('windows-1252')).lower())
+    ok_ (u.quick_detect_encoding(
+        u'Versi\xf3n'.encode('windows-1252')).lower() in [
+            'windows-1252', 'windows-1250'])
    eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower())
Author	SHA1	Message	Date
Sergey Obukhov	f16ae5110b	Merge pull request #138 from mailgun/sergey/v1.3.7 bumped talon version	2017-04-25 11:49:29 -07:00
Sergey Obukhov	ab5cbe5ec3	bumped talon version	2017-04-25 11:43:55 -07:00
Sergey Obukhov	be5da92f16	Merge pull request #135 from esetnik/polymail_support Polymail Quote Support	2017-04-25 11:34:47 -07:00
Sergey Obukhov	95954a65a0	Merge branch 'master' into polymail_support	2017-04-25 11:30:53 -07:00
Sergey Obukhov	0b55e8fa77	Merge pull request #137 from mailgun/sergey/chardet loosen the encoding requirement for detect_encoding	2017-04-25 11:29:06 -07:00
Sergey Obukhov	6f159e8959	loosen the encoding requirement for detect_encoding	2017-04-25 11:19:01 -07:00
Ethan Setnik	5c413b4b00	allow more lines since polymail has extra whitespace	2017-04-12 00:07:29 -04:00
Ethan Setnik	cca64d3ed1	add test case	2017-04-11 23:36:36 -04:00
Ethan Setnik	e11eaf6ff8	add support for polymail reply format	2017-04-11 22:38:29 -04:00
Sergey Obukhov	85a4c1d855	Merge pull request #133 from mailgun/sergey/android add android quotation pattern	2017-04-10 16:37:17 -07:00
Sergey Obukhov	0f5e72623b	add android quotation pattern	2017-04-10 16:33:21 -07:00
Sergey Obukhov	061e549ad7	Merge pull request #128 from mailgun/sergey/1.3.4 bump version	2017-02-14 11:17:35 -08:00
Sergey Obukhov	49d1a5d248	bump version	2017-02-14 11:05:50 -08:00
Sergey Obukhov	03d6b00db8	Merge pull request #127 from conalsmith49/mark-splitlines-in-email-quotation-indents Split_Email(): Mark splitlines for headers indented with spaces or email quotation indents (">")	2017-02-14 11:03:51 -08:00
smitcona	a2eb0f7201	Creating new method which removes initial spaces and marks the message lines. Removing ambiguity introduced to mark_message_lines	2017-02-14 18:19:45 +00:00
smitcona	5c71a0ca07	Split the comment lines so that they are not over 80 characters	2017-02-13 16:45:26 +00:00
Sergey Obukhov	489d16fad9	Merge branch 'master' into mark-splitlines-in-email-quotation-indents	2017-02-09 21:10:16 -08:00
Sergey Obukhov	a458707777	Merge pull request #124 from phanindra-ramesh/issue_123 Fixes issue #123	2017-02-09 20:55:36 -08:00
smitcona	a1d0a86305	Pass ignore_initial_spaces=True as this has better clarity than separate boolean variable	2017-02-07 12:47:33 +00:00
smitcona	29f1d21be7	fixed expected markers and incorrect condensed header not matching regex	2017-02-06 15:03:22 +00:00
smitcona	34c5b526c3	Remove the whitespace before the line if the flag is set	2017-02-03 12:57:26 +00:00
smitcona	3edb6578ba	Dividing preprocess method into two methods, split_emails() now calls one without email content being altered.	2017-02-03 11:49:23 +00:00
smitcona	984c036b6e	Set the marker back to 'm' rather than 't' if it matches the QUOT_PATTERN. Updated test case.	2017-02-01 18:28:19 +00:00
smitcona	a403ecb5c9	Adding two level indentation test	2017-02-01 18:09:35 +00:00
smitcona	a44713409c	Added additional case for testing new functionality of split_emails()	2017-02-01 17:40:59 +00:00
smitcona	567467b8ed	Update comment	2017-02-01 17:29:05 +00:00
smitcona	139edd6104	Add new method which marks as splitlines, lines which are splitlines but start with email quotation indents ("> ")	2017-02-01 17:16:30 +00:00
Phanindra Ramesh Challa	e756d55abf	Fixes issue #123	2016-12-27 13:53:40 +05:30