Merge pull request #41 from simonflore/master

New splitter pattern for Dutch mail replies
Revert "Change of behavior when msg_body has more then 1000 lines"
2015-04-22 12:17:39 -07:00 · 2015-04-16 13:26:17 +02:00 · 2015-04-16 13:22:18 +02:00 · 2015-04-15 13:55:59 +02:00 · 2015-04-15 13:55:17 +02:00 · 2015-04-14 18:52:45 +02:00
7 changed files with 206 additions and 43 deletions
--- a/README.rst
+++ b/README.rst
@@ -3,7 +3,7 @@ talon

 Mailgun library to extract message quotations and signatures.

-If you ever tried to parse message quotations or signatures you know that absense of any formatting standards in this area could make this task a nightmare. Hopefully this library will make your life much easier. The name of the project is inspired by TALON - multipurpose robot designed to perform missions ranging from reconnaissance to combat and operate in a number of hostile environments. That’s what a good quotations and signature parser should be like :smile:
+If you ever tried to parse message quotations or signatures you know that absence of any formatting standards in this area could make this task a nightmare. Hopefully this library will make your life much easier. The name of the project is inspired by TALON - multipurpose robot designed to perform missions ranging from reconnaissance to combat and operate in a number of hostile environments. That’s what a good quotations and signature parser should be like :smile:

 Usage
 -----
@@ -71,6 +71,11 @@ the power of machine learning algorithms:

 .. code:: python

+    import talon
+    # don't forget to init the library first
+    # it loads machine learning classifiers
+    talon.init()
+
    from talon import signature


--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,8 @@ setup(name='talon',
          "html2text",
          "nose==1.2.1",
          "mock",
-          "coverage"
+          "coverage",
+          "flanker"
          ]
      )

--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -23,14 +23,49 @@ log = logging.getLogger(__name__)
 RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)

 RE_ON_DATE_SMB_WROTE = re.compile(
-    r'''
-    (
-        -*  # could include dashes
-        [ ]?On[ ].*,  # date part ends with comma
-        (.*\n){0,2}  # splitter takes 4 lines at most
-        .*(wrote|sent):
+    u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
+        # Beginning of the line
+        u'|'.join((
+            # English
+            'On',
+            # French
+            'Le',
+            # Polish
+            'W dniu',
+            # Dutch
+            'Op'
+        )),
+        # Date and sender separator
+        u'|'.join((
+            # most languages separate date and sender address by comma
+            ',',
+            # polish date and sender address separator
+            u'użytkownik'
+        )),
+        # Ending of the line
+        u'|'.join((
+            # English
+            'wrote', 'sent',
+            # French
+            u'a écrit',
+            # Polish
+            u'napisał',
+            # Dutch
+            'schreef','verzond','geschreven'
+        ))
+    ))
+# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
+RE_ON_DATE_WROTE_SMB = re.compile(
+    u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format(
+        # Beginning of the line
+        	'Op',
+        # Ending of the line
+        u'|'.join((
+            # Dutch
+            'schreef','verzond','geschreven'
+        ))
+    )
    )
-    ''', re.VERBOSE)

 RE_QUOTATION = re.compile(
    r'''
@@ -66,13 +101,33 @@ RE_EMPTY_QUOTATION = re.compile(
    e*
    ''', re.VERBOSE)

+# ------Original Message------ or ---- Reply Message ----
+# With variations in other languages.
+RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
+    u'|'.join((
+        # English
+        'Original Message', 'Reply Message',
+        # German
+        u'Ursprüngliche Nachricht', 'Antwort Nachricht',
+        # Danish
+        'Oprindelig meddelelse',
+    ))), re.I)
+
+RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format(
+    u'|'.join((
+        # "From" in different languages.
+        'From', 'Van', 'De', 'Von', 'Fra',
+        # "Date" in different languages.
+        'Date', 'Datum', u'Envoyé'
+    ))), re.I)
+
 SPLITTER_PATTERNS = [
-    # ------Original Message------ or ---- Reply Message ----
-    re.compile("[\s]*[-]+[ ]*(Original|Reply) Message[ ]*[-]+", re.I),
+    RE_ORIGINAL_MESSAGE,
    # <date> <person>
    re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE),
    RE_ON_DATE_SMB_WROTE,
-    re.compile('(_+\r?\n)?[\s]*(:?[*]?From|Date):[*]? .*'),
+    RE_ON_DATE_WROTE_SMB,
+    RE_FROM_COLON_OR_DATE_COLON,
    re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
               '( \S+){3,6}@\S+:')
    ]
@@ -81,7 +136,7 @@ SPLITTER_PATTERNS = [
 RE_LINK = re.compile('<(http://[^>]*)>')
 RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@')

-RE_PARANTHESIS_LINK = re.compile("\(https?://")
+RE_PARENTHESIS_LINK = re.compile("\(https?://")

 SPLITTER_MAX_LINES = 4
 MAX_LINES_COUNT = 1000
@@ -169,8 +224,8 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
        # long links could break sequence of quotation lines but they shouldn't
        # be considered an inline reply
        links = (
-            RE_PARANTHESIS_LINK.search(lines[inline_reply.start() - 1]) or
-            RE_PARANTHESIS_LINK.match(lines[inline_reply.start()].strip()))
+            RE_PARENTHESIS_LINK.search(lines[inline_reply.start() - 1]) or
+            RE_PARENTHESIS_LINK.match(lines[inline_reply.start()].strip()))
        if not links:
            return_flags[:] = [False, -1, -1]
            return lines
@@ -197,7 +252,7 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
    """Prepares msg_body for being stripped.

    Replaces link brackets so that they couldn't be taken for quotation marker.
-    Splits line in two if splitter pattern preceeded by some text on the same
+    Splits line in two if splitter pattern preceded by some text on the same
    line (done only for 'On <date> <person> wrote:' pattern).
    """
    # normalize links i.e. replace '<', '>' wrapping the link with some symbols
@@ -213,7 +268,7 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
    msg_body = re.sub(RE_LINK, link_wrapper, msg_body)

    def splitter_wrapper(splitter):
-        """Wrapps splitter with new line"""
+        """Wraps splitter with new line"""
        if splitter.start() and msg_body[splitter.start() - 1] != '\n':
            return '%s%s' % (delimiter, splitter.group())
        else:
@@ -268,7 +323,7 @@ def extract_from_html(msg_body):
    then converting html to text,
    then extracting quotations from text,
    then checking deleted checkpoints,
-    then deleting neccessary tags.
+    then deleting necessary tags.
    """

    if msg_body.strip() == '':
--- a/talon/signature/bruteforce.py
+++ b/talon/signature/bruteforce.py
@@ -49,7 +49,7 @@ RE_PHONE_SIGNATURE = re.compile(r'''
 # c - could be signature line
 # d - line starts with dashes (could be signature or list item)
 # l - long line
-RE_SIGNATURE_CANDIDAATE = re.compile(r'''
+RE_SIGNATURE_CANDIDATE = re.compile(r'''
    (?P<candidate>c+d)[^d]
    |
    (?P<candidate>c+d)$
@@ -184,5 +184,5 @@ def _process_marked_candidate_indexes(candidate, markers):
    >>> _process_marked_candidate_indexes([9, 12, 14, 15, 17], 'clddc')
    [15, 17]
    """
-    match = RE_SIGNATURE_CANDIDAATE.match(markers[::-1])
+    match = RE_SIGNATURE_CANDIDATE.match(markers[::-1])
    return candidate[-match.end('candidate'):] if match else []
--- a/talon/signature/learning/featurespace.py
+++ b/talon/signature/learning/featurespace.py
@@ -1,13 +1,14 @@
 # -*- coding: utf-8 -*-

-""" The module provides functions for convertion of a message body/body lines
+""" The module provides functions for conversion of a message body/body lines
 into classifiers features space.

 The body and the message sender string are converted into unicode before
 applying features to them.
 """

-from talon.signature.constants import SIGNATURE_MAX_LINES
+from talon.signature.constants import (SIGNATURE_MAX_LINES,
+                                       TOO_LONG_SIGNATURE_LINE)
 from talon.signature.learning.helpers import *


@@ -20,7 +21,7 @@ def features(sender=''):
        # This one is not from paper.
        # Line is too long.
        # This one is less aggressive than `Line is too short`
-        lambda line: 1 if len(line) > 60 else 0,
+        lambda line: 1 if len(line) > TOO_LONG_SIGNATURE_LINE else 0,
        # Line contains email pattern.
        binary_regex_search(RE_EMAIL),
        # Line contains url.
@@ -47,9 +48,9 @@ def apply_features(body, features):
    '''Applies features to message body lines.

    Returns list of lists. Each of the lists corresponds to the body line
-    and is constituted by the numbers of features occurances (0 or 1).
+    and is constituted by the numbers of features occurrences (0 or 1).
    E.g. if element j of list i equals 1 this means that
-    feature j occured in line i (counting from the last line of the body).
+    feature j occurred in line i (counting from the last line of the body).
    '''
    # collect all non empty lines
    lines = [line for line in body.splitlines() if line.strip()]
@@ -66,7 +67,7 @@ def build_pattern(body, features):
    '''Converts body into a pattern i.e. a point in the features space.

    Applies features to the body lines and sums up the results.
-    Elements of the pattern indicate how many times a certain feature occured
+    Elements of the pattern indicate how many times a certain feature occurred
    in the last lines of the body.
    '''
    line_patterns = apply_features(body, features)
--- a/talon/signature/learning/helpers.py
+++ b/talon/signature/learning/helpers.py
@@ -94,7 +94,7 @@ def binary_regex_match(prog):


 def flatten_list(list_to_flatten):
-    """Simple list comprehesion to flatten list.
+    """Simple list comprehension to flatten list.

    >>> flatten_list([[1, 2], [3, 4, 5]])
    [1, 2, 3, 4, 5]
@@ -155,7 +155,7 @@ def extract_names(sender):


 def categories_percent(s, categories):
-    '''Returns category characters persent.
+    '''Returns category characters percent.

    >>> categories_percent("qqq ggg hhh", ["Po"])
    0.0
@@ -177,7 +177,7 @@ def categories_percent(s, categories):


 def punctuation_percent(s):
-    '''Returns punctuation persent.
+    '''Returns punctuation percent.

    >>> punctuation_percent("qqq ggg hhh")
    0.0
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -33,6 +33,16 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote:
    eq_("Test reply", quotations.extract_from_plain(msg_body))


+def test_pattern_on_date_wrote_somebody():
+    eq_('Lorem', quotations.extract_from_plain(
+    """Lorem
+
+Op 13-02-2014 3:18 schreef Julius Caesar <pantheon@rome.com>:
+    
+Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
+"""))
+
+
 def test_pattern_on_date_somebody_wrote_date_with_slashes():
    msg_body = """Test reply

@@ -98,22 +108,24 @@ bla-bla - bla"""
    eq_(reply, quotations.extract_from_plain(msg_body))


-def test_pattern_original_message():
-    msg_body = """Test reply
+def _check_pattern_original_message(original_message_indicator):
+    msg_body = u"""Test reply

-----Original Message-----
+-----{}-----

 Test"""
+    eq_('Test reply', quotations.extract_from_plain(msg_body.format(unicode(original_message_indicator))))

-    eq_("Test reply", quotations.extract_from_plain(msg_body))
+def test_english_original_message():
+    _check_pattern_original_message('Original Message')
+    _check_pattern_original_message('Reply Message')

-    msg_body = """Test reply
+def test_german_original_message():
+    _check_pattern_original_message(u'Ursprüngliche Nachricht')
+    _check_pattern_original_message('Antwort Nachricht')

- -----Original Message-----
-
-Test"""
-
-    eq_("Test reply", quotations.extract_from_plain(msg_body))
+def test_danish_original_message():
+    _check_pattern_original_message('Oprindelig meddelelse')


 def test_reply_after_quotations():
@@ -199,6 +211,33 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote:
 > Hello"""
    eq_("Hi", quotations.extract_from_plain(msg_body))

+def test_with_indent():
+    msg_body = """YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.
+
+------On 12/29/1987 17:32 PM, Julius Caesar wrote-----
+
+Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur. 
+    """
+    eq_("YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.", quotations.extract_from_plain(msg_body))
+
+
+def test_short_quotation_with_newline():
+    msg_body = """Btw blah blah...
+
+On Tue, Jan 27, 2015 at 12:42 PM -0800, "Company" <christine.XXX@XXX.com> wrote:
+
+Hi Mark,
+Blah blah? 
+Thanks,Christine 
+
+On Jan 27, 2015, at 11:55 AM, Mark XXX <mark@XXX.com> wrote:
+
+Lorem ipsum?
+Mark
+
+Sent from Acompli"""
+    eq_("Btw blah blah...", quotations.extract_from_plain(msg_body))
+

 def test_pattern_date_email_with_unicode():
    msg_body = """Replying ok
@@ -208,8 +247,8 @@ def test_pattern_date_email_with_unicode():
    eq_("Replying ok", quotations.extract_from_plain(msg_body))


-def test_pattern_from_block():
-    msg_body = """Allo! Follow up MIME!
+def test_english_from_block():
+    eq_('Allo! Follow up MIME!', quotations.extract_from_plain("""Allo! Follow up MIME!

 From: somebody@example.com
 Sent: March-19-11 5:42 PM
@@ -217,8 +256,70 @@ To: Somebody
 Subject: The manager has commented on your Loop

 Blah-blah-blah
-"""
-    eq_("Allo! Follow up MIME!", quotations.extract_from_plain(msg_body))
+"""))
+
+def test_german_from_block():
+    eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
+    """Allo! Follow up MIME!
+
+Von: somebody@example.com
+Gesendet: Dienstag, 25. November 2014 14:59
+An: Somebody
+Betreff: The manager has commented on your Loop
+
+Blah-blah-blah
+"""))
+
+def test_french_multiline_from_block():
+    eq_('Lorem ipsum', quotations.extract_from_plain(
+    u"""Lorem ipsum
+
+De : Brendan xxx [mailto:brendan.xxx@xxx.com]
+Envoyé : vendredi 23 janvier 2015 16:39
+À : Camille XXX
+Objet : Follow Up
+
+Blah-blah-blah
+"""))
+
+def test_french_from_block():
+    eq_('Lorem ipsum', quotations.extract_from_plain(
+    u"""Lorem ipsum
+
+Le 23 janv. 2015 à 22:03, Brendan xxx <brendan.xxx@xxx.com<mailto:brendan.xxx@xxx.com>> a écrit:
+
+Bonjour!"""))
+
+def test_polish_from_block():
+    eq_('Lorem ipsum', quotations.extract_from_plain(
+    u"""Lorem ipsum
+
+W dniu 28 stycznia 2015 01:53 użytkownik Zoe xxx <zoe.xxx@xxx.com>
+napisał:
+
+Blah!
+"""))
+
+def test_danish_from_block():
+    eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
+    """Allo! Follow up MIME!
+
+Fra: somebody@example.com
+Sendt: 19. march 2011 12:10
+Til: Somebody
+Emne: The manager has commented on your Loop
+
+Blah-blah-blah
+"""))
+
+def test_dutch_from_block():
+    eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
+    """Gluten-free culpa lo-fi et nesciunt nostrud. 
+
+Op 17-feb.-2015, om 13:18 heeft Julius Caesar <pantheon@rome.com> het volgende geschreven:
+    
+Small batch beard laboris tempor, non listicle hella Tumblr heirloom. 
+"""))


 def test_quotation_marker_false_positive():
Author	SHA1	Message	Date
Jeremy Schlatter	3a37d8b649	Merge pull request #41 from simonflore/master New splitter pattern for Dutch mail replies	2015-04-22 12:17:39 -07:00
Simon	f9f428f4c3	Revert "Change of behavior when msg_body has more then 1000 lines" This reverts commit `84a83e865e`.	2015-04-16 13:26:17 +02:00
Simon	84a83e865e	Change of behavior when msg_body has more then 1000 lines	2015-04-16 13:22:18 +02:00
Simon	b4c180b9ff	Extra spaces check in RE_ON_DATE_WROTE_SMB reggae	2015-04-15 13:55:59 +02:00
Simon	072a440837	Test cases for new patterns	2015-04-15 13:55:17 +02:00
Simon	105d16644d	For patterns like this '---- On {date} {name} {mail} wrote ---- '	2015-04-14 18:52:45 +02:00
Simon	df3338192a	Another submission to a dutch variation	2015-04-14 18:49:26 +02:00
Simon	f0ed5d6c07	New splitter pattern for Dutch mail replies	2015-04-14 18:22:48 +02:00
Sergey Obukhov	790463821f	Merge pull request #31 from tsheasha/patch-1 Utilising the Constants	2015-03-02 14:48:41 -08:00
Sergey Obukhov	763d3b308e	Merge pull request #35 from futuresimple/more_formats Support some polish and french formats	2015-03-02 14:25:26 -08:00
szymonsobczak	3c9ef4653f	some more french fromats	2015-02-24 12:18:54 +01:00
szymonsobczak	b16060261a	support some polish and french formats	2015-02-24 11:39:12 +01:00
Tarek Sheasha	13dc43e960	Utilising the Constants Checking for the length of a line to determine if it is possibly a signature or not could be done in a more generic way by determining the maximum size of the line via a constant. Hence advocating the spirit of the modifying the code in only one place and propagating that change everywhere. This exact approach has already been used at:	2015-01-21 15:54:57 +01:00
Jeremy Schlatter	3768d7ba31	make a separate test function for each language	2014-12-30 14:41:20 -08:00
Jeremy Schlatter	613d1fc815	Add extra splitter expressions and tests for German and Danish. Also some refactoring to make it a bit easier to add more languages.	2014-12-23 15:44:04 -08:00
Sergey Obukhov	52505bba8a	Update README.rst Clarified that some signature extraction methods require initializing the lib first.	2014-09-14 09:03:10 -07:00
Sergey Obukhov	79cd4fcc52	Merge pull request #15 from willemdelbare/master added extra splitter expressions for Dutch, French, German	2014-09-14 08:38:39 -07:00
Willem Delbare	a4f156b174	added extra splitter expressions for Dutch, French, German	2014-09-13 15:33:08 +02:00
Sergey Obukhov	1789ccf3c8	Merge branch 'master' of github.com:mailgun/talon	2014-07-24 20:37:47 -07:00
Sergey Obukhov	7a42ab3b28	fix #4 add flanker to setup.py	2014-07-24 20:37:33 -07:00
Sergey Obukhov	12b0e88a01	Merge pull request #5 from pborreli/typos Fixed typos	2014-07-24 20:32:57 -07:00
Pascal Borreli	8b78da5977	Fixed typos	2014-07-25 02:40:37 +00:00