Merge pull request #62 from tgwizard/better-support-for-scandinavian-languages

Add better support for Scandinavian languages
Merge pull request #65 from mailgun/sergey/cssselect
2015-10-14 21:48:10 -07:00 · 2015-10-14 20:34:02 -07:00 · 2015-10-14 20:31:26 -07:00 · 2015-10-14 12:38:06 -07:00 · 2015-09-21 21:42:01 +02:00 · 2015-09-21 21:33:57 +02:00
12 changed files with 150 additions and 24 deletions
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages


 setup(name='talon',
-      version='1.0.7',
+      version='1.0.9',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
@@ -22,6 +22,7 @@ setup(name='talon',
          "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
          'chardet>=1.0.1',
          'cchardet>=0.3.5',
+          'cssselect'
          ],
      tests_require=[
          "mock",
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -22,7 +22,7 @@ log = logging.getLogger(__name__)
 RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)

 RE_ON_DATE_SMB_WROTE = re.compile(
-    u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
+    u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
        # Beginning of the line
        u'|'.join((
            # English
@@ -34,7 +34,11 @@ RE_ON_DATE_SMB_WROTE = re.compile(
            # Dutch
            'Op',
            # German
-            'Am'
+            'Am',
+            # Norwegian
+            u'På',
+            # Swedish, Danish
+            'Den',
        )),
        # Date and sender separator
        u'|'.join((
@@ -54,12 +58,14 @@ RE_ON_DATE_SMB_WROTE = re.compile(
            # Dutch
            'schreef','verzond','geschreven',
            # German
-            'schrieb'
+            'schrieb',
+            # Norwegian, Swedish
+            'skrev',
        ))
    ))
 # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
 RE_ON_DATE_WROTE_SMB = re.compile(
-    u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
+    u'(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
        # Beginning of the line
        u'|'.join((
        	'Op',
@@ -125,9 +131,9 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
 RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format(
    u'|'.join((
        # "From" in different languages.
-        'From', 'Van', 'De', 'Von', 'Fra',
+        'From', 'Van', 'De', 'Von', 'Fra', u'Från',
        # "Date" in different languages.
-        'Date', 'Datum', u'Envoyé'
+        'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
    ))), re.I)

 SPLITTER_PATTERNS = [
@@ -315,7 +321,7 @@ def extract_from_plain(msg_body):
    return msg_body


-def extract_from_html(msg_body):
+def extract_from_html(s):
    """
    Extract not quoted message from provided html message body
    using tags and plain text algorithm.
@@ -332,8 +338,12 @@ def extract_from_html(msg_body):
    then deleting necessary tags.
    """

-    if msg_body.strip() == '':
-        return msg_body
+    if s.strip() == '':
+        return s
+
+    # replace CRLF with LF temporaraly otherwise CR will be converted to '&#13;'
+    # when doing deepcopy on html tree
+    msg_body, replaced = _CRLF_to_LF(s)

    html_tree = html.document_fromstring(
        msg_body,
@@ -364,15 +374,12 @@ def extract_from_html(msg_body):
    plain_text = plain_text.replace('*', '')
    # Unmask saved star symbols
    plain_text = plain_text.replace('3423oorkg432', '*')
-
-    delimiter = get_delimiter(plain_text)
-
-    plain_text = preprocess(plain_text, delimiter, content_type='text/html')
+    plain_text = preprocess(plain_text, '\n', content_type='text/html')
    lines = plain_text.splitlines()

    # Don't process too long messages
    if len(lines) > MAX_LINES_COUNT:
-        return msg_body
+        return s

    # Collect checkpoints on each line
    line_checkpoints = [
@@ -397,9 +404,9 @@ def extract_from_html(msg_body):
                quotation_checkpoints[checkpoint] = True
    else:
        if cut_quotations:
-            return html.tostring(html_tree_copy)
+            return _restore_CRLF(html.tostring(html_tree_copy), replaced)
        else:
-            return msg_body
+            return s

    # Remove tags with quotation checkpoints
    html_quotations.delete_quotation_tags(
@@ -435,3 +442,37 @@ def register_xpath_extensions():
    ns.prefix = 'mg'
    ns['text_content'] = text_content
    ns['tail'] = tail
+
+
+def _restore_CRLF(s, replaced=True):
+    """Restore CRLF if previously CRLF was replaced with LF
+
+    >>> _restore_CRLF('a\nb')
+    'a\r\nb'
+    >>> _restore_CRLF('a\nb', replaced=False)
+    'a\nb'
+    """
+    if replaced:
+        return s.replace('\n', '\r\n')
+    return s
+
+
+def _CRLF_to_LF(s):
+    """Replace CRLF with LF
+
+    >>> s, changed = _CRLF_to_LF('a\r\n'b)
+    >>> s
+    'a\nb'
+    >>> changed
+    True
+
+    >>> s, changed = _CRLF_to_LF('a\n'b)
+    >>> s
+    'a\nb'
+    >>> changed
+    False
+    """
+    delimiter = get_delimiter(s)
+    if delimiter == '\r\n':
+        return s.replace(delimiter, '\n'), True
+    return s, False
--- a/talon/signature/data/classifier
+++ b/talon/signature/data/classifier
--- a/talon/signature/data/classifier_02.npy
+++ b/talon/signature/data/classifier_02.npy
--- a/talon/signature/data/classifier_03.npy
+++ b/talon/signature/data/classifier_03.npy
--- a/talon/signature/learning/helpers.py
+++ b/talon/signature/learning/helpers.py
@@ -16,7 +16,7 @@ from talon.signature.constants import SIGNATURE_MAX_LINES

 rc = re.compile

-RE_EMAIL = rc('@')
+RE_EMAIL = rc('\S@\S')
 RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
 RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')

@@ -120,7 +120,7 @@ def contains_sender_names(sender):
    names = names or sender
    if names != '':
        return binary_regex_search(re.compile(names))
-    return lambda s: False
+    return lambda s: 0


 def extract_names(sender):
--- a/tests/fixtures/html_replies/hotmail.html
+++ b/tests/fixtures/html_replies/hotmail.html
@@ -1,3 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
 <html>
 <head>
 <style><!--
--- a/tests/fixtures/standard_replies/apple_mail_2.eml
+++ b/tests/fixtures/standard_replies/apple_mail_2.eml
@@ -0,0 +1,19 @@
+Content-Type: text/plain;
+	charset=us-ascii
+Mime-Version: 1.0 (Mac OS X Mail 8.2 \(2104\))
+Subject: Re: Hello there
+X-Universally-Unique-Identifier: 85B1075D-5841-46A9-8565-FCB287A93AC4
+From: Adam Renberg <adam@tictail.com>
+In-Reply-To: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
+Date: Sat, 22 Aug 2015 19:22:20 +0200
+Content-Transfer-Encoding: 7bit
+X-Smtp-Server: smtp.gmail.com:adam@tictail.com
+Message-Id: <68001B29-8EA4-444C-A894-0537D2CA5208@tictail.com>
+References: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
+To: Adam Renberg <tgwizard@gmail.com>
+
+Hello
+> On 22 Aug 2015, at 19:21, Adam Renberg <tgwizard@gmail.com> wrote:
+>
+> Hi there!
+
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -28,8 +28,8 @@ def test_quotation_splitter_inside_blockquote():

 </blockquote>"""

-    eq_("<html><body><p>Reply</p></body></html>",
-        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
+    eq_("<html><body><p>Reply\n</p></body></html>",
+        quotations.extract_from_html(msg_body))


 def test_quotation_splitter_outside_blockquote():
@@ -264,7 +264,7 @@ RE_REPLY = re.compile(r"^Hi\. I am fine\.\s*\n\s*Thanks,\s*\n\s*Alex\s*$")
 def extract_reply_and_check(filename):
    f = open(filename)

-    msg_body = f.read().decode("utf-8")
+    msg_body = f.read()
    reply = quotations.extract_from_html(msg_body)

    h = html2text.HTML2Text()
@@ -310,3 +310,25 @@ def test_windows_mail_reply():

 def test_yandex_ru_reply():
    extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html")
+
+
+def test_CRLF():
+    """CR is not converted to '&#13;'
+    """
+    eq_('<html>\r\n</html>', quotations.extract_from_html('<html>\r\n</html>'))
+
+    msg_body = """Reply
+<blockquote>
+
+  <div>
+    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
+  </div>
+
+  <div>
+    Test
+  </div>
+
+</blockquote>"""
+    msg_body = msg_body.replace('\n', '\r\n')
+    eq_("<html><body><p>Reply\r\n</p></body></html>",
+        quotations.extract_from_html(msg_body))
--- a/tests/quotations_test.py
+++ b/tests/quotations_test.py
@@ -29,3 +29,15 @@ def test_crash_inside_extract_from():

 def test_empty_body():
    eq_('', quotations.extract_from_plain(''))
+
+
+def test__CRLF_to_LF():
+    eq_(('\n\r', True), quotations._CRLF_to_LF('\r\n\r'))
+    eq_(('\n\r', False), quotations._CRLF_to_LF('\n\r'))
+
+
+def test__restore_CRLF():
+    eq_('\n', quotations._restore_CRLF('\n', replaced=False))
+    eq_('\r\n', quotations._restore_CRLF('\n', replaced=True))    
+    # default
+    eq_('\r\n', quotations._restore_CRLF('\n'))
--- a/tests/signature/learning/featurespace_test.py
+++ b/tests/signature/learning/featurespace_test.py
@@ -6,7 +6,9 @@ from talon.signature.learning import featurespace as fs


 def test_apply_features():
-    s = '''John Doe
+    s = '''This is John Doe
+
+Tuesday @3pm suits. I'll chat to you then.

 VP Research and Development, Xxxx Xxxx Xxxxx

@@ -19,11 +21,12 @@ john@example.com'''
    # note that we don't consider the first line because signatures don't
    # usually take all the text, empty lines are not considered
    eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
+                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
                 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

-    with patch.object(fs, 'SIGNATURE_MAX_LINES', 4):
+    with patch.object(fs, 'SIGNATURE_MAX_LINES', 5):
        features = fs.features(sender)
        new_result = fs.apply_features(s, features)
        # result remains the same because we don't consider empty lines
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -311,6 +311,33 @@ Emne: The manager has commented on your Loop
 Blah-blah-blah
 """))

+def test_swedish_from_block():
+    eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
+    u"""Allo! Follow up MIME!
+Från: Anno Sportel [mailto:anno.spoel@hsbcssad.com]
+Skickat: den 26 augusti 2015 14:45
+Till: Isacson Leiff
+Ämne: RE: Week 36
+
+Blah-blah-blah
+"""))
+
+def test_swedish_from_line():
+    eq_('Lorem', quotations.extract_from_plain(
+    """Lorem
+Den 14 september, 2015 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
+
+Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
+"""))
+
+def test_norwegian_from_line():
+    eq_('Lorem', quotations.extract_from_plain(
+    u"""Lorem
+På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
+
+Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
+"""))
+
 def test_dutch_from_block():
    eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
    """Gluten-free culpa lo-fi et nesciunt nostrud.
Author	SHA1	Message	Date
Sergey Obukhov	2c416ecc0e	Merge pull request #62 from tgwizard/better-support-for-scandinavian-languages Add better support for Scandinavian languages	2015-10-14 21:48:10 -07:00
Sergey Obukhov	3ab33c557b	Merge pull request #65 from mailgun/sergey/cssselect add cssselect to dependencies	2015-10-14 20:34:02 -07:00
Sergey Obukhov	8db05f4950	add cssselect to dependencies	2015-10-14 20:31:26 -07:00
Sergey Obukhov	3d5bc82a03	Merge pull request #61 from tgwizard/fix-for-apple-mail Add fix for Apple Mail email format	2015-10-14 12:38:06 -07:00
Adam Renberg	14e3a0d80b	Add better support for Scandinavian languages This is a port of https://github.com/tictail/claw/pull/6 by @simonflore.	2015-09-21 21:42:01 +02:00
Adam Renberg	fcd9e2716a	Add fix for Apple Mail email format Where they have an initial > on the "date line".	2015-09-21 21:33:57 +02:00
Sergey Obukhov	d62d633215	bump up version	2015-09-21 09:55:51 -07:00
Sergey Obukhov	3b0c9273c1	Merge pull request #60 from mailgun/sergey/26 fixes mailgun/talon#26	2015-09-21 09:54:35 -07:00
Sergey Obukhov	e4c1c11845	remove print	2015-09-21 09:52:47 -07:00
Sergey Obukhov	ae508fe0e5	fixes mailgun/talon#26	2015-09-21 09:51:26 -07:00
Sergey Obukhov	2cb9b5399c	bump up version	2015-09-18 05:23:29 -07:00
Sergey Obukhov	134c47f515	Merge pull request #59 from mailgun/sergey/43 fixes mailgun/talon#43	2015-09-18 05:20:51 -07:00
Sergey Obukhov	d328c9d128	fixes mailgun/talon#43	2015-09-18 05:19:59 -07:00
Sergey Obukhov	77b62b0fef	Merge pull request #58 from mailgun/sergey/52 fixes mailgun/talon#52	2015-09-18 04:48:50 -07:00
Sergey Obukhov	ad09b18f3f	fixes mailgun/talon#52	2015-09-18 04:47:23 -07:00