Merge pull request #177 from mailgun/obukhov-sergey-patch-1

Update Readme with how to retrain on your own data
2018-11-02 15:22:18 +03:00 · 2018-11-02 15:21:36 +03:00 · 2018-11-02 15:03:02 +03:00 · 2018-11-02 14:52:38 +03:00 · 2018-11-02 09:12:43 +03:00 · 2018-11-02 09:11:07 +03:00
6 changed files with 96 additions and 7 deletions
--- a/README.rst
+++ b/README.rst
@@ -129,6 +129,22 @@ start using it for talon.
 .. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set
 .. _forge: https://github.com/mailgun/forge
 Training on your dataset
 ------------------------
 talon comes with a pre-processed dataset and a pre-trained classifier. To retrain the classifier on your own dataset of raw emails, structure and annotate them in the same way the `forge`_ project does. Then do:
 .. code:: python
    from talon.signature.learning.dataset import build_extraction_dataset
    from talon.signature.learning import classifier as c 
    build_extraction_dataset("/path/to/your/P/folder", "/path/to/talon/signature/data/train.data")
    c.train(c.init(), "/path/to/talon/signature/data/train.data", "/path/to/talon/signature/data/classifier")
 Note that for signature extraction you need just the folder with the positive samples with annotated signature lines (P folder).
 .. _forge: https://github.com/mailgun/forge
 Research
 --------
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):
 setup(name='talon',
-      version='1.4.0',
+      version='1.4.5',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
--- a/talon/html_quotations.py
+++ b/talon/html_quotations.py
@@ -94,6 +94,12 @@ def cut_microsoft_quote(html_message):
        #outlook 2007, 2010 (american)
        "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
        "padding:3.0pt 0in 0in 0in']|"
        #outlook 2013 (international)
        "//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;"
        "padding:3.0pt 0cm 0cm 0cm']|"
        #outlook 2013 (american)
        "//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;"
        "padding:3.0pt 0in 0in 0in']|"
        #windows mail
        "//div[@style='padding-top: 5px; "
        "border-top-color: rgb(229, 229, 229); "
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -38,10 +38,14 @@ RE_ON_DATE_SMB_WROTE = re.compile(
            'Op',
            # German
            'Am',
            # Portuguese
            'Em',
            # Norwegian
            u'På',
            # Swedish, Danish
            'Den',
            # Vietnamese
            u'Vào',
        )),
        # Date and sender separator
        u'|'.join((
@@ -62,8 +66,12 @@ RE_ON_DATE_SMB_WROTE = re.compile(
            'schreef','verzond','geschreven',
            # German
            'schrieb',
            # Portuguese
            'escreveu',
            # Norwegian, Swedish
            'skrev',
            # Vietnamese
            u'đã viết',
        ))
    ))
 # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
@@ -143,7 +151,7 @@ RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]?.*
 RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format(
    u'|'.join((
        # English
-        'wrote'
+        'wrote',
    ))), re.I)
 # Support polymail.io reply format
@@ -161,15 +169,15 @@ SPLITTER_PATTERNS = [
    RE_FROM_COLON_OR_DATE_COLON,
    # 02.04.2012 14:20 пользователь "bob@example.com" <
    # bob@xxx.mailgun.org> написал:
-    re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S),
+    re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*\s\S+@\S+", re.S),
    # 2014-10-17 11:28 GMT+03:00 Bob <
    # bob@example.com>:
-    re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S),
+    re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*\s\S+@\S+", re.S),
    # Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>:
    re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
               '( \S+){3,6}@\S+:'),
    # Sent from Samsung MobileName <address@example.com> wrote:
-    re.compile('Sent from Samsung .*@.*> wrote'),
+    re.compile('Sent from Samsung.* \S+@\S+> wrote'),
    RE_ANDROID_WROTE,
    RE_POLYMAIL
    ]
@@ -282,7 +290,7 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
    # inlined reply
    # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm'
    # both 't' entries should be found
-    for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers):
+    for inline_reply in re.finditer('(?<=m)e*(t[te]*)m', markers):
        # long links could break sequence of quotation lines but they shouldn't
        # be considered an inline reply
        links = (
@@ -426,6 +434,9 @@ def _extract_from_html(msg_body):
    Extract not quoted message from provided html message body
    using tags and plain text algorithm.
    Cut out first some encoding html tags such as xml and doctype
    for avoiding conflict with unicode decoding
    Cut out the 'blockquote', 'gmail_quote' tags.
    Cut Microsoft quotations.
@@ -441,6 +452,9 @@ def _extract_from_html(msg_body):
        return msg_body
    msg_body = msg_body.replace(b'\r\n', b'\n')
    msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
    html_tree = html_document_fromstring(msg_body)
    if html_tree is None:
--- a/talon/signature/extraction.py
+++ b/talon/signature/extraction.py
@@ -32,7 +32,7 @@ RE_REVERSE_SIGNATURE = re.compile(r'''
 def is_signature_line(line, sender, classifier):
    '''Checks if the line belongs to signature. Returns True or False.'''
-    data = numpy.array(build_pattern(line, features(sender)))
+    data = numpy.array(build_pattern(line, features(sender))).reshape(1, -1)
    return classifier.predict(data) > 0
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -119,6 +119,38 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> sent:
    eq_("Test reply", quotations.extract_from_plain(msg_body))
 def test_appointment():
    msg_body = """Response
 10/19/2017 @ 9:30 am for physical therapy
 Bla
 1517 4th Avenue Ste 300
 London CA 19129, 555-421-6780
 John Doe, FCLS
 Mailgun Inc
 555-941-0697
 From: from@example.com [mailto:from@example.com]
 Sent: Wednesday, October 18, 2017 2:05 PM
 To: John Doer - SIU <jd@example.com>
 Subject: RE: Claim # 5551188-1
 Text"""
    expected = """Response
 10/19/2017 @ 9:30 am for physical therapy
 Bla
 1517 4th Avenue Ste 300
 London CA 19129, 555-421-6780
 John Doe, FCLS
 Mailgun Inc
 555-941-0697"""
    eq_(expected, quotations.extract_from_plain(msg_body))
 def test_line_starts_with_on():
    msg_body = """Blah-blah-blah
 On blah-blah-blah"""
@@ -401,6 +433,14 @@ Op 17-feb.-2015, om 13:18 heeft Julius Caesar <pantheon@rome.com> het volgende g
 Small batch beard laboris tempor, non listicle hella Tumblr heirloom.
 """))
 def test_vietnamese_from_block():
    eq_('Hello', quotations.extract_from_plain(
    u"""Hello
 Vào 14:24 8 tháng 6, 2017, Hùng Nguyễn <hungnguyen@xxx.com> đã viết:
 > Xin chào
 """))
 def test_quotation_marker_false_positive():
    msg_body = """Visit us now for assistance...
@@ -770,3 +810,16 @@ def test_split_email():
    expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm"
    markers = quotations.split_emails(msg)
    eq_(markers, expected_markers)
 def test_feedback_below_left_unparsed():
    msg_body = """Please enter your feedback below. Thank you.
 ------------------------------------- Enter Feedback Below -------------------------------------
 The user experience was unparallelled. Please continue production. I'm sending payment to ensure
 that this line is intact."""
    parsed = quotations.extract_from_plain(msg_body)
    eq_(msg_body, parsed.decode('utf8'))
Author	SHA1	Message	Date
Sergey Obukhov	6a304215c3	Merge pull request #177 from mailgun/obukhov-sergey-patch-1 Update Readme with how to retrain on your own data	2018-11-02 15:22:18 +03:00
Sergey Obukhov	31714506bd	Update Readme with how to retrain on your own data	2018-11-02 15:21:36 +03:00
Sergey Obukhov	403d80cf3b	Merge pull request #161 from glaand/master Fix: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.	2018-11-02 15:03:02 +03:00
Sergey Obukhov	7cf20f2877	Merge branch 'master' into master	2018-11-02 14:52:38 +03:00
Sergey Obukhov	685abb1905	Merge pull request #171 from gabriellima95/Add-Portuguese-Language Add Portuguese language to quotations	2018-11-02 09:12:43 +03:00
Sergey Obukhov	41990727a3	Merge branch 'master' into Add-Portuguese-Language	2018-11-02 09:11:07 +03:00
Sergey Obukhov	b113d8ab33	Merge pull request #172 from ad-m/patch-1 Fix catastrophic backtracking in regexp	2018-11-02 09:09:49 +03:00
Adam Dobrawy	7bd0e9cc2f	Fix catastrophic backtracking in regexp Co-Author: @Nipsuli	2018-09-21 22:00:10 +02:00
gabriellima95	1e030a51d4	Add Portuguese language to quotations	2018-09-11 15:27:39 -03:00
André Glatzl	53b24ffb3d	Cut out first some encoding html tags such as xml and doctype for avoiding conflict with unicode decoding	2017-12-19 15:15:10 +01:00
Sergey Obukhov	a7404afbcb	Merge pull request #155 from mailgun/sergey/appointment fix appointments in text	2017-10-23 16:34:08 -07:00
Sergey Obukhov	0e6d5f993c	fix appointments in text	2017-10-23 16:32:42 -07:00
Sergey Obukhov	60637ff13a	Merge pull request #152 from mailgun/sergey/v1.4.4 bump version	2017-08-24 16:00:05 -07:00
Sergey Obukhov	df8259e3fe	bump version	2017-08-24 15:58:53 -07:00
Sergey Obukhov	aab3b1cc75	Merge pull request #150 from ezrapagel/fix_greedy_dash_regex android_wrote regex incorrectly matching	2017-08-24 15:52:29 -07:00
Sergey Obukhov	9492b39f2d	Merge branch 'master' into fix_greedy_dash_regex	2017-08-24 15:39:28 -07:00
Sergey Obukhov	b9ac866ea7	Merge pull request #151 from mailgun/sergey/reshape reshape data as suggested by sklearn	2017-08-24 12:04:58 -07:00
Sergey Obukhov	678517dd89	reshape data as suggested by sklearn	2017-08-24 12:03:47 -07:00
Ezra Pagel	221774c6f8	android_wrote regex was incorrectly iterating characters in 'wrote', resulting in greedy regex that matched many strings with dashes	2017-08-21 12:47:06 -05:00
Sergey Obukhov	a2aa345712	Merge pull request #148 from mailgun/sergey/v1.4.2 bump version after adding support for Vietnamese format	2017-07-10 11:44:46 -07:00
Sergey Obukhov	d998beaff3	bump version after adding support for Vietnamese format	2017-07-10 11:42:52 -07:00
Sergey Obukhov	a379bc4e7c	Merge pull request #147 from hnx116/master add support for Vietnamese reply format	2017-07-10 11:40:04 -07:00
Hung Nguyen	b8e1894f3b	add test case	2017-07-10 13:28:33 +07:00
Hung Nguyen	0b5a44090f	add support for Vietnamese reply format	2017-07-10 11:18:57 +07:00
Sergey Obukhov	b40835eca2	Merge pull request #145 from mailgun/sergey/outlook-2013-version-bump bump version after merging outlook 2013 support PR	2017-06-18 22:56:16 -07:00
Sergey Obukhov	b38562c7cc	bump version after merging outlook 2013 support PR	2017-06-18 22:55:15 -07:00
Sergey Obukhov	70e9fb415e	Merge pull request #139 from Savageman/patch-1 Added Outlook 2013 rules	2017-06-18 22:53:18 -07:00
Sergey Obukhov	64612099cd	Merge branch 'master' into patch-1	2017-06-18 22:51:46 -07:00
Esperat Julian	e16dcf629e	Added Outlook 2013 rules Only the border color changes (compared to Outlook 2007, 2010) from `#B5C4DF` to `#E1E1E1`.	2017-04-27 11:34:01 +02:00