Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6a304215c3 | ||
|
|
31714506bd | ||
|
|
403d80cf3b | ||
|
|
7cf20f2877 | ||
|
|
685abb1905 | ||
|
|
41990727a3 | ||
|
|
b113d8ab33 | ||
|
|
7bd0e9cc2f | ||
|
|
1e030a51d4 | ||
|
|
53b24ffb3d | ||
|
|
a7404afbcb | ||
|
|
0e6d5f993c |
16
README.rst
16
README.rst
@@ -129,6 +129,22 @@ start using it for talon.
|
|||||||
.. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set
|
.. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set
|
||||||
.. _forge: https://github.com/mailgun/forge
|
.. _forge: https://github.com/mailgun/forge
|
||||||
|
|
||||||
|
Training on your dataset
|
||||||
|
------------------------
|
||||||
|
|
||||||
|
talon comes with a pre-processed dataset and a pre-trained classifier. To retrain the classifier on your own dataset of raw emails, structure and annotate them in the same way the `forge`_ project does. Then do:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from talon.signature.learning.dataset import build_extraction_dataset
|
||||||
|
from talon.signature.learning import classifier as c
|
||||||
|
|
||||||
|
build_extraction_dataset("/path/to/your/P/folder", "/path/to/talon/signature/data/train.data")
|
||||||
|
c.train(c.init(), "/path/to/talon/signature/data/train.data", "/path/to/talon/signature/data/classifier")
|
||||||
|
|
||||||
|
Note that for signature extraction you need just the folder with the positive samples with annotated signature lines (P folder).
|
||||||
|
|
||||||
|
.. _forge: https://github.com/mailgun/forge
|
||||||
|
|
||||||
Research
|
Research
|
||||||
--------
|
--------
|
||||||
|
|||||||
2
setup.py
2
setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):
|
|||||||
|
|
||||||
|
|
||||||
setup(name='talon',
|
setup(name='talon',
|
||||||
version='1.4.4',
|
version='1.4.5',
|
||||||
description=("Mailgun library "
|
description=("Mailgun library "
|
||||||
"to extract message quotations and signatures."),
|
"to extract message quotations and signatures."),
|
||||||
long_description=open("README.rst").read(),
|
long_description=open("README.rst").read(),
|
||||||
|
|||||||
@@ -38,6 +38,8 @@ RE_ON_DATE_SMB_WROTE = re.compile(
|
|||||||
'Op',
|
'Op',
|
||||||
# German
|
# German
|
||||||
'Am',
|
'Am',
|
||||||
|
# Portuguese
|
||||||
|
'Em',
|
||||||
# Norwegian
|
# Norwegian
|
||||||
u'På',
|
u'På',
|
||||||
# Swedish, Danish
|
# Swedish, Danish
|
||||||
@@ -64,6 +66,8 @@ RE_ON_DATE_SMB_WROTE = re.compile(
|
|||||||
'schreef','verzond','geschreven',
|
'schreef','verzond','geschreven',
|
||||||
# German
|
# German
|
||||||
'schrieb',
|
'schrieb',
|
||||||
|
# Portuguese
|
||||||
|
'escreveu',
|
||||||
# Norwegian, Swedish
|
# Norwegian, Swedish
|
||||||
'skrev',
|
'skrev',
|
||||||
# Vietnamese
|
# Vietnamese
|
||||||
@@ -165,15 +169,15 @@ SPLITTER_PATTERNS = [
|
|||||||
RE_FROM_COLON_OR_DATE_COLON,
|
RE_FROM_COLON_OR_DATE_COLON,
|
||||||
# 02.04.2012 14:20 пользователь "bob@example.com" <
|
# 02.04.2012 14:20 пользователь "bob@example.com" <
|
||||||
# bob@xxx.mailgun.org> написал:
|
# bob@xxx.mailgun.org> написал:
|
||||||
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S),
|
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*\s\S+@\S+", re.S),
|
||||||
# 2014-10-17 11:28 GMT+03:00 Bob <
|
# 2014-10-17 11:28 GMT+03:00 Bob <
|
||||||
# bob@example.com>:
|
# bob@example.com>:
|
||||||
re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S),
|
re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*\s\S+@\S+", re.S),
|
||||||
# Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>:
|
# Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>:
|
||||||
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
|
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
|
||||||
'( \S+){3,6}@\S+:'),
|
'( \S+){3,6}@\S+:'),
|
||||||
# Sent from Samsung MobileName <address@example.com> wrote:
|
# Sent from Samsung MobileName <address@example.com> wrote:
|
||||||
re.compile('Sent from Samsung .*@.*> wrote'),
|
re.compile('Sent from Samsung.* \S+@\S+> wrote'),
|
||||||
RE_ANDROID_WROTE,
|
RE_ANDROID_WROTE,
|
||||||
RE_POLYMAIL
|
RE_POLYMAIL
|
||||||
]
|
]
|
||||||
@@ -286,7 +290,7 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
|
|||||||
# inlined reply
|
# inlined reply
|
||||||
# use lookbehind assertions to find overlapping entries e.g. for 'mtmtm'
|
# use lookbehind assertions to find overlapping entries e.g. for 'mtmtm'
|
||||||
# both 't' entries should be found
|
# both 't' entries should be found
|
||||||
for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers):
|
for inline_reply in re.finditer('(?<=m)e*(t[te]*)m', markers):
|
||||||
# long links could break sequence of quotation lines but they shouldn't
|
# long links could break sequence of quotation lines but they shouldn't
|
||||||
# be considered an inline reply
|
# be considered an inline reply
|
||||||
links = (
|
links = (
|
||||||
@@ -430,6 +434,9 @@ def _extract_from_html(msg_body):
|
|||||||
Extract not quoted message from provided html message body
|
Extract not quoted message from provided html message body
|
||||||
using tags and plain text algorithm.
|
using tags and plain text algorithm.
|
||||||
|
|
||||||
|
Cut out first some encoding html tags such as xml and doctype
|
||||||
|
for avoiding conflict with unicode decoding
|
||||||
|
|
||||||
Cut out the 'blockquote', 'gmail_quote' tags.
|
Cut out the 'blockquote', 'gmail_quote' tags.
|
||||||
Cut Microsoft quotations.
|
Cut Microsoft quotations.
|
||||||
|
|
||||||
@@ -445,6 +452,9 @@ def _extract_from_html(msg_body):
|
|||||||
return msg_body
|
return msg_body
|
||||||
|
|
||||||
msg_body = msg_body.replace(b'\r\n', b'\n')
|
msg_body = msg_body.replace(b'\r\n', b'\n')
|
||||||
|
|
||||||
|
msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
|
||||||
|
|
||||||
html_tree = html_document_fromstring(msg_body)
|
html_tree = html_document_fromstring(msg_body)
|
||||||
|
|
||||||
if html_tree is None:
|
if html_tree is None:
|
||||||
|
|||||||
@@ -119,6 +119,38 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> sent:
|
|||||||
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
eq_("Test reply", quotations.extract_from_plain(msg_body))
|
||||||
|
|
||||||
|
|
||||||
|
def test_appointment():
|
||||||
|
msg_body = """Response
|
||||||
|
|
||||||
|
10/19/2017 @ 9:30 am for physical therapy
|
||||||
|
Bla
|
||||||
|
1517 4th Avenue Ste 300
|
||||||
|
London CA 19129, 555-421-6780
|
||||||
|
|
||||||
|
John Doe, FCLS
|
||||||
|
Mailgun Inc
|
||||||
|
555-941-0697
|
||||||
|
|
||||||
|
From: from@example.com [mailto:from@example.com]
|
||||||
|
Sent: Wednesday, October 18, 2017 2:05 PM
|
||||||
|
To: John Doer - SIU <jd@example.com>
|
||||||
|
Subject: RE: Claim # 5551188-1
|
||||||
|
|
||||||
|
Text"""
|
||||||
|
|
||||||
|
expected = """Response
|
||||||
|
|
||||||
|
10/19/2017 @ 9:30 am for physical therapy
|
||||||
|
Bla
|
||||||
|
1517 4th Avenue Ste 300
|
||||||
|
London CA 19129, 555-421-6780
|
||||||
|
|
||||||
|
John Doe, FCLS
|
||||||
|
Mailgun Inc
|
||||||
|
555-941-0697"""
|
||||||
|
eq_(expected, quotations.extract_from_plain(msg_body))
|
||||||
|
|
||||||
|
|
||||||
def test_line_starts_with_on():
|
def test_line_starts_with_on():
|
||||||
msg_body = """Blah-blah-blah
|
msg_body = """Blah-blah-blah
|
||||||
On blah-blah-blah"""
|
On blah-blah-blah"""
|
||||||
|
|||||||
Reference in New Issue
Block a user