Compare commits
11 Commits
sergey/app
...
v1.4.6
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6a304215c3 | ||
|
|
31714506bd | ||
|
|
403d80cf3b | ||
|
|
7cf20f2877 | ||
|
|
685abb1905 | ||
|
|
41990727a3 | ||
|
|
b113d8ab33 | ||
|
|
7bd0e9cc2f | ||
|
|
1e030a51d4 | ||
|
|
53b24ffb3d | ||
|
|
a7404afbcb |
16
README.rst
16
README.rst
@@ -129,6 +129,22 @@ start using it for talon.
|
||||
.. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set
|
||||
.. _forge: https://github.com/mailgun/forge
|
||||
|
||||
Training on your dataset
|
||||
------------------------
|
||||
|
||||
talon comes with a pre-processed dataset and a pre-trained classifier. To retrain the classifier on your own dataset of raw emails, structure and annotate them in the same way the `forge`_ project does. Then do:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from talon.signature.learning.dataset import build_extraction_dataset
|
||||
from talon.signature.learning import classifier as c
|
||||
|
||||
build_extraction_dataset("/path/to/your/P/folder", "/path/to/talon/signature/data/train.data")
|
||||
c.train(c.init(), "/path/to/talon/signature/data/train.data", "/path/to/talon/signature/data/classifier")
|
||||
|
||||
Note that for signature extraction you need just the folder with the positive samples with annotated signature lines (P folder).
|
||||
|
||||
.. _forge: https://github.com/mailgun/forge
|
||||
|
||||
Research
|
||||
--------
|
||||
|
||||
@@ -38,6 +38,8 @@ RE_ON_DATE_SMB_WROTE = re.compile(
|
||||
'Op',
|
||||
# German
|
||||
'Am',
|
||||
# Portuguese
|
||||
'Em',
|
||||
# Norwegian
|
||||
u'På',
|
||||
# Swedish, Danish
|
||||
@@ -64,6 +66,8 @@ RE_ON_DATE_SMB_WROTE = re.compile(
|
||||
'schreef','verzond','geschreven',
|
||||
# German
|
||||
'schrieb',
|
||||
# Portuguese
|
||||
'escreveu',
|
||||
# Norwegian, Swedish
|
||||
'skrev',
|
||||
# Vietnamese
|
||||
@@ -286,7 +290,7 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
|
||||
# inlined reply
|
||||
# use lookbehind assertions to find overlapping entries e.g. for 'mtmtm'
|
||||
# both 't' entries should be found
|
||||
for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers):
|
||||
for inline_reply in re.finditer('(?<=m)e*(t[te]*)m', markers):
|
||||
# long links could break sequence of quotation lines but they shouldn't
|
||||
# be considered an inline reply
|
||||
links = (
|
||||
@@ -430,6 +434,9 @@ def _extract_from_html(msg_body):
|
||||
Extract not quoted message from provided html message body
|
||||
using tags and plain text algorithm.
|
||||
|
||||
Cut out first some encoding html tags such as xml and doctype
|
||||
for avoiding conflict with unicode decoding
|
||||
|
||||
Cut out the 'blockquote', 'gmail_quote' tags.
|
||||
Cut Microsoft quotations.
|
||||
|
||||
@@ -445,6 +452,9 @@ def _extract_from_html(msg_body):
|
||||
return msg_body
|
||||
|
||||
msg_body = msg_body.replace(b'\r\n', b'\n')
|
||||
|
||||
msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
|
||||
|
||||
html_tree = html_document_fromstring(msg_body)
|
||||
|
||||
if html_tree is None:
|
||||
|
||||
Reference in New Issue
Block a user