Merge branch 'master' into patch-2
This commit is contained in:
16
README.rst
16
README.rst
@@ -129,6 +129,22 @@ start using it for talon.
|
|||||||
.. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set
|
.. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set
|
||||||
.. _forge: https://github.com/mailgun/forge
|
.. _forge: https://github.com/mailgun/forge
|
||||||
|
|
||||||
|
Training on your dataset
|
||||||
|
------------------------
|
||||||
|
|
||||||
|
talon comes with a pre-processed dataset and a pre-trained classifier. To retrain the classifier on your own dataset of raw emails, structure and annotate them in the same way the `forge`_ project does. Then do:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from talon.signature.learning.dataset import build_extraction_dataset
|
||||||
|
from talon.signature.learning import classifier as c
|
||||||
|
|
||||||
|
build_extraction_dataset("/path/to/your/P/folder", "/path/to/talon/signature/data/train.data")
|
||||||
|
c.train(c.init(), "/path/to/talon/signature/data/train.data", "/path/to/talon/signature/data/classifier")
|
||||||
|
|
||||||
|
Note that for signature extraction you need just the folder with the positive samples with annotated signature lines (P folder).
|
||||||
|
|
||||||
|
.. _forge: https://github.com/mailgun/forge
|
||||||
|
|
||||||
Research
|
Research
|
||||||
--------
|
--------
|
||||||
|
|||||||
@@ -434,6 +434,9 @@ def _extract_from_html(msg_body):
|
|||||||
Extract not quoted message from provided html message body
|
Extract not quoted message from provided html message body
|
||||||
using tags and plain text algorithm.
|
using tags and plain text algorithm.
|
||||||
|
|
||||||
|
Cut out first some encoding html tags such as xml and doctype
|
||||||
|
for avoiding conflict with unicode decoding
|
||||||
|
|
||||||
Cut out the 'blockquote', 'gmail_quote' tags.
|
Cut out the 'blockquote', 'gmail_quote' tags.
|
||||||
Cut Microsoft quotations.
|
Cut Microsoft quotations.
|
||||||
|
|
||||||
@@ -449,6 +452,9 @@ def _extract_from_html(msg_body):
|
|||||||
return msg_body
|
return msg_body
|
||||||
|
|
||||||
msg_body = msg_body.replace(b'\r\n', b'\n')
|
msg_body = msg_body.replace(b'\r\n', b'\n')
|
||||||
|
|
||||||
|
msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
|
||||||
|
|
||||||
html_tree = html_document_fromstring(msg_body)
|
html_tree = html_document_fromstring(msg_body)
|
||||||
|
|
||||||
if html_tree is None:
|
if html_tree is None:
|
||||||
|
|||||||
Reference in New Issue
Block a user