diff --git a/README.rst b/README.rst index 1f8d7e0..6ba6d73 100644 --- a/README.rst +++ b/README.rst @@ -129,6 +129,22 @@ start using it for talon. .. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set .. _forge: https://github.com/mailgun/forge +Training on your dataset +------------------------ + +talon comes with a pre-processed dataset and a pre-trained classifier. To retrain the classifier on your own dataset of raw emails, structure and annotate them in the same way the `forge`_ project does. Then do: + +.. code:: python + + from talon.signature.learning.dataset import build_extraction_dataset + from talon.signature.learning import classifier as c + + build_extraction_dataset("/path/to/your/P/folder", "/path/to/talon/signature/data/train.data") + c.train(c.init(), "/path/to/talon/signature/data/train.data", "/path/to/talon/signature/data/classifier") + +Note that for signature extraction you need just the folder with the positive samples with annotated signature lines (P folder). + +.. _forge: https://github.com/mailgun/forge Research -------- diff --git a/talon/quotations.py b/talon/quotations.py index 0fecf8a..8b368e5 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -434,6 +434,9 @@ def _extract_from_html(msg_body): Extract not quoted message from provided html message body using tags and plain text algorithm. + Cut out first some encoding html tags such as xml and doctype + for avoiding conflict with unicode decoding + Cut out the 'blockquote', 'gmail_quote' tags. Cut Microsoft quotations. @@ -449,6 +452,9 @@ def _extract_from_html(msg_body): return msg_body msg_body = msg_body.replace(b'\r\n', b'\n') + + msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body) + html_tree = html_document_fromstring(msg_body) if html_tree is None: