From 53b24ffb3db43e4eb91836f055ca27c88bbf3f7d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Glatzl?= <andreglatzl@gmail.com>
Date: Tue, 19 Dec 2017 15:15:10 +0100
Subject: [PATCH 1/2] Cut out first some encoding html tags such as xml and
 doctype for avoiding conflict with unicode decoding

---
 talon/quotations.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/talon/quotations.py b/talon/quotations.py
index 996f79d..b43e99a 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -430,6 +430,9 @@ def _extract_from_html(msg_body):
     Extract not quoted message from provided html message body
     using tags and plain text algorithm.
 
+    Cut out first some encoding html tags such as xml and doctype
+    for avoiding conflict with unicode decoding
+
     Cut out the 'blockquote', 'gmail_quote' tags.
     Cut Microsoft quotations.
 
@@ -445,6 +448,9 @@ def _extract_from_html(msg_body):
         return msg_body
 
     msg_body = msg_body.replace(b'\r\n', b'\n')
+
+    msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
+
     html_tree = html_document_fromstring(msg_body)
 
     if html_tree is None:

From 31714506bdadde704a74ecf4534daafbd03efc69 Mon Sep 17 00:00:00 2001
From: Sergey Obukhov <sergey.obykhov@mailgunhq.com>
Date: Fri, 2 Nov 2018 15:21:36 +0300
Subject: [PATCH 2/2] Update Readme with how to retrain on your own data

---
 README.rst | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/README.rst b/README.rst
index 1f8d7e0..6ba6d73 100644
--- a/README.rst
+++ b/README.rst
@@ -129,6 +129,22 @@ start using it for talon.
 .. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set
 .. _forge: https://github.com/mailgun/forge
 
+Training on your dataset
+------------------------
+
+talon comes with a pre-processed dataset and a pre-trained classifier. To retrain the classifier on your own dataset of raw emails, structure and annotate them in the same way the `forge`_ project does. Then do:
+
+.. code:: python
+
+    from talon.signature.learning.dataset import build_extraction_dataset
+    from talon.signature.learning import classifier as c 
+    
+    build_extraction_dataset("/path/to/your/P/folder", "/path/to/talon/signature/data/train.data")
+    c.train(c.init(), "/path/to/talon/signature/data/train.data", "/path/to/talon/signature/data/classifier")
+
+Note that for signature extraction you need just the folder with the positive samples with annotated signature lines (P folder).
+
+.. _forge: https://github.com/mailgun/forge
 
 Research
 --------