From dcc0d1de20146787586da86732333fa14d9cef08 Mon Sep 17 00:00:00 2001
From: Umair Khan <umair.waheed@gmail.com>
Date: Wed, 13 Jul 2016 10:32:27 +0500
Subject: [PATCH] Convert msg_body to bytes in extract_from_html

---
 talon/quotations.py | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/talon/quotations.py b/talon/quotations.py
index 9d2a8d4..ae56cdd 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -15,6 +15,7 @@ from lxml import html, etree
 from talon.utils import get_delimiter, html_to_text
 from talon import html_quotations
 from six.moves import range
+import six
 
 
 log = logging.getLogger(__name__)
@@ -345,11 +346,41 @@ def extract_from_html(msg_body):
     then extracting quotations from text,
     then checking deleted checkpoints,
     then deleting necessary tags.
+
+    Returns a unicode string.
     """
-    if msg_body.strip() == '':
+    if isinstance(msg_body, six.text_type):
+        msg_body = msg_body.encode('utf8')
+    elif not isinstance(msg_body, bytes):
+        msg_body = msg_body.encode('ascii')
+
+    result = _extract_from_html(msg_body)
+    if isinstance(result, bytes):
+        result = result.decode('utf8')
+
+    return result
+
+
+def _extract_from_html(msg_body):
+    """
+    Extract not quoted message from provided html message body
+    using tags and plain text algorithm.
+
+    Cut out the 'blockquote', 'gmail_quote' tags.
+    Cut Microsoft quotations.
+
+    Then use plain text algorithm to cut out splitter or
+    leftover quotation.
+    This works by adding checkpoint text to all html tags,
+    then converting html to text,
+    then extracting quotations from text,
+    then checking deleted checkpoints,
+    then deleting necessary tags.
+    """
+    if msg_body.strip() == b'':
         return msg_body
 
-    msg_body = msg_body.replace('\r\n', '').replace('\n', '')
+    msg_body = msg_body.replace(b'\r\n', b'').replace(b'\n', b'')
     html_tree = html.document_fromstring(
         msg_body,
         parser=html.HTMLParser(encoding="utf-8")