Convert msg_body to bytes in extract_from_html

2016-07-13 10:32:27 +05:00
parent 7bdf4d622b
commit dcc0d1de20
1 changed files with 33 additions and 2 deletions
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -15,6 +15,7 @@ from lxml import html, etree
 from talon.utils import get_delimiter, html_to_text
 from talon import html_quotations
 from six.moves import range
+import six


 log = logging.getLogger(__name__)
@@ -345,11 +346,41 @@ def extract_from_html(msg_body):
    then extracting quotations from text,
    then checking deleted checkpoints,
    then deleting necessary tags.
+
+    Returns a unicode string.
    """
-    if msg_body.strip() == '':
+    if isinstance(msg_body, six.text_type):
+        msg_body = msg_body.encode('utf8')
+    elif not isinstance(msg_body, bytes):
+        msg_body = msg_body.encode('ascii')
+
+    result = _extract_from_html(msg_body)
+    if isinstance(result, bytes):
+        result = result.decode('utf8')
+
+    return result
+
+
+def _extract_from_html(msg_body):
+    """
+    Extract not quoted message from provided html message body
+    using tags and plain text algorithm.
+
+    Cut out the 'blockquote', 'gmail_quote' tags.
+    Cut Microsoft quotations.
+
+    Then use plain text algorithm to cut out splitter or
+    leftover quotation.
+    This works by adding checkpoint text to all html tags,
+    then converting html to text,
+    then extracting quotations from text,
+    then checking deleted checkpoints,
+    then deleting necessary tags.
+    """
+    if msg_body.strip() == b'':
        return msg_body

-    msg_body = msg_body.replace('\r\n', '').replace('\n', '')
+    msg_body = msg_body.replace(b'\r\n', b'').replace(b'\n', b'')
    html_tree = html.document_fromstring(
        msg_body,
        parser=html.HTMLParser(encoding="utf-8")