From dcc0d1de20146787586da86732333fa14d9cef08 Mon Sep 17 00:00:00 2001 From: Umair Khan Date: Wed, 13 Jul 2016 10:32:27 +0500 Subject: [PATCH] Convert msg_body to bytes in extract_from_html --- talon/quotations.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index 9d2a8d4..ae56cdd 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -15,6 +15,7 @@ from lxml import html, etree from talon.utils import get_delimiter, html_to_text from talon import html_quotations from six.moves import range +import six log = logging.getLogger(__name__) @@ -345,11 +346,41 @@ def extract_from_html(msg_body): then extracting quotations from text, then checking deleted checkpoints, then deleting necessary tags. + + Returns a unicode string. """ - if msg_body.strip() == '': + if isinstance(msg_body, six.text_type): + msg_body = msg_body.encode('utf8') + elif not isinstance(msg_body, bytes): + msg_body = msg_body.encode('ascii') + + result = _extract_from_html(msg_body) + if isinstance(result, bytes): + result = result.decode('utf8') + + return result + + +def _extract_from_html(msg_body): + """ + Extract not quoted message from provided html message body + using tags and plain text algorithm. + + Cut out the 'blockquote', 'gmail_quote' tags. + Cut Microsoft quotations. + + Then use plain text algorithm to cut out splitter or + leftover quotation. + This works by adding checkpoint text to all html tags, + then converting html to text, + then extracting quotations from text, + then checking deleted checkpoints, + then deleting necessary tags. + """ + if msg_body.strip() == b'': return msg_body - msg_body = msg_body.replace('\r\n', '').replace('\n', '') + msg_body = msg_body.replace(b'\r\n', b'').replace(b'\n', b'') html_tree = html.document_fromstring( msg_body, parser=html.HTMLParser(encoding="utf-8")