diff --git a/talon/utils.py b/talon/utils.py index 845b149..0e50c8f 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -121,8 +121,11 @@ def html_to_text(string): 1. the string is expected to contain UTF-8 encoded HTML! 2. returns utf-8 encoded str (not unicode) """ + if isinstance(string, six.text_type): + string = string.encode('utf8') + s = _prepend_utf8_declaration(string) - s = s.replace("\n", "") + s = s.replace(b"\n", b"") tree = html.fromstring(s) @@ -157,7 +160,7 @@ def html_to_text(string): def _contains_charset_spec(s): """Return True if the first 4KB contain charset spec """ - return s.lower().find('html; charset=', 0, 4096) != -1 + return s.lower().find(b'html; charset=', 0, 4096) != -1 def _prepend_utf8_declaration(s): @@ -178,8 +181,8 @@ def _encode_utf8(s): return s.encode('utf-8') if isinstance(s, six.text_type) else s -_UTF8_DECLARATION = ('') +_UTF8_DECLARATION = (b'') _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index 5794545..c155cbb 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -306,6 +306,7 @@ def extract_reply_and_check(filename): msg_body = f.read() reply = quotations.extract_from_html(msg_body) plain_reply = u.html_to_text(reply) + plain_reply = plain_reply.decode('utf8') eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"), RE_WHITESPACE.sub('', plain_reply))