Merge pull request #161 from glaand/master

Fix: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
This commit is contained in:
Sergey Obukhov
2018-11-02 15:03:02 +03:00
committed by GitHub

View File

@@ -434,6 +434,9 @@ def _extract_from_html(msg_body):
Extract not quoted message from provided html message body
using tags and plain text algorithm.
Cut out first some encoding html tags such as xml and doctype
for avoiding conflict with unicode decoding
Cut out the 'blockquote', 'gmail_quote' tags.
Cut Microsoft quotations.
@@ -449,6 +452,9 @@ def _extract_from_html(msg_body):
return msg_body
msg_body = msg_body.replace(b'\r\n', b'\n')
msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
html_tree = html_document_fromstring(msg_body)
if html_tree is None: