Cut out first some encoding html tags such as xml and doctype for avoiding conflict with unicode decoding

This commit is contained in:
André Glatzl
2017-12-19 15:15:10 +01:00
parent a7404afbcb
commit 53b24ffb3d

View File

@@ -430,6 +430,9 @@ def _extract_from_html(msg_body):
Extract not quoted message from provided html message body
using tags and plain text algorithm.
Cut out first some encoding html tags such as xml and doctype
for avoiding conflict with unicode decoding
Cut out the 'blockquote', 'gmail_quote' tags.
Cut Microsoft quotations.
@@ -445,6 +448,9 @@ def _extract_from_html(msg_body):
return msg_body
msg_body = msg_body.replace(b'\r\n', b'\n')
msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
html_tree = html_document_fromstring(msg_body)
if html_tree is None: