Cut out first some encoding html tags such as xml and doctype for avoiding conflict with unicode decoding

2017-12-19 15:15:10 +01:00
parent a7404afbcb
commit 53b24ffb3d
1 changed files with 6 additions and 0 deletions
@@ -430,6 +430,9 @@ def _extract_from_html(msg_body):
    Extract not quoted message from provided html message body
    using tags and plain text algorithm.

+    Cut out first some encoding html tags such as xml and doctype
+    for avoiding conflict with unicode decoding
+
    Cut out the 'blockquote', 'gmail_quote' tags.
    Cut Microsoft quotations.

@@ -445,6 +448,9 @@ def _extract_from_html(msg_body):
        return msg_body

    msg_body = msg_body.replace(b'\r\n', b'\n')
+
+    msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
+
    html_tree = html_document_fromstring(msg_body)

    if html_tree is None: