Cut out first some encoding html tags such as xml and doctype for avoiding conflict with unicode decoding
This commit is contained in:
@@ -430,6 +430,9 @@ def _extract_from_html(msg_body):
|
|||||||
Extract not quoted message from provided html message body
|
Extract not quoted message from provided html message body
|
||||||
using tags and plain text algorithm.
|
using tags and plain text algorithm.
|
||||||
|
|
||||||
|
Cut out first some encoding html tags such as xml and doctype
|
||||||
|
for avoiding conflict with unicode decoding
|
||||||
|
|
||||||
Cut out the 'blockquote', 'gmail_quote' tags.
|
Cut out the 'blockquote', 'gmail_quote' tags.
|
||||||
Cut Microsoft quotations.
|
Cut Microsoft quotations.
|
||||||
|
|
||||||
@@ -445,6 +448,9 @@ def _extract_from_html(msg_body):
|
|||||||
return msg_body
|
return msg_body
|
||||||
|
|
||||||
msg_body = msg_body.replace(b'\r\n', b'\n')
|
msg_body = msg_body.replace(b'\r\n', b'\n')
|
||||||
|
|
||||||
|
msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
|
||||||
|
|
||||||
html_tree = html_document_fromstring(msg_body)
|
html_tree = html_document_fromstring(msg_body)
|
||||||
|
|
||||||
if html_tree is None:
|
if html_tree is None:
|
||||||
|
|||||||
Reference in New Issue
Block a user