Expose extract_from_html_tree
This commit is contained in:
@@ -193,9 +193,6 @@ RE_PARENTHESIS_LINK = re.compile("\(https?://")
|
|||||||
|
|
||||||
SPLITTER_MAX_LINES = 6
|
SPLITTER_MAX_LINES = 6
|
||||||
MAX_LINES_COUNT = 1000
|
MAX_LINES_COUNT = 1000
|
||||||
# an extensive research shows that exceeding this limit
|
|
||||||
# leads to excessive processing time
|
|
||||||
MAX_HTML_LEN = 2794202
|
|
||||||
|
|
||||||
QUOT_PATTERN = re.compile('^>+ ?')
|
QUOT_PATTERN = re.compile('^>+ ?')
|
||||||
NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
|
NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
|
||||||
@@ -421,25 +418,31 @@ def extract_from_html(msg_body):
|
|||||||
|
|
||||||
Returns a unicode string.
|
Returns a unicode string.
|
||||||
"""
|
"""
|
||||||
|
msg_body_bytes = msg_body
|
||||||
if isinstance(msg_body, six.text_type):
|
if isinstance(msg_body, six.text_type):
|
||||||
msg_body = msg_body.encode('utf8')
|
msg_body_bytes = msg_body.encode('utf8')
|
||||||
elif not isinstance(msg_body, bytes):
|
|
||||||
msg_body = msg_body.encode('ascii')
|
|
||||||
|
|
||||||
result = _extract_from_html(msg_body)
|
if msg_body_bytes.strip() == b'':
|
||||||
if isinstance(result, bytes):
|
return msg_body
|
||||||
result = result.decode('utf8')
|
|
||||||
|
msg_body_bytes = msg_body_bytes.replace(b'\r\n', b'\n')
|
||||||
|
# Cut out xml and doctype tags to avoid conflict with unicode decoding.
|
||||||
|
msg_body_bytes = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", b"", msg_body_bytes)
|
||||||
|
html_tree = html_document_fromstring(msg_body_bytes)
|
||||||
|
if html_tree is None:
|
||||||
|
return msg_body
|
||||||
|
|
||||||
|
result = extract_from_html_tree(html_tree)
|
||||||
|
if not result:
|
||||||
|
return msg_body
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def _extract_from_html(msg_body):
|
def extract_from_html_tree(html_tree):
|
||||||
"""
|
"""
|
||||||
Extract not quoted message from provided html message body
|
Extract not quoted message from provided parsed html tree using tags and
|
||||||
using tags and plain text algorithm.
|
plain text algorithm.
|
||||||
|
|
||||||
Cut out first some encoding html tags such as xml and doctype
|
|
||||||
for avoiding conflict with unicode decoding
|
|
||||||
|
|
||||||
Cut out the 'blockquote', 'gmail_quote' tags.
|
Cut out the 'blockquote', 'gmail_quote' tags.
|
||||||
Cut Microsoft quotations.
|
Cut Microsoft quotations.
|
||||||
@@ -452,18 +455,6 @@ def _extract_from_html(msg_body):
|
|||||||
then checking deleted checkpoints,
|
then checking deleted checkpoints,
|
||||||
then deleting necessary tags.
|
then deleting necessary tags.
|
||||||
"""
|
"""
|
||||||
if msg_body.strip() == b'':
|
|
||||||
return msg_body
|
|
||||||
|
|
||||||
msg_body = msg_body.replace(b'\r\n', b'\n')
|
|
||||||
|
|
||||||
msg_body = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
|
|
||||||
|
|
||||||
html_tree = html_document_fromstring(msg_body)
|
|
||||||
|
|
||||||
if html_tree is None:
|
|
||||||
return msg_body
|
|
||||||
|
|
||||||
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
|
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
|
||||||
html_quotations.cut_zimbra_quote(html_tree) or
|
html_quotations.cut_zimbra_quote(html_tree) or
|
||||||
html_quotations.cut_blockquote(html_tree) or
|
html_quotations.cut_blockquote(html_tree) or
|
||||||
@@ -481,7 +472,7 @@ def _extract_from_html(msg_body):
|
|||||||
|
|
||||||
# Don't process too long messages
|
# Don't process too long messages
|
||||||
if len(lines) > MAX_LINES_COUNT:
|
if len(lines) > MAX_LINES_COUNT:
|
||||||
return msg_body
|
return None
|
||||||
|
|
||||||
# Collect checkpoints on each line
|
# Collect checkpoints on each line
|
||||||
line_checkpoints = [
|
line_checkpoints = [
|
||||||
@@ -500,7 +491,7 @@ def _extract_from_html(msg_body):
|
|||||||
lines_were_deleted, first_deleted, last_deleted = return_flags
|
lines_were_deleted, first_deleted, last_deleted = return_flags
|
||||||
|
|
||||||
if not lines_were_deleted and not cut_quotations:
|
if not lines_were_deleted and not cut_quotations:
|
||||||
return msg_body
|
return None
|
||||||
|
|
||||||
if lines_were_deleted:
|
if lines_were_deleted:
|
||||||
#collect checkpoints from deleted lines
|
#collect checkpoints from deleted lines
|
||||||
@@ -514,7 +505,7 @@ def _extract_from_html(msg_body):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if _readable_text_empty(html_tree_copy):
|
if _readable_text_empty(html_tree_copy):
|
||||||
return msg_body
|
return None
|
||||||
|
|
||||||
# NOTE: We remove_namespaces() because we are using an HTML5 Parser, HTML
|
# NOTE: We remove_namespaces() because we are using an HTML5 Parser, HTML
|
||||||
# parsers do not recognize namespaces in HTML tags. As such the rendered
|
# parsers do not recognize namespaces in HTML tags. As such the rendered
|
||||||
@@ -540,7 +531,11 @@ def _extract_from_html(msg_body):
|
|||||||
# of replacing data outside the <tag> which might be essential to
|
# of replacing data outside the <tag> which might be essential to
|
||||||
# the customer.
|
# the customer.
|
||||||
remove_namespaces(html_tree_copy)
|
remove_namespaces(html_tree_copy)
|
||||||
return html.tostring(html_tree_copy)
|
s = html.tostring(html_tree_copy)
|
||||||
|
if not s:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return s.decode('utf-8')
|
||||||
|
|
||||||
|
|
||||||
def remove_namespaces(root):
|
def remove_namespaces(root):
|
||||||
|
|||||||
Reference in New Issue
Block a user