Now removing namespaces from parsed HTML

This commit is contained in:
Derrick J. Wippler
2019-05-08 11:01:04 -05:00
parent cdd84563dd
commit 1018e88ec1
3 changed files with 83 additions and 2 deletions

View File

@@ -8,6 +8,7 @@ import re
from talon import quotations, utils as u
from . import *
from .fixtures import *
from lxml import html
RE_WHITESPACE = re.compile("\s")
RE_DOUBLE_WHITESPACE = re.compile("\s")
@@ -424,3 +425,23 @@ def test_readable_html_empty():
def test_bad_html():
bad_html = "<html></html>"
eq_(bad_html, quotations.extract_from_html(bad_html))
def test_remove_namespaces():
msg_body = """
<html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns="http://www.w3.org/TR/REC-html40">
<body>
<o:p>Dear Sir,</o:p>
<o:p>Thank you for the email.</o:p>
<blockquote>thing</blockquote>
</body>
</html>
"""
rendered = quotations.extract_from_html(msg_body)
assert_true("<p>" in rendered)
assert_true("xmlns" in rendered)
assert_true("<o:p>" not in rendered)
assert_true("<xmlns:o>" not in rendered)