use html5lib to parse html

This commit is contained in:
Sergey Obukhov
2016-08-15 19:36:21 -07:00
parent f53b5cc7a6
commit bcf97eccfa
4 changed files with 44 additions and 25 deletions

View File

@@ -27,7 +27,7 @@ def test_quotation_splitter_inside_blockquote():
</blockquote>"""
eq_("<html><body><p>Reply</p></body></html>",
eq_("<html><head></head><body>Reply</body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -44,7 +44,7 @@ def test_quotation_splitter_outside_blockquote():
</div>
</blockquote>
"""
eq_("<html><body><p>Reply</p></body></html>",
eq_("<html><head></head><body>Reply</body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -62,7 +62,7 @@ def test_regular_blockquote():
</div>
</blockquote>
"""
eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote></body></html>",
eq_("<html><head></head><body>Reply<blockquote>Regular</blockquote></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -85,6 +85,7 @@ Reply
reply = """
<html>
<head></head>
<body>
Reply
@@ -128,7 +129,7 @@ def test_gmail_quote():
</div>
</div>
</div>"""
eq_("<html><body><p>Reply</p></body></html>",
eq_("<html><head></head><body>Reply</body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -139,7 +140,7 @@ def test_gmail_quote_compact():
'<div>Test</div>' \
'</div>' \
'</div>'
eq_("<html><body><p>Reply</p></body></html>",
eq_("<html><head></head><body>Reply</body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -166,7 +167,7 @@ def test_unicode_in_reply():
Quote
</blockquote>""".encode("utf-8")
eq_("<html><body><p>Reply&#160;&#160;Text<br></p><div><br></div>"
eq_("<html><head></head><body>Reply&#160;&#160;Text<br><div><br></div>"
"</body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -192,6 +193,7 @@ def test_blockquote_disclaimer():
stripped_html = """
<html>
<head></head>
<body>
<div>
<div>
@@ -223,7 +225,7 @@ def test_date_block():
</div>
</div>
"""
eq_('<html><body><div>message<br></div></body></html>',
eq_('<html><head></head><body><div>message<br></div></body></html>',
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -240,7 +242,7 @@ Subject: You Have New Mail From Mary!<br><br>
text
</div></div>
"""
eq_('<html><body><div>message<br></div></body></html>',
eq_('<html><head></head><body><div>message<br></div></body></html>',
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -258,7 +260,7 @@ def test_reply_shares_div_with_from_block():
</div>
</body>'''
eq_('<html><body><div>Blah<br><br></div></body></html>',
eq_('<html><head></head><body><div>Blah<br><br></div></body></html>',
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -269,13 +271,13 @@ def test_reply_quotations_share_block():
def test_OLK_SRC_BODY_SECTION_stripped():
eq_('<html><body><div>Reply</div></body></html>',
eq_('<html><head></head><body><div>Reply</div></body></html>',
RE_WHITESPACE.sub(
'', quotations.extract_from_html(OLK_SRC_BODY_SECTION)))
def test_reply_separated_by_hr():
eq_('<html><body><div>Hi<div>there</div></div></body></html>',
eq_('<html><head></head><body><div>Hi<div>there</div></div></body></html>',
RE_WHITESPACE.sub(
'', quotations.extract_from_html(REPLY_SEPARATED_BY_HR)))
@@ -296,7 +298,7 @@ Reply
</div>
</div>
'''
eq_('<html><body><p>Reply</p><div><hr></div></body></html>',
eq_('<html><head></head><body>Reply<div><hr></div></body></html>',
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -373,7 +375,7 @@ reply
extracted = quotations.extract_from_html(msg_body)
assert_false(symbol in extracted)
# Keep new lines otherwise "My reply" becomes one word - "Myreply"
eq_("<html><body><p>My\nreply\n</p></body></html>", extracted)
eq_("<html><head></head><body>My\nreply\n</body></html>", extracted)
def test_gmail_forwarded_msg():