3 Commits

Author SHA1 Message Date
Sergey Obukhov
f53b5cc7a6 Merge pull request #105 from mailgun/sergey/fromstring
html with comment that has no parent crashes html_tree_to_text
2016-08-15 13:40:37 -07:00
Sergey Obukhov
27adde7aa7 bump version 2016-08-15 13:21:10 -07:00
Sergey Obukhov
a9719833e0 html with comment that has no parent crashes html_tree_to_text 2016-08-12 17:40:12 -07:00
3 changed files with 15 additions and 2 deletions

View File

@@ -29,7 +29,7 @@ class InstallCommand(install):
setup(name='talon', setup(name='talon',
version='1.2.15', version='1.2.16',
description=("Mailgun library " description=("Mailgun library "
"to extract message quotations and signatures."), "to extract message quotations and signatures."),
long_description=open("README.rst").read(), long_description=open("README.rst").read(),

View File

@@ -117,7 +117,13 @@ def html_tree_to_text(tree):
style.getparent().remove(style) style.getparent().remove(style)
for c in tree.xpath('//comment()'): for c in tree.xpath('//comment()'):
c.getparent().remove(c) parent = c.getparent()
# comment with no parent does not impact produced text
if not parent:
continue
parent.remove(c)
text = "" text = ""
for el in tree.iter(): for el in tree.iter():

View File

@@ -6,6 +6,7 @@ from . import *
from talon import utils as u from talon import utils as u
import cchardet import cchardet
import six import six
from lxml import html
def test_get_delimiter(): def test_get_delimiter():
@@ -107,3 +108,9 @@ font: 13px 'Lucida Grande', Arial, sans-serif;
<p>TEXT 2 <!-- COMMENT 2 --></p> <p>TEXT 2 <!-- COMMENT 2 --></p>
</div>""" </div>"""
eq_(b'TEXT 1 \nTEXT 2', u.html_to_text(html)) eq_(b'TEXT 1 \nTEXT 2', u.html_to_text(html))
def test_comment_no_parent():
s = "<!-- COMMENT 1 --> no comment"
d = html.document_fromstring(s)
eq_("no comment", u.html_tree_to_text(d))