Remove max tags limit
This commit is contained in:
2
setup.py
2
setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):
|
||||
|
||||
|
||||
setup(name='talon',
|
||||
version='1.4.10',
|
||||
version='1.5.0',
|
||||
description=("Mailgun library "
|
||||
"to extract message quotations and signatures."),
|
||||
long_description=open("README.rst").read(),
|
||||
|
||||
@@ -180,9 +180,6 @@ def html_fromstring(s):
|
||||
if isinstance(s, six.text_type):
|
||||
s = s.encode('utf8')
|
||||
try:
|
||||
if html_too_big(s):
|
||||
return None
|
||||
|
||||
return html5parser.fromstring(s, parser=_html5lib_parser())
|
||||
except Exception:
|
||||
pass
|
||||
@@ -194,9 +191,6 @@ def html_document_fromstring(s):
|
||||
if isinstance(s, six.text_type):
|
||||
s = s.encode('utf8')
|
||||
try:
|
||||
if html_too_big(s):
|
||||
return None
|
||||
|
||||
return html5parser.document_fromstring(s, parser=_html5lib_parser())
|
||||
except Exception:
|
||||
pass
|
||||
@@ -206,12 +200,6 @@ def cssselect(expr, tree):
|
||||
return CSSSelector(expr)(tree)
|
||||
|
||||
|
||||
def html_too_big(s):
|
||||
if isinstance(s, six.text_type):
|
||||
s = s.encode('utf8')
|
||||
return s.count(b'<') > _MAX_TAGS_COUNT
|
||||
|
||||
|
||||
def _contains_charset_spec(s):
|
||||
"""Return True if the first 4KB contain charset spec
|
||||
"""
|
||||
@@ -258,7 +246,3 @@ _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
|
||||
_HARDBREAKS = ['br', 'hr', 'tr']
|
||||
|
||||
_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
|
||||
|
||||
# an extensive research shows that exceeding this limit
|
||||
# might lead to excessive processing time
|
||||
_MAX_TAGS_COUNT = 419
|
||||
|
||||
@@ -391,18 +391,6 @@ def test_gmail_forwarded_msg():
|
||||
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
|
||||
|
||||
|
||||
@patch.object(u, '_MAX_TAGS_COUNT', 4)
|
||||
def test_too_large_html():
|
||||
msg_body = 'Reply' \
|
||||
'<div class="gmail_quote">' \
|
||||
'<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:' \
|
||||
'<div>Test</div>' \
|
||||
'</div>' \
|
||||
'</div>'
|
||||
eq_(RE_WHITESPACE.sub('', msg_body),
|
||||
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
|
||||
|
||||
|
||||
def test_readable_html_empty():
|
||||
msg_body = """
|
||||
<blockquote>
|
||||
|
||||
@@ -125,39 +125,13 @@ def test_html_fromstring_exception():
|
||||
eq_(None, u.html_fromstring("<html></html>"))
|
||||
|
||||
|
||||
@patch.object(u, 'html_too_big', Mock())
|
||||
@patch.object(u.html5parser, 'fromstring')
|
||||
def test_html_fromstring_too_big(fromstring):
|
||||
eq_(None, u.html_fromstring("<html></html>"))
|
||||
assert_false(fromstring.called)
|
||||
|
||||
|
||||
@patch.object(u.html5parser, 'document_fromstring')
|
||||
def test_html_document_fromstring_exception(document_fromstring):
|
||||
document_fromstring.side_effect = Exception()
|
||||
eq_(None, u.html_document_fromstring("<html></html>"))
|
||||
|
||||
|
||||
@patch.object(u, 'html_too_big', Mock())
|
||||
@patch.object(u.html5parser, 'document_fromstring')
|
||||
def test_html_document_fromstring_too_big(document_fromstring):
|
||||
eq_(None, u.html_document_fromstring("<html></html>"))
|
||||
assert_false(document_fromstring.called)
|
||||
|
||||
|
||||
@patch.object(u, 'html_fromstring', Mock(return_value=None))
|
||||
def test_bad_html_to_text():
|
||||
bad_html = "one<br>two<br>three"
|
||||
eq_(None, u.html_to_text(bad_html))
|
||||
|
||||
|
||||
@patch.object(u, '_MAX_TAGS_COUNT', 3)
|
||||
def test_html_too_big():
|
||||
eq_(False, u.html_too_big("<div></div>"))
|
||||
eq_(True, u.html_too_big("<div><span>Hi</span></div>"))
|
||||
|
||||
|
||||
@patch.object(u, '_MAX_TAGS_COUNT', 3)
|
||||
def test_html_to_text():
|
||||
eq_(b"Hello", u.html_to_text("<div>Hello</div>"))
|
||||
eq_(None, u.html_to_text("<div><span>Hi</span></div>"))
|
||||
|
||||
Reference in New Issue
Block a user