protect html_to_text as well

2016-09-14 09:58:41 -07:00
parent ea82a9730e
commit 534457e713
4 changed files with 40 additions and 13 deletions
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -386,9 +386,6 @@ def _extract_from_html(msg_body):
    then checking deleted checkpoints,
    then deleting necessary tags.
    """
-    if _html_too_big(msg_body):
-        return msg_body
-
    if msg_body.strip() == b'':
        return msg_body

@@ -483,12 +480,3 @@ def register_xpath_extensions():
    ns.prefix = 'mg'
    ns['text_content'] = text_content
    ns['tail'] = tail
-
-
-def _html_too_big(msg_body):
-    return msg_body.count('<') > _MAX_TAGS_COUNT
-
-
-# an extensive research shows that exceeding this limit
-# might lead to excessive processing time
-_MAX_TAGS_COUNT = 419
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -178,6 +178,9 @@ def html_fromstring(s):
    """Parse html tree from string. Return None if the string can't be parsed.
    """
    try:
+        if html_too_big(s):
+            return None
+
        return html5parser.fromstring(s, parser=_html5lib_parser())
    except Exception:
        pass
@@ -187,6 +190,9 @@ def html_document_fromstring(s):
    """Parse html tree from string. Return None if the string can't be parsed.
    """
    try:
+        if html_too_big(s):
+            return None
+
        return html5parser.document_fromstring(s, parser=_html5lib_parser())
    except Exception:
        pass
@@ -196,6 +202,10 @@ def cssselect(expr, tree):
    return CSSSelector(expr)(tree)


+def html_too_big(s):
+    return s.count('<') > _MAX_TAGS_COUNT
+
+
 def _contains_charset_spec(s):
    """Return True if the first 4KB contain charset spec
    """
@@ -243,3 +253,7 @@ _BLOCKTAGS  = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
 _HARDBREAKS = ['br', 'hr', 'tr']

 _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
+
+# an extensive research shows that exceeding this limit
+# might lead to excessive processing time
+_MAX_TAGS_COUNT = 419