fix cssselect

This commit is contained in:
Sergey Obukhov
2016-08-16 17:09:07 -07:00
parent ec8e09b34e
commit 5b1ca33c57
2 changed files with 8 additions and 2 deletions

View File

@@ -6,6 +6,7 @@ messages (without quoted messages) from html
from __future__ import absolute_import
import regex as re
from talon.utils import cssselect
CHECKPOINT_PREFIX = '#!%!'
CHECKPOINT_SUFFIX = '!%!#'
@@ -78,7 +79,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints):
def cut_gmail_quote(html_message):
''' Cuts the outermost block element with class gmail_quote. '''
gmail_quote = html_message.cssselect('div.gmail_quote')
gmail_quote = cssselect('div.gmail_quote', html_message)
if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)):
gmail_quote[0].getparent().remove(gmail_quote[0])
return True
@@ -135,7 +136,7 @@ def cut_microsoft_quote(html_message):
def cut_by_id(html_message):
found = False
for quote_id in QUOTE_IDS:
quote = html_message.cssselect('#{}'.format(quote_id))
quote = cssselect('#{}'.format(quote_id), html_message)
if quote:
found = True
quote[0].getparent().remove(quote[0])

View File

@@ -114,6 +114,7 @@ def get_delimiter(msg_body):
return delimiter
def html_tree_to_text(tree):
for style in CSSSelector('style')(tree):
style.getparent().remove(style)
@@ -176,6 +177,10 @@ def html_document_fromstring(s):
return html5parser.document_fromstring(s, parser=_HTML5LIB_PARSER)
def cssselect(expr, tree):
return CSSSelector(expr)(tree)
def _contains_charset_spec(s):
"""Return True if the first 4KB contain charset spec
"""