Files
talon/talon/html_quotations.py
Esperat Julian e16dcf629e Added Outlook 2013 rules
Only the border color changes (compared to Outlook 2007, 2010) from `#B5C4DF` to `#E1E1E1`.
2017-04-27 11:34:01 +02:00

229 lines
7.3 KiB
Python

"""
The module's functions operate on message bodies trying to extract original
messages (without quoted messages) from html
"""
from __future__ import absolute_import
import regex as re
from talon.utils import cssselect
CHECKPOINT_PREFIX = '#!%!'
CHECKPOINT_SUFFIX = '!%!#'
CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX)
# HTML quote indicators (tag ids)
QUOTE_IDS = ['OLK_SRC_BODY_SECTION']
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
def add_checkpoint(html_note, counter):
"""Recursively adds checkpoints to html tree.
"""
if html_note.text:
html_note.text = (html_note.text + CHECKPOINT_PREFIX +
str(counter) + CHECKPOINT_SUFFIX)
else:
html_note.text = (CHECKPOINT_PREFIX + str(counter) +
CHECKPOINT_SUFFIX)
counter += 1
for child in html_note.iterchildren():
counter = add_checkpoint(child, counter)
if html_note.tail:
html_note.tail = (html_note.tail + CHECKPOINT_PREFIX +
str(counter) + CHECKPOINT_SUFFIX)
else:
html_note.tail = (CHECKPOINT_PREFIX + str(counter) +
CHECKPOINT_SUFFIX)
counter += 1
return counter
def delete_quotation_tags(html_note, counter, quotation_checkpoints):
"""Deletes tags with quotation checkpoints from html tree.
"""
tag_in_quotation = True
if quotation_checkpoints[counter]:
html_note.text = ''
else:
tag_in_quotation = False
counter += 1
quotation_children = [] # Children tags which are in quotation.
for child in html_note.iterchildren():
counter, child_tag_in_quotation = delete_quotation_tags(
child, counter,
quotation_checkpoints
)
if child_tag_in_quotation:
quotation_children.append(child)
if quotation_checkpoints[counter]:
html_note.tail = ''
else:
tag_in_quotation = False
counter += 1
if tag_in_quotation:
return counter, tag_in_quotation
else:
# Remove quotation children.
for child in quotation_children:
html_note.remove(child)
return counter, tag_in_quotation
def cut_gmail_quote(html_message):
''' Cuts the outermost block element with class gmail_quote. '''
gmail_quote = cssselect('div.gmail_quote', html_message)
if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)):
gmail_quote[0].getparent().remove(gmail_quote[0])
return True
def cut_microsoft_quote(html_message):
''' Cuts splitter block and all following blocks. '''
splitter = html_message.xpath(
#outlook 2007, 2010 (international)
"//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
"padding:3.0pt 0cm 0cm 0cm']|"
#outlook 2007, 2010 (american)
"//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
"padding:3.0pt 0in 0in 0in']|"
#outlook 2013 (international)
"//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;"
"padding:3.0pt 0cm 0cm 0cm']|"
#outlook 2013 (american)
"//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;"
"padding:3.0pt 0in 0in 0in']|"
#windows mail
"//div[@style='padding-top: 5px; "
"border-top-color: rgb(229, 229, 229); "
"border-top-width: 1px; border-top-style: solid;']"
)
if splitter:
splitter = splitter[0]
#outlook 2010
if splitter == splitter.getparent().getchildren()[0]:
splitter = splitter.getparent()
else:
#outlook 2003
splitter = html_message.xpath(
"//div"
"/div[@class='MsoNormal' and @align='center' "
"and @style='text-align:center']"
"/font"
"/span"
"/hr[@size='3' and @width='100%' and @align='center' "
"and @tabindex='-1']"
)
if len(splitter):
splitter = splitter[0]
splitter = splitter.getparent().getparent()
splitter = splitter.getparent().getparent()
if len(splitter):
parent = splitter.getparent()
after_splitter = splitter.getnext()
while after_splitter is not None:
parent.remove(after_splitter)
after_splitter = splitter.getnext()
parent.remove(splitter)
return True
return False
def cut_by_id(html_message):
found = False
for quote_id in QUOTE_IDS:
quote = cssselect('#{}'.format(quote_id), html_message)
if quote:
found = True
quote[0].getparent().remove(quote[0])
return found
def cut_blockquote(html_message):
''' Cuts the last non-nested blockquote with wrapping elements.'''
quote = html_message.xpath(
'(.//blockquote)'
'[not(@class="gmail_quote") and not(ancestor::blockquote)]'
'[last()]')
if quote:
quote = quote[0]
quote.getparent().remove(quote)
return True
def cut_from_block(html_message):
"""Cuts div tag which wraps block starting with "From:"."""
# handle the case when From: block is enclosed in some tag
block = html_message.xpath(
("//*[starts-with(mg:text_content(), 'From:')]|"
"//*[starts-with(mg:text_content(), 'Date:')]"))
if block:
block = block[-1]
parent_div = None
while block.getparent() is not None:
if block.tag == 'div':
parent_div = block
break
block = block.getparent()
if parent_div is not None:
maybe_body = parent_div.getparent()
# In cases where removing this enclosing div will remove all
# content, we should assume the quote is not enclosed in a tag.
parent_div_is_all_content = (
maybe_body is not None and maybe_body.tag == 'body' and
len(maybe_body.getchildren()) == 1)
if not parent_div_is_all_content:
parent = block.getparent()
next_sibling = block.getnext()
# remove all tags after found From block
# (From block and quoted message are in separate divs)
while next_sibling is not None:
parent.remove(block)
block = next_sibling
next_sibling = block.getnext()
# remove the last sibling (or the
# From block if no siblings)
if block is not None:
parent.remove(block)
return True
else:
return False
# handle the case when From: block goes right after e.g. <hr>
# and not enclosed in some tag
block = html_message.xpath(
("//*[starts-with(mg:tail(), 'From:')]|"
"//*[starts-with(mg:tail(), 'Date:')]"))
if block:
block = block[0]
if RE_FWD.match(block.getparent().text or ''):
return False
while(block.getnext() is not None):
block.getparent().remove(block.getnext())
block.getparent().remove(block)
return True
def cut_zimbra_quote(html_message):
zDivider = html_message.xpath('//hr[@data-marker="__DIVIDER__"]')
if zDivider:
zDivider[0].getparent().remove(zDivider[0])
return True