This commit is contained in:
Sergey Obukhov
2015-12-05 00:37:02 -08:00
parent 2c416ecc0e
commit 41457d8fbd
5 changed files with 93 additions and 81 deletions

View File

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
setup(name='talon', setup(name='talon',
version='1.0.9', version='1.2.0',
description=("Mailgun library " description=("Mailgun library "
"to extract message quotations and signatures."), "to extract message quotations and signatures."),
long_description=open("README.rst").read(), long_description=open("README.rst").read(),

View File

@@ -76,7 +76,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints):
def cut_gmail_quote(html_message): def cut_gmail_quote(html_message):
''' Cuts the outermost block element with class gmail_quote. ''' ''' Cuts the outermost block element with class gmail_quote. '''
gmail_quote = html_message.cssselect('.gmail_quote') gmail_quote = html_message.cssselect('div.gmail_quote')
if gmail_quote: if gmail_quote:
gmail_quote[0].getparent().remove(gmail_quote[0]) gmail_quote[0].getparent().remove(gmail_quote[0])
return True return True
@@ -139,7 +139,11 @@ def cut_by_id(html_message):
def cut_blockquote(html_message): def cut_blockquote(html_message):
''' Cuts the last non-nested blockquote with wrapping elements.''' ''' Cuts the last non-nested blockquote with wrapping elements.'''
quote = html_message.xpath('(.//blockquote)[not(ancestor::blockquote)][last()]') quote = html_message.xpath(
'(.//blockquote)'
'[not(@class="gmail_quote") and not(ancestor::blockquote)]'
'[last()]')
if quote: if quote:
quote = quote[0] quote = quote[0]
quote.getparent().remove(quote) quote.getparent().remove(quote)

View File

@@ -10,6 +10,8 @@ import logging
from copy import deepcopy from copy import deepcopy
from lxml import html, etree from lxml import html, etree
import lxml
from lxml.cssselect import CSSSelector
import html2text import html2text
from talon.utils import get_delimiter from talon.utils import get_delimiter
@@ -108,7 +110,7 @@ RE_EMPTY_QUOTATION = re.compile(
( (
# quotation border: splitter line or a number of quotation marker lines # quotation border: splitter line or a number of quotation marker lines
(?: (?:
s (?:se*)+
| |
(?:me*){2,} (?:me*){2,}
) )
@@ -139,7 +141,7 @@ RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .
SPLITTER_PATTERNS = [ SPLITTER_PATTERNS = [
RE_ORIGINAL_MESSAGE, RE_ORIGINAL_MESSAGE,
# <date> <person> # <date> <person>
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE), re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S),
RE_ON_DATE_SMB_WROTE, RE_ON_DATE_SMB_WROTE,
RE_ON_DATE_WROTE_SMB, RE_ON_DATE_WROTE_SMB,
RE_FROM_COLON_OR_DATE_COLON, RE_FROM_COLON_OR_DATE_COLON,
@@ -321,7 +323,7 @@ def extract_from_plain(msg_body):
return msg_body return msg_body
def extract_from_html(s): def extract_from_html(msg_body):
""" """
Extract not quoted message from provided html message body Extract not quoted message from provided html message body
using tags and plain text algorithm. using tags and plain text algorithm.
@@ -337,49 +339,32 @@ def extract_from_html(s):
then checking deleted checkpoints, then checking deleted checkpoints,
then deleting necessary tags. then deleting necessary tags.
""" """
if msg_body.strip() == '':
return msg_body
if s.strip() == '': msg_body = msg_body.replace('\r\n', '').replace('\n', '')
return s
# replace CRLF with LF temporaraly otherwise CR will be converted to '&#13;'
# when doing deepcopy on html tree
msg_body, replaced = _CRLF_to_LF(s)
html_tree = html.document_fromstring( html_tree = html.document_fromstring(
msg_body, msg_body,
parser=html.HTMLParser(encoding="utf-8") parser=html.HTMLParser(encoding="utf-8")
) )
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
html_quotations.cut_blockquote(html_tree) or html_quotations.cut_blockquote(html_tree) or
html_quotations.cut_microsoft_quote(html_tree) or html_quotations.cut_microsoft_quote(html_tree) or
html_quotations.cut_by_id(html_tree) or html_quotations.cut_by_id(html_tree) or
html_quotations.cut_from_block(html_tree) html_quotations.cut_from_block(html_tree)
) )
html_tree_copy = deepcopy(html_tree) html_tree_copy = deepcopy(html_tree)
number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
quotation_checkpoints = [False] * number_of_checkpoints quotation_checkpoints = [False] * number_of_checkpoints
msg_with_checkpoints = html.tostring(html_tree) msg_with_checkpoints = html.tostring(html_tree)
plain_text = html_to_text(msg_with_checkpoints)
h = html2text.HTML2Text()
h.body_width = 0 # generate plain text without wrap
# html2text adds unnecessary star symbols. Remove them.
# Mask star symbols
msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432')
plain_text = h.handle(msg_with_checkpoints)
# Remove created star symbols
plain_text = plain_text.replace('*', '')
# Unmask saved star symbols
plain_text = plain_text.replace('3423oorkg432', '*')
plain_text = preprocess(plain_text, '\n', content_type='text/html') plain_text = preprocess(plain_text, '\n', content_type='text/html')
lines = plain_text.splitlines() lines = plain_text.splitlines()
# Don't process too long messages # Don't process too long messages
if len(lines) > MAX_LINES_COUNT: if len(lines) > MAX_LINES_COUNT:
return s return msg_body
# Collect checkpoints on each line # Collect checkpoints on each line
line_checkpoints = [ line_checkpoints = [
@@ -396,7 +381,6 @@ def extract_from_html(s):
return_flags = [] return_flags = []
process_marked_lines(lines, markers, return_flags) process_marked_lines(lines, markers, return_flags)
lines_were_deleted, first_deleted, last_deleted = return_flags lines_were_deleted, first_deleted, last_deleted = return_flags
if lines_were_deleted: if lines_were_deleted:
#collect checkpoints from deleted lines #collect checkpoints from deleted lines
for i in xrange(first_deleted, last_deleted): for i in xrange(first_deleted, last_deleted):
@@ -404,9 +388,9 @@ def extract_from_html(s):
quotation_checkpoints[checkpoint] = True quotation_checkpoints[checkpoint] = True
else: else:
if cut_quotations: if cut_quotations:
return _restore_CRLF(html.tostring(html_tree_copy), replaced) return html.tostring(html_tree_copy)
else: else:
return s return msg_body
# Remove tags with quotation checkpoints # Remove tags with quotation checkpoints
html_quotations.delete_quotation_tags( html_quotations.delete_quotation_tags(
@@ -444,35 +428,54 @@ def register_xpath_extensions():
ns['tail'] = tail ns['tail'] = tail
def _restore_CRLF(s, replaced=True): def html_to_text(string):
"""Restore CRLF if previously CRLF was replaced with LF
>>> _restore_CRLF('a\nb')
'a\r\nb'
>>> _restore_CRLF('a\nb', replaced=False)
'a\nb'
""" """
if replaced: Dead-simple HTML-to-text converter:
return s.replace('\n', '\r\n') >>> html_to_text("one<br>two<br>three")
return s >>> u"one\ntwo\nthree"
NOTES:
def _CRLF_to_LF(s): 1. the string is expected to contain UTF-8 encoded HTML!
"""Replace CRLF with LF 2. returns utf-8 encoded str (not unicode)
>>> s, changed = _CRLF_to_LF('a\r\n'b)
>>> s
'a\nb'
>>> changed
True
>>> s, changed = _CRLF_to_LF('a\n'b)
>>> s
'a\nb'
>>> changed
False
""" """
delimiter = get_delimiter(s) retval = None
if delimiter == '\r\n': try:
return s.replace(delimiter, '\n'), True # append 'utf-8' encoding declaration to HTML string if the first 4KB of the message does not
return s, False # contain the charset spec:
if string.lower().find('html; charset=', 0, 4096) == -1:
string = '''<meta http-equiv="Content-Type" content="text/html; charset=utf-8">''' + string
tree = lxml.html.fromstring(string.replace("\n", ""))
for style in CSSSelector('style')(tree):
style.getparent().remove(style)
for c in tree.xpath('//comment()'):
c.getparent().remove(c)
blocktags = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
hardbreaks = ['br', 'hr', 'tr']
text = ""
for el in tree.iter():
el_text = (el.text or '') + (el.tail or '')
if len(el_text) > 1:
if el.tag in blocktags:
text += "\n"
if el.tag == 'li':
text += " * "
text += el_text.strip() + " "
# add href to the output
# href = el.attrib.get('href', None)
# if href:
# text += "(%s) " % href
if el.tag in hardbreaks and len(text) > 0 and text[-1] != "\n":
text += "\n"
# remove excessive newlines that often happen due to tons of divs:
retval = re.sub("\n{2,10}", "\n\n", text).strip()
if isinstance(retval, unicode):
retval = retval.encode('utf-8')
except Exception as e:
pass
return retval

View File

@@ -28,8 +28,8 @@ def test_quotation_splitter_inside_blockquote():
</blockquote>""" </blockquote>"""
eq_("<html><body><p>Reply\n</p></body></html>", eq_("<html><body><p>Reply</p></body></html>",
quotations.extract_from_html(msg_body)) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_quotation_splitter_outside_blockquote(): def test_quotation_splitter_outside_blockquote():
@@ -45,7 +45,7 @@ def test_quotation_splitter_outside_blockquote():
</div> </div>
</blockquote> </blockquote>
""" """
eq_("<html><body><p>Reply</p><div></div></body></html>", eq_("<html><body><p>Reply</p></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -63,7 +63,7 @@ def test_regular_blockquote():
</div> </div>
</blockquote> </blockquote>
""" """
eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote><div></div></body></html>", eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -133,6 +133,18 @@ def test_gmail_quote():
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_gmail_quote_blockquote():
msg_body = """Message
<blockquote class="gmail_quote">
<div class="gmail_default">
My name is William Shakespeare.
<br/>
</div>
</blockquote>"""
eq_(RE_WHITESPACE.sub('', msg_body),
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_unicode_in_reply(): def test_unicode_in_reply():
msg_body = u"""Reply \xa0 \xa0 Text<br> msg_body = u"""Reply \xa0 \xa0 Text<br>
@@ -140,7 +152,7 @@ def test_unicode_in_reply():
<br> <br>
</div> </div>
<blockquote class="gmail_quote"> <blockquote>
Quote Quote
</blockquote>""".encode("utf-8") </blockquote>""".encode("utf-8")
@@ -315,7 +327,10 @@ def test_yandex_ru_reply():
def test_CRLF(): def test_CRLF():
"""CR is not converted to '&#13;' """CR is not converted to '&#13;'
""" """
eq_('<html>\r\n</html>', quotations.extract_from_html('<html>\r\n</html>')) symbol = '&#13;'
extracted = quotations.extract_from_html('<html>\r\n</html>')
assert_false(symbol in extracted)
eq_('<html></html>', RE_WHITESPACE.sub('', extracted))
msg_body = """Reply msg_body = """Reply
<blockquote> <blockquote>
@@ -330,5 +345,7 @@ def test_CRLF():
</blockquote>""" </blockquote>"""
msg_body = msg_body.replace('\n', '\r\n') msg_body = msg_body.replace('\n', '\r\n')
eq_("<html><body><p>Reply\r\n</p></body></html>", extracted = quotations.extract_from_html(msg_body)
quotations.extract_from_html(msg_body)) assert_false(symbol in extracted)
eq_("<html><body><p>Reply</p></body></html>",
RE_WHITESPACE.sub('', extracted))

View File

@@ -29,15 +29,3 @@ def test_crash_inside_extract_from():
def test_empty_body(): def test_empty_body():
eq_('', quotations.extract_from_plain('')) eq_('', quotations.extract_from_plain(''))
def test__CRLF_to_LF():
eq_(('\n\r', True), quotations._CRLF_to_LF('\r\n\r'))
eq_(('\n\r', False), quotations._CRLF_to_LF('\n\r'))
def test__restore_CRLF():
eq_('\n', quotations._restore_CRLF('\n', replaced=False))
eq_('\r\n', quotations._restore_CRLF('\n', replaced=True))
# default
eq_('\r\n', quotations._restore_CRLF('\n'))