Merge pull request #70 from mailgun/sergey/gmail

fixes mailgun/talon#38 mailgun/talon#20
This commit is contained in:
Sergey Obukhov
2015-12-18 19:00:13 -08:00
7 changed files with 177 additions and 108 deletions

View File

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
setup(name='talon', setup(name='talon',
version='1.0.9', version='1.2.0',
description=("Mailgun library " description=("Mailgun library "
"to extract message quotations and signatures."), "to extract message quotations and signatures."),
long_description=open("README.rst").read(), long_description=open("README.rst").read(),
@@ -16,7 +16,6 @@ setup(name='talon',
install_requires=[ install_requires=[
"lxml>=2.3.3", "lxml>=2.3.3",
"regex>=1", "regex>=1",
"html2text",
"numpy", "numpy",
"scipy", "scipy",
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild

View File

@@ -76,7 +76,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints):
def cut_gmail_quote(html_message): def cut_gmail_quote(html_message):
''' Cuts the outermost block element with class gmail_quote. ''' ''' Cuts the outermost block element with class gmail_quote. '''
gmail_quote = html_message.cssselect('.gmail_quote') gmail_quote = html_message.cssselect('div.gmail_quote')
if gmail_quote: if gmail_quote:
gmail_quote[0].getparent().remove(gmail_quote[0]) gmail_quote[0].getparent().remove(gmail_quote[0])
return True return True
@@ -138,8 +138,12 @@ def cut_by_id(html_message):
def cut_blockquote(html_message): def cut_blockquote(html_message):
''' Cuts the last non-nested blockquote with wrapping elements. ''' ''' Cuts the last non-nested blockquote with wrapping elements.'''
quote = html_message.xpath('(.//blockquote)[not(ancestor::blockquote)][last()]') quote = html_message.xpath(
'(.//blockquote)'
'[not(@class="gmail_quote") and not(ancestor::blockquote)]'
'[last()]')
if quote: if quote:
quote = quote[0] quote = quote[0]
quote.getparent().remove(quote) quote.getparent().remove(quote)

View File

@@ -10,9 +10,8 @@ import logging
from copy import deepcopy from copy import deepcopy
from lxml import html, etree from lxml import html, etree
import html2text
from talon.utils import get_delimiter from talon.utils import get_delimiter, html_to_text
from talon import html_quotations from talon import html_quotations
@@ -108,7 +107,7 @@ RE_EMPTY_QUOTATION = re.compile(
( (
# quotation border: splitter line or a number of quotation marker lines # quotation border: splitter line or a number of quotation marker lines
(?: (?:
s (?:se*)+
| |
(?:me*){2,} (?:me*){2,}
) )
@@ -139,7 +138,7 @@ RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .
SPLITTER_PATTERNS = [ SPLITTER_PATTERNS = [
RE_ORIGINAL_MESSAGE, RE_ORIGINAL_MESSAGE,
# <date> <person> # <date> <person>
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE), re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S),
RE_ON_DATE_SMB_WROTE, RE_ON_DATE_SMB_WROTE,
RE_ON_DATE_WROTE_SMB, RE_ON_DATE_WROTE_SMB,
RE_FROM_COLON_OR_DATE_COLON, RE_FROM_COLON_OR_DATE_COLON,
@@ -321,7 +320,7 @@ def extract_from_plain(msg_body):
return msg_body return msg_body
def extract_from_html(s): def extract_from_html(msg_body):
""" """
Extract not quoted message from provided html message body Extract not quoted message from provided html message body
using tags and plain text algorithm. using tags and plain text algorithm.
@@ -337,49 +336,32 @@ def extract_from_html(s):
then checking deleted checkpoints, then checking deleted checkpoints,
then deleting necessary tags. then deleting necessary tags.
""" """
if msg_body.strip() == '':
return msg_body
if s.strip() == '': msg_body = msg_body.replace('\r\n', '').replace('\n', '')
return s
# replace CRLF with LF temporaraly otherwise CR will be converted to '&#13;'
# when doing deepcopy on html tree
msg_body, replaced = _CRLF_to_LF(s)
html_tree = html.document_fromstring( html_tree = html.document_fromstring(
msg_body, msg_body,
parser=html.HTMLParser(encoding="utf-8") parser=html.HTMLParser(encoding="utf-8")
) )
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
html_quotations.cut_blockquote(html_tree) or html_quotations.cut_blockquote(html_tree) or
html_quotations.cut_microsoft_quote(html_tree) or html_quotations.cut_microsoft_quote(html_tree) or
html_quotations.cut_by_id(html_tree) or html_quotations.cut_by_id(html_tree) or
html_quotations.cut_from_block(html_tree) html_quotations.cut_from_block(html_tree)
) )
html_tree_copy = deepcopy(html_tree) html_tree_copy = deepcopy(html_tree)
number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
quotation_checkpoints = [False] * number_of_checkpoints quotation_checkpoints = [False] * number_of_checkpoints
msg_with_checkpoints = html.tostring(html_tree) msg_with_checkpoints = html.tostring(html_tree)
plain_text = html_to_text(msg_with_checkpoints)
h = html2text.HTML2Text()
h.body_width = 0 # generate plain text without wrap
# html2text adds unnecessary star symbols. Remove them.
# Mask star symbols
msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432')
plain_text = h.handle(msg_with_checkpoints)
# Remove created star symbols
plain_text = plain_text.replace('*', '')
# Unmask saved star symbols
plain_text = plain_text.replace('3423oorkg432', '*')
plain_text = preprocess(plain_text, '\n', content_type='text/html') plain_text = preprocess(plain_text, '\n', content_type='text/html')
lines = plain_text.splitlines() lines = plain_text.splitlines()
# Don't process too long messages # Don't process too long messages
if len(lines) > MAX_LINES_COUNT: if len(lines) > MAX_LINES_COUNT:
return s return msg_body
# Collect checkpoints on each line # Collect checkpoints on each line
line_checkpoints = [ line_checkpoints = [
@@ -396,7 +378,6 @@ def extract_from_html(s):
return_flags = [] return_flags = []
process_marked_lines(lines, markers, return_flags) process_marked_lines(lines, markers, return_flags)
lines_were_deleted, first_deleted, last_deleted = return_flags lines_were_deleted, first_deleted, last_deleted = return_flags
if lines_were_deleted: if lines_were_deleted:
#collect checkpoints from deleted lines #collect checkpoints from deleted lines
for i in xrange(first_deleted, last_deleted): for i in xrange(first_deleted, last_deleted):
@@ -404,9 +385,9 @@ def extract_from_html(s):
quotation_checkpoints[checkpoint] = True quotation_checkpoints[checkpoint] = True
else: else:
if cut_quotations: if cut_quotations:
return _restore_CRLF(html.tostring(html_tree_copy), replaced) return html.tostring(html_tree_copy)
else: else:
return s return msg_body
# Remove tags with quotation checkpoints # Remove tags with quotation checkpoints
html_quotations.delete_quotation_tags( html_quotations.delete_quotation_tags(
@@ -442,37 +423,3 @@ def register_xpath_extensions():
ns.prefix = 'mg' ns.prefix = 'mg'
ns['text_content'] = text_content ns['text_content'] = text_content
ns['tail'] = tail ns['tail'] = tail
def _restore_CRLF(s, replaced=True):
"""Restore CRLF if previously CRLF was replaced with LF
>>> _restore_CRLF('a\nb')
'a\r\nb'
>>> _restore_CRLF('a\nb', replaced=False)
'a\nb'
"""
if replaced:
return s.replace('\n', '\r\n')
return s
def _CRLF_to_LF(s):
"""Replace CRLF with LF
>>> s, changed = _CRLF_to_LF('a\r\n'b)
>>> s
'a\nb'
>>> changed
True
>>> s, changed = _CRLF_to_LF('a\n'b)
>>> s
'a\nb'
>>> changed
False
"""
delimiter = get_delimiter(s)
if delimiter == '\r\n':
return s.replace(delimiter, '\n'), True
return s, False

View File

@@ -4,6 +4,10 @@ import logging
from random import shuffle from random import shuffle
import chardet import chardet
import cchardet import cchardet
import regex as re
from lxml import html
from lxml.cssselect import CSSSelector
from talon.constants import RE_DELIMITER from talon.constants import RE_DELIMITER
@@ -58,7 +62,6 @@ def detect_encoding(string):
if detected: if detected:
return detected.get('encoding') or 'utf-8' return detected.get('encoding') or 'utf-8'
except Exception, e: except Exception, e:
print 11111111111, e
pass pass
return 'utf-8' return 'utf-8'
@@ -74,7 +77,6 @@ def quick_detect_encoding(string):
if detected: if detected:
return detected.get('encoding') or detect_encoding(string) return detected.get('encoding') or detect_encoding(string)
except Exception, e: except Exception, e:
print 222222222222, e
pass pass
return detect_encoding(string) return detect_encoding(string)
@@ -105,3 +107,81 @@ def get_delimiter(msg_body):
delimiter = '\n' delimiter = '\n'
return delimiter return delimiter
def html_to_text(string):
"""
Dead-simple HTML-to-text converter:
>>> html_to_text("one<br>two<br>three")
>>> "one\ntwo\nthree"
NOTES:
1. the string is expected to contain UTF-8 encoded HTML!
2. returns utf-8 encoded str (not unicode)
"""
s = _prepend_utf8_declaration(string)
s = s.replace("\n", "")
tree = html.fromstring(s)
for style in CSSSelector('style')(tree):
style.getparent().remove(style)
for c in tree.xpath('//comment()'):
c.getparent().remove(c)
text = ""
for el in tree.iter():
el_text = (el.text or '') + (el.tail or '')
if len(el_text) > 1:
if el.tag in _BLOCKTAGS:
text += "\n"
if el.tag == 'li':
text += " * "
text += el_text.strip() + " "
# add href to the output
href = el.attrib.get('href')
if href:
text += "(%s) " % href
if el.tag in _HARDBREAKS and text and not text.endswith("\n"):
text += "\n"
retval = _rm_excessive_newlines(text)
return _encode_utf8(retval)
def _contains_charset_spec(s):
"""Return True if the first 4KB contain charset spec
"""
return s.lower().find('html; charset=', 0, 4096) != -1
def _prepend_utf8_declaration(s):
"""Prepend 'utf-8' encoding declaration if the first 4KB don't have any
"""
return s if _contains_charset_spec(s) else _UTF8_DECLARATION + s
def _rm_excessive_newlines(s):
"""Remove excessive newlines that often happen due to tons of divs
"""
return _RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip()
def _encode_utf8(s):
"""Encode in 'utf-8' if unicode
"""
return s.encode('utf-8') if isinstance(s, unicode) else s
_UTF8_DECLARATION = ('<meta http-equiv="Content-Type" content="text/html;'
'charset=utf-8">')
_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
_HARDBREAKS = ['br', 'hr', 'tr']
_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")

View File

@@ -5,9 +5,7 @@ from . fixtures import *
import regex as re import regex as re
from talon import quotations from talon import quotations, utils as u
import html2text
RE_WHITESPACE = re.compile("\s") RE_WHITESPACE = re.compile("\s")
@@ -28,8 +26,8 @@ def test_quotation_splitter_inside_blockquote():
</blockquote>""" </blockquote>"""
eq_("<html><body><p>Reply\n</p></body></html>", eq_("<html><body><p>Reply</p></body></html>",
quotations.extract_from_html(msg_body)) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_quotation_splitter_outside_blockquote(): def test_quotation_splitter_outside_blockquote():
@@ -45,7 +43,7 @@ def test_quotation_splitter_outside_blockquote():
</div> </div>
</blockquote> </blockquote>
""" """
eq_("<html><body><p>Reply</p><div></div></body></html>", eq_("<html><body><p>Reply</p></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -63,7 +61,7 @@ def test_regular_blockquote():
</div> </div>
</blockquote> </blockquote>
""" """
eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote><div></div></body></html>", eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -133,6 +131,18 @@ def test_gmail_quote():
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_gmail_quote_blockquote():
msg_body = """Message
<blockquote class="gmail_quote">
<div class="gmail_default">
My name is William Shakespeare.
<br/>
</div>
</blockquote>"""
eq_(RE_WHITESPACE.sub('', msg_body),
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_unicode_in_reply(): def test_unicode_in_reply():
msg_body = u"""Reply \xa0 \xa0 Text<br> msg_body = u"""Reply \xa0 \xa0 Text<br>
@@ -140,7 +150,7 @@ def test_unicode_in_reply():
<br> <br>
</div> </div>
<blockquote class="gmail_quote"> <blockquote>
Quote Quote
</blockquote>""".encode("utf-8") </blockquote>""".encode("utf-8")
@@ -258,26 +268,15 @@ def test_reply_separated_by_hr():
'', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR)))
RE_REPLY = re.compile(r"^Hi\. I am fine\.\s*\n\s*Thanks,\s*\n\s*Alex\s*$")
def extract_reply_and_check(filename): def extract_reply_and_check(filename):
f = open(filename) f = open(filename)
msg_body = f.read() msg_body = f.read()
reply = quotations.extract_from_html(msg_body) reply = quotations.extract_from_html(msg_body)
plain_reply = u.html_to_text(reply)
h = html2text.HTML2Text() eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"),
h.body_width = 0 RE_WHITESPACE.sub('', plain_reply))
plain_reply = h.handle(reply)
#remove &nbsp; spaces
plain_reply = plain_reply.replace(u'\xa0', u' ')
if RE_REPLY.match(plain_reply):
eq_(1, 1)
else:
eq_("Hi. I am fine.\n\nThanks,\nAlex", plain_reply)
def test_gmail_reply(): def test_gmail_reply():
@@ -315,7 +314,10 @@ def test_yandex_ru_reply():
def test_CRLF(): def test_CRLF():
"""CR is not converted to '&#13;' """CR is not converted to '&#13;'
""" """
eq_('<html>\r\n</html>', quotations.extract_from_html('<html>\r\n</html>')) symbol = '&#13;'
extracted = quotations.extract_from_html('<html>\r\n</html>')
assert_false(symbol in extracted)
eq_('<html></html>', RE_WHITESPACE.sub('', extracted))
msg_body = """Reply msg_body = """Reply
<blockquote> <blockquote>
@@ -330,5 +332,7 @@ def test_CRLF():
</blockquote>""" </blockquote>"""
msg_body = msg_body.replace('\n', '\r\n') msg_body = msg_body.replace('\n', '\r\n')
eq_("<html><body><p>Reply\r\n</p></body></html>", extracted = quotations.extract_from_html(msg_body)
quotations.extract_from_html(msg_body)) assert_false(symbol in extracted)
eq_("<html><body><p>Reply</p></body></html>",
RE_WHITESPACE.sub('', extracted))

View File

@@ -29,15 +29,3 @@ def test_crash_inside_extract_from():
def test_empty_body(): def test_empty_body():
eq_('', quotations.extract_from_plain('')) eq_('', quotations.extract_from_plain(''))
def test__CRLF_to_LF():
eq_(('\n\r', True), quotations._CRLF_to_LF('\r\n\r'))
eq_(('\n\r', False), quotations._CRLF_to_LF('\n\r'))
def test__restore_CRLF():
eq_('\n', quotations._restore_CRLF('\n', replaced=False))
eq_('\r\n', quotations._restore_CRLF('\n', replaced=True))
# default
eq_('\r\n', quotations._restore_CRLF('\n'))

View File

@@ -58,3 +58,50 @@ def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect):
detect_encoding.return_value = 'utf-8' detect_encoding.return_value = 'utf-8'
eq_('utf-8', u.quick_detect_encoding("qwe")) eq_('utf-8', u.quick_detect_encoding("qwe"))
ok_(detect_encoding.called) ok_(detect_encoding.called)
def test_html_to_text():
html = """<body>
<p>Hello world!</p>
<br>
<ul>
<li>One!</li>
<li>Two</li>
</ul>
<p>
Haha
</p>
</body>"""
text = u.html_to_text(html)
eq_("Hello world! \n\n * One! \n * Two \nHaha", text)
eq_("привет!", u.html_to_text("<b>привет!</b>"))
html = '<body><br/><br/>Hi</body>'
eq_ ('Hi', u.html_to_text(html))
html = """Hi
<style type="text/css">
div, p, li {
font: 13px 'Lucida Grande', Arial, sans-serif;
}
</style>
<style type="text/css">
h1 {
font: 13px 'Lucida Grande', Arial, sans-serif;
}
</style>"""
eq_ ('Hi', u.html_to_text(html))
html = """<div>
<!-- COMMENT 1 -->
<span>TEXT 1</span>
<p>TEXT 2 <!-- COMMENT 2 --></p>
</div>"""
eq_('TEXT 1 \nTEXT 2', u.html_to_text(html))