add more tests, make standard reply tests more relaxed

This commit is contained in:
Sergey Obukhov
2015-12-18 18:56:41 -08:00
parent 41457d8fbd
commit 3d9ae356ea
5 changed files with 134 additions and 77 deletions

View File

@@ -16,7 +16,6 @@ setup(name='talon',
install_requires=[ install_requires=[
"lxml>=2.3.3", "lxml>=2.3.3",
"regex>=1", "regex>=1",
"html2text",
"numpy", "numpy",
"scipy", "scipy",
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild

View File

@@ -10,11 +10,8 @@ import logging
from copy import deepcopy from copy import deepcopy
from lxml import html, etree from lxml import html, etree
import lxml
from lxml.cssselect import CSSSelector
import html2text
from talon.utils import get_delimiter from talon.utils import get_delimiter, html_to_text
from talon import html_quotations from talon import html_quotations
@@ -426,56 +423,3 @@ def register_xpath_extensions():
ns.prefix = 'mg' ns.prefix = 'mg'
ns['text_content'] = text_content ns['text_content'] = text_content
ns['tail'] = tail ns['tail'] = tail
def html_to_text(string):
"""
Dead-simple HTML-to-text converter:
>>> html_to_text("one<br>two<br>three")
>>> u"one\ntwo\nthree"
NOTES:
1. the string is expected to contain UTF-8 encoded HTML!
2. returns utf-8 encoded str (not unicode)
"""
retval = None
try:
# append 'utf-8' encoding declaration to HTML string if the first 4KB of the message does not
# contain the charset spec:
if string.lower().find('html; charset=', 0, 4096) == -1:
string = '''<meta http-equiv="Content-Type" content="text/html; charset=utf-8">''' + string
tree = lxml.html.fromstring(string.replace("\n", ""))
for style in CSSSelector('style')(tree):
style.getparent().remove(style)
for c in tree.xpath('//comment()'):
c.getparent().remove(c)
blocktags = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
hardbreaks = ['br', 'hr', 'tr']
text = ""
for el in tree.iter():
el_text = (el.text or '') + (el.tail or '')
if len(el_text) > 1:
if el.tag in blocktags:
text += "\n"
if el.tag == 'li':
text += " * "
text += el_text.strip() + " "
# add href to the output
# href = el.attrib.get('href', None)
# if href:
# text += "(%s) " % href
if el.tag in hardbreaks and len(text) > 0 and text[-1] != "\n":
text += "\n"
# remove excessive newlines that often happen due to tons of divs:
retval = re.sub("\n{2,10}", "\n\n", text).strip()
if isinstance(retval, unicode):
retval = retval.encode('utf-8')
except Exception as e:
pass
return retval

View File

@@ -4,6 +4,10 @@ import logging
from random import shuffle from random import shuffle
import chardet import chardet
import cchardet import cchardet
import regex as re
from lxml import html
from lxml.cssselect import CSSSelector
from talon.constants import RE_DELIMITER from talon.constants import RE_DELIMITER
@@ -58,7 +62,6 @@ def detect_encoding(string):
if detected: if detected:
return detected.get('encoding') or 'utf-8' return detected.get('encoding') or 'utf-8'
except Exception, e: except Exception, e:
print 11111111111, e
pass pass
return 'utf-8' return 'utf-8'
@@ -74,7 +77,6 @@ def quick_detect_encoding(string):
if detected: if detected:
return detected.get('encoding') or detect_encoding(string) return detected.get('encoding') or detect_encoding(string)
except Exception, e: except Exception, e:
print 222222222222, e
pass pass
return detect_encoding(string) return detect_encoding(string)
@@ -105,3 +107,81 @@ def get_delimiter(msg_body):
delimiter = '\n' delimiter = '\n'
return delimiter return delimiter
def html_to_text(string):
"""
Dead-simple HTML-to-text converter:
>>> html_to_text("one<br>two<br>three")
>>> "one\ntwo\nthree"
NOTES:
1. the string is expected to contain UTF-8 encoded HTML!
2. returns utf-8 encoded str (not unicode)
"""
s = _prepend_utf8_declaration(string)
s = s.replace("\n", "")
tree = html.fromstring(s)
for style in CSSSelector('style')(tree):
style.getparent().remove(style)
for c in tree.xpath('//comment()'):
c.getparent().remove(c)
text = ""
for el in tree.iter():
el_text = (el.text or '') + (el.tail or '')
if len(el_text) > 1:
if el.tag in _BLOCKTAGS:
text += "\n"
if el.tag == 'li':
text += " * "
text += el_text.strip() + " "
# add href to the output
href = el.attrib.get('href')
if href:
text += "(%s) " % href
if el.tag in _HARDBREAKS and text and not text.endswith("\n"):
text += "\n"
retval = _rm_excessive_newlines(text)
return _encode_utf8(retval)
def _contains_charset_spec(s):
"""Return True if the first 4KB contain charset spec
"""
return s.lower().find('html; charset=', 0, 4096) != -1
def _prepend_utf8_declaration(s):
"""Prepend 'utf-8' encoding declaration if the first 4KB don't have any
"""
return s if _contains_charset_spec(s) else _UTF8_DECLARATION + s
def _rm_excessive_newlines(s):
"""Remove excessive newlines that often happen due to tons of divs
"""
return _RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip()
def _encode_utf8(s):
"""Encode in 'utf-8' if unicode
"""
return s.encode('utf-8') if isinstance(s, unicode) else s
_UTF8_DECLARATION = ('<meta http-equiv="Content-Type" content="text/html;'
'charset=utf-8">')
_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
_HARDBREAKS = ['br', 'hr', 'tr']
_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")

View File

@@ -5,9 +5,7 @@ from . fixtures import *
import regex as re import regex as re
from talon import quotations from talon import quotations, utils as u
import html2text
RE_WHITESPACE = re.compile("\s") RE_WHITESPACE = re.compile("\s")
@@ -270,26 +268,15 @@ def test_reply_separated_by_hr():
'', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR)))
RE_REPLY = re.compile(r"^Hi\. I am fine\.\s*\n\s*Thanks,\s*\n\s*Alex\s*$")
def extract_reply_and_check(filename): def extract_reply_and_check(filename):
f = open(filename) f = open(filename)
msg_body = f.read() msg_body = f.read()
reply = quotations.extract_from_html(msg_body) reply = quotations.extract_from_html(msg_body)
plain_reply = u.html_to_text(reply)
h = html2text.HTML2Text() eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"),
h.body_width = 0 RE_WHITESPACE.sub('', plain_reply))
plain_reply = h.handle(reply)
#remove &nbsp; spaces
plain_reply = plain_reply.replace(u'\xa0', u' ')
if RE_REPLY.match(plain_reply):
eq_(1, 1)
else:
eq_("Hi. I am fine.\n\nThanks,\nAlex", plain_reply)
def test_gmail_reply(): def test_gmail_reply():

View File

@@ -58,3 +58,50 @@ def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect):
detect_encoding.return_value = 'utf-8' detect_encoding.return_value = 'utf-8'
eq_('utf-8', u.quick_detect_encoding("qwe")) eq_('utf-8', u.quick_detect_encoding("qwe"))
ok_(detect_encoding.called) ok_(detect_encoding.called)
def test_html_to_text():
html = """<body>
<p>Hello world!</p>
<br>
<ul>
<li>One!</li>
<li>Two</li>
</ul>
<p>
Haha
</p>
</body>"""
text = u.html_to_text(html)
eq_("Hello world! \n\n * One! \n * Two \nHaha", text)
eq_("привет!", u.html_to_text("<b>привет!</b>"))
html = '<body><br/><br/>Hi</body>'
eq_ ('Hi', u.html_to_text(html))
html = """Hi
<style type="text/css">
div, p, li {
font: 13px 'Lucida Grande', Arial, sans-serif;
}
</style>
<style type="text/css">
h1 {
font: 13px 'Lucida Grande', Arial, sans-serif;
}
</style>"""
eq_ ('Hi', u.html_to_text(html))
html = """<div>
<!-- COMMENT 1 -->
<span>TEXT 1</span>
<p>TEXT 2 <!-- COMMENT 2 --></p>
</div>"""
eq_('TEXT 1 \nTEXT 2', u.html_to_text(html))