From 3d9ae356eaddf2a4e442c5e1e4ef9406d8e2d575 Mon Sep 17 00:00:00 2001 From: Sergey Obukhov Date: Fri, 18 Dec 2015 18:56:41 -0800 Subject: [PATCH] add more tests, make standard reply tests more relaxed --- setup.py | 1 - talon/quotations.py | 58 +----------------------- talon/utils.py | 84 ++++++++++++++++++++++++++++++++++- tests/html_quotations_test.py | 21 ++------- tests/utils_test.py | 47 ++++++++++++++++++++ 5 files changed, 134 insertions(+), 77 deletions(-) diff --git a/setup.py b/setup.py index e346f5d..a7d0687 100755 --- a/setup.py +++ b/setup.py @@ -16,7 +16,6 @@ setup(name='talon', install_requires=[ "lxml>=2.3.3", "regex>=1", - "html2text", "numpy", "scipy", "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild diff --git a/talon/quotations.py b/talon/quotations.py index 146ec46..a40d247 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -10,11 +10,8 @@ import logging from copy import deepcopy from lxml import html, etree -import lxml -from lxml.cssselect import CSSSelector -import html2text -from talon.utils import get_delimiter +from talon.utils import get_delimiter, html_to_text from talon import html_quotations @@ -426,56 +423,3 @@ def register_xpath_extensions(): ns.prefix = 'mg' ns['text_content'] = text_content ns['tail'] = tail - - -def html_to_text(string): - """ - Dead-simple HTML-to-text converter: - >>> html_to_text("one
two
three") - >>> u"one\ntwo\nthree" - - NOTES: - 1. the string is expected to contain UTF-8 encoded HTML! - 2. returns utf-8 encoded str (not unicode) - """ - retval = None - try: - # append 'utf-8' encoding declaration to HTML string if the first 4KB of the message does not - # contain the charset spec: - if string.lower().find('html; charset=', 0, 4096) == -1: - string = '''''' + string - tree = lxml.html.fromstring(string.replace("\n", "")) - - for style in CSSSelector('style')(tree): - style.getparent().remove(style) - - for c in tree.xpath('//comment()'): - c.getparent().remove(c) - - blocktags = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] - hardbreaks = ['br', 'hr', 'tr'] - text = "" - for el in tree.iter(): - el_text = (el.text or '') + (el.tail or '') - if len(el_text) > 1: - if el.tag in blocktags: - text += "\n" - if el.tag == 'li': - text += " * " - text += el_text.strip() + " " - - # add href to the output - # href = el.attrib.get('href', None) - # if href: - # text += "(%s) " % href - - if el.tag in hardbreaks and len(text) > 0 and text[-1] != "\n": - text += "\n" - - # remove excessive newlines that often happen due to tons of divs: - retval = re.sub("\n{2,10}", "\n\n", text).strip() - if isinstance(retval, unicode): - retval = retval.encode('utf-8') - except Exception as e: - pass - return retval diff --git a/talon/utils.py b/talon/utils.py index 2092d8e..dc47622 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -4,6 +4,10 @@ import logging from random import shuffle import chardet import cchardet +import regex as re + +from lxml import html +from lxml.cssselect import CSSSelector from talon.constants import RE_DELIMITER @@ -58,7 +62,6 @@ def detect_encoding(string): if detected: return detected.get('encoding') or 'utf-8' except Exception, e: - print 11111111111, e pass return 'utf-8' @@ -74,7 +77,6 @@ def quick_detect_encoding(string): if detected: return detected.get('encoding') or detect_encoding(string) except Exception, e: - print 222222222222, e pass return detect_encoding(string) @@ -105,3 +107,81 @@ def get_delimiter(msg_body): delimiter = '\n' return delimiter + + +def html_to_text(string): + """ + Dead-simple HTML-to-text converter: + >>> html_to_text("one
two
three") + >>> "one\ntwo\nthree" + + NOTES: + 1. the string is expected to contain UTF-8 encoded HTML! + 2. returns utf-8 encoded str (not unicode) + """ + s = _prepend_utf8_declaration(string) + s = s.replace("\n", "") + + tree = html.fromstring(s) + + for style in CSSSelector('style')(tree): + style.getparent().remove(style) + + for c in tree.xpath('//comment()'): + c.getparent().remove(c) + + text = "" + for el in tree.iter(): + el_text = (el.text or '') + (el.tail or '') + if len(el_text) > 1: + if el.tag in _BLOCKTAGS: + text += "\n" + if el.tag == 'li': + text += " * " + text += el_text.strip() + " " + + # add href to the output + href = el.attrib.get('href') + if href: + text += "(%s) " % href + + if el.tag in _HARDBREAKS and text and not text.endswith("\n"): + text += "\n" + + retval = _rm_excessive_newlines(text) + return _encode_utf8(retval) + + +def _contains_charset_spec(s): + """Return True if the first 4KB contain charset spec + """ + return s.lower().find('html; charset=', 0, 4096) != -1 + + +def _prepend_utf8_declaration(s): + """Prepend 'utf-8' encoding declaration if the first 4KB don't have any + """ + return s if _contains_charset_spec(s) else _UTF8_DECLARATION + s + + +def _rm_excessive_newlines(s): + """Remove excessive newlines that often happen due to tons of divs + """ + return _RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip() + + +def _encode_utf8(s): + """Encode in 'utf-8' if unicode + """ + return s.encode('utf-8') if isinstance(s, unicode) else s + + +_UTF8_DECLARATION = ('') + + +_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] +_HARDBREAKS = ['br', 'hr', 'tr'] + + +_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index 5a496c8..9bf488f 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -5,9 +5,7 @@ from . fixtures import * import regex as re -from talon import quotations - -import html2text +from talon import quotations, utils as u RE_WHITESPACE = re.compile("\s") @@ -270,26 +268,15 @@ def test_reply_separated_by_hr(): '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) -RE_REPLY = re.compile(r"^Hi\. I am fine\.\s*\n\s*Thanks,\s*\n\s*Alex\s*$") - - def extract_reply_and_check(filename): f = open(filename) msg_body = f.read() reply = quotations.extract_from_html(msg_body) + plain_reply = u.html_to_text(reply) - h = html2text.HTML2Text() - h.body_width = 0 - plain_reply = h.handle(reply) - - #remove   spaces - plain_reply = plain_reply.replace(u'\xa0', u' ') - - if RE_REPLY.match(plain_reply): - eq_(1, 1) - else: - eq_("Hi. I am fine.\n\nThanks,\nAlex", plain_reply) + eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"), + RE_WHITESPACE.sub('', plain_reply)) def test_gmail_reply(): diff --git a/tests/utils_test.py b/tests/utils_test.py index 519efe1..c77f0a6 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -58,3 +58,50 @@ def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect): detect_encoding.return_value = 'utf-8' eq_('utf-8', u.quick_detect_encoding("qwe")) ok_(detect_encoding.called) + + +def test_html_to_text(): + html = """ +

Hello world!

+
+ +

+Haha +

+""" + text = u.html_to_text(html) + eq_("Hello world! \n\n * One! \n * Two \nHaha", text) + eq_("привет!", u.html_to_text("привет!")) + + html = '

Hi' + eq_ ('Hi', u.html_to_text(html)) + + html = """Hi + + +""" + eq_ ('Hi', u.html_to_text(html)) + + html = """
+ +TEXT 1 +

TEXT 2

+
""" + eq_('TEXT 1 \nTEXT 2', u.html_to_text(html))