From 41457d8fbd042d6193f943cfd1ae3d5f3bac3245 Mon Sep 17 00:00:00 2001 From: Sergey Obukhov Date: Sat, 5 Dec 2015 00:37:02 -0800 Subject: [PATCH 1/2] fixes mailgun/talon#38 mailgun/talon#20 --- setup.py | 2 +- talon/html_quotations.py | 10 ++- talon/quotations.py | 117 +++++++++++++++++----------------- tests/html_quotations_test.py | 33 +++++++--- tests/quotations_test.py | 12 ---- 5 files changed, 93 insertions(+), 81 deletions(-) diff --git a/setup.py b/setup.py index ab570a9..e346f5d 100755 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages setup(name='talon', - version='1.0.9', + version='1.2.0', description=("Mailgun library " "to extract message quotations and signatures."), long_description=open("README.rst").read(), diff --git a/talon/html_quotations.py b/talon/html_quotations.py index 32bf634..9540db8 100644 --- a/talon/html_quotations.py +++ b/talon/html_quotations.py @@ -76,7 +76,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints): def cut_gmail_quote(html_message): ''' Cuts the outermost block element with class gmail_quote. ''' - gmail_quote = html_message.cssselect('.gmail_quote') + gmail_quote = html_message.cssselect('div.gmail_quote') if gmail_quote: gmail_quote[0].getparent().remove(gmail_quote[0]) return True @@ -138,8 +138,12 @@ def cut_by_id(html_message): def cut_blockquote(html_message): - ''' Cuts the last non-nested blockquote with wrapping elements. ''' - quote = html_message.xpath('(.//blockquote)[not(ancestor::blockquote)][last()]') + ''' Cuts the last non-nested blockquote with wrapping elements.''' + quote = html_message.xpath( + '(.//blockquote)' + '[not(@class="gmail_quote") and not(ancestor::blockquote)]' + '[last()]') + if quote: quote = quote[0] quote.getparent().remove(quote) diff --git a/talon/quotations.py b/talon/quotations.py index dbb58f4..146ec46 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -10,6 +10,8 @@ import logging from copy import deepcopy from lxml import html, etree +import lxml +from lxml.cssselect import CSSSelector import html2text from talon.utils import get_delimiter @@ -108,7 +110,7 @@ RE_EMPTY_QUOTATION = re.compile( ( # quotation border: splitter line or a number of quotation marker lines (?: - s + (?:se*)+ | (?:me*){2,} ) @@ -139,7 +141,7 @@ RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? . SPLITTER_PATTERNS = [ RE_ORIGINAL_MESSAGE, # - re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE), + re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S), RE_ON_DATE_SMB_WROTE, RE_ON_DATE_WROTE_SMB, RE_FROM_COLON_OR_DATE_COLON, @@ -321,7 +323,7 @@ def extract_from_plain(msg_body): return msg_body -def extract_from_html(s): +def extract_from_html(msg_body): """ Extract not quoted message from provided html message body using tags and plain text algorithm. @@ -337,49 +339,32 @@ def extract_from_html(s): then checking deleted checkpoints, then deleting necessary tags. """ + if msg_body.strip() == '': + return msg_body - if s.strip() == '': - return s - - # replace CRLF with LF temporaraly otherwise CR will be converted to ' ' - # when doing deepcopy on html tree - msg_body, replaced = _CRLF_to_LF(s) - + msg_body = msg_body.replace('\r\n', '').replace('\n', '') html_tree = html.document_fromstring( msg_body, parser=html.HTMLParser(encoding="utf-8") ) - cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or html_quotations.cut_blockquote(html_tree) or html_quotations.cut_microsoft_quote(html_tree) or html_quotations.cut_by_id(html_tree) or html_quotations.cut_from_block(html_tree) ) - html_tree_copy = deepcopy(html_tree) number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) quotation_checkpoints = [False] * number_of_checkpoints msg_with_checkpoints = html.tostring(html_tree) - - h = html2text.HTML2Text() - h.body_width = 0 # generate plain text without wrap - - # html2text adds unnecessary star symbols. Remove them. - # Mask star symbols - msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432') - plain_text = h.handle(msg_with_checkpoints) - # Remove created star symbols - plain_text = plain_text.replace('*', '') - # Unmask saved star symbols - plain_text = plain_text.replace('3423oorkg432', '*') + plain_text = html_to_text(msg_with_checkpoints) plain_text = preprocess(plain_text, '\n', content_type='text/html') lines = plain_text.splitlines() # Don't process too long messages if len(lines) > MAX_LINES_COUNT: - return s + return msg_body # Collect checkpoints on each line line_checkpoints = [ @@ -396,7 +381,6 @@ def extract_from_html(s): return_flags = [] process_marked_lines(lines, markers, return_flags) lines_were_deleted, first_deleted, last_deleted = return_flags - if lines_were_deleted: #collect checkpoints from deleted lines for i in xrange(first_deleted, last_deleted): @@ -404,9 +388,9 @@ def extract_from_html(s): quotation_checkpoints[checkpoint] = True else: if cut_quotations: - return _restore_CRLF(html.tostring(html_tree_copy), replaced) + return html.tostring(html_tree_copy) else: - return s + return msg_body # Remove tags with quotation checkpoints html_quotations.delete_quotation_tags( @@ -444,35 +428,54 @@ def register_xpath_extensions(): ns['tail'] = tail -def _restore_CRLF(s, replaced=True): - """Restore CRLF if previously CRLF was replaced with LF - - >>> _restore_CRLF('a\nb') - 'a\r\nb' - >>> _restore_CRLF('a\nb', replaced=False) - 'a\nb' +def html_to_text(string): """ - if replaced: - return s.replace('\n', '\r\n') - return s + Dead-simple HTML-to-text converter: + >>> html_to_text("one
two
three") + >>> u"one\ntwo\nthree" - -def _CRLF_to_LF(s): - """Replace CRLF with LF - - >>> s, changed = _CRLF_to_LF('a\r\n'b) - >>> s - 'a\nb' - >>> changed - True - - >>> s, changed = _CRLF_to_LF('a\n'b) - >>> s - 'a\nb' - >>> changed - False + NOTES: + 1. the string is expected to contain UTF-8 encoded HTML! + 2. returns utf-8 encoded str (not unicode) """ - delimiter = get_delimiter(s) - if delimiter == '\r\n': - return s.replace(delimiter, '\n'), True - return s, False + retval = None + try: + # append 'utf-8' encoding declaration to HTML string if the first 4KB of the message does not + # contain the charset spec: + if string.lower().find('html; charset=', 0, 4096) == -1: + string = '''''' + string + tree = lxml.html.fromstring(string.replace("\n", "")) + + for style in CSSSelector('style')(tree): + style.getparent().remove(style) + + for c in tree.xpath('//comment()'): + c.getparent().remove(c) + + blocktags = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] + hardbreaks = ['br', 'hr', 'tr'] + text = "" + for el in tree.iter(): + el_text = (el.text or '') + (el.tail or '') + if len(el_text) > 1: + if el.tag in blocktags: + text += "\n" + if el.tag == 'li': + text += " * " + text += el_text.strip() + " " + + # add href to the output + # href = el.attrib.get('href', None) + # if href: + # text += "(%s) " % href + + if el.tag in hardbreaks and len(text) > 0 and text[-1] != "\n": + text += "\n" + + # remove excessive newlines that often happen due to tons of divs: + retval = re.sub("\n{2,10}", "\n\n", text).strip() + if isinstance(retval, unicode): + retval = retval.encode('utf-8') + except Exception as e: + pass + return retval diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index 5c4118e..5a496c8 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -28,8 +28,8 @@ def test_quotation_splitter_inside_blockquote(): """ - eq_("

Reply\n

", - quotations.extract_from_html(msg_body)) + eq_("

Reply

", + RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) def test_quotation_splitter_outside_blockquote(): @@ -45,7 +45,7 @@ def test_quotation_splitter_outside_blockquote(): """ - eq_("

Reply

", + eq_("

Reply

", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -63,7 +63,7 @@ def test_regular_blockquote(): """ - eq_("

Reply

Regular
", + eq_("

Reply

Regular
", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -133,6 +133,18 @@ def test_gmail_quote(): RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) +def test_gmail_quote_blockquote(): + msg_body = """Message +
+
+ My name is William Shakespeare. +
+
+
""" + eq_(RE_WHITESPACE.sub('', msg_body), + RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) + + def test_unicode_in_reply(): msg_body = u"""Reply \xa0 \xa0 Text
@@ -140,7 +152,7 @@ def test_unicode_in_reply():
-
+
Quote
""".encode("utf-8") @@ -315,7 +327,10 @@ def test_yandex_ru_reply(): def test_CRLF(): """CR is not converted to ' ' """ - eq_('\r\n', quotations.extract_from_html('\r\n')) + symbol = ' ' + extracted = quotations.extract_from_html('\r\n') + assert_false(symbol in extracted) + eq_('', RE_WHITESPACE.sub('', extracted)) msg_body = """Reply
@@ -330,5 +345,7 @@ def test_CRLF():
""" msg_body = msg_body.replace('\n', '\r\n') - eq_("

Reply\r\n

", - quotations.extract_from_html(msg_body)) + extracted = quotations.extract_from_html(msg_body) + assert_false(symbol in extracted) + eq_("

Reply

", + RE_WHITESPACE.sub('', extracted)) diff --git a/tests/quotations_test.py b/tests/quotations_test.py index 0cd18b2..7184368 100644 --- a/tests/quotations_test.py +++ b/tests/quotations_test.py @@ -29,15 +29,3 @@ def test_crash_inside_extract_from(): def test_empty_body(): eq_('', quotations.extract_from_plain('')) - - -def test__CRLF_to_LF(): - eq_(('\n\r', True), quotations._CRLF_to_LF('\r\n\r')) - eq_(('\n\r', False), quotations._CRLF_to_LF('\n\r')) - - -def test__restore_CRLF(): - eq_('\n', quotations._restore_CRLF('\n', replaced=False)) - eq_('\r\n', quotations._restore_CRLF('\n', replaced=True)) - # default - eq_('\r\n', quotations._restore_CRLF('\n')) From 3d9ae356eaddf2a4e442c5e1e4ef9406d8e2d575 Mon Sep 17 00:00:00 2001 From: Sergey Obukhov Date: Fri, 18 Dec 2015 18:56:41 -0800 Subject: [PATCH 2/2] add more tests, make standard reply tests more relaxed --- setup.py | 1 - talon/quotations.py | 58 +----------------------- talon/utils.py | 84 ++++++++++++++++++++++++++++++++++- tests/html_quotations_test.py | 21 ++------- tests/utils_test.py | 47 ++++++++++++++++++++ 5 files changed, 134 insertions(+), 77 deletions(-) diff --git a/setup.py b/setup.py index e346f5d..a7d0687 100755 --- a/setup.py +++ b/setup.py @@ -16,7 +16,6 @@ setup(name='talon', install_requires=[ "lxml>=2.3.3", "regex>=1", - "html2text", "numpy", "scipy", "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild diff --git a/talon/quotations.py b/talon/quotations.py index 146ec46..a40d247 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -10,11 +10,8 @@ import logging from copy import deepcopy from lxml import html, etree -import lxml -from lxml.cssselect import CSSSelector -import html2text -from talon.utils import get_delimiter +from talon.utils import get_delimiter, html_to_text from talon import html_quotations @@ -426,56 +423,3 @@ def register_xpath_extensions(): ns.prefix = 'mg' ns['text_content'] = text_content ns['tail'] = tail - - -def html_to_text(string): - """ - Dead-simple HTML-to-text converter: - >>> html_to_text("one
two
three") - >>> u"one\ntwo\nthree" - - NOTES: - 1. the string is expected to contain UTF-8 encoded HTML! - 2. returns utf-8 encoded str (not unicode) - """ - retval = None - try: - # append 'utf-8' encoding declaration to HTML string if the first 4KB of the message does not - # contain the charset spec: - if string.lower().find('html; charset=', 0, 4096) == -1: - string = '''''' + string - tree = lxml.html.fromstring(string.replace("\n", "")) - - for style in CSSSelector('style')(tree): - style.getparent().remove(style) - - for c in tree.xpath('//comment()'): - c.getparent().remove(c) - - blocktags = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] - hardbreaks = ['br', 'hr', 'tr'] - text = "" - for el in tree.iter(): - el_text = (el.text or '') + (el.tail or '') - if len(el_text) > 1: - if el.tag in blocktags: - text += "\n" - if el.tag == 'li': - text += " * " - text += el_text.strip() + " " - - # add href to the output - # href = el.attrib.get('href', None) - # if href: - # text += "(%s) " % href - - if el.tag in hardbreaks and len(text) > 0 and text[-1] != "\n": - text += "\n" - - # remove excessive newlines that often happen due to tons of divs: - retval = re.sub("\n{2,10}", "\n\n", text).strip() - if isinstance(retval, unicode): - retval = retval.encode('utf-8') - except Exception as e: - pass - return retval diff --git a/talon/utils.py b/talon/utils.py index 2092d8e..dc47622 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -4,6 +4,10 @@ import logging from random import shuffle import chardet import cchardet +import regex as re + +from lxml import html +from lxml.cssselect import CSSSelector from talon.constants import RE_DELIMITER @@ -58,7 +62,6 @@ def detect_encoding(string): if detected: return detected.get('encoding') or 'utf-8' except Exception, e: - print 11111111111, e pass return 'utf-8' @@ -74,7 +77,6 @@ def quick_detect_encoding(string): if detected: return detected.get('encoding') or detect_encoding(string) except Exception, e: - print 222222222222, e pass return detect_encoding(string) @@ -105,3 +107,81 @@ def get_delimiter(msg_body): delimiter = '\n' return delimiter + + +def html_to_text(string): + """ + Dead-simple HTML-to-text converter: + >>> html_to_text("one
two
three") + >>> "one\ntwo\nthree" + + NOTES: + 1. the string is expected to contain UTF-8 encoded HTML! + 2. returns utf-8 encoded str (not unicode) + """ + s = _prepend_utf8_declaration(string) + s = s.replace("\n", "") + + tree = html.fromstring(s) + + for style in CSSSelector('style')(tree): + style.getparent().remove(style) + + for c in tree.xpath('//comment()'): + c.getparent().remove(c) + + text = "" + for el in tree.iter(): + el_text = (el.text or '') + (el.tail or '') + if len(el_text) > 1: + if el.tag in _BLOCKTAGS: + text += "\n" + if el.tag == 'li': + text += " * " + text += el_text.strip() + " " + + # add href to the output + href = el.attrib.get('href') + if href: + text += "(%s) " % href + + if el.tag in _HARDBREAKS and text and not text.endswith("\n"): + text += "\n" + + retval = _rm_excessive_newlines(text) + return _encode_utf8(retval) + + +def _contains_charset_spec(s): + """Return True if the first 4KB contain charset spec + """ + return s.lower().find('html; charset=', 0, 4096) != -1 + + +def _prepend_utf8_declaration(s): + """Prepend 'utf-8' encoding declaration if the first 4KB don't have any + """ + return s if _contains_charset_spec(s) else _UTF8_DECLARATION + s + + +def _rm_excessive_newlines(s): + """Remove excessive newlines that often happen due to tons of divs + """ + return _RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip() + + +def _encode_utf8(s): + """Encode in 'utf-8' if unicode + """ + return s.encode('utf-8') if isinstance(s, unicode) else s + + +_UTF8_DECLARATION = ('') + + +_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] +_HARDBREAKS = ['br', 'hr', 'tr'] + + +_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index 5a496c8..9bf488f 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -5,9 +5,7 @@ from . fixtures import * import regex as re -from talon import quotations - -import html2text +from talon import quotations, utils as u RE_WHITESPACE = re.compile("\s") @@ -270,26 +268,15 @@ def test_reply_separated_by_hr(): '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) -RE_REPLY = re.compile(r"^Hi\. I am fine\.\s*\n\s*Thanks,\s*\n\s*Alex\s*$") - - def extract_reply_and_check(filename): f = open(filename) msg_body = f.read() reply = quotations.extract_from_html(msg_body) + plain_reply = u.html_to_text(reply) - h = html2text.HTML2Text() - h.body_width = 0 - plain_reply = h.handle(reply) - - #remove   spaces - plain_reply = plain_reply.replace(u'\xa0', u' ') - - if RE_REPLY.match(plain_reply): - eq_(1, 1) - else: - eq_("Hi. I am fine.\n\nThanks,\nAlex", plain_reply) + eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"), + RE_WHITESPACE.sub('', plain_reply)) def test_gmail_reply(): diff --git a/tests/utils_test.py b/tests/utils_test.py index 519efe1..c77f0a6 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -58,3 +58,50 @@ def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect): detect_encoding.return_value = 'utf-8' eq_('utf-8', u.quick_detect_encoding("qwe")) ok_(detect_encoding.called) + + +def test_html_to_text(): + html = """ +

Hello world!

+
+
    +
  • One!
  • +
  • Two
  • +
+

+Haha +

+""" + text = u.html_to_text(html) + eq_("Hello world! \n\n * One! \n * Two \nHaha", text) + eq_("привет!", u.html_to_text("привет!")) + + html = '

Hi' + eq_ ('Hi', u.html_to_text(html)) + + html = """Hi + + +""" + eq_ ('Hi', u.html_to_text(html)) + + html = """
+ +TEXT 1 +

TEXT 2

+
""" + eq_('TEXT 1 \nTEXT 2', u.html_to_text(html))