diff --git a/setup.py b/setup.py index 8f054df..c54dd2d 100755 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ class InstallCommand(install): setup(name='talon', - version='1.2.14', + version='1.2.15', description=("Mailgun library " "to extract message quotations and signatures."), long_description=open("README.rst").read(), diff --git a/talon/quotations.py b/talon/quotations.py index add0e02..2834be0 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -12,7 +12,7 @@ from copy import deepcopy from lxml import html, etree -from talon.utils import get_delimiter, html_to_text +from talon.utils import get_delimiter, html_tree_to_text from talon import html_quotations from six.moves import range import six @@ -407,8 +407,7 @@ def _extract_from_html(msg_body): number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) quotation_checkpoints = [False] * number_of_checkpoints - msg_with_checkpoints = html.tostring(html_tree) - plain_text = html_to_text(msg_with_checkpoints) + plain_text = html_tree_to_text(html_tree) plain_text = preprocess(plain_text, '\n', content_type='text/html') lines = plain_text.splitlines() @@ -431,25 +430,31 @@ def _extract_from_html(msg_body): return_flags = [] process_marked_lines(lines, markers, return_flags) lines_were_deleted, first_deleted, last_deleted = return_flags + + if not lines_were_deleted and not cut_quotations: + return msg_body + if lines_were_deleted: #collect checkpoints from deleted lines for i in range(first_deleted, last_deleted): for checkpoint in line_checkpoints[i]: quotation_checkpoints[checkpoint] = True - else: - if cut_quotations: - return html.tostring(html_tree_copy) - else: - return msg_body - # Remove tags with quotation checkpoints - html_quotations.delete_quotation_tags( - html_tree_copy, 0, quotation_checkpoints - ) + # Remove tags with quotation checkpoints + html_quotations.delete_quotation_tags( + html_tree_copy, 0, quotation_checkpoints + ) + + if _readable_text_empty(html_tree_copy): + return msg_body return html.tostring(html_tree_copy) +def _readable_text_empty(html_tree): + return not bool(html_tree_to_text(html_tree).strip()) + + def is_splitter(line): ''' Returns Matcher object if provided string is a splitter and diff --git a/talon/utils.py b/talon/utils.py index 02a7a92..2da73bf 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -112,25 +112,7 @@ def get_delimiter(msg_body): return delimiter - -def html_to_text(string): - """ - Dead-simple HTML-to-text converter: - >>> html_to_text("one
two
three") - >>> "one\ntwo\nthree" - - NOTES: - 1. the string is expected to contain UTF-8 encoded HTML! - 2. returns utf-8 encoded str (not unicode) - """ - if isinstance(string, six.text_type): - string = string.encode('utf8') - - s = _prepend_utf8_declaration(string) - s = s.replace(b"\n", b"") - - tree = html.fromstring(s) - +def html_tree_to_text(tree): for style in CSSSelector('style')(tree): style.getparent().remove(style) @@ -159,6 +141,26 @@ def html_to_text(string): return _encode_utf8(retval) +def html_to_text(string): + """ + Dead-simple HTML-to-text converter: + >>> html_to_text("one
two
three") + >>> "one\ntwo\nthree" + + NOTES: + 1. the string is expected to contain UTF-8 encoded HTML! + 2. returns utf-8 encoded str (not unicode) + """ + if isinstance(string, six.text_type): + string = string.encode('utf8') + + s = _prepend_utf8_declaration(string) + s = s.replace(b"\n", b"") + + tree = html.fromstring(s) + return html_tree_to_text(tree) + + def _contains_charset_spec(s): """Return True if the first 4KB contain charset spec """ diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index 4453033..e0ddcb2 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -393,3 +393,21 @@ def test_too_large_html(): '' eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) + + +def test_readable_html_empty(): + msg_body = """ +
+ Reply +
+ On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: +
+ +
+ Test +
+ +
""" + + eq_(RE_WHITESPACE.sub('', msg_body), + RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))