diff --git a/talon/quotations.py b/talon/quotations.py
index 9999e6a..6a6f746 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -12,7 +12,7 @@ from copy import deepcopy
from lxml import html, etree
-from talon.utils import get_delimiter, html_to_text
+from talon.utils import get_delimiter, html_tree_to_text
from talon import html_quotations
from six.moves import range
import six
@@ -407,8 +407,7 @@ def _extract_from_html(msg_body):
number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
quotation_checkpoints = [False] * number_of_checkpoints
- msg_with_checkpoints = html.tostring(html_tree)
- plain_text = html_to_text(msg_with_checkpoints)
+ plain_text = html_tree_to_text(html_tree)
plain_text = preprocess(plain_text, '\n', content_type='text/html')
lines = plain_text.splitlines()
@@ -431,25 +430,31 @@ def _extract_from_html(msg_body):
return_flags = []
process_marked_lines(lines, markers, return_flags)
lines_were_deleted, first_deleted, last_deleted = return_flags
+
+ if not lines_were_deleted and not cut_quotations:
+ return msg_body
+
if lines_were_deleted:
#collect checkpoints from deleted lines
for i in range(first_deleted, last_deleted):
for checkpoint in line_checkpoints[i]:
quotation_checkpoints[checkpoint] = True
- else:
- if cut_quotations:
- return html.tostring(html_tree_copy)
- else:
- return msg_body
- # Remove tags with quotation checkpoints
- html_quotations.delete_quotation_tags(
- html_tree_copy, 0, quotation_checkpoints
- )
+ # Remove tags with quotation checkpoints
+ html_quotations.delete_quotation_tags(
+ html_tree_copy, 0, quotation_checkpoints
+ )
+
+ if _readable_text_empty(html_tree_copy):
+ return msg_body
return html.tostring(html_tree_copy)
+def _readable_text_empty(html_tree):
+ return not bool(html_tree_to_text(html_tree).strip())
+
+
def is_splitter(line):
'''
Returns Matcher object if provided string is a splitter and
diff --git a/talon/utils.py b/talon/utils.py
index 02a7a92..2da73bf 100644
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -112,25 +112,7 @@ def get_delimiter(msg_body):
return delimiter
-
-def html_to_text(string):
- """
- Dead-simple HTML-to-text converter:
- >>> html_to_text("one
two
three")
- >>> "one\ntwo\nthree"
-
- NOTES:
- 1. the string is expected to contain UTF-8 encoded HTML!
- 2. returns utf-8 encoded str (not unicode)
- """
- if isinstance(string, six.text_type):
- string = string.encode('utf8')
-
- s = _prepend_utf8_declaration(string)
- s = s.replace(b"\n", b"")
-
- tree = html.fromstring(s)
-
+def html_tree_to_text(tree):
for style in CSSSelector('style')(tree):
style.getparent().remove(style)
@@ -159,6 +141,26 @@ def html_to_text(string):
return _encode_utf8(retval)
+def html_to_text(string):
+ """
+ Dead-simple HTML-to-text converter:
+ >>> html_to_text("one
two
three")
+ >>> "one\ntwo\nthree"
+
+ NOTES:
+ 1. the string is expected to contain UTF-8 encoded HTML!
+ 2. returns utf-8 encoded str (not unicode)
+ """
+ if isinstance(string, six.text_type):
+ string = string.encode('utf8')
+
+ s = _prepend_utf8_declaration(string)
+ s = s.replace(b"\n", b"")
+
+ tree = html.fromstring(s)
+ return html_tree_to_text(tree)
+
+
def _contains_charset_spec(s):
"""Return True if the first 4KB contain charset spec
"""
diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py
index 03c66a8..2a0b765 100644
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -392,3 +392,21 @@ def test_too_large_html():
''
eq_(RE_WHITESPACE.sub('', msg_body),
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
+
+
+def test_readable_html_empty():
+ msg_body = """
+
+ Reply +""" + + eq_(RE_WHITESPACE.sub('', msg_body), + RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))+ On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: ++ ++ Test ++ +