This commit is contained in:
Sergey Obukhov
2015-09-21 09:51:26 -07:00
parent 2cb9b5399c
commit ae508fe0e5
3 changed files with 82 additions and 12 deletions

View File

@@ -315,7 +315,7 @@ def extract_from_plain(msg_body):
return msg_body
def extract_from_html(msg_body):
def extract_from_html(s):
"""
Extract not quoted message from provided html message body
using tags and plain text algorithm.
@@ -332,8 +332,12 @@ def extract_from_html(msg_body):
then deleting necessary tags.
"""
if msg_body.strip() == '':
return msg_body
if s.strip() == '':
return s
# replace CRLF with LF temporaraly otherwise CR will be converted to '
'
# when doing deepcopy on html tree
msg_body, replaced = _CRLF_to_LF(s)
html_tree = html.document_fromstring(
msg_body,
@@ -364,15 +368,12 @@ def extract_from_html(msg_body):
plain_text = plain_text.replace('*', '')
# Unmask saved star symbols
plain_text = plain_text.replace('3423oorkg432', '*')
delimiter = get_delimiter(plain_text)
plain_text = preprocess(plain_text, delimiter, content_type='text/html')
plain_text = preprocess(plain_text, '\n', content_type='text/html')
lines = plain_text.splitlines()
# Don't process too long messages
if len(lines) > MAX_LINES_COUNT:
return msg_body
return s
# Collect checkpoints on each line
line_checkpoints = [
@@ -397,9 +398,10 @@ def extract_from_html(msg_body):
quotation_checkpoints[checkpoint] = True
else:
if cut_quotations:
return html.tostring(html_tree_copy)
print 1111111111, replaced
return _restore_CRLF(html.tostring(html_tree_copy), replaced)
else:
return msg_body
return s
# Remove tags with quotation checkpoints
html_quotations.delete_quotation_tags(
@@ -435,3 +437,37 @@ def register_xpath_extensions():
ns.prefix = 'mg'
ns['text_content'] = text_content
ns['tail'] = tail
def _restore_CRLF(s, replaced=True):
"""Restore CRLF if previously CRLF was replaced with LF
>>> _restore_CRLF('a\nb')
'a\r\nb'
>>> _restore_CRLF('a\nb', replaced=False)
'a\nb'
"""
if replaced:
return s.replace('\n', '\r\n')
return s
def _CRLF_to_LF(s):
"""Replace CRLF with LF
>>> s, changed = _CRLF_to_LF('a\r\n'b)
>>> s
'a\nb'
>>> changed
True
>>> s, changed = _CRLF_to_LF('a\n'b)
>>> s
'a\nb'
>>> changed
False
"""
delimiter = get_delimiter(s)
if delimiter == '\r\n':
return s.replace(delimiter, '\n'), True
return s, False

View File

@@ -28,8 +28,8 @@ def test_quotation_splitter_inside_blockquote():
</blockquote>"""
eq_("<html><body><p>Reply</p></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
eq_("<html><body><p>Reply\n</p></body></html>",
quotations.extract_from_html(msg_body))
def test_quotation_splitter_outside_blockquote():
@@ -310,3 +310,25 @@ def test_windows_mail_reply():
def test_yandex_ru_reply():
extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html")
def test_CRLF():
"""CR is not converted to '&#13;'
"""
eq_('<html>\r\n</html>', quotations.extract_from_html('<html>\r\n</html>'))
msg_body = """Reply
<blockquote>
<div>
On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
</div>
<div>
Test
</div>
</blockquote>"""
msg_body = msg_body.replace('\n', '\r\n')
eq_("<html><body><p>Reply\r\n</p></body></html>",
quotations.extract_from_html(msg_body))

View File

@@ -29,3 +29,15 @@ def test_crash_inside_extract_from():
def test_empty_body():
eq_('', quotations.extract_from_plain(''))
def test__CRLF_to_LF():
eq_(('\n\r', True), quotations._CRLF_to_LF('\r\n\r'))
eq_(('\n\r', False), quotations._CRLF_to_LF('\n\r'))
def test__restore_CRLF():
eq_('\n', quotations._restore_CRLF('\n', replaced=False))
eq_('\r\n', quotations._restore_CRLF('\n', replaced=True))
# default
eq_('\r\n', quotations._restore_CRLF('\n'))