@@ -315,7 +315,7 @@ def extract_from_plain(msg_body):
|
|||||||
return msg_body
|
return msg_body
|
||||||
|
|
||||||
|
|
||||||
def extract_from_html(msg_body):
|
def extract_from_html(s):
|
||||||
"""
|
"""
|
||||||
Extract not quoted message from provided html message body
|
Extract not quoted message from provided html message body
|
||||||
using tags and plain text algorithm.
|
using tags and plain text algorithm.
|
||||||
@@ -332,8 +332,12 @@ def extract_from_html(msg_body):
|
|||||||
then deleting necessary tags.
|
then deleting necessary tags.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if msg_body.strip() == '':
|
if s.strip() == '':
|
||||||
return msg_body
|
return s
|
||||||
|
|
||||||
|
# replace CRLF with LF temporaraly otherwise CR will be converted to ' '
|
||||||
|
# when doing deepcopy on html tree
|
||||||
|
msg_body, replaced = _CRLF_to_LF(s)
|
||||||
|
|
||||||
html_tree = html.document_fromstring(
|
html_tree = html.document_fromstring(
|
||||||
msg_body,
|
msg_body,
|
||||||
@@ -364,15 +368,12 @@ def extract_from_html(msg_body):
|
|||||||
plain_text = plain_text.replace('*', '')
|
plain_text = plain_text.replace('*', '')
|
||||||
# Unmask saved star symbols
|
# Unmask saved star symbols
|
||||||
plain_text = plain_text.replace('3423oorkg432', '*')
|
plain_text = plain_text.replace('3423oorkg432', '*')
|
||||||
|
plain_text = preprocess(plain_text, '\n', content_type='text/html')
|
||||||
delimiter = get_delimiter(plain_text)
|
|
||||||
|
|
||||||
plain_text = preprocess(plain_text, delimiter, content_type='text/html')
|
|
||||||
lines = plain_text.splitlines()
|
lines = plain_text.splitlines()
|
||||||
|
|
||||||
# Don't process too long messages
|
# Don't process too long messages
|
||||||
if len(lines) > MAX_LINES_COUNT:
|
if len(lines) > MAX_LINES_COUNT:
|
||||||
return msg_body
|
return s
|
||||||
|
|
||||||
# Collect checkpoints on each line
|
# Collect checkpoints on each line
|
||||||
line_checkpoints = [
|
line_checkpoints = [
|
||||||
@@ -397,9 +398,9 @@ def extract_from_html(msg_body):
|
|||||||
quotation_checkpoints[checkpoint] = True
|
quotation_checkpoints[checkpoint] = True
|
||||||
else:
|
else:
|
||||||
if cut_quotations:
|
if cut_quotations:
|
||||||
return html.tostring(html_tree_copy)
|
return _restore_CRLF(html.tostring(html_tree_copy), replaced)
|
||||||
else:
|
else:
|
||||||
return msg_body
|
return s
|
||||||
|
|
||||||
# Remove tags with quotation checkpoints
|
# Remove tags with quotation checkpoints
|
||||||
html_quotations.delete_quotation_tags(
|
html_quotations.delete_quotation_tags(
|
||||||
@@ -435,3 +436,37 @@ def register_xpath_extensions():
|
|||||||
ns.prefix = 'mg'
|
ns.prefix = 'mg'
|
||||||
ns['text_content'] = text_content
|
ns['text_content'] = text_content
|
||||||
ns['tail'] = tail
|
ns['tail'] = tail
|
||||||
|
|
||||||
|
|
||||||
|
def _restore_CRLF(s, replaced=True):
|
||||||
|
"""Restore CRLF if previously CRLF was replaced with LF
|
||||||
|
|
||||||
|
>>> _restore_CRLF('a\nb')
|
||||||
|
'a\r\nb'
|
||||||
|
>>> _restore_CRLF('a\nb', replaced=False)
|
||||||
|
'a\nb'
|
||||||
|
"""
|
||||||
|
if replaced:
|
||||||
|
return s.replace('\n', '\r\n')
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def _CRLF_to_LF(s):
|
||||||
|
"""Replace CRLF with LF
|
||||||
|
|
||||||
|
>>> s, changed = _CRLF_to_LF('a\r\n'b)
|
||||||
|
>>> s
|
||||||
|
'a\nb'
|
||||||
|
>>> changed
|
||||||
|
True
|
||||||
|
|
||||||
|
>>> s, changed = _CRLF_to_LF('a\n'b)
|
||||||
|
>>> s
|
||||||
|
'a\nb'
|
||||||
|
>>> changed
|
||||||
|
False
|
||||||
|
"""
|
||||||
|
delimiter = get_delimiter(s)
|
||||||
|
if delimiter == '\r\n':
|
||||||
|
return s.replace(delimiter, '\n'), True
|
||||||
|
return s, False
|
||||||
|
|||||||
@@ -28,8 +28,8 @@ def test_quotation_splitter_inside_blockquote():
|
|||||||
|
|
||||||
</blockquote>"""
|
</blockquote>"""
|
||||||
|
|
||||||
eq_("<html><body><p>Reply</p></body></html>",
|
eq_("<html><body><p>Reply\n</p></body></html>",
|
||||||
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
|
quotations.extract_from_html(msg_body))
|
||||||
|
|
||||||
|
|
||||||
def test_quotation_splitter_outside_blockquote():
|
def test_quotation_splitter_outside_blockquote():
|
||||||
@@ -310,3 +310,25 @@ def test_windows_mail_reply():
|
|||||||
|
|
||||||
def test_yandex_ru_reply():
|
def test_yandex_ru_reply():
|
||||||
extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html")
|
extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html")
|
||||||
|
|
||||||
|
|
||||||
|
def test_CRLF():
|
||||||
|
"""CR is not converted to ' '
|
||||||
|
"""
|
||||||
|
eq_('<html>\r\n</html>', quotations.extract_from_html('<html>\r\n</html>'))
|
||||||
|
|
||||||
|
msg_body = """Reply
|
||||||
|
<blockquote>
|
||||||
|
|
||||||
|
<div>
|
||||||
|
On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div>
|
||||||
|
Test
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</blockquote>"""
|
||||||
|
msg_body = msg_body.replace('\n', '\r\n')
|
||||||
|
eq_("<html><body><p>Reply\r\n</p></body></html>",
|
||||||
|
quotations.extract_from_html(msg_body))
|
||||||
|
|||||||
@@ -29,3 +29,15 @@ def test_crash_inside_extract_from():
|
|||||||
|
|
||||||
def test_empty_body():
|
def test_empty_body():
|
||||||
eq_('', quotations.extract_from_plain(''))
|
eq_('', quotations.extract_from_plain(''))
|
||||||
|
|
||||||
|
|
||||||
|
def test__CRLF_to_LF():
|
||||||
|
eq_(('\n\r', True), quotations._CRLF_to_LF('\r\n\r'))
|
||||||
|
eq_(('\n\r', False), quotations._CRLF_to_LF('\n\r'))
|
||||||
|
|
||||||
|
|
||||||
|
def test__restore_CRLF():
|
||||||
|
eq_('\n', quotations._restore_CRLF('\n', replaced=False))
|
||||||
|
eq_('\r\n', quotations._restore_CRLF('\n', replaced=True))
|
||||||
|
# default
|
||||||
|
eq_('\r\n', quotations._restore_CRLF('\n'))
|
||||||
|
|||||||
Reference in New Issue
Block a user