Merge pull request #119 from conapart3/master
Addition of new split_email method for issue:115
This commit is contained in:
@@ -172,6 +172,9 @@ MAX_HTML_LEN = 2794202
|
|||||||
QUOT_PATTERN = re.compile('^>+ ?')
|
QUOT_PATTERN = re.compile('^>+ ?')
|
||||||
NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
|
NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
|
||||||
|
|
||||||
|
# Regular expression to identify if a line is a header.
|
||||||
|
RE_HEADER = re.compile(": ")
|
||||||
|
|
||||||
|
|
||||||
def extract_from(msg_body, content_type='text/plain'):
|
def extract_from(msg_body, content_type='text/plain'):
|
||||||
try:
|
try:
|
||||||
@@ -450,6 +453,54 @@ def _extract_from_html(msg_body):
|
|||||||
return html.tostring(html_tree_copy)
|
return html.tostring(html_tree_copy)
|
||||||
|
|
||||||
|
|
||||||
|
def split_emails(msg):
|
||||||
|
"""
|
||||||
|
Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify
|
||||||
|
split lines, content lines and empty lines.
|
||||||
|
|
||||||
|
Correct the split line markers inside header blocks. Header blocks are identified by the regular expression
|
||||||
|
RE_HEADER.
|
||||||
|
|
||||||
|
Return the corrected markers
|
||||||
|
"""
|
||||||
|
delimiter = get_delimiter(msg)
|
||||||
|
msg_body = preprocess(msg, delimiter)
|
||||||
|
# don't process too long messages
|
||||||
|
lines = msg_body.splitlines()[:MAX_LINES_COUNT]
|
||||||
|
markers = mark_message_lines(lines)
|
||||||
|
|
||||||
|
# we don't want splitlines in header blocks
|
||||||
|
markers = _correct_splitlines_in_headers(markers, lines)
|
||||||
|
|
||||||
|
return markers
|
||||||
|
|
||||||
|
|
||||||
|
def _correct_splitlines_in_headers(markers, lines):
|
||||||
|
"""Corrects markers by removing splitlines deemed to be inside header blocks"""
|
||||||
|
updated_markers = ""
|
||||||
|
i = 0
|
||||||
|
in_header_block = False
|
||||||
|
|
||||||
|
for m in markers:
|
||||||
|
# Only set in_header_block flag true when we hit an 's' and the line is a header.
|
||||||
|
if m == 's':
|
||||||
|
if not in_header_block:
|
||||||
|
if bool(re.search(RE_HEADER, lines[i])):
|
||||||
|
in_header_block = True
|
||||||
|
else:
|
||||||
|
m = 't'
|
||||||
|
|
||||||
|
# If the line is not a header line, set in_header_block false.
|
||||||
|
if not bool(re.search(RE_HEADER, lines[i])):
|
||||||
|
in_header_block = False
|
||||||
|
|
||||||
|
# Add the marker to the new updated markers string.
|
||||||
|
updated_markers += m
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return updated_markers
|
||||||
|
|
||||||
|
|
||||||
def _readable_text_empty(html_tree):
|
def _readable_text_empty(html_tree):
|
||||||
return not bool(html_tree_to_text(html_tree).strip())
|
return not bool(html_tree_to_text(html_tree).strip())
|
||||||
|
|
||||||
|
|||||||
@@ -696,3 +696,27 @@ def test_standard_replies():
|
|||||||
"'%(reply)s' != %(stripped)s for %(fn)s" % \
|
"'%(reply)s' != %(stripped)s for %(fn)s" % \
|
||||||
{'reply': reply_text, 'stripped': stripped_text,
|
{'reply': reply_text, 'stripped': stripped_text,
|
||||||
'fn': filename}
|
'fn': filename}
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_email():
|
||||||
|
msg = """From: Mr. X
|
||||||
|
Date: 24 February 2016
|
||||||
|
To: Mr. Y
|
||||||
|
Subject: Hi
|
||||||
|
Attachments: none
|
||||||
|
Goodbye.
|
||||||
|
From: Mr. Y
|
||||||
|
To: Mr. X
|
||||||
|
Date: 24 February 2016
|
||||||
|
Subject: Hi
|
||||||
|
Attachments: none
|
||||||
|
|
||||||
|
Hello.
|
||||||
|
|
||||||
|
-- Original Message --
|
||||||
|
On 24th February 2016 at 09.32am Conal Wrote:
|
||||||
|
Hey!
|
||||||
|
"""
|
||||||
|
expected_markers = "stttttsttttetestt"
|
||||||
|
markers = quotations.split_emails(msg)
|
||||||
|
eq_(markers, expected_markers)
|
||||||
|
|||||||
Reference in New Issue
Block a user