split_emails function added, test added

This commit is contained in:
smitcona
2016-11-21 12:35:36 +00:00
parent 2444ba87c0
commit adfed748ce
2 changed files with 63 additions and 0 deletions

View File

@@ -172,6 +172,8 @@ MAX_HTML_LEN = 2794202
QUOT_PATTERN = re.compile('^>+ ?')
NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
# Regular expression to identify if a line is a header.
RE_HEADER = re.compile(": ")
def extract_from(msg_body, content_type='text/plain'):
try:
@@ -450,6 +452,47 @@ def _extract_from_html(msg_body):
return html.tostring(html_tree_copy)
def split_emails(msg):
"""
Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify
split lines, content lines and empty lines.
Correct the split line markers inside header blocks. Header blocks are identified by the regular expression
RE_HEADER.
Return the corrected markers
"""
print "Conal's split_email method!"
delimiter = get_delimiter(msg)
msg_body = preprocess(msg, delimiter)
# don't process too long messages
lines = msg_body.splitlines()[:MAX_LINES_COUNT]
markers = mark_message_lines(lines)
print "Conal's split_email method obtained initial markers: " + markers
# we don't want splitlines in header blocks
markers = correct_splitlines_in_headers(markers, lines)
print "Conal's split_email method returning corrected markers: " + markers
return markers
def correct_splitlines_in_headers(markers, lines):
"""Corrects markers by removing splitlines deemed to be inside header blocks"""
updated_markers = ""
i = -1
for m in markers:
if m == 's':
if i > -1:
if bool(re.search(RE_HEADER, lines[i])):
m = 't'
updated_markers += m
i += 1
return updated_markers
def _readable_text_empty(html_tree):
return not bool(html_tree_to_text(html_tree).strip())

View File

@@ -696,3 +696,23 @@ def test_standard_replies():
"'%(reply)s' != %(stripped)s for %(fn)s" % \
{'reply': reply_text, 'stripped': stripped_text,
'fn': filename}
def test_split_email():
msg = """From: Mr. X
Date: 24 February 2016
To: Mr. Y
Subject: Hi
Attachments: none
Goodbye.
From: Mr. Y
To: Mr. X
Date: 24 February 2016
Subject: Hi
Attachments: none
Hello.
"""
expected_markers = "stttttsttttet"
markers = quotations.split_emails(msg)
eq_(markers, expected_markers)