Merge pull request #119 from conapart3/master

Addition of new split_email method for issue:115
This commit is contained in:
Sergey Obukhov
2016-11-30 12:51:32 -08:00
committed by GitHub
2 changed files with 75 additions and 0 deletions

View File

@@ -172,6 +172,9 @@ MAX_HTML_LEN = 2794202
QUOT_PATTERN = re.compile('^>+ ?') QUOT_PATTERN = re.compile('^>+ ?')
NO_QUOT_LINE = re.compile('^[^>].*[\S].*') NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
# Regular expression to identify if a line is a header.
RE_HEADER = re.compile(": ")
def extract_from(msg_body, content_type='text/plain'): def extract_from(msg_body, content_type='text/plain'):
try: try:
@@ -450,6 +453,54 @@ def _extract_from_html(msg_body):
return html.tostring(html_tree_copy) return html.tostring(html_tree_copy)
def split_emails(msg):
"""
Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify
split lines, content lines and empty lines.
Correct the split line markers inside header blocks. Header blocks are identified by the regular expression
RE_HEADER.
Return the corrected markers
"""
delimiter = get_delimiter(msg)
msg_body = preprocess(msg, delimiter)
# don't process too long messages
lines = msg_body.splitlines()[:MAX_LINES_COUNT]
markers = mark_message_lines(lines)
# we don't want splitlines in header blocks
markers = _correct_splitlines_in_headers(markers, lines)
return markers
def _correct_splitlines_in_headers(markers, lines):
"""Corrects markers by removing splitlines deemed to be inside header blocks"""
updated_markers = ""
i = 0
in_header_block = False
for m in markers:
# Only set in_header_block flag true when we hit an 's' and the line is a header.
if m == 's':
if not in_header_block:
if bool(re.search(RE_HEADER, lines[i])):
in_header_block = True
else:
m = 't'
# If the line is not a header line, set in_header_block false.
if not bool(re.search(RE_HEADER, lines[i])):
in_header_block = False
# Add the marker to the new updated markers string.
updated_markers += m
i += 1
return updated_markers
def _readable_text_empty(html_tree): def _readable_text_empty(html_tree):
return not bool(html_tree_to_text(html_tree).strip()) return not bool(html_tree_to_text(html_tree).strip())

View File

@@ -696,3 +696,27 @@ def test_standard_replies():
"'%(reply)s' != %(stripped)s for %(fn)s" % \ "'%(reply)s' != %(stripped)s for %(fn)s" % \
{'reply': reply_text, 'stripped': stripped_text, {'reply': reply_text, 'stripped': stripped_text,
'fn': filename} 'fn': filename}
def test_split_email():
msg = """From: Mr. X
Date: 24 February 2016
To: Mr. Y
Subject: Hi
Attachments: none
Goodbye.
From: Mr. Y
To: Mr. X
Date: 24 February 2016
Subject: Hi
Attachments: none
Hello.
-- Original Message --
On 24th February 2016 at 09.32am Conal Wrote:
Hey!
"""
expected_markers = "stttttsttttetestt"
markers = quotations.split_emails(msg)
eq_(markers, expected_markers)