Merge pull request #127 from conalsmith49/mark-splitlines-in-email-quotation-indents
Split_Email(): Mark splitlines for headers indented with spaces or email quotation indents (">")
This commit is contained in:
@@ -188,6 +188,19 @@ def extract_from(msg_body, content_type='text/plain'):
|
|||||||
return msg_body
|
return msg_body
|
||||||
|
|
||||||
|
|
||||||
|
def remove_initial_spaces_and_mark_message_lines(lines):
|
||||||
|
"""
|
||||||
|
Removes the initial spaces in each line before marking message lines.
|
||||||
|
|
||||||
|
This ensures headers can be identified if they are indented with spaces.
|
||||||
|
"""
|
||||||
|
i = 0
|
||||||
|
while i < len(lines):
|
||||||
|
lines[i] = lines[i].lstrip(' ')
|
||||||
|
i += 1
|
||||||
|
return mark_message_lines(lines)
|
||||||
|
|
||||||
|
|
||||||
def mark_message_lines(lines):
|
def mark_message_lines(lines):
|
||||||
"""Mark message lines with markers to distinguish quotation lines.
|
"""Mark message lines with markers to distinguish quotation lines.
|
||||||
|
|
||||||
@@ -290,9 +303,21 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
|
|||||||
|
|
||||||
Converts msg_body into a unicode.
|
Converts msg_body into a unicode.
|
||||||
"""
|
"""
|
||||||
# normalize links i.e. replace '<', '>' wrapping the link with some symbols
|
msg_body = _replace_link_brackets(msg_body)
|
||||||
# so that '>' closing the link couldn't be mistakenly taken for quotation
|
|
||||||
# marker.
|
msg_body = _wrap_splitter_with_newline(msg_body, delimiter, content_type)
|
||||||
|
|
||||||
|
return msg_body
|
||||||
|
|
||||||
|
|
||||||
|
def _replace_link_brackets(msg_body):
|
||||||
|
"""
|
||||||
|
Normalize links i.e. replace '<', '>' wrapping the link with some symbols
|
||||||
|
so that '>' closing the link couldn't be mistakenly taken for quotation
|
||||||
|
marker.
|
||||||
|
|
||||||
|
Converts msg_body into a unicode
|
||||||
|
"""
|
||||||
if isinstance(msg_body, bytes):
|
if isinstance(msg_body, bytes):
|
||||||
msg_body = msg_body.decode('utf8')
|
msg_body = msg_body.decode('utf8')
|
||||||
|
|
||||||
@@ -304,7 +329,14 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
|
|||||||
return "@@%s@@" % link.group(1)
|
return "@@%s@@" % link.group(1)
|
||||||
|
|
||||||
msg_body = re.sub(RE_LINK, link_wrapper, msg_body)
|
msg_body = re.sub(RE_LINK, link_wrapper, msg_body)
|
||||||
|
return msg_body
|
||||||
|
|
||||||
|
|
||||||
|
def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'):
|
||||||
|
"""
|
||||||
|
Splits line in two if splitter pattern preceded by some text on the same
|
||||||
|
line (done only for 'On <date> <person> wrote:' pattern.
|
||||||
|
"""
|
||||||
def splitter_wrapper(splitter):
|
def splitter_wrapper(splitter):
|
||||||
"""Wraps splitter with new line"""
|
"""Wraps splitter with new line"""
|
||||||
if splitter.start() and msg_body[splitter.start() - 1] != '\n':
|
if splitter.start() and msg_body[splitter.start() - 1] != '\n':
|
||||||
@@ -455,19 +487,22 @@ def _extract_from_html(msg_body):
|
|||||||
|
|
||||||
def split_emails(msg):
|
def split_emails(msg):
|
||||||
"""
|
"""
|
||||||
Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify
|
Given a message (which may consist of an email conversation thread with
|
||||||
split lines, content lines and empty lines.
|
multiple emails), mark the lines to identify split lines, content lines and
|
||||||
|
empty lines.
|
||||||
|
|
||||||
Correct the split line markers inside header blocks. Header blocks are identified by the regular expression
|
Correct the split line markers inside header blocks. Header blocks are
|
||||||
RE_HEADER.
|
identified by the regular expression RE_HEADER.
|
||||||
|
|
||||||
Return the corrected markers
|
Return the corrected markers
|
||||||
"""
|
"""
|
||||||
delimiter = get_delimiter(msg)
|
msg_body = _replace_link_brackets(msg)
|
||||||
msg_body = preprocess(msg, delimiter)
|
|
||||||
# don't process too long messages
|
# don't process too long messages
|
||||||
lines = msg_body.splitlines()[:MAX_LINES_COUNT]
|
lines = msg_body.splitlines()[:MAX_LINES_COUNT]
|
||||||
markers = mark_message_lines(lines)
|
markers = remove_initial_spaces_and_mark_message_lines(lines)
|
||||||
|
|
||||||
|
markers = _mark_quoted_email_splitlines(markers, lines)
|
||||||
|
|
||||||
# we don't want splitlines in header blocks
|
# we don't want splitlines in header blocks
|
||||||
markers = _correct_splitlines_in_headers(markers, lines)
|
markers = _correct_splitlines_in_headers(markers, lines)
|
||||||
@@ -475,20 +510,45 @@ def split_emails(msg):
|
|||||||
return markers
|
return markers
|
||||||
|
|
||||||
|
|
||||||
|
def _mark_quoted_email_splitlines(markers, lines):
|
||||||
|
"""
|
||||||
|
When there are headers indented with '>' characters, this method will
|
||||||
|
attempt to identify if the header is a splitline header. If it is, then we
|
||||||
|
mark it with 's' instead of leaving it as 'm' and return the new markers.
|
||||||
|
"""
|
||||||
|
# Create a list of markers to easily alter specific characters
|
||||||
|
markerlist = list(markers)
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
if markerlist[i] != 'm':
|
||||||
|
continue
|
||||||
|
for pattern in SPLITTER_PATTERNS:
|
||||||
|
matcher = re.search(pattern, line)
|
||||||
|
if matcher:
|
||||||
|
markerlist[i] = 's'
|
||||||
|
break
|
||||||
|
|
||||||
|
return "".join(markerlist)
|
||||||
|
|
||||||
|
|
||||||
def _correct_splitlines_in_headers(markers, lines):
|
def _correct_splitlines_in_headers(markers, lines):
|
||||||
"""Corrects markers by removing splitlines deemed to be inside header blocks"""
|
"""
|
||||||
|
Corrects markers by removing splitlines deemed to be inside header blocks.
|
||||||
|
"""
|
||||||
updated_markers = ""
|
updated_markers = ""
|
||||||
i = 0
|
i = 0
|
||||||
in_header_block = False
|
in_header_block = False
|
||||||
|
|
||||||
for m in markers:
|
for m in markers:
|
||||||
# Only set in_header_block flag true when we hit an 's' and the line is a header.
|
# Only set in_header_block flag when we hit an 's' and line is a header
|
||||||
if m == 's':
|
if m == 's':
|
||||||
if not in_header_block:
|
if not in_header_block:
|
||||||
if bool(re.search(RE_HEADER, lines[i])):
|
if bool(re.search(RE_HEADER, lines[i])):
|
||||||
in_header_block = True
|
in_header_block = True
|
||||||
else:
|
else:
|
||||||
m = 't'
|
if QUOT_PATTERN.match(lines[i]):
|
||||||
|
m = 'm'
|
||||||
|
else:
|
||||||
|
m = 't'
|
||||||
|
|
||||||
# If the line is not a header line, set in_header_block false.
|
# If the line is not a header line, set in_header_block false.
|
||||||
if not bool(re.search(RE_HEADER, lines[i])):
|
if not bool(re.search(RE_HEADER, lines[i])):
|
||||||
|
|||||||
@@ -700,23 +700,48 @@ def test_standard_replies():
|
|||||||
|
|
||||||
def test_split_email():
|
def test_split_email():
|
||||||
msg = """From: Mr. X
|
msg = """From: Mr. X
|
||||||
Date: 24 February 2016
|
Date: 24 February 2016
|
||||||
To: Mr. Y
|
To: Mr. Y
|
||||||
Subject: Hi
|
Subject: Hi
|
||||||
Attachments: none
|
Attachments: none
|
||||||
Goodbye.
|
Goodbye.
|
||||||
From: Mr. Y
|
From: Mr. Y
|
||||||
To: Mr. X
|
To: Mr. X
|
||||||
Date: 24 February 2016
|
Date: 24 February 2016
|
||||||
Subject: Hi
|
Subject: Hi
|
||||||
Attachments: none
|
Attachments: none
|
||||||
|
|
||||||
Hello.
|
Hello.
|
||||||
|
|
||||||
-- Original Message --
|
On 24th February 2016 at 09.32am, Conal wrote:
|
||||||
On 24th February 2016 at 09.32am Conal Wrote:
|
|
||||||
Hey!
|
Hey!
|
||||||
|
|
||||||
|
On Mon, 2016-10-03 at 09:45 -0600, Stangel, Dan wrote:
|
||||||
|
> Mohan,
|
||||||
|
>
|
||||||
|
> We have not yet migrated the systems.
|
||||||
|
>
|
||||||
|
> Dan
|
||||||
|
>
|
||||||
|
> > -----Original Message-----
|
||||||
|
> > Date: Mon, 2 Apr 2012 17:44:22 +0400
|
||||||
|
> > Subject: Test
|
||||||
|
> > From: bob@xxx.mailgun.org
|
||||||
|
> > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
|
||||||
|
> >
|
||||||
|
> > Hi
|
||||||
|
> >
|
||||||
|
> > > From: bob@xxx.mailgun.org
|
||||||
|
> > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
|
||||||
|
> > > Date: Mon, 2 Apr 2012 17:44:22 +0400
|
||||||
|
> > > Subject: Test
|
||||||
|
> > > Hi
|
||||||
|
> > >
|
||||||
|
> >
|
||||||
|
>
|
||||||
|
>
|
||||||
"""
|
"""
|
||||||
expected_markers = "stttttsttttetestt"
|
expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm"
|
||||||
markers = quotations.split_emails(msg)
|
markers = quotations.split_emails(msg)
|
||||||
eq_(markers, expected_markers)
|
eq_(markers, expected_markers)
|
||||||
|
|||||||
Reference in New Issue
Block a user