Merge pull request #127 from conalsmith49/mark-splitlines-in-email-quotation-indents

Split_Email(): Mark splitlines for headers indented with spaces or email quotation indents (">")
This commit is contained in:
Sergey Obukhov
2017-02-14 11:03:51 -08:00
committed by GitHub
2 changed files with 113 additions and 28 deletions

View File

@@ -188,6 +188,19 @@ def extract_from(msg_body, content_type='text/plain'):
return msg_body return msg_body
def remove_initial_spaces_and_mark_message_lines(lines):
"""
Removes the initial spaces in each line before marking message lines.
This ensures headers can be identified if they are indented with spaces.
"""
i = 0
while i < len(lines):
lines[i] = lines[i].lstrip(' ')
i += 1
return mark_message_lines(lines)
def mark_message_lines(lines): def mark_message_lines(lines):
"""Mark message lines with markers to distinguish quotation lines. """Mark message lines with markers to distinguish quotation lines.
@@ -290,9 +303,21 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
Converts msg_body into a unicode. Converts msg_body into a unicode.
""" """
# normalize links i.e. replace '<', '>' wrapping the link with some symbols msg_body = _replace_link_brackets(msg_body)
# so that '>' closing the link couldn't be mistakenly taken for quotation
# marker. msg_body = _wrap_splitter_with_newline(msg_body, delimiter, content_type)
return msg_body
def _replace_link_brackets(msg_body):
"""
Normalize links i.e. replace '<', '>' wrapping the link with some symbols
so that '>' closing the link couldn't be mistakenly taken for quotation
marker.
Converts msg_body into a unicode
"""
if isinstance(msg_body, bytes): if isinstance(msg_body, bytes):
msg_body = msg_body.decode('utf8') msg_body = msg_body.decode('utf8')
@@ -304,7 +329,14 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
return "@@%s@@" % link.group(1) return "@@%s@@" % link.group(1)
msg_body = re.sub(RE_LINK, link_wrapper, msg_body) msg_body = re.sub(RE_LINK, link_wrapper, msg_body)
return msg_body
def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'):
"""
Splits line in two if splitter pattern preceded by some text on the same
line (done only for 'On <date> <person> wrote:' pattern.
"""
def splitter_wrapper(splitter): def splitter_wrapper(splitter):
"""Wraps splitter with new line""" """Wraps splitter with new line"""
if splitter.start() and msg_body[splitter.start() - 1] != '\n': if splitter.start() and msg_body[splitter.start() - 1] != '\n':
@@ -455,19 +487,22 @@ def _extract_from_html(msg_body):
def split_emails(msg): def split_emails(msg):
""" """
Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify Given a message (which may consist of an email conversation thread with
split lines, content lines and empty lines. multiple emails), mark the lines to identify split lines, content lines and
empty lines.
Correct the split line markers inside header blocks. Header blocks are identified by the regular expression Correct the split line markers inside header blocks. Header blocks are
RE_HEADER. identified by the regular expression RE_HEADER.
Return the corrected markers Return the corrected markers
""" """
delimiter = get_delimiter(msg) msg_body = _replace_link_brackets(msg)
msg_body = preprocess(msg, delimiter)
# don't process too long messages # don't process too long messages
lines = msg_body.splitlines()[:MAX_LINES_COUNT] lines = msg_body.splitlines()[:MAX_LINES_COUNT]
markers = mark_message_lines(lines) markers = remove_initial_spaces_and_mark_message_lines(lines)
markers = _mark_quoted_email_splitlines(markers, lines)
# we don't want splitlines in header blocks # we don't want splitlines in header blocks
markers = _correct_splitlines_in_headers(markers, lines) markers = _correct_splitlines_in_headers(markers, lines)
@@ -475,20 +510,45 @@ def split_emails(msg):
return markers return markers
def _mark_quoted_email_splitlines(markers, lines):
"""
When there are headers indented with '>' characters, this method will
attempt to identify if the header is a splitline header. If it is, then we
mark it with 's' instead of leaving it as 'm' and return the new markers.
"""
# Create a list of markers to easily alter specific characters
markerlist = list(markers)
for i, line in enumerate(lines):
if markerlist[i] != 'm':
continue
for pattern in SPLITTER_PATTERNS:
matcher = re.search(pattern, line)
if matcher:
markerlist[i] = 's'
break
return "".join(markerlist)
def _correct_splitlines_in_headers(markers, lines): def _correct_splitlines_in_headers(markers, lines):
"""Corrects markers by removing splitlines deemed to be inside header blocks""" """
Corrects markers by removing splitlines deemed to be inside header blocks.
"""
updated_markers = "" updated_markers = ""
i = 0 i = 0
in_header_block = False in_header_block = False
for m in markers: for m in markers:
# Only set in_header_block flag true when we hit an 's' and the line is a header. # Only set in_header_block flag when we hit an 's' and line is a header
if m == 's': if m == 's':
if not in_header_block: if not in_header_block:
if bool(re.search(RE_HEADER, lines[i])): if bool(re.search(RE_HEADER, lines[i])):
in_header_block = True in_header_block = True
else: else:
m = 't' if QUOT_PATTERN.match(lines[i]):
m = 'm'
else:
m = 't'
# If the line is not a header line, set in_header_block false. # If the line is not a header line, set in_header_block false.
if not bool(re.search(RE_HEADER, lines[i])): if not bool(re.search(RE_HEADER, lines[i])):

View File

@@ -700,23 +700,48 @@ def test_standard_replies():
def test_split_email(): def test_split_email():
msg = """From: Mr. X msg = """From: Mr. X
Date: 24 February 2016 Date: 24 February 2016
To: Mr. Y To: Mr. Y
Subject: Hi Subject: Hi
Attachments: none Attachments: none
Goodbye. Goodbye.
From: Mr. Y From: Mr. Y
To: Mr. X To: Mr. X
Date: 24 February 2016 Date: 24 February 2016
Subject: Hi Subject: Hi
Attachments: none Attachments: none
Hello. Hello.
-- Original Message -- On 24th February 2016 at 09.32am, Conal wrote:
On 24th February 2016 at 09.32am Conal Wrote:
Hey! Hey!
On Mon, 2016-10-03 at 09:45 -0600, Stangel, Dan wrote:
> Mohan,
>
> We have not yet migrated the systems.
>
> Dan
>
> > -----Original Message-----
> > Date: Mon, 2 Apr 2012 17:44:22 +0400
> > Subject: Test
> > From: bob@xxx.mailgun.org
> > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
> >
> > Hi
> >
> > > From: bob@xxx.mailgun.org
> > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
> > > Date: Mon, 2 Apr 2012 17:44:22 +0400
> > > Subject: Test
> > > Hi
> > >
> >
>
>
""" """
expected_markers = "stttttsttttetestt" expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm"
markers = quotations.split_emails(msg) markers = quotations.split_emails(msg)
eq_(markers, expected_markers) eq_(markers, expected_markers)