diff --git a/talon/quotations.py b/talon/quotations.py index 928a8ee..232c69d 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -188,6 +188,19 @@ def extract_from(msg_body, content_type='text/plain'): return msg_body +def remove_initial_spaces_and_mark_message_lines(lines): + """ + Removes the initial spaces in each line before marking message lines. + + This ensures headers can be identified if they are indented with spaces. + """ + i = 0 + while i < len(lines): + lines[i] = lines[i].lstrip(' ') + i += 1 + return mark_message_lines(lines) + + def mark_message_lines(lines): """Mark message lines with markers to distinguish quotation lines. @@ -290,9 +303,21 @@ def preprocess(msg_body, delimiter, content_type='text/plain'): Converts msg_body into a unicode. """ - # normalize links i.e. replace '<', '>' wrapping the link with some symbols - # so that '>' closing the link couldn't be mistakenly taken for quotation - # marker. + msg_body = _replace_link_brackets(msg_body) + + msg_body = _wrap_splitter_with_newline(msg_body, delimiter, content_type) + + return msg_body + + +def _replace_link_brackets(msg_body): + """ + Normalize links i.e. replace '<', '>' wrapping the link with some symbols + so that '>' closing the link couldn't be mistakenly taken for quotation + marker. + + Converts msg_body into a unicode + """ if isinstance(msg_body, bytes): msg_body = msg_body.decode('utf8') @@ -304,7 +329,14 @@ def preprocess(msg_body, delimiter, content_type='text/plain'): return "@@%s@@" % link.group(1) msg_body = re.sub(RE_LINK, link_wrapper, msg_body) + return msg_body + +def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'): + """ + Splits line in two if splitter pattern preceded by some text on the same + line (done only for 'On wrote:' pattern. + """ def splitter_wrapper(splitter): """Wraps splitter with new line""" if splitter.start() and msg_body[splitter.start() - 1] != '\n': @@ -455,19 +487,22 @@ def _extract_from_html(msg_body): def split_emails(msg): """ - Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify - split lines, content lines and empty lines. + Given a message (which may consist of an email conversation thread with + multiple emails), mark the lines to identify split lines, content lines and + empty lines. - Correct the split line markers inside header blocks. Header blocks are identified by the regular expression - RE_HEADER. + Correct the split line markers inside header blocks. Header blocks are + identified by the regular expression RE_HEADER. Return the corrected markers """ - delimiter = get_delimiter(msg) - msg_body = preprocess(msg, delimiter) + msg_body = _replace_link_brackets(msg) + # don't process too long messages lines = msg_body.splitlines()[:MAX_LINES_COUNT] - markers = mark_message_lines(lines) + markers = remove_initial_spaces_and_mark_message_lines(lines) + + markers = _mark_quoted_email_splitlines(markers, lines) # we don't want splitlines in header blocks markers = _correct_splitlines_in_headers(markers, lines) @@ -475,20 +510,45 @@ def split_emails(msg): return markers +def _mark_quoted_email_splitlines(markers, lines): + """ + When there are headers indented with '>' characters, this method will + attempt to identify if the header is a splitline header. If it is, then we + mark it with 's' instead of leaving it as 'm' and return the new markers. + """ + # Create a list of markers to easily alter specific characters + markerlist = list(markers) + for i, line in enumerate(lines): + if markerlist[i] != 'm': + continue + for pattern in SPLITTER_PATTERNS: + matcher = re.search(pattern, line) + if matcher: + markerlist[i] = 's' + break + + return "".join(markerlist) + + def _correct_splitlines_in_headers(markers, lines): - """Corrects markers by removing splitlines deemed to be inside header blocks""" + """ + Corrects markers by removing splitlines deemed to be inside header blocks. + """ updated_markers = "" i = 0 in_header_block = False for m in markers: - # Only set in_header_block flag true when we hit an 's' and the line is a header. + # Only set in_header_block flag when we hit an 's' and line is a header if m == 's': if not in_header_block: if bool(re.search(RE_HEADER, lines[i])): in_header_block = True else: - m = 't' + if QUOT_PATTERN.match(lines[i]): + m = 'm' + else: + m = 't' # If the line is not a header line, set in_header_block false. if not bool(re.search(RE_HEADER, lines[i])): diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index ff8722a..622e84f 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -700,23 +700,48 @@ def test_standard_replies(): def test_split_email(): msg = """From: Mr. X -Date: 24 February 2016 -To: Mr. Y -Subject: Hi -Attachments: none -Goodbye. -From: Mr. Y -To: Mr. X -Date: 24 February 2016 -Subject: Hi -Attachments: none + Date: 24 February 2016 + To: Mr. Y + Subject: Hi + Attachments: none + Goodbye. + From: Mr. Y + To: Mr. X + Date: 24 February 2016 + Subject: Hi + Attachments: none -Hello. + Hello. --- Original Message -- -On 24th February 2016 at 09.32am Conal Wrote: -Hey! + On 24th February 2016 at 09.32am, Conal wrote: + + Hey! + + On Mon, 2016-10-03 at 09:45 -0600, Stangel, Dan wrote: + > Mohan, + > + > We have not yet migrated the systems. + > + > Dan + > + > > -----Original Message----- + > > Date: Mon, 2 Apr 2012 17:44:22 +0400 + > > Subject: Test + > > From: bob@xxx.mailgun.org + > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com + > > + > > Hi + > > + > > > From: bob@xxx.mailgun.org + > > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com + > > > Date: Mon, 2 Apr 2012 17:44:22 +0400 + > > > Subject: Test + > > > Hi + > > > + > > + > + > """ - expected_markers = "stttttsttttetestt" + expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm" markers = quotations.split_emails(msg) eq_(markers, expected_markers)