From 139edd6104f6824a0b94a7577fd23d6ae7bf2d28 Mon Sep 17 00:00:00 2001 From: smitcona Date: Wed, 1 Feb 2017 17:16:30 +0000 Subject: [PATCH 01/11] Add new method which marks as splitlines, lines which are splitlines but start with email quotation indents ("> ") --- talon/quotations.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/talon/quotations.py b/talon/quotations.py index 8e2b2b7..514617f 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -469,12 +469,33 @@ def split_emails(msg): lines = msg_body.splitlines()[:MAX_LINES_COUNT] markers = mark_message_lines(lines) + markers = _mark_quoted_email_splitlines(markers, lines) + # we don't want splitlines in header blocks markers = _correct_splitlines_in_headers(markers, lines) return markers +def _mark_quoted_email_splitlines(markers, lines): + """ + When there are headers indented with '>' characters, we will attempt to identify if the header is a splitline header + using a slightly altered SPLITTER_PATTERNS list and mark it as 's'. + """ + # Create a list of markers to easily alter specific characters + markerlist = list(markers) + for i, line in enumerate(lines): + if markerlist[i] != 'm': + continue + for pattern in SPLITTER_PATTERNS: + matcher = re.search(pattern, line) + if matcher: + markerlist[i] = 's' + break + + return "".join(markerlist) + + def _correct_splitlines_in_headers(markers, lines): """Corrects markers by removing splitlines deemed to be inside header blocks""" updated_markers = "" From 567467b8ed6557eb521b7d37efd1a999096698aa Mon Sep 17 00:00:00 2001 From: smitcona Date: Wed, 1 Feb 2017 17:29:05 +0000 Subject: [PATCH 02/11] Update comment --- talon/quotations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index 514617f..155bf4a 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -479,8 +479,8 @@ def split_emails(msg): def _mark_quoted_email_splitlines(markers, lines): """ - When there are headers indented with '>' characters, we will attempt to identify if the header is a splitline header - using a slightly altered SPLITTER_PATTERNS list and mark it as 's'. + When there are headers indented with '>' characters, this method will attempt to identify if the header is a + splitline header. If it is, then we mark it with 's' instead of leaving it as 'm' and return the new markers. """ # Create a list of markers to easily alter specific characters markerlist = list(markers) From a44713409c0f5cfc95506ef3f1b06f14234d4501 Mon Sep 17 00:00:00 2001 From: smitcona Date: Wed, 1 Feb 2017 17:40:59 +0000 Subject: [PATCH 03/11] Added additional case for testing new functionality of split_emails() --- tests/text_quotations_test.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index ff8722a..8cd2023 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -716,7 +716,14 @@ Hello. -- Original Message -- On 24th February 2016 at 09.32am Conal Wrote: Hey! + +> Date: Mon, 2 Apr 2012 17:44:22 +0400 +> Subject: Test +> From: bob@xxx.mailgun.org +> To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com +> +> Hi """ - expected_markers = "stttttsttttetestt" + expected_markers = "stttttsttttetesttesmtmmm" markers = quotations.split_emails(msg) eq_(markers, expected_markers) From a403ecb5c94c3eef60ea212b07e52a6ff90dc3f2 Mon Sep 17 00:00:00 2001 From: smitcona Date: Wed, 1 Feb 2017 18:09:35 +0000 Subject: [PATCH 04/11] Adding two level indentation test --- tests/text_quotations_test.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index 8cd2023..c0c1294 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -723,7 +723,13 @@ Hey! > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com > > Hi +> +> > From: bob@xxx.mailgun.org +> > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com +> > Date: Mon, 2 Apr 2012 17:44:22 +0400 +> > Subject: Test +> > Hi """ - expected_markers = "stttttsttttetesttesmtmmm" + expected_markers = "stttttsttttetesttesmtmmmmsmtmm" markers = quotations.split_emails(msg) eq_(markers, expected_markers) From 984c036b6ee96d3ee6e27535015423ee1087c240 Mon Sep 17 00:00:00 2001 From: smitcona Date: Wed, 1 Feb 2017 18:28:19 +0000 Subject: [PATCH 05/11] Set the marker back to 'm' rather than 't' if it matches the QUOT_PATTERN. Updated test case. --- talon/quotations.py | 5 ++++- tests/text_quotations_test.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index 155bf4a..2e68259 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -509,7 +509,10 @@ def _correct_splitlines_in_headers(markers, lines): if bool(re.search(RE_HEADER, lines[i])): in_header_block = True else: - m = 't' + if QUOT_PATTERN.match(lines[i]): + m = 'm' + else: + m = 't' # If the line is not a header line, set in_header_block false. if not bool(re.search(RE_HEADER, lines[i])): diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index c0c1294..9d7deb0 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -730,6 +730,6 @@ Hey! > > Subject: Test > > Hi """ - expected_markers = "stttttsttttetesttesmtmmmmsmtmm" + expected_markers = "stttttsttttetesttesmmmmmmsmmmm" markers = quotations.split_emails(msg) eq_(markers, expected_markers) From 3edb6578ba9b120fdcc028da2c5aa084fcbd4059 Mon Sep 17 00:00:00 2001 From: smitcona Date: Fri, 3 Feb 2017 11:49:23 +0000 Subject: [PATCH 06/11] Dividing preprocess method into two methods, split_emails() now calls one without email content being altered. --- talon/quotations.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index 2e68259..e4ab2bb 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -290,9 +290,19 @@ def preprocess(msg_body, delimiter, content_type='text/plain'): Converts msg_body into a unicode. """ - # normalize links i.e. replace '<', '>' wrapping the link with some symbols - # so that '>' closing the link couldn't be mistakenly taken for quotation - # marker. + msg_body = _replace_link_brackets(msg_body) + + msg_body = _wrap_splitter_with_newline(msg_body, delimiter, content_type) + + return msg_body + + +def _replace_link_brackets(msg_body): + """Normalize links i.e. replace '<', '>' wrapping the link with some symbols + so that '>' closing the link couldn't be mistakenly taken for quotation marker. + + Converts msg_body into a unicode + """ if isinstance(msg_body, bytes): msg_body = msg_body.decode('utf8') @@ -304,7 +314,13 @@ def preprocess(msg_body, delimiter, content_type='text/plain'): return "@@%s@@" % link.group(1) msg_body = re.sub(RE_LINK, link_wrapper, msg_body) + return msg_body + +def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'): + """Splits line in two if splitter pattern preceded by some text on the same + line (done only for 'On wrote:' pattern. + """ def splitter_wrapper(splitter): """Wraps splitter with new line""" if splitter.start() and msg_body[splitter.start() - 1] != '\n': @@ -463,8 +479,8 @@ def split_emails(msg): Return the corrected markers """ - delimiter = get_delimiter(msg) - msg_body = preprocess(msg, delimiter) + msg_body = _replace_link_brackets(msg) + # don't process too long messages lines = msg_body.splitlines()[:MAX_LINES_COUNT] markers = mark_message_lines(lines) From 34c5b526c3505c6e0ec679a0c10e9fbb7cd03e81 Mon Sep 17 00:00:00 2001 From: smitcona Date: Fri, 3 Feb 2017 12:57:26 +0000 Subject: [PATCH 07/11] Remove the whitespace before the line if the flag is set --- talon/quotations.py | 7 ++-- tests/text_quotations_test.py | 66 +++++++++++++++++++++-------------- 2 files changed, 44 insertions(+), 29 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index e4ab2bb..1a8c4e8 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -188,7 +188,7 @@ def extract_from(msg_body, content_type='text/plain'): return msg_body -def mark_message_lines(lines): +def mark_message_lines(lines, ignore_initial_spaces=False): """Mark message lines with markers to distinguish quotation lines. Markers: @@ -204,6 +204,8 @@ def mark_message_lines(lines): markers = ['e' for _ in lines] i = 0 while i < len(lines): + if ignore_initial_spaces: + lines[i] = lines[i].lstrip(' ') if not lines[i].strip(): markers[i] = 'e' # empty line elif QUOT_PATTERN.match(lines[i]): @@ -480,10 +482,11 @@ def split_emails(msg): Return the corrected markers """ msg_body = _replace_link_brackets(msg) + ignore_initial_spaces = True # don't process too long messages lines = msg_body.splitlines()[:MAX_LINES_COUNT] - markers = mark_message_lines(lines) + markers = mark_message_lines(lines, ignore_initial_spaces) markers = _mark_quoted_email_splitlines(markers, lines) diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index 9d7deb0..6a87e9b 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -700,36 +700,48 @@ def test_standard_replies(): def test_split_email(): msg = """From: Mr. X -Date: 24 February 2016 -To: Mr. Y -Subject: Hi -Attachments: none -Goodbye. -From: Mr. Y -To: Mr. X -Date: 24 February 2016 -Subject: Hi -Attachments: none + Date: 24 February 2016 + To: Mr. Y + Subject: Hi + Attachments: none + Goodbye. + From: Mr. Y + To: Mr. X + Date: 24 February 2016 + Subject: Hi + Attachments: none -Hello. + Hello. --- Original Message -- -On 24th February 2016 at 09.32am Conal Wrote: -Hey! + On 24th February 2016 at 09.32am, Conal Wrote: -> Date: Mon, 2 Apr 2012 17:44:22 +0400 -> Subject: Test -> From: bob@xxx.mailgun.org -> To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com -> -> Hi -> -> > From: bob@xxx.mailgun.org -> > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com -> > Date: Mon, 2 Apr 2012 17:44:22 +0400 -> > Subject: Test -> > Hi + Hey! + + On Mon, 2016-10-03 at 09:45 -0600, Stangel, Dan wrote: + > Mohan, + > + > We have not yet migrated the systems. + > + > Dan + > + > > -----Original Message----- + > > Date: Mon, 2 Apr 2012 17:44:22 +0400 + > > Subject: Test + > > From: bob@xxx.mailgun.org + > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com + > > + > > Hi + > > + > > > From: bob@xxx.mailgun.org + > > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com + > > > Date: Mon, 2 Apr 2012 17:44:22 +0400 + > > > Subject: Test + > > > Hi + > > > + > > + > + > """ - expected_markers = "stttttsttttetesttesmmmmmmsmmmm" + expected_markers = "stttttsttttetetetesmmmmmmssmmmmmmsmmmmmmmm" markers = quotations.split_emails(msg) eq_(markers, expected_markers) From 29f1d21be7597b91c7280f1eb7d376ea72140bcd Mon Sep 17 00:00:00 2001 From: smitcona Date: Mon, 6 Feb 2017 15:03:22 +0000 Subject: [PATCH 08/11] fixed expected markers and incorrect condensed header not matching regex --- tests/text_quotations_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index 6a87e9b..622e84f 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -713,7 +713,7 @@ def test_split_email(): Hello. - On 24th February 2016 at 09.32am, Conal Wrote: + On 24th February 2016 at 09.32am, Conal wrote: Hey! @@ -742,6 +742,6 @@ def test_split_email(): > > """ - expected_markers = "stttttsttttetetetesmmmmmmssmmmmmmsmmmmmmmm" + expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm" markers = quotations.split_emails(msg) eq_(markers, expected_markers) From a1d0a863058ae721b4e5b1bf93336664888582d6 Mon Sep 17 00:00:00 2001 From: smitcona Date: Tue, 7 Feb 2017 12:47:33 +0000 Subject: [PATCH 09/11] Pass ignore_initial_spaces=True as this has better clarity than separate boolean variable --- talon/quotations.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index 1a8c4e8..4fc8494 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -482,11 +482,10 @@ def split_emails(msg): Return the corrected markers """ msg_body = _replace_link_brackets(msg) - ignore_initial_spaces = True # don't process too long messages lines = msg_body.splitlines()[:MAX_LINES_COUNT] - markers = mark_message_lines(lines, ignore_initial_spaces) + markers = mark_message_lines(lines, ignore_initial_spaces=True) markers = _mark_quoted_email_splitlines(markers, lines) From 5c71a0ca0776fd0a4aa5cf58606b328541080fab Mon Sep 17 00:00:00 2001 From: smitcona Date: Mon, 13 Feb 2017 16:45:26 +0000 Subject: [PATCH 10/11] Split the comment lines so that they are not over 80 characters --- talon/quotations.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index 7546d7f..811a21c 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -300,8 +300,10 @@ def preprocess(msg_body, delimiter, content_type='text/plain'): def _replace_link_brackets(msg_body): - """Normalize links i.e. replace '<', '>' wrapping the link with some symbols - so that '>' closing the link couldn't be mistakenly taken for quotation marker. + """ + Normalize links i.e. replace '<', '>' wrapping the link with some symbols + so that '>' closing the link couldn't be mistakenly taken for quotation + marker. Converts msg_body into a unicode """ @@ -320,7 +322,8 @@ def _replace_link_brackets(msg_body): def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'): - """Splits line in two if splitter pattern preceded by some text on the same + """ + Splits line in two if splitter pattern preceded by some text on the same line (done only for 'On wrote:' pattern. """ def splitter_wrapper(splitter): @@ -473,11 +476,12 @@ def _extract_from_html(msg_body): def split_emails(msg): """ - Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify - split lines, content lines and empty lines. + Given a message (which may consist of an email conversation thread with + multiple emails), mark the lines to identify split lines, content lines and + empty lines. - Correct the split line markers inside header blocks. Header blocks are identified by the regular expression - RE_HEADER. + Correct the split line markers inside header blocks. Header blocks are + identified by the regular expression RE_HEADER. Return the corrected markers """ @@ -497,8 +501,9 @@ def split_emails(msg): def _mark_quoted_email_splitlines(markers, lines): """ - When there are headers indented with '>' characters, this method will attempt to identify if the header is a - splitline header. If it is, then we mark it with 's' instead of leaving it as 'm' and return the new markers. + When there are headers indented with '>' characters, this method will + attempt to identify if the header is a splitline header. If it is, then we + mark it with 's' instead of leaving it as 'm' and return the new markers. """ # Create a list of markers to easily alter specific characters markerlist = list(markers) @@ -515,13 +520,15 @@ def _mark_quoted_email_splitlines(markers, lines): def _correct_splitlines_in_headers(markers, lines): - """Corrects markers by removing splitlines deemed to be inside header blocks""" + """ + Corrects markers by removing splitlines deemed to be inside header blocks. + """ updated_markers = "" i = 0 in_header_block = False for m in markers: - # Only set in_header_block flag true when we hit an 's' and the line is a header. + # Only set in_header_block flag when we hit an 's' and line is a header if m == 's': if not in_header_block: if bool(re.search(RE_HEADER, lines[i])): From a2eb0f72015dd5d2453592e1dfa6d86560980528 Mon Sep 17 00:00:00 2001 From: smitcona Date: Tue, 14 Feb 2017 18:19:45 +0000 Subject: [PATCH 11/11] Creating new method which removes initial spaces and marks the message lines. Removing ambiguity introduced to mark_message_lines --- talon/quotations.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index 811a21c..232c69d 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -188,7 +188,20 @@ def extract_from(msg_body, content_type='text/plain'): return msg_body -def mark_message_lines(lines, ignore_initial_spaces=False): +def remove_initial_spaces_and_mark_message_lines(lines): + """ + Removes the initial spaces in each line before marking message lines. + + This ensures headers can be identified if they are indented with spaces. + """ + i = 0 + while i < len(lines): + lines[i] = lines[i].lstrip(' ') + i += 1 + return mark_message_lines(lines) + + +def mark_message_lines(lines): """Mark message lines with markers to distinguish quotation lines. Markers: @@ -204,8 +217,6 @@ def mark_message_lines(lines, ignore_initial_spaces=False): markers = ['e' for _ in lines] i = 0 while i < len(lines): - if ignore_initial_spaces: - lines[i] = lines[i].lstrip(' ') if not lines[i].strip(): markers[i] = 'e' # empty line elif QUOT_PATTERN.match(lines[i]): @@ -489,7 +500,7 @@ def split_emails(msg): # don't process too long messages lines = msg_body.splitlines()[:MAX_LINES_COUNT] - markers = mark_message_lines(lines, ignore_initial_spaces=True) + markers = remove_initial_spaces_and_mark_message_lines(lines) markers = _mark_quoted_email_splitlines(markers, lines)