From adfed748ce6f62bc320435ab5345a54e26910135 Mon Sep 17 00:00:00 2001 From: smitcona Date: Mon, 21 Nov 2016 12:35:36 +0000 Subject: [PATCH 1/6] split_emails function added, test added --- talon/quotations.py | 43 +++++++++++++++++++++++++++++++++++ tests/text_quotations_test.py | 20 ++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/talon/quotations.py b/talon/quotations.py index 8ed3a15..acdc741 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -172,6 +172,8 @@ MAX_HTML_LEN = 2794202 QUOT_PATTERN = re.compile('^>+ ?') NO_QUOT_LINE = re.compile('^[^>].*[\S].*') +# Regular expression to identify if a line is a header. +RE_HEADER = re.compile(": ") def extract_from(msg_body, content_type='text/plain'): try: @@ -450,6 +452,47 @@ def _extract_from_html(msg_body): return html.tostring(html_tree_copy) +def split_emails(msg): + """ + Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify + split lines, content lines and empty lines. + + Correct the split line markers inside header blocks. Header blocks are identified by the regular expression + RE_HEADER. + + Return the corrected markers + """ + print "Conal's split_email method!" + delimiter = get_delimiter(msg) + msg_body = preprocess(msg, delimiter) + # don't process too long messages + lines = msg_body.splitlines()[:MAX_LINES_COUNT] + markers = mark_message_lines(lines) + print "Conal's split_email method obtained initial markers: " + markers + # we don't want splitlines in header blocks + markers = correct_splitlines_in_headers(markers, lines) + + print "Conal's split_email method returning corrected markers: " + markers + return markers + + +def correct_splitlines_in_headers(markers, lines): + """Corrects markers by removing splitlines deemed to be inside header blocks""" + updated_markers = "" + i = -1 + + for m in markers: + if m == 's': + if i > -1: + if bool(re.search(RE_HEADER, lines[i])): + m = 't' + + updated_markers += m + i += 1 + + return updated_markers + + def _readable_text_empty(html_tree): return not bool(html_tree_to_text(html_tree).strip()) diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index aed2ce8..5a9f998 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -696,3 +696,23 @@ def test_standard_replies(): "'%(reply)s' != %(stripped)s for %(fn)s" % \ {'reply': reply_text, 'stripped': stripped_text, 'fn': filename} + + +def test_split_email(): + msg = """From: Mr. X +Date: 24 February 2016 +To: Mr. Y +Subject: Hi +Attachments: none +Goodbye. +From: Mr. Y +To: Mr. X +Date: 24 February 2016 +Subject: Hi +Attachments: none + +Hello. +""" + expected_markers = "stttttsttttet" + markers = quotations.split_emails(msg) + eq_(markers, expected_markers) From e5988d447b0c5f4b90fbbdebdf8ea446fb05586e Mon Sep 17 00:00:00 2001 From: smitcona Date: Mon, 21 Nov 2016 12:48:29 +0000 Subject: [PATCH 2/6] Add space --- talon/quotations.py | 1 + 1 file changed, 1 insertion(+) diff --git a/talon/quotations.py b/talon/quotations.py index acdc741..c3ed666 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -175,6 +175,7 @@ NO_QUOT_LINE = re.compile('^[^>].*[\S].*') # Regular expression to identify if a line is a header. RE_HEADER = re.compile(": ") + def extract_from(msg_body, content_type='text/plain'): try: if content_type == 'text/plain': From 31489848be3337352beb46562eacda04251cb8f3 Mon Sep 17 00:00:00 2001 From: smitcona Date: Mon, 21 Nov 2016 17:36:06 +0000 Subject: [PATCH 3/6] Remove print lines --- talon/quotations.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index c3ed666..c8604d0 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -463,17 +463,14 @@ def split_emails(msg): Return the corrected markers """ - print "Conal's split_email method!" delimiter = get_delimiter(msg) msg_body = preprocess(msg, delimiter) # don't process too long messages lines = msg_body.splitlines()[:MAX_LINES_COUNT] markers = mark_message_lines(lines) - print "Conal's split_email method obtained initial markers: " + markers # we don't want splitlines in header blocks markers = correct_splitlines_in_headers(markers, lines) - print "Conal's split_email method returning corrected markers: " + markers return markers From 97b72ef767dba51e37275f5bf300d70a33b26026 Mon Sep 17 00:00:00 2001 From: smitcona Date: Tue, 22 Nov 2016 19:06:34 +0000 Subject: [PATCH 4/6] Adding in_header_block variable for reliability --- talon/quotations.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index c8604d0..debf369 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -468,23 +468,37 @@ def split_emails(msg): # don't process too long messages lines = msg_body.splitlines()[:MAX_LINES_COUNT] markers = mark_message_lines(lines) + # we don't want splitlines in header blocks - markers = correct_splitlines_in_headers(markers, lines) + markers = _correct_splitlines_in_headers(markers, lines) return markers -def correct_splitlines_in_headers(markers, lines): +def _correct_splitlines_in_headers(markers, lines): """Corrects markers by removing splitlines deemed to be inside header blocks""" updated_markers = "" - i = -1 + i = 0 + in_header_block = False for m in markers: + # Only set in_header_block flag true when we hit an 's' and the line is a header. if m == 's': - if i > -1: - if bool(re.search(RE_HEADER, lines[i])): + if not in_header_block: + if i == 0: + in_header_block = True + elif i > 0 and not bool(re.search(RE_HEADER, lines[i-1])): + in_header_block = True + else: m = 't' + else: + m = 't' + # If the line is not a header line, set in_header_block false. + if not m == 's' and not bool(re.search(RE_HEADER, lines[i])): + in_header_block = False + + # Add the marker to the new updated markers string. updated_markers += m i += 1 From 5685a4055ab663b6fae2e9759ab741430f210a1f Mon Sep 17 00:00:00 2001 From: smitcona Date: Tue, 22 Nov 2016 19:56:57 +0000 Subject: [PATCH 5/6] Improved algorithm --- talon/quotations.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/talon/quotations.py b/talon/quotations.py index debf369..8e2b2b7 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -485,17 +485,13 @@ def _correct_splitlines_in_headers(markers, lines): # Only set in_header_block flag true when we hit an 's' and the line is a header. if m == 's': if not in_header_block: - if i == 0: + if bool(re.search(RE_HEADER, lines[i])): in_header_block = True - elif i > 0 and not bool(re.search(RE_HEADER, lines[i-1])): - in_header_block = True - else: - m = 't' else: m = 't' # If the line is not a header line, set in_header_block false. - if not m == 's' and not bool(re.search(RE_HEADER, lines[i])): + if not bool(re.search(RE_HEADER, lines[i])): in_header_block = False # Add the marker to the new updated markers string. From b5e3397b8806ce29090f1134ed6bc7c3155ffd3e Mon Sep 17 00:00:00 2001 From: smitcona Date: Tue, 22 Nov 2016 20:00:31 +0000 Subject: [PATCH 6/6] Updating test to account for --original message-- case --- tests/text_quotations_test.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index 5a9f998..ff8722a 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -712,7 +712,11 @@ Subject: Hi Attachments: none Hello. + +-- Original Message -- +On 24th February 2016 at 09.32am Conal Wrote: +Hey! """ - expected_markers = "stttttsttttet" + expected_markers = "stttttsttttetestt" markers = quotations.split_emails(msg) eq_(markers, expected_markers)