From adfed748ce6f62bc320435ab5345a54e26910135 Mon Sep 17 00:00:00 2001 From: smitcona Date: Mon, 21 Nov 2016 12:35:36 +0000 Subject: [PATCH] split_emails function added, test added --- talon/quotations.py | 43 +++++++++++++++++++++++++++++++++++ tests/text_quotations_test.py | 20 ++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/talon/quotations.py b/talon/quotations.py index 8ed3a15..acdc741 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -172,6 +172,8 @@ MAX_HTML_LEN = 2794202 QUOT_PATTERN = re.compile('^>+ ?') NO_QUOT_LINE = re.compile('^[^>].*[\S].*') +# Regular expression to identify if a line is a header. +RE_HEADER = re.compile(": ") def extract_from(msg_body, content_type='text/plain'): try: @@ -450,6 +452,47 @@ def _extract_from_html(msg_body): return html.tostring(html_tree_copy) +def split_emails(msg): + """ + Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify + split lines, content lines and empty lines. + + Correct the split line markers inside header blocks. Header blocks are identified by the regular expression + RE_HEADER. + + Return the corrected markers + """ + print "Conal's split_email method!" + delimiter = get_delimiter(msg) + msg_body = preprocess(msg, delimiter) + # don't process too long messages + lines = msg_body.splitlines()[:MAX_LINES_COUNT] + markers = mark_message_lines(lines) + print "Conal's split_email method obtained initial markers: " + markers + # we don't want splitlines in header blocks + markers = correct_splitlines_in_headers(markers, lines) + + print "Conal's split_email method returning corrected markers: " + markers + return markers + + +def correct_splitlines_in_headers(markers, lines): + """Corrects markers by removing splitlines deemed to be inside header blocks""" + updated_markers = "" + i = -1 + + for m in markers: + if m == 's': + if i > -1: + if bool(re.search(RE_HEADER, lines[i])): + m = 't' + + updated_markers += m + i += 1 + + return updated_markers + + def _readable_text_empty(html_tree): return not bool(html_tree_to_text(html_tree).strip()) diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index aed2ce8..5a9f998 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -696,3 +696,23 @@ def test_standard_replies(): "'%(reply)s' != %(stripped)s for %(fn)s" % \ {'reply': reply_text, 'stripped': stripped_text, 'fn': filename} + + +def test_split_email(): + msg = """From: Mr. X +Date: 24 February 2016 +To: Mr. Y +Subject: Hi +Attachments: none +Goodbye. +From: Mr. Y +To: Mr. X +Date: 24 February 2016 +Subject: Hi +Attachments: none + +Hello. +""" + expected_markers = "stttttsttttet" + markers = quotations.split_emails(msg) + eq_(markers, expected_markers)