From adfed748ce6f62bc320435ab5345a54e26910135 Mon Sep 17 00:00:00 2001
From: smitcona <conal.smith@hpe.com>
Date: Mon, 21 Nov 2016 12:35:36 +0000
Subject: [PATCH] split_emails function added, test added

---
 talon/quotations.py           | 43 +++++++++++++++++++++++++++++++++++
 tests/text_quotations_test.py | 20 ++++++++++++++++
 2 files changed, 63 insertions(+)

diff --git a/talon/quotations.py b/talon/quotations.py
index 8ed3a15..acdc741 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -172,6 +172,8 @@ MAX_HTML_LEN = 2794202
 QUOT_PATTERN = re.compile('^>+ ?')
 NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
 
+# Regular expression to identify if a line is a header.
+RE_HEADER = re.compile(": ")
 
 def extract_from(msg_body, content_type='text/plain'):
     try:
@@ -450,6 +452,47 @@ def _extract_from_html(msg_body):
     return html.tostring(html_tree_copy)
 
 
+def split_emails(msg):
+    """
+    Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify
+     split lines, content lines and empty lines.
+
+    Correct the split line markers inside header blocks. Header blocks are identified by the regular expression
+    RE_HEADER.
+
+    Return the corrected markers
+    """
+    print "Conal's split_email method!"
+    delimiter = get_delimiter(msg)
+    msg_body = preprocess(msg, delimiter)
+    # don't process too long messages
+    lines = msg_body.splitlines()[:MAX_LINES_COUNT]
+    markers = mark_message_lines(lines)
+    print "Conal's split_email method obtained initial markers: " + markers
+    # we don't want splitlines in header blocks
+    markers = correct_splitlines_in_headers(markers, lines)
+
+    print "Conal's split_email method returning corrected markers: " + markers
+    return markers
+
+
+def correct_splitlines_in_headers(markers, lines):
+    """Corrects markers by removing splitlines deemed to be inside header blocks"""
+    updated_markers = ""
+    i = -1
+
+    for m in markers:
+        if m == 's':
+            if i > -1:
+                if bool(re.search(RE_HEADER, lines[i])):
+                    m = 't'
+
+        updated_markers += m
+        i += 1
+
+    return updated_markers
+
+
 def _readable_text_empty(html_tree):
     return not bool(html_tree_to_text(html_tree).strip())
 
diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py
index aed2ce8..5a9f998 100644
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -696,3 +696,23 @@ def test_standard_replies():
                 "'%(reply)s' != %(stripped)s for %(fn)s" % \
                 {'reply': reply_text, 'stripped': stripped_text,
                  'fn': filename}
+
+
+def test_split_email():
+    msg = """From: Mr. X
+Date: 24 February 2016
+To: Mr. Y
+Subject: Hi
+Attachments: none
+Goodbye.
+From: Mr. Y
+To: Mr. X
+Date: 24 February 2016
+Subject: Hi
+Attachments: none
+
+Hello.
+"""
+    expected_markers = "stttttsttttet"
+    markers = quotations.split_emails(msg)
+    eq_(markers, expected_markers)