From adfed748ce6f62bc320435ab5345a54e26910135 Mon Sep 17 00:00:00 2001
From: smitcona <conal.smith@hpe.com>
Date: Mon, 21 Nov 2016 12:35:36 +0000
Subject: [PATCH 1/6] split_emails function added, test added

---
 talon/quotations.py           | 43 +++++++++++++++++++++++++++++++++++
 tests/text_quotations_test.py | 20 ++++++++++++++++
 2 files changed, 63 insertions(+)

diff --git a/talon/quotations.py b/talon/quotations.py
index 8ed3a15..acdc741 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -172,6 +172,8 @@ MAX_HTML_LEN = 2794202
 QUOT_PATTERN = re.compile('^>+ ?')
 NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
 
+# Regular expression to identify if a line is a header.
+RE_HEADER = re.compile(": ")
 
 def extract_from(msg_body, content_type='text/plain'):
     try:
@@ -450,6 +452,47 @@ def _extract_from_html(msg_body):
     return html.tostring(html_tree_copy)
 
 
+def split_emails(msg):
+    """
+    Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify
+     split lines, content lines and empty lines.
+
+    Correct the split line markers inside header blocks. Header blocks are identified by the regular expression
+    RE_HEADER.
+
+    Return the corrected markers
+    """
+    print "Conal's split_email method!"
+    delimiter = get_delimiter(msg)
+    msg_body = preprocess(msg, delimiter)
+    # don't process too long messages
+    lines = msg_body.splitlines()[:MAX_LINES_COUNT]
+    markers = mark_message_lines(lines)
+    print "Conal's split_email method obtained initial markers: " + markers
+    # we don't want splitlines in header blocks
+    markers = correct_splitlines_in_headers(markers, lines)
+
+    print "Conal's split_email method returning corrected markers: " + markers
+    return markers
+
+
+def correct_splitlines_in_headers(markers, lines):
+    """Corrects markers by removing splitlines deemed to be inside header blocks"""
+    updated_markers = ""
+    i = -1
+
+    for m in markers:
+        if m == 's':
+            if i > -1:
+                if bool(re.search(RE_HEADER, lines[i])):
+                    m = 't'
+
+        updated_markers += m
+        i += 1
+
+    return updated_markers
+
+
 def _readable_text_empty(html_tree):
     return not bool(html_tree_to_text(html_tree).strip())
 
diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py
index aed2ce8..5a9f998 100644
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -696,3 +696,23 @@ def test_standard_replies():
                 "'%(reply)s' != %(stripped)s for %(fn)s" % \
                 {'reply': reply_text, 'stripped': stripped_text,
                  'fn': filename}
+
+
+def test_split_email():
+    msg = """From: Mr. X
+Date: 24 February 2016
+To: Mr. Y
+Subject: Hi
+Attachments: none
+Goodbye.
+From: Mr. Y
+To: Mr. X
+Date: 24 February 2016
+Subject: Hi
+Attachments: none
+
+Hello.
+"""
+    expected_markers = "stttttsttttet"
+    markers = quotations.split_emails(msg)
+    eq_(markers, expected_markers)

From e5988d447b0c5f4b90fbbdebdf8ea446fb05586e Mon Sep 17 00:00:00 2001
From: smitcona <conal.smith@hpe.com>
Date: Mon, 21 Nov 2016 12:48:29 +0000
Subject: [PATCH 2/6] Add space

---
 talon/quotations.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/talon/quotations.py b/talon/quotations.py
index acdc741..c3ed666 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -175,6 +175,7 @@ NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
 # Regular expression to identify if a line is a header.
 RE_HEADER = re.compile(": ")
 
+
 def extract_from(msg_body, content_type='text/plain'):
     try:
         if content_type == 'text/plain':

From 31489848be3337352beb46562eacda04251cb8f3 Mon Sep 17 00:00:00 2001
From: smitcona <conal.smith@hpe.com>
Date: Mon, 21 Nov 2016 17:36:06 +0000
Subject: [PATCH 3/6] Remove print lines

---
 talon/quotations.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/talon/quotations.py b/talon/quotations.py
index c3ed666..c8604d0 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -463,17 +463,14 @@ def split_emails(msg):
 
     Return the corrected markers
     """
-    print "Conal's split_email method!"
     delimiter = get_delimiter(msg)
     msg_body = preprocess(msg, delimiter)
     # don't process too long messages
     lines = msg_body.splitlines()[:MAX_LINES_COUNT]
     markers = mark_message_lines(lines)
-    print "Conal's split_email method obtained initial markers: " + markers
     # we don't want splitlines in header blocks
     markers = correct_splitlines_in_headers(markers, lines)
 
-    print "Conal's split_email method returning corrected markers: " + markers
     return markers
 
 

From 97b72ef767dba51e37275f5bf300d70a33b26026 Mon Sep 17 00:00:00 2001
From: smitcona <conal.smith@hpe.com>
Date: Tue, 22 Nov 2016 19:06:34 +0000
Subject: [PATCH 4/6] Adding in_header_block variable for reliability

---
 talon/quotations.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/talon/quotations.py b/talon/quotations.py
index c8604d0..debf369 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -468,23 +468,37 @@ def split_emails(msg):
     # don't process too long messages
     lines = msg_body.splitlines()[:MAX_LINES_COUNT]
     markers = mark_message_lines(lines)
+
     # we don't want splitlines in header blocks
-    markers = correct_splitlines_in_headers(markers, lines)
+    markers = _correct_splitlines_in_headers(markers, lines)
 
     return markers
 
 
-def correct_splitlines_in_headers(markers, lines):
+def _correct_splitlines_in_headers(markers, lines):
     """Corrects markers by removing splitlines deemed to be inside header blocks"""
     updated_markers = ""
-    i = -1
+    i = 0
+    in_header_block = False
 
     for m in markers:
+        # Only set in_header_block flag true when we hit an 's' and the line is a header.
         if m == 's':
-            if i > -1:
-                if bool(re.search(RE_HEADER, lines[i])):
+            if not in_header_block:
+                if i == 0:
+                    in_header_block = True
+                elif i > 0 and not bool(re.search(RE_HEADER, lines[i-1])):
+                    in_header_block = True
+                else:
                     m = 't'
+            else:
+                m = 't'
 
+        # If the line is not a header line, set in_header_block false.
+        if not m == 's' and not bool(re.search(RE_HEADER, lines[i])):
+            in_header_block = False
+
+        # Add the marker to the new updated markers string.
         updated_markers += m
         i += 1
 

From 5685a4055ab663b6fae2e9759ab741430f210a1f Mon Sep 17 00:00:00 2001
From: smitcona <conal.smith@hpe.com>
Date: Tue, 22 Nov 2016 19:56:57 +0000
Subject: [PATCH 5/6] Improved algorithm

---
 talon/quotations.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/talon/quotations.py b/talon/quotations.py
index debf369..8e2b2b7 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -485,17 +485,13 @@ def _correct_splitlines_in_headers(markers, lines):
         # Only set in_header_block flag true when we hit an 's' and the line is a header.
         if m == 's':
             if not in_header_block:
-                if i == 0:
+                if bool(re.search(RE_HEADER, lines[i])):
                     in_header_block = True
-                elif i > 0 and not bool(re.search(RE_HEADER, lines[i-1])):
-                    in_header_block = True
-                else:
-                    m = 't'
             else:
                 m = 't'
 
         # If the line is not a header line, set in_header_block false.
-        if not m == 's' and not bool(re.search(RE_HEADER, lines[i])):
+        if not bool(re.search(RE_HEADER, lines[i])):
             in_header_block = False
 
         # Add the marker to the new updated markers string.

From b5e3397b8806ce29090f1134ed6bc7c3155ffd3e Mon Sep 17 00:00:00 2001
From: smitcona <conal.smith@hpe.com>
Date: Tue, 22 Nov 2016 20:00:31 +0000
Subject: [PATCH 6/6] Updating test to account for --original message-- case

---
 tests/text_quotations_test.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py
index 5a9f998..ff8722a 100644
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -712,7 +712,11 @@ Subject: Hi
 Attachments: none
 
 Hello.
+
+-- Original Message --
+On 24th February 2016 at 09.32am Conal Wrote:
+Hey!
 """
-    expected_markers = "stttttsttttet"
+    expected_markers = "stttttsttttetestt"
     markers = quotations.split_emails(msg)
     eq_(markers, expected_markers)