From 139edd6104f6824a0b94a7577fd23d6ae7bf2d28 Mon Sep 17 00:00:00 2001
From: smitcona <conal.smith@hpe.com>
Date: Wed, 1 Feb 2017 17:16:30 +0000
Subject: [PATCH 01/11] Add new method which marks as splitlines, lines which
 are splitlines but start with email quotation indents ("> ")

---
 talon/quotations.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/talon/quotations.py b/talon/quotations.py
index 8e2b2b7..514617f 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -469,12 +469,33 @@ def split_emails(msg):
     lines = msg_body.splitlines()[:MAX_LINES_COUNT]
     markers = mark_message_lines(lines)
 
+    markers = _mark_quoted_email_splitlines(markers, lines)
+
     # we don't want splitlines in header blocks
     markers = _correct_splitlines_in_headers(markers, lines)
 
     return markers
 
 
+def _mark_quoted_email_splitlines(markers, lines):
+    """
+    When there are headers indented with '>' characters, we will attempt to identify if the header is a splitline header
+    using a slightly altered SPLITTER_PATTERNS list and mark it as 's'.
+    """
+    # Create a list of markers to easily alter specific characters
+    markerlist = list(markers)
+    for i, line in enumerate(lines):
+        if markerlist[i] != 'm':
+            continue
+        for pattern in SPLITTER_PATTERNS:
+            matcher = re.search(pattern, line)
+            if matcher:
+                markerlist[i] = 's'
+                break
+
+    return "".join(markerlist)
+
+
 def _correct_splitlines_in_headers(markers, lines):
     """Corrects markers by removing splitlines deemed to be inside header blocks"""
     updated_markers = ""

From 567467b8ed6557eb521b7d37efd1a999096698aa Mon Sep 17 00:00:00 2001
From: smitcona <conal.smith@hpe.com>
Date: Wed, 1 Feb 2017 17:29:05 +0000
Subject: [PATCH 02/11] Update comment

---
 talon/quotations.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/talon/quotations.py b/talon/quotations.py
index 514617f..155bf4a 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -479,8 +479,8 @@ def split_emails(msg):
 
 def _mark_quoted_email_splitlines(markers, lines):
     """
-    When there are headers indented with '>' characters, we will attempt to identify if the header is a splitline header
-    using a slightly altered SPLITTER_PATTERNS list and mark it as 's'.
+    When there are headers indented with '>' characters, this method will attempt to identify if the header is a
+    splitline header. If it is, then we mark it with 's' instead of leaving it as 'm' and return the new markers.
     """
     # Create a list of markers to easily alter specific characters
     markerlist = list(markers)

From a44713409c0f5cfc95506ef3f1b06f14234d4501 Mon Sep 17 00:00:00 2001
From: smitcona <conal.smith@hpe.com>
Date: Wed, 1 Feb 2017 17:40:59 +0000
Subject: [PATCH 03/11] Added additional case for testing new functionality of
 split_emails()

---
 tests/text_quotations_test.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py
index ff8722a..8cd2023 100644
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -716,7 +716,14 @@ Hello.
 -- Original Message --
 On 24th February 2016 at 09.32am Conal Wrote:
 Hey!
+
+> Date: Mon, 2 Apr 2012 17:44:22 +0400
+> Subject: Test
+> From: bob@xxx.mailgun.org
+> To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
+>
+> Hi
 """
-    expected_markers = "stttttsttttetestt"
+    expected_markers = "stttttsttttetesttesmtmmm"
     markers = quotations.split_emails(msg)
     eq_(markers, expected_markers)

From a403ecb5c94c3eef60ea212b07e52a6ff90dc3f2 Mon Sep 17 00:00:00 2001
From: smitcona <conal.smith@hpe.com>
Date: Wed, 1 Feb 2017 18:09:35 +0000
Subject: [PATCH 04/11] Adding two level indentation test

---
 tests/text_quotations_test.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py
index 8cd2023..c0c1294 100644
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -723,7 +723,13 @@ Hey!
 > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
 >
 > Hi
+>
+> > From: bob@xxx.mailgun.org
+> > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
+> > Date: Mon, 2 Apr 2012 17:44:22 +0400
+> > Subject: Test
+> > Hi
 """
-    expected_markers = "stttttsttttetesttesmtmmm"
+    expected_markers = "stttttsttttetesttesmtmmmmsmtmm"
     markers = quotations.split_emails(msg)
     eq_(markers, expected_markers)

From 984c036b6ee96d3ee6e27535015423ee1087c240 Mon Sep 17 00:00:00 2001
From: smitcona <conal.smith@hpe.com>
Date: Wed, 1 Feb 2017 18:28:19 +0000
Subject: [PATCH 05/11] Set the marker back to 'm' rather than 't' if it
 matches the QUOT_PATTERN. Updated test case.

---
 talon/quotations.py           | 5 ++++-
 tests/text_quotations_test.py | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/talon/quotations.py b/talon/quotations.py
index 155bf4a..2e68259 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -509,7 +509,10 @@ def _correct_splitlines_in_headers(markers, lines):
                 if bool(re.search(RE_HEADER, lines[i])):
                     in_header_block = True
             else:
-                m = 't'
+                if QUOT_PATTERN.match(lines[i]):
+                    m = 'm'
+                else:
+                    m = 't'
 
         # If the line is not a header line, set in_header_block false.
         if not bool(re.search(RE_HEADER, lines[i])):
diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py
index c0c1294..9d7deb0 100644
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -730,6 +730,6 @@ Hey!
 > > Subject: Test
 > > Hi
 """
-    expected_markers = "stttttsttttetesttesmtmmmmsmtmm"
+    expected_markers = "stttttsttttetesttesmmmmmmsmmmm"
     markers = quotations.split_emails(msg)
     eq_(markers, expected_markers)

From 3edb6578ba9b120fdcc028da2c5aa084fcbd4059 Mon Sep 17 00:00:00 2001
From: smitcona <conal.smith@hpe.com>
Date: Fri, 3 Feb 2017 11:49:23 +0000
Subject: [PATCH 06/11] Dividing preprocess method into two methods,
 split_emails() now calls one without email content being altered.

---
 talon/quotations.py | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/talon/quotations.py b/talon/quotations.py
index 2e68259..e4ab2bb 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -290,9 +290,19 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
 
     Converts msg_body into a unicode.
     """
-    # normalize links i.e. replace '<', '>' wrapping the link with some symbols
-    # so that '>' closing the link couldn't be mistakenly taken for quotation
-    # marker.
+    msg_body = _replace_link_brackets(msg_body)
+
+    msg_body = _wrap_splitter_with_newline(msg_body, delimiter, content_type)
+
+    return msg_body
+
+
+def _replace_link_brackets(msg_body):
+    """Normalize links i.e. replace '<', '>' wrapping the link with some symbols
+    so that '>' closing the link couldn't be mistakenly taken for quotation marker.
+
+    Converts msg_body into a unicode
+    """
     if isinstance(msg_body, bytes):
         msg_body = msg_body.decode('utf8')
 
@@ -304,7 +314,13 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
             return "@@%s@@" % link.group(1)
 
     msg_body = re.sub(RE_LINK, link_wrapper, msg_body)
+    return msg_body
 
+
+def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'):
+    """Splits line in two if splitter pattern preceded by some text on the same
+    line (done only for 'On <date> <person> wrote:' pattern.
+    """
     def splitter_wrapper(splitter):
         """Wraps splitter with new line"""
         if splitter.start() and msg_body[splitter.start() - 1] != '\n':
@@ -463,8 +479,8 @@ def split_emails(msg):
 
     Return the corrected markers
     """
-    delimiter = get_delimiter(msg)
-    msg_body = preprocess(msg, delimiter)
+    msg_body = _replace_link_brackets(msg)
+
     # don't process too long messages
     lines = msg_body.splitlines()[:MAX_LINES_COUNT]
     markers = mark_message_lines(lines)

From 34c5b526c3505c6e0ec679a0c10e9fbb7cd03e81 Mon Sep 17 00:00:00 2001
From: smitcona <conal.smith@hpe.com>
Date: Fri, 3 Feb 2017 12:57:26 +0000
Subject: [PATCH 07/11] Remove the whitespace before the line if the flag is
 set

---
 talon/quotations.py           |  7 ++--
 tests/text_quotations_test.py | 66 +++++++++++++++++++++--------------
 2 files changed, 44 insertions(+), 29 deletions(-)

diff --git a/talon/quotations.py b/talon/quotations.py
index e4ab2bb..1a8c4e8 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -188,7 +188,7 @@ def extract_from(msg_body, content_type='text/plain'):
     return msg_body
 
 
-def mark_message_lines(lines):
+def mark_message_lines(lines, ignore_initial_spaces=False):
     """Mark message lines with markers to distinguish quotation lines.
 
     Markers:
@@ -204,6 +204,8 @@ def mark_message_lines(lines):
     markers = ['e' for _ in lines]
     i = 0
     while i < len(lines):
+        if ignore_initial_spaces:
+            lines[i] = lines[i].lstrip(' ')
         if not lines[i].strip():
             markers[i] = 'e'  # empty line
         elif QUOT_PATTERN.match(lines[i]):
@@ -480,10 +482,11 @@ def split_emails(msg):
     Return the corrected markers
     """
     msg_body = _replace_link_brackets(msg)
+    ignore_initial_spaces = True
 
     # don't process too long messages
     lines = msg_body.splitlines()[:MAX_LINES_COUNT]
-    markers = mark_message_lines(lines)
+    markers = mark_message_lines(lines, ignore_initial_spaces)
 
     markers = _mark_quoted_email_splitlines(markers, lines)
 
diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py
index 9d7deb0..6a87e9b 100644
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -700,36 +700,48 @@ def test_standard_replies():
 
 def test_split_email():
     msg = """From: Mr. X
-Date: 24 February 2016
-To: Mr. Y
-Subject: Hi
-Attachments: none
-Goodbye.
-From: Mr. Y
-To: Mr. X
-Date: 24 February 2016
-Subject: Hi
-Attachments: none
+    Date: 24 February 2016
+    To: Mr. Y
+    Subject: Hi
+    Attachments: none
+    Goodbye.
+    From: Mr. Y
+    To: Mr. X
+    Date: 24 February 2016
+    Subject: Hi
+    Attachments: none
 
-Hello.
+    Hello.
 
--- Original Message --
-On 24th February 2016 at 09.32am Conal Wrote:
-Hey!
+        On 24th February 2016 at 09.32am, Conal Wrote:
 
-> Date: Mon, 2 Apr 2012 17:44:22 +0400
-> Subject: Test
-> From: bob@xxx.mailgun.org
-> To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
->
-> Hi
->
-> > From: bob@xxx.mailgun.org
-> > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
-> > Date: Mon, 2 Apr 2012 17:44:22 +0400
-> > Subject: Test
-> > Hi
+        Hey!
+
+        On Mon, 2016-10-03 at 09:45 -0600, Stangel, Dan wrote:
+        > Mohan,
+        >
+        > We have not yet migrated the systems.
+        >
+        > Dan
+        >
+        > > -----Original Message-----
+        > > Date: Mon, 2 Apr 2012 17:44:22 +0400
+        > > Subject: Test
+        > > From: bob@xxx.mailgun.org
+        > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
+        > >
+        > > Hi
+        > >
+        > > > From: bob@xxx.mailgun.org
+        > > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
+        > > > Date: Mon, 2 Apr 2012 17:44:22 +0400
+        > > > Subject: Test
+        > > > Hi
+        > > >
+        > >
+        >
+        >
 """
-    expected_markers = "stttttsttttetesttesmmmmmmsmmmm"
+    expected_markers = "stttttsttttetetetesmmmmmmssmmmmmmsmmmmmmmm"
     markers = quotations.split_emails(msg)
     eq_(markers, expected_markers)

From 29f1d21be7597b91c7280f1eb7d376ea72140bcd Mon Sep 17 00:00:00 2001
From: smitcona <conal.smith@hpe.com>
Date: Mon, 6 Feb 2017 15:03:22 +0000
Subject: [PATCH 08/11] fixed expected markers and incorrect condensed header
 not matching regex

---
 tests/text_quotations_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py
index 6a87e9b..622e84f 100644
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -713,7 +713,7 @@ def test_split_email():
 
     Hello.
 
-        On 24th February 2016 at 09.32am, Conal Wrote:
+        On 24th February 2016 at 09.32am, Conal wrote:
 
         Hey!
 
@@ -742,6 +742,6 @@ def test_split_email():
         >
         >
 """
-    expected_markers = "stttttsttttetetetesmmmmmmssmmmmmmsmmmmmmmm"
+    expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm"
     markers = quotations.split_emails(msg)
     eq_(markers, expected_markers)

From a1d0a863058ae721b4e5b1bf93336664888582d6 Mon Sep 17 00:00:00 2001
From: smitcona <conal.smith@hpe.com>
Date: Tue, 7 Feb 2017 12:47:33 +0000
Subject: [PATCH 09/11] Pass ignore_initial_spaces=True as this has better
 clarity than separate boolean variable

---
 talon/quotations.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/talon/quotations.py b/talon/quotations.py
index 1a8c4e8..4fc8494 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -482,11 +482,10 @@ def split_emails(msg):
     Return the corrected markers
     """
     msg_body = _replace_link_brackets(msg)
-    ignore_initial_spaces = True
 
     # don't process too long messages
     lines = msg_body.splitlines()[:MAX_LINES_COUNT]
-    markers = mark_message_lines(lines, ignore_initial_spaces)
+    markers = mark_message_lines(lines, ignore_initial_spaces=True)
 
     markers = _mark_quoted_email_splitlines(markers, lines)
 

From 5c71a0ca0776fd0a4aa5cf58606b328541080fab Mon Sep 17 00:00:00 2001
From: smitcona <conal.smith@hpe.com>
Date: Mon, 13 Feb 2017 16:45:26 +0000
Subject: [PATCH 10/11] Split the comment lines so that they are not over 80
 characters

---
 talon/quotations.py | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/talon/quotations.py b/talon/quotations.py
index 7546d7f..811a21c 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -300,8 +300,10 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
 
 
 def _replace_link_brackets(msg_body):
-    """Normalize links i.e. replace '<', '>' wrapping the link with some symbols
-    so that '>' closing the link couldn't be mistakenly taken for quotation marker.
+    """
+    Normalize links i.e. replace '<', '>' wrapping the link with some symbols
+    so that '>' closing the link couldn't be mistakenly taken for quotation
+    marker.
 
     Converts msg_body into a unicode
     """
@@ -320,7 +322,8 @@ def _replace_link_brackets(msg_body):
 
 
 def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'):
-    """Splits line in two if splitter pattern preceded by some text on the same
+    """
+    Splits line in two if splitter pattern preceded by some text on the same
     line (done only for 'On <date> <person> wrote:' pattern.
     """
     def splitter_wrapper(splitter):
@@ -473,11 +476,12 @@ def _extract_from_html(msg_body):
 
 def split_emails(msg):
     """
-    Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify
-     split lines, content lines and empty lines.
+    Given a message (which may consist of an email conversation thread with
+    multiple emails), mark the lines to identify split lines, content lines and
+    empty lines.
 
-    Correct the split line markers inside header blocks. Header blocks are identified by the regular expression
-    RE_HEADER.
+    Correct the split line markers inside header blocks. Header blocks are
+    identified by the regular expression RE_HEADER.
 
     Return the corrected markers
     """
@@ -497,8 +501,9 @@ def split_emails(msg):
 
 def _mark_quoted_email_splitlines(markers, lines):
     """
-    When there are headers indented with '>' characters, this method will attempt to identify if the header is a
-    splitline header. If it is, then we mark it with 's' instead of leaving it as 'm' and return the new markers.
+    When there are headers indented with '>' characters, this method will
+    attempt to identify if the header is a splitline header. If it is, then we
+    mark it with 's' instead of leaving it as 'm' and return the new markers.
     """
     # Create a list of markers to easily alter specific characters
     markerlist = list(markers)
@@ -515,13 +520,15 @@ def _mark_quoted_email_splitlines(markers, lines):
 
 
 def _correct_splitlines_in_headers(markers, lines):
-    """Corrects markers by removing splitlines deemed to be inside header blocks"""
+    """
+    Corrects markers by removing splitlines deemed to be inside header blocks.
+    """
     updated_markers = ""
     i = 0
     in_header_block = False
 
     for m in markers:
-        # Only set in_header_block flag true when we hit an 's' and the line is a header.
+        # Only set in_header_block flag when we hit an 's' and line is a header
         if m == 's':
             if not in_header_block:
                 if bool(re.search(RE_HEADER, lines[i])):

From a2eb0f72015dd5d2453592e1dfa6d86560980528 Mon Sep 17 00:00:00 2001
From: smitcona <conal.smith@hpe.com>
Date: Tue, 14 Feb 2017 18:19:45 +0000
Subject: [PATCH 11/11] Creating new method which removes initial spaces and
 marks the message lines. Removing ambiguity introduced to mark_message_lines

---
 talon/quotations.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/talon/quotations.py b/talon/quotations.py
index 811a21c..232c69d 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -188,7 +188,20 @@ def extract_from(msg_body, content_type='text/plain'):
     return msg_body
 
 
-def mark_message_lines(lines, ignore_initial_spaces=False):
+def remove_initial_spaces_and_mark_message_lines(lines):
+    """
+    Removes the initial spaces in each line before marking message lines.
+
+    This ensures headers can be identified if they are indented with spaces.
+    """
+    i = 0
+    while i < len(lines):
+        lines[i] = lines[i].lstrip(' ')
+        i += 1
+    return mark_message_lines(lines)
+
+
+def mark_message_lines(lines):
     """Mark message lines with markers to distinguish quotation lines.
 
     Markers:
@@ -204,8 +217,6 @@ def mark_message_lines(lines, ignore_initial_spaces=False):
     markers = ['e' for _ in lines]
     i = 0
     while i < len(lines):
-        if ignore_initial_spaces:
-            lines[i] = lines[i].lstrip(' ')
         if not lines[i].strip():
             markers[i] = 'e'  # empty line
         elif QUOT_PATTERN.match(lines[i]):
@@ -489,7 +500,7 @@ def split_emails(msg):
 
     # don't process too long messages
     lines = msg_body.splitlines()[:MAX_LINES_COUNT]
-    markers = mark_message_lines(lines, ignore_initial_spaces=True)
+    markers = remove_initial_spaces_and_mark_message_lines(lines)
 
     markers = _mark_quoted_email_splitlines(markers, lines)