Merge pull request #119 from conapart3/master

Addition of new split_email method for issue:115
2016-11-30 12:51:32 -08:00
parent 2444ba87c0 23cb2a9a53
commit e69a9c7a54
2 changed files with 75 additions and 0 deletions
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -172,6 +172,9 @@ MAX_HTML_LEN = 2794202
 QUOT_PATTERN = re.compile('^>+ ?')
 NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
 # Regular expression to identify if a line is a header.
 RE_HEADER = re.compile(": ")
 def extract_from(msg_body, content_type='text/plain'):
    try:
@@ -450,6 +453,54 @@ def _extract_from_html(msg_body):
    return html.tostring(html_tree_copy)
 def split_emails(msg):
    """
    Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify
     split lines, content lines and empty lines.
    Correct the split line markers inside header blocks. Header blocks are identified by the regular expression
    RE_HEADER.
    Return the corrected markers
    """
    delimiter = get_delimiter(msg)
    msg_body = preprocess(msg, delimiter)
    # don't process too long messages
    lines = msg_body.splitlines()[:MAX_LINES_COUNT]
    markers = mark_message_lines(lines)
    # we don't want splitlines in header blocks
    markers = _correct_splitlines_in_headers(markers, lines)
    return markers
 def _correct_splitlines_in_headers(markers, lines):
    """Corrects markers by removing splitlines deemed to be inside header blocks"""
    updated_markers = ""
    i = 0
    in_header_block = False
    for m in markers:
        # Only set in_header_block flag true when we hit an 's' and the line is a header.
        if m == 's':
            if not in_header_block:
                if bool(re.search(RE_HEADER, lines[i])):
                    in_header_block = True
            else:
                m = 't'
        # If the line is not a header line, set in_header_block false.
        if not bool(re.search(RE_HEADER, lines[i])):
            in_header_block = False
        # Add the marker to the new updated markers string.
        updated_markers += m
        i += 1
    return updated_markers
 def _readable_text_empty(html_tree):
    return not bool(html_tree_to_text(html_tree).strip())
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -696,3 +696,27 @@ def test_standard_replies():
                "'%(reply)s' != %(stripped)s for %(fn)s" % \
                {'reply': reply_text, 'stripped': stripped_text,
                 'fn': filename}
 def test_split_email():
    msg = """From: Mr. X
 Date: 24 February 2016
 To: Mr. Y
 Subject: Hi
 Attachments: none
 Goodbye.
 From: Mr. Y
 To: Mr. X
 Date: 24 February 2016
 Subject: Hi
 Attachments: none
 Hello.
 -- Original Message --
 On 24th February 2016 at 09.32am Conal Wrote:
 Hey!
 """
    expected_markers = "stttttsttttetestt"
    markers = quotations.split_emails(msg)
    eq_(markers, expected_markers)