10 Commits

Author SHA1 Message Date
Sergey Obukhov
015c8d2a78 Merge pull request #120 from mailgun/sergey/talon-1.3.3
bump talon version
2016-11-30 18:28:39 -08:00
Sergey Obukhov
5af846c13d bump talon version 2016-11-30 12:56:06 -08:00
Sergey Obukhov
e69a9c7a54 Merge pull request #119 from conapart3/master
Addition of new split_email method for issue:115
2016-11-30 12:51:32 -08:00
conapart3
23cb2a9a53 Merge pull request #1 from conapart3/issue-115-date-split-in-headers
split_emails function added, test added
2016-11-22 20:02:54 +00:00
smitcona
b5e3397b88 Updating test to account for --original message-- case 2016-11-22 20:00:31 +00:00
smitcona
5685a4055a Improved algorithm 2016-11-22 19:56:57 +00:00
smitcona
97b72ef767 Adding in_header_block variable for reliability 2016-11-22 19:06:34 +00:00
smitcona
31489848be Remove print lines 2016-11-21 17:36:06 +00:00
smitcona
e5988d447b Add space 2016-11-21 12:48:29 +00:00
smitcona
adfed748ce split_emails function added, test added 2016-11-21 12:35:36 +00:00
3 changed files with 76 additions and 1 deletions

View File

@@ -29,7 +29,7 @@ class InstallCommand(install):
setup(name='talon',
version='1.3.2',
version='1.3.3',
description=("Mailgun library "
"to extract message quotations and signatures."),
long_description=open("README.rst").read(),

View File

@@ -172,6 +172,9 @@ MAX_HTML_LEN = 2794202
QUOT_PATTERN = re.compile('^>+ ?')
NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
# Regular expression to identify if a line is a header.
RE_HEADER = re.compile(": ")
def extract_from(msg_body, content_type='text/plain'):
try:
@@ -450,6 +453,54 @@ def _extract_from_html(msg_body):
return html.tostring(html_tree_copy)
def split_emails(msg):
"""
Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify
split lines, content lines and empty lines.
Correct the split line markers inside header blocks. Header blocks are identified by the regular expression
RE_HEADER.
Return the corrected markers
"""
delimiter = get_delimiter(msg)
msg_body = preprocess(msg, delimiter)
# don't process too long messages
lines = msg_body.splitlines()[:MAX_LINES_COUNT]
markers = mark_message_lines(lines)
# we don't want splitlines in header blocks
markers = _correct_splitlines_in_headers(markers, lines)
return markers
def _correct_splitlines_in_headers(markers, lines):
"""Corrects markers by removing splitlines deemed to be inside header blocks"""
updated_markers = ""
i = 0
in_header_block = False
for m in markers:
# Only set in_header_block flag true when we hit an 's' and the line is a header.
if m == 's':
if not in_header_block:
if bool(re.search(RE_HEADER, lines[i])):
in_header_block = True
else:
m = 't'
# If the line is not a header line, set in_header_block false.
if not bool(re.search(RE_HEADER, lines[i])):
in_header_block = False
# Add the marker to the new updated markers string.
updated_markers += m
i += 1
return updated_markers
def _readable_text_empty(html_tree):
return not bool(html_tree_to_text(html_tree).strip())

View File

@@ -696,3 +696,27 @@ def test_standard_replies():
"'%(reply)s' != %(stripped)s for %(fn)s" % \
{'reply': reply_text, 'stripped': stripped_text,
'fn': filename}
def test_split_email():
msg = """From: Mr. X
Date: 24 February 2016
To: Mr. Y
Subject: Hi
Attachments: none
Goodbye.
From: Mr. Y
To: Mr. X
Date: 24 February 2016
Subject: Hi
Attachments: none
Hello.
-- Original Message --
On 24th February 2016 at 09.32am Conal Wrote:
Hey!
"""
expected_markers = "stttttsttttetestt"
markers = quotations.split_emails(msg)
eq_(markers, expected_markers)