12 Commits

Author SHA1 Message Date
Sergey Obukhov
3497b5cab4 Merge pull request #79 from mailgun/sergey/version
bump version
2016-02-29 15:13:51 -08:00
Sergey Obukhov
9c17dca17c bump version 2016-02-29 14:50:52 -08:00
Sergey Obukhov
de342d3177 Merge pull request #78 from defkev/master
Added Zimbra HTML quotation extraction
2016-02-29 14:14:09 -08:00
defkev
743b452daf Added Zimbra HTML quotation extraction 2016-02-21 16:56:52 +01:00
Sergey Obukhov
c762f3c337 Merge pull request #77 from mailgun/sergey/fix-gmail-fwd
fixes mailgun/talon#18
2016-02-19 19:08:37 -08:00
Sergey Obukhov
31803d41bc fixes mailgun/talon#18 2016-02-19 19:07:10 -08:00
Sergey Obukhov
2ecd9779fc bump up version 2016-02-19 18:32:07 -08:00
Sergey Obukhov
5a7047233e Merge pull request #76 from mailgun/sergey/fix-date-splitter
fixes mailgun/talon#19
2016-02-19 18:28:23 -08:00
Sergey Obukhov
999e9c3725 fixes mailgun/talon#19 2016-02-19 17:53:52 -08:00
Sergey Obukhov
f6940fe878 bump up version 2015-12-18 19:15:58 -08:00
Sergey Obukhov
ce65ff8fc8 Merge pull request #71 from clara-labs/ms-2010-issue
First pass at handling issue with ms outlook 2010 with unenclosed quo…
2015-12-18 19:14:13 -08:00
Carlos Correa
f688d074b5 First pass at handling issue with ms outlook 2010 with unenclosed quoted text. 2015-12-10 19:16:13 -08:00
6 changed files with 158 additions and 18 deletions

View File

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
setup(name='talon',
version='1.2.0',
version='1.2.4',
description=("Mailgun library "
"to extract message quotations and signatures."),
long_description=open("README.rst").read(),

View File

@@ -12,6 +12,7 @@ CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX)
# HTML quote indicators (tag ids)
QUOTE_IDS = ['OLK_SRC_BODY_SECTION']
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
def add_checkpoint(html_note, counter):
@@ -77,7 +78,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints):
def cut_gmail_quote(html_message):
''' Cuts the outermost block element with class gmail_quote. '''
gmail_quote = html_message.cssselect('div.gmail_quote')
if gmail_quote:
if gmail_quote and not RE_FWD.match(gmail_quote[0].text):
gmail_quote[0].getparent().remove(gmail_quote[0])
return True
@@ -159,13 +160,26 @@ def cut_from_block(html_message):
if block:
block = block[-1]
parent_div = None
while block.getparent() is not None:
if block.tag == 'div':
parent_div = block
break
block = block.getparent()
if parent_div is not None:
maybe_body = parent_div.getparent()
# In cases where removing this enclosing div will remove all
# content, we should assume the quote is not enclosed in a tag.
parent_div_is_all_content = (
maybe_body is not None and maybe_body.tag == 'body' and
len(maybe_body.getchildren()) == 1)
if not parent_div_is_all_content:
block.getparent().remove(block)
return True
else:
block = block.getparent()
else:
return False
# handle the case when From: block goes right after e.g. <hr>
# and not enclosed in some tag
block = html_message.xpath(
@@ -173,7 +187,17 @@ def cut_from_block(html_message):
"//*[starts-with(mg:tail(), 'Date:')]"))
if block:
block = block[0]
if RE_FWD.match(block.getparent().text or ''):
return False
while(block.getnext() is not None):
block.getparent().remove(block.getnext())
block.getparent().remove(block)
return True
def cut_zimbra_quote(html_message):
zDivider = html_message.xpath('//hr[@data-marker="__DIVIDER__"]')
if zDivider:
zDivider[0].getparent().remove(zDivider[0])
return True

View File

@@ -137,11 +137,16 @@ RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .
SPLITTER_PATTERNS = [
RE_ORIGINAL_MESSAGE,
# <date> <person>
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S),
RE_ON_DATE_SMB_WROTE,
RE_ON_DATE_WROTE_SMB,
RE_FROM_COLON_OR_DATE_COLON,
# 02.04.2012 14:20 пользователь "bob@example.com" <
# bob@xxx.mailgun.org> написал:
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S),
# 2014-10-17 11:28 GMT+03:00 Bob <
# bob@example.com>:
re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S),
# Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>:
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
'( \S+){3,6}@\S+:')
]
@@ -345,6 +350,7 @@ def extract_from_html(msg_body):
parser=html.HTMLParser(encoding="utf-8")
)
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
html_quotations.cut_zimbra_quote(html_tree) or
html_quotations.cut_blockquote(html_tree) or
html_quotations.cut_microsoft_quote(html_tree) or
html_quotations.cut_by_id(html_tree) or

View File

@@ -0,0 +1,87 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-2022-jp">
<meta name="Generator" content="Microsoft Word 14 (filtered medium)">
<style><!--
/* Font Definitions */
@font-face
{font-family:Calibri;
panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
{font-family:Tahoma;
panose-1:2 11 6 4 3 5 4 4 2 4;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
{margin:0in;
margin-bottom:.0001pt;
font-size:12.0pt;
font-family:"Times New Roman","serif";}
h3
{mso-style-priority:9;
mso-style-link:"Heading 3 Char";
mso-margin-top-alt:auto;
margin-right:0in;
mso-margin-bottom-alt:auto;
margin-left:0in;
font-size:13.5pt;
font-family:"Times New Roman","serif";
font-weight:bold;}
a:link, span.MsoHyperlink
{mso-style-priority:99;
color:blue;
text-decoration:underline;}
a:visited, span.MsoHyperlinkFollowed
{mso-style-priority:99;
color:purple;
text-decoration:underline;}
p
{mso-style-priority:99;
mso-margin-top-alt:auto;
margin-right:0in;
mso-margin-bottom-alt:auto;
margin-left:0in;
font-size:12.0pt;
font-family:"Times New Roman","serif";}
span.Heading3Char
{mso-style-name:"Heading 3 Char";
mso-style-priority:9;
mso-style-link:"Heading 3";
font-family:"Cambria","serif";
color:#4F81BD;
font-weight:bold;}
span.EmailStyle19
{mso-style-type:personal-reply;
font-family:"Calibri","sans-serif";
color:#1F497D;}
.MsoChpDefault
{mso-style-type:export-only;
font-family:"Calibri","sans-serif";}
@page WordSection1
{size:8.5in 11.0in;
margin:1.0in 1.0in 1.0in 1.0in;}
div.WordSection1
{page:WordSection1;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1" />
</o:shapelayout></xml><![endif]-->
</head>
<body lang="EN-US" link="blue" vlink="purple">
<div class="WordSection1">
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Hi. I am fine.<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Thanks,<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Alex<o:p></o:p></span></p>
<p class="MsoNormal"><b><span style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;">From:</span></b><span style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;"> Foo [mailto:foo@bar.com]
<b>On Behalf Of </b>baz@bar.com<br>
<b>Sent:</b> Monday, January 01, 2000 12:00 AM<br>
<b>To:</b> john@bar.com<br>
<b>Cc:</b> jane@bar.io<br>
<b>Subject:</b> Conversation<o:p></o:p></span></p>
<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
<p>Hello! How are you?<o:p></o:p></p>
<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
</div>
</body>
</html>

View File

@@ -299,6 +299,10 @@ def test_ms_outlook_2007_reply():
extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2007.html")
def test_ms_outlook_2010_reply():
extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2010.html")
def test_thunderbird_reply():
extract_reply_and_check("tests/fixtures/html_replies/thunderbird.html")
@@ -336,3 +340,10 @@ def test_CRLF():
assert_false(symbol in extracted)
eq_("<html><body><p>Reply</p></body></html>",
RE_WHITESPACE.sub('', extracted))
def test_gmail_forwarded_msg():
msg_body = """<div dir="ltr"><br><div class="gmail_quote">---------- Forwarded message ----------<br>From: <b class="gmail_sendername">Bob</b> <span dir="ltr">&lt;<a href="mailto:bob@example.com">bob@example.com</a>&gt;</span><br>Date: Fri, Feb 11, 2010 at 5:59 PM<br>Subject: Bob WFH today<br>To: Mary &lt;<a href="mailto:mary@example.com">mary@example.com</a>&gt;<br><br><br><div dir="ltr">eom</div>
</div><br></div>"""
extracted = quotations.extract_from_html(msg_body)
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))

View File

@@ -54,6 +54,18 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote:
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_date_time_email_splitter():
msg_body = """Test reply
2014-10-17 11:28 GMT+03:00 Postmaster <
postmaster@sandboxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.mailgun.org>:
> First from site
>
"""
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_pattern_on_date_somebody_wrote_allows_space_in_front():
msg_body = """Thanks Thanmai
On Mar 8, 2012 9:59 AM, "Example.com" <