18 Commits

Author SHA1 Message Date
Sergey Obukhov
ab6066eafa Merge pull request #87 from mailgun/sergey/1.2.6
bump up version
2016-04-07 17:54:12 -07:00
Sergey Obukhov
42258cdd36 bump up version 2016-04-07 17:51:48 -07:00
Sergey Obukhov
d3de9e6893 Merge pull request #86 from dougkeen/master
Fix #85 (exception when stripping gmail quotes)
2016-04-07 17:47:38 -07:00
Doug Keen
333beb94af Fix #85 (exception when stripping gmail quotes) 2016-04-04 14:22:50 -07:00
Sergey Obukhov
f3c0942c49 Merge pull request #80 from mailgun/sergey/12
fixes mailgun/talon#12
2016-03-04 13:33:46 -08:00
Sergey Obukhov
02adf53ab9 fixes mailgun/talon#12 2016-03-04 13:14:50 -08:00
Sergey Obukhov
3497b5cab4 Merge pull request #79 from mailgun/sergey/version
bump version
2016-02-29 15:13:51 -08:00
Sergey Obukhov
9c17dca17c bump version 2016-02-29 14:50:52 -08:00
Sergey Obukhov
de342d3177 Merge pull request #78 from defkev/master
Added Zimbra HTML quotation extraction
2016-02-29 14:14:09 -08:00
defkev
743b452daf Added Zimbra HTML quotation extraction 2016-02-21 16:56:52 +01:00
Sergey Obukhov
c762f3c337 Merge pull request #77 from mailgun/sergey/fix-gmail-fwd
fixes mailgun/talon#18
2016-02-19 19:08:37 -08:00
Sergey Obukhov
31803d41bc fixes mailgun/talon#18 2016-02-19 19:07:10 -08:00
Sergey Obukhov
2ecd9779fc bump up version 2016-02-19 18:32:07 -08:00
Sergey Obukhov
5a7047233e Merge pull request #76 from mailgun/sergey/fix-date-splitter
fixes mailgun/talon#19
2016-02-19 18:28:23 -08:00
Sergey Obukhov
999e9c3725 fixes mailgun/talon#19 2016-02-19 17:53:52 -08:00
Sergey Obukhov
f6940fe878 bump up version 2015-12-18 19:15:58 -08:00
Sergey Obukhov
ce65ff8fc8 Merge pull request #71 from clara-labs/ms-2010-issue
First pass at handling issue with ms outlook 2010 with unenclosed quo…
2015-12-18 19:14:13 -08:00
Carlos Correa
f688d074b5 First pass at handling issue with ms outlook 2010 with unenclosed quoted text. 2015-12-10 19:16:13 -08:00
6 changed files with 185 additions and 19 deletions

View File

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
setup(name='talon',
version='1.2.0',
version='1.2.6',
description=("Mailgun library "
"to extract message quotations and signatures."),
long_description=open("README.rst").read(),

View File

@@ -12,6 +12,7 @@ CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX)
# HTML quote indicators (tag ids)
QUOTE_IDS = ['OLK_SRC_BODY_SECTION']
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
def add_checkpoint(html_note, counter):
@@ -77,7 +78,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints):
def cut_gmail_quote(html_message):
''' Cuts the outermost block element with class gmail_quote. '''
gmail_quote = html_message.cssselect('div.gmail_quote')
if gmail_quote:
if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)):
gmail_quote[0].getparent().remove(gmail_quote[0])
return True
@@ -159,21 +160,44 @@ def cut_from_block(html_message):
if block:
block = block[-1]
parent_div = None
while block.getparent() is not None:
if block.tag == 'div':
parent_div = block
break
block = block.getparent()
if parent_div is not None:
maybe_body = parent_div.getparent()
# In cases where removing this enclosing div will remove all
# content, we should assume the quote is not enclosed in a tag.
parent_div_is_all_content = (
maybe_body is not None and maybe_body.tag == 'body' and
len(maybe_body.getchildren()) == 1)
if not parent_div_is_all_content:
block.getparent().remove(block)
return True
else:
block = block.getparent()
else:
# handle the case when From: block goes right after e.g. <hr>
# and not enclosed in some tag
block = html_message.xpath(
("//*[starts-with(mg:tail(), 'From:')]|"
"//*[starts-with(mg:tail(), 'Date:')]"))
if block:
block = block[0]
while(block.getnext() is not None):
block.getparent().remove(block.getnext())
block.getparent().remove(block)
return True
else:
return False
# handle the case when From: block goes right after e.g. <hr>
# and not enclosed in some tag
block = html_message.xpath(
("//*[starts-with(mg:tail(), 'From:')]|"
"//*[starts-with(mg:tail(), 'Date:')]"))
if block:
block = block[0]
if RE_FWD.match(block.getparent().text or ''):
return False
while(block.getnext() is not None):
block.getparent().remove(block.getnext())
block.getparent().remove(block)
return True
def cut_zimbra_quote(html_message):
zDivider = html_message.xpath('//hr[@data-marker="__DIVIDER__"]')
if zDivider:
zDivider[0].getparent().remove(zDivider[0])
return True

View File

@@ -137,13 +137,20 @@ RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .
SPLITTER_PATTERNS = [
RE_ORIGINAL_MESSAGE,
# <date> <person>
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S),
RE_ON_DATE_SMB_WROTE,
RE_ON_DATE_WROTE_SMB,
RE_FROM_COLON_OR_DATE_COLON,
# 02.04.2012 14:20 пользователь "bob@example.com" <
# bob@xxx.mailgun.org> написал:
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S),
# 2014-10-17 11:28 GMT+03:00 Bob <
# bob@example.com>:
re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S),
# Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>:
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
'( \S+){3,6}@\S+:')
'( \S+){3,6}@\S+:'),
# Sent from Samsung MobileName <address@example.com> wrote:
re.compile('Sent from Samsung .*@.*> wrote')
]
@@ -345,6 +352,7 @@ def extract_from_html(msg_body):
parser=html.HTMLParser(encoding="utf-8")
)
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
html_quotations.cut_zimbra_quote(html_tree) or
html_quotations.cut_blockquote(html_tree) or
html_quotations.cut_microsoft_quote(html_tree) or
html_quotations.cut_by_id(html_tree) or

View File

@@ -0,0 +1,87 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-2022-jp">
<meta name="Generator" content="Microsoft Word 14 (filtered medium)">
<style><!--
/* Font Definitions */
@font-face
{font-family:Calibri;
panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
{font-family:Tahoma;
panose-1:2 11 6 4 3 5 4 4 2 4;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
{margin:0in;
margin-bottom:.0001pt;
font-size:12.0pt;
font-family:"Times New Roman","serif";}
h3
{mso-style-priority:9;
mso-style-link:"Heading 3 Char";
mso-margin-top-alt:auto;
margin-right:0in;
mso-margin-bottom-alt:auto;
margin-left:0in;
font-size:13.5pt;
font-family:"Times New Roman","serif";
font-weight:bold;}
a:link, span.MsoHyperlink
{mso-style-priority:99;
color:blue;
text-decoration:underline;}
a:visited, span.MsoHyperlinkFollowed
{mso-style-priority:99;
color:purple;
text-decoration:underline;}
p
{mso-style-priority:99;
mso-margin-top-alt:auto;
margin-right:0in;
mso-margin-bottom-alt:auto;
margin-left:0in;
font-size:12.0pt;
font-family:"Times New Roman","serif";}
span.Heading3Char
{mso-style-name:"Heading 3 Char";
mso-style-priority:9;
mso-style-link:"Heading 3";
font-family:"Cambria","serif";
color:#4F81BD;
font-weight:bold;}
span.EmailStyle19
{mso-style-type:personal-reply;
font-family:"Calibri","sans-serif";
color:#1F497D;}
.MsoChpDefault
{mso-style-type:export-only;
font-family:"Calibri","sans-serif";}
@page WordSection1
{size:8.5in 11.0in;
margin:1.0in 1.0in 1.0in 1.0in;}
div.WordSection1
{page:WordSection1;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1" />
</o:shapelayout></xml><![endif]-->
</head>
<body lang="EN-US" link="blue" vlink="purple">
<div class="WordSection1">
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Hi. I am fine.<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Thanks,<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Alex<o:p></o:p></span></p>
<p class="MsoNormal"><b><span style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;">From:</span></b><span style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;"> Foo [mailto:foo@bar.com]
<b>On Behalf Of </b>baz@bar.com<br>
<b>Sent:</b> Monday, January 01, 2000 12:00 AM<br>
<b>To:</b> john@bar.com<br>
<b>Cc:</b> jane@bar.io<br>
<b>Subject:</b> Conversation<o:p></o:p></span></p>
<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
<p>Hello! How are you?<o:p></o:p></p>
<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
</div>
</body>
</html>

View File

@@ -131,6 +131,17 @@ def test_gmail_quote():
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_gmail_quote_compact():
msg_body = 'Reply' \
'<div class="gmail_quote">' \
'<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:' \
'<div>Test</div>' \
'</div>' \
'</div>'
eq_("<html><body><p>Reply</p></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_gmail_quote_blockquote():
msg_body = """Message
<blockquote class="gmail_quote">
@@ -299,6 +310,10 @@ def test_ms_outlook_2007_reply():
extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2007.html")
def test_ms_outlook_2010_reply():
extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2010.html")
def test_thunderbird_reply():
extract_reply_and_check("tests/fixtures/html_replies/thunderbird.html")
@@ -336,3 +351,10 @@ def test_CRLF():
assert_false(symbol in extracted)
eq_("<html><body><p>Reply</p></body></html>",
RE_WHITESPACE.sub('', extracted))
def test_gmail_forwarded_msg():
msg_body = """<div dir="ltr"><br><div class="gmail_quote">---------- Forwarded message ----------<br>From: <b class="gmail_sendername">Bob</b> <span dir="ltr">&lt;<a href="mailto:bob@example.com">bob@example.com</a>&gt;</span><br>Date: Fri, Feb 11, 2010 at 5:59 PM<br>Subject: Bob WFH today<br>To: Mary &lt;<a href="mailto:mary@example.com">mary@example.com</a>&gt;<br><br><br><div dir="ltr">eom</div>
</div><br></div>"""
extracted = quotations.extract_from_html(msg_body)
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))

View File

@@ -32,6 +32,19 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote:
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_pattern_sent_from_samsung_smb_wrote():
msg_body = """Test reply
Sent from Samsung MobileName <address@example.com> wrote:
>
> Test
>
> Roman"""
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_pattern_on_date_wrote_somebody():
eq_('Lorem', quotations.extract_from_plain(
"""Lorem
@@ -54,6 +67,18 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote:
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_date_time_email_splitter():
msg_body = """Test reply
2014-10-17 11:28 GMT+03:00 Postmaster <
postmaster@sandboxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.mailgun.org>:
> First from site
>
"""
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_pattern_on_date_somebody_wrote_allows_space_in_front():
msg_body = """Thanks Thanmai
On Mar 8, 2012 9:59 AM, "Example.com" <