29 Commits

Author SHA1 Message Date
Sergey Obukhov
5bcf7403ad Merge pull request #94 from mailgun/obukhov-sergey-patch-1
Update README.rst
2016-05-31 20:16:13 -07:00
Sergey Obukhov
2d6c092b65 bump version 2016-05-31 18:42:47 -07:00
Sergey Obukhov
6d0689cad6 Update README.rst 2016-05-31 18:39:07 -07:00
Sergey Obukhov
3f80e93ee0 Merge pull request #93 from mailgun/sergey/version-bump
bump
2016-05-31 18:15:28 -07:00
Sergey Obukhov
1b18abab1d bump 2016-05-31 16:53:41 -07:00
Sergey Obukhov
03dd5af5ab Merge pull request #91 from KevinCathcart/patch-1
Support outlook 2007/2010 running in en-us locale
2016-05-31 16:50:35 -07:00
Sergey Obukhov
dfba82b07c Merge pull request #92 from mailgun/obukhov-sergey-kuntzcamera
Update README.rst
2016-05-31 15:42:34 -07:00
Sergey Obukhov
08ca02c87f Update README.rst 2016-05-31 15:14:32 -07:00
Kevin Cathcart
b61f4ec095 Support outlook 2007/2010 running in en-us locale
My American English copy of outlook 2007 is using inches in the reply separator rather than centimeters. The separator is otherwise Identical. What a strange thing to localize. I'm guessing it uses whatever it thinks the preferred units for page margins are.
2016-05-23 17:23:53 -04:00
Sergey Obukhov
9dbe6a494b Merge pull request #90 from mailgun/sergey/89
fixes mailgun/talon#89
2016-05-17 16:01:56 -07:00
Sergey Obukhov
44e70939d6 fixes mailgun/talon#89 2016-05-17 15:31:01 -07:00
Sergey Obukhov
ab6066eafa Merge pull request #87 from mailgun/sergey/1.2.6
bump up version
2016-04-07 17:54:12 -07:00
Sergey Obukhov
42258cdd36 bump up version 2016-04-07 17:51:48 -07:00
Sergey Obukhov
d3de9e6893 Merge pull request #86 from dougkeen/master
Fix #85 (exception when stripping gmail quotes)
2016-04-07 17:47:38 -07:00
Doug Keen
333beb94af Fix #85 (exception when stripping gmail quotes) 2016-04-04 14:22:50 -07:00
Sergey Obukhov
f3c0942c49 Merge pull request #80 from mailgun/sergey/12
fixes mailgun/talon#12
2016-03-04 13:33:46 -08:00
Sergey Obukhov
02adf53ab9 fixes mailgun/talon#12 2016-03-04 13:14:50 -08:00
Sergey Obukhov
3497b5cab4 Merge pull request #79 from mailgun/sergey/version
bump version
2016-02-29 15:13:51 -08:00
Sergey Obukhov
9c17dca17c bump version 2016-02-29 14:50:52 -08:00
Sergey Obukhov
de342d3177 Merge pull request #78 from defkev/master
Added Zimbra HTML quotation extraction
2016-02-29 14:14:09 -08:00
defkev
743b452daf Added Zimbra HTML quotation extraction 2016-02-21 16:56:52 +01:00
Sergey Obukhov
c762f3c337 Merge pull request #77 from mailgun/sergey/fix-gmail-fwd
fixes mailgun/talon#18
2016-02-19 19:08:37 -08:00
Sergey Obukhov
31803d41bc fixes mailgun/talon#18 2016-02-19 19:07:10 -08:00
Sergey Obukhov
2ecd9779fc bump up version 2016-02-19 18:32:07 -08:00
Sergey Obukhov
5a7047233e Merge pull request #76 from mailgun/sergey/fix-date-splitter
fixes mailgun/talon#19
2016-02-19 18:28:23 -08:00
Sergey Obukhov
999e9c3725 fixes mailgun/talon#19 2016-02-19 17:53:52 -08:00
Sergey Obukhov
f6940fe878 bump up version 2015-12-18 19:15:58 -08:00
Sergey Obukhov
ce65ff8fc8 Merge pull request #71 from clara-labs/ms-2010-issue
First pass at handling issue with ms outlook 2010 with unenclosed quo…
2015-12-18 19:14:13 -08:00
Carlos Correa
f688d074b5 First pass at handling issue with ms outlook 2010 with unenclosed quoted text. 2015-12-10 19:16:13 -08:00
7 changed files with 225 additions and 22 deletions

View File

@@ -95,7 +95,7 @@ classifiers. The core of machine learning algorithm lays in
apply to a message (``featurespace.py``), how data sets are built
(``dataset.py``), classifiers interface (``classifier.py``).
The data used for training is taken from our personal email
Currently the data used for training is taken from our personal email
conversations and from `ENRON`_ dataset. As a result of applying our set
of features to the dataset we provide files ``classifier`` and
``train.data`` that dont have any personal information but could be

View File

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
setup(name='talon',
version='1.2.0',
version='1.2.9',
description=("Mailgun library "
"to extract message quotations and signatures."),
long_description=open("README.rst").read(),

View File

@@ -12,6 +12,7 @@ CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX)
# HTML quote indicators (tag ids)
QUOTE_IDS = ['OLK_SRC_BODY_SECTION']
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
def add_checkpoint(html_note, counter):
@@ -77,7 +78,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints):
def cut_gmail_quote(html_message):
''' Cuts the outermost block element with class gmail_quote. '''
gmail_quote = html_message.cssselect('div.gmail_quote')
if gmail_quote:
if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)):
gmail_quote[0].getparent().remove(gmail_quote[0])
return True
@@ -85,9 +86,12 @@ def cut_gmail_quote(html_message):
def cut_microsoft_quote(html_message):
''' Cuts splitter block and all following blocks. '''
splitter = html_message.xpath(
#outlook 2007, 2010
#outlook 2007, 2010 (international)
"//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
"padding:3.0pt 0cm 0cm 0cm']|"
#outlook 2007, 2010 (american)
"//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
"padding:3.0pt 0in 0in 0in']|"
#windows mail
"//div[@style='padding-top: 5px; "
"border-top-color: rgb(229, 229, 229); "
@@ -159,13 +163,40 @@ def cut_from_block(html_message):
if block:
block = block[-1]
parent_div = None
while block.getparent() is not None:
if block.tag == 'div':
block.getparent().remove(block)
parent_div = block
break
block = block.getparent()
if parent_div is not None:
maybe_body = parent_div.getparent()
# In cases where removing this enclosing div will remove all
# content, we should assume the quote is not enclosed in a tag.
parent_div_is_all_content = (
maybe_body is not None and maybe_body.tag == 'body' and
len(maybe_body.getchildren()) == 1)
if not parent_div_is_all_content:
parent = block.getparent()
next_sibling = block.getnext()
# remove all tags after found From block
# (From block and quoted message are in separate divs)
while next_sibling is not None:
parent.remove(block)
block = next_sibling
next_sibling = block.getnext()
# remove the last sibling (or the
# From block if no siblings)
if block is not None:
parent.remove(block)
return True
else:
block = block.getparent()
else:
return False
# handle the case when From: block goes right after e.g. <hr>
# and not enclosed in some tag
block = html_message.xpath(
@@ -173,7 +204,17 @@ def cut_from_block(html_message):
"//*[starts-with(mg:tail(), 'Date:')]"))
if block:
block = block[0]
if RE_FWD.match(block.getparent().text or ''):
return False
while(block.getnext() is not None):
block.getparent().remove(block.getnext())
block.getparent().remove(block)
return True
def cut_zimbra_quote(html_message):
zDivider = html_message.xpath('//hr[@data-marker="__DIVIDER__"]')
if zDivider:
zDivider[0].getparent().remove(zDivider[0])
return True

View File

@@ -137,13 +137,20 @@ RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .
SPLITTER_PATTERNS = [
RE_ORIGINAL_MESSAGE,
# <date> <person>
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S),
RE_ON_DATE_SMB_WROTE,
RE_ON_DATE_WROTE_SMB,
RE_FROM_COLON_OR_DATE_COLON,
# 02.04.2012 14:20 пользователь "bob@example.com" <
# bob@xxx.mailgun.org> написал:
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S),
# 2014-10-17 11:28 GMT+03:00 Bob <
# bob@example.com>:
re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S),
# Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>:
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
'( \S+){3,6}@\S+:')
'( \S+){3,6}@\S+:'),
# Sent from Samsung MobileName <address@example.com> wrote:
re.compile('Sent from Samsung .*@.*> wrote')
]
@@ -345,6 +352,7 @@ def extract_from_html(msg_body):
parser=html.HTMLParser(encoding="utf-8")
)
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
html_quotations.cut_zimbra_quote(html_tree) or
html_quotations.cut_blockquote(html_tree) or
html_quotations.cut_microsoft_quote(html_tree) or
html_quotations.cut_by_id(html_tree) or

View File

@@ -0,0 +1,87 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-2022-jp">
<meta name="Generator" content="Microsoft Word 14 (filtered medium)">
<style><!--
/* Font Definitions */
@font-face
{font-family:Calibri;
panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
{font-family:Tahoma;
panose-1:2 11 6 4 3 5 4 4 2 4;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
{margin:0in;
margin-bottom:.0001pt;
font-size:12.0pt;
font-family:"Times New Roman","serif";}
h3
{mso-style-priority:9;
mso-style-link:"Heading 3 Char";
mso-margin-top-alt:auto;
margin-right:0in;
mso-margin-bottom-alt:auto;
margin-left:0in;
font-size:13.5pt;
font-family:"Times New Roman","serif";
font-weight:bold;}
a:link, span.MsoHyperlink
{mso-style-priority:99;
color:blue;
text-decoration:underline;}
a:visited, span.MsoHyperlinkFollowed
{mso-style-priority:99;
color:purple;
text-decoration:underline;}
p
{mso-style-priority:99;
mso-margin-top-alt:auto;
margin-right:0in;
mso-margin-bottom-alt:auto;
margin-left:0in;
font-size:12.0pt;
font-family:"Times New Roman","serif";}
span.Heading3Char
{mso-style-name:"Heading 3 Char";
mso-style-priority:9;
mso-style-link:"Heading 3";
font-family:"Cambria","serif";
color:#4F81BD;
font-weight:bold;}
span.EmailStyle19
{mso-style-type:personal-reply;
font-family:"Calibri","sans-serif";
color:#1F497D;}
.MsoChpDefault
{mso-style-type:export-only;
font-family:"Calibri","sans-serif";}
@page WordSection1
{size:8.5in 11.0in;
margin:1.0in 1.0in 1.0in 1.0in;}
div.WordSection1
{page:WordSection1;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1" />
</o:shapelayout></xml><![endif]-->
</head>
<body lang="EN-US" link="blue" vlink="purple">
<div class="WordSection1">
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Hi. I am fine.<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Thanks,<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Alex<o:p></o:p></span></p>
<p class="MsoNormal"><b><span style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;">From:</span></b><span style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;"> Foo [mailto:foo@bar.com]
<b>On Behalf Of </b>baz@bar.com<br>
<b>Sent:</b> Monday, January 01, 2000 12:00 AM<br>
<b>To:</b> john@bar.com<br>
<b>Cc:</b> jane@bar.io<br>
<b>Subject:</b> Conversation<o:p></o:p></span></p>
<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
<p>Hello! How are you?<o:p></o:p></p>
<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
</div>
</body>
</html>

View File

@@ -131,6 +131,17 @@ def test_gmail_quote():
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_gmail_quote_compact():
msg_body = 'Reply' \
'<div class="gmail_quote">' \
'<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:' \
'<div>Test</div>' \
'</div>' \
'</div>'
eq_("<html><body><p>Reply</p></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_gmail_quote_blockquote():
msg_body = """Message
<blockquote class="gmail_quote">
@@ -268,6 +279,26 @@ def test_reply_separated_by_hr():
'', quotations.extract_from_html(REPLY_SEPARATED_BY_HR)))
def test_from_block_and_quotations_in_separate_divs():
msg_body = '''
Reply
<div>
<hr/>
<div>
<font>
<b>From: bob@example.com</b>
<b>Date: Thu, 24 Mar 2016 08:07:12 -0700</b>
</font>
</div>
<div>
Quoted message
</div>
</div>
'''
eq_('<html><body><p>Reply</p><div><hr></div></body></html>',
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def extract_reply_and_check(filename):
f = open(filename)
@@ -299,6 +330,10 @@ def test_ms_outlook_2007_reply():
extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2007.html")
def test_ms_outlook_2010_reply():
extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2010.html")
def test_thunderbird_reply():
extract_reply_and_check("tests/fixtures/html_replies/thunderbird.html")
@@ -336,3 +371,10 @@ def test_CRLF():
assert_false(symbol in extracted)
eq_("<html><body><p>Reply</p></body></html>",
RE_WHITESPACE.sub('', extracted))
def test_gmail_forwarded_msg():
msg_body = """<div dir="ltr"><br><div class="gmail_quote">---------- Forwarded message ----------<br>From: <b class="gmail_sendername">Bob</b> <span dir="ltr">&lt;<a href="mailto:bob@example.com">bob@example.com</a>&gt;</span><br>Date: Fri, Feb 11, 2010 at 5:59 PM<br>Subject: Bob WFH today<br>To: Mary &lt;<a href="mailto:mary@example.com">mary@example.com</a>&gt;<br><br><br><div dir="ltr">eom</div>
</div><br></div>"""
extracted = quotations.extract_from_html(msg_body)
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))

View File

@@ -32,6 +32,19 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote:
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_pattern_sent_from_samsung_smb_wrote():
msg_body = """Test reply
Sent from Samsung MobileName <address@example.com> wrote:
>
> Test
>
> Roman"""
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_pattern_on_date_wrote_somebody():
eq_('Lorem', quotations.extract_from_plain(
"""Lorem
@@ -54,6 +67,18 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote:
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_date_time_email_splitter():
msg_body = """Test reply
2014-10-17 11:28 GMT+03:00 Postmaster <
postmaster@sandboxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.mailgun.org>:
> First from site
>
"""
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_pattern_on_date_somebody_wrote_allows_space_in_front():
msg_body = """Thanks Thanmai
On Mar 8, 2012 9:59 AM, "Example.com" <