8 Commits

Author SHA1 Message Date
Sergey Obukhov
9dbe6a494b Merge pull request #90 from mailgun/sergey/89
fixes mailgun/talon#89
2016-05-17 16:01:56 -07:00
Sergey Obukhov
44e70939d6 fixes mailgun/talon#89 2016-05-17 15:31:01 -07:00
Sergey Obukhov
ab6066eafa Merge pull request #87 from mailgun/sergey/1.2.6
bump up version
2016-04-07 17:54:12 -07:00
Sergey Obukhov
42258cdd36 bump up version 2016-04-07 17:51:48 -07:00
Sergey Obukhov
d3de9e6893 Merge pull request #86 from dougkeen/master
Fix #85 (exception when stripping gmail quotes)
2016-04-07 17:47:38 -07:00
Doug Keen
333beb94af Fix #85 (exception when stripping gmail quotes) 2016-04-04 14:22:50 -07:00
Sergey Obukhov
f3c0942c49 Merge pull request #80 from mailgun/sergey/12
fixes mailgun/talon#12
2016-03-04 13:33:46 -08:00
Sergey Obukhov
02adf53ab9 fixes mailgun/talon#12 2016-03-04 13:14:50 -08:00
5 changed files with 65 additions and 5 deletions

View File

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
setup(name='talon',
version='1.2.4',
version='1.2.7',
description=("Mailgun library "
"to extract message quotations and signatures."),
long_description=open("README.rst").read(),

View File

@@ -78,7 +78,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints):
def cut_gmail_quote(html_message):
''' Cuts the outermost block element with class gmail_quote. '''
gmail_quote = html_message.cssselect('div.gmail_quote')
if gmail_quote and not RE_FWD.match(gmail_quote[0].text):
if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)):
gmail_quote[0].getparent().remove(gmail_quote[0])
return True
@@ -175,7 +175,21 @@ def cut_from_block(html_message):
len(maybe_body.getchildren()) == 1)
if not parent_div_is_all_content:
block.getparent().remove(block)
parent = block.getparent()
next_sibling = block.getnext()
# remove all tags after found From block
# (From block and quoted message are in separate divs)
while next_sibling is not None:
parent.remove(block)
block = next_sibling
next_sibling = block.getnext()
# remove the last sibling (or the
# From block if no siblings)
if block is not None:
parent.remove(block)
return True
else:
return False

View File

@@ -148,7 +148,9 @@ SPLITTER_PATTERNS = [
re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S),
# Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>:
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
'( \S+){3,6}@\S+:')
'( \S+){3,6}@\S+:'),
# Sent from Samsung MobileName <address@example.com> wrote:
re.compile('Sent from Samsung .*@.*> wrote')
]

View File

@@ -131,6 +131,17 @@ def test_gmail_quote():
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_gmail_quote_compact():
msg_body = 'Reply' \
'<div class="gmail_quote">' \
'<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:' \
'<div>Test</div>' \
'</div>' \
'</div>'
eq_("<html><body><p>Reply</p></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_gmail_quote_blockquote():
msg_body = """Message
<blockquote class="gmail_quote">
@@ -268,6 +279,26 @@ def test_reply_separated_by_hr():
'', quotations.extract_from_html(REPLY_SEPARATED_BY_HR)))
def test_from_block_and_quotations_in_separate_divs():
msg_body = '''
Reply
<div>
<hr/>
<div>
<font>
<b>From: bob@example.com</b>
<b>Date: Thu, 24 Mar 2016 08:07:12 -0700</b>
</font>
</div>
<div>
Quoted message
</div>
</div>
'''
eq_('<html><body><p>Reply</p><div><hr></div></body></html>',
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def extract_reply_and_check(filename):
f = open(filename)

View File

@@ -32,6 +32,19 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote:
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_pattern_sent_from_samsung_smb_wrote():
msg_body = """Test reply
Sent from Samsung MobileName <address@example.com> wrote:
>
> Test
>
> Roman"""
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_pattern_on_date_wrote_somebody():
eq_('Lorem', quotations.extract_from_plain(
"""Lorem