diff --git a/setup.py b/setup.py index a620806..0e8bf4d 100755 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages setup(name='talon', - version='1.2.2', + version='1.2.3', description=("Mailgun library " "to extract message quotations and signatures."), long_description=open("README.rst").read(), diff --git a/talon/html_quotations.py b/talon/html_quotations.py index fcf5a8a..e903b69 100644 --- a/talon/html_quotations.py +++ b/talon/html_quotations.py @@ -12,6 +12,7 @@ CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX) # HTML quote indicators (tag ids) QUOTE_IDS = ['OLK_SRC_BODY_SECTION'] +RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) def add_checkpoint(html_note, counter): @@ -77,7 +78,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints): def cut_gmail_quote(html_message): ''' Cuts the outermost block element with class gmail_quote. ''' gmail_quote = html_message.cssselect('div.gmail_quote') - if gmail_quote: + if gmail_quote and not RE_FWD.match(gmail_quote[0].text): gmail_quote[0].getparent().remove(gmail_quote[0]) return True @@ -172,6 +173,7 @@ def cut_from_block(html_message): parent_div_is_all_content = ( maybe_body is not None and maybe_body.tag == 'body' and len(maybe_body.getchildren()) == 1) + if not parent_div_is_all_content: block.getparent().remove(block) return True @@ -185,6 +187,10 @@ def cut_from_block(html_message): "//*[starts-with(mg:tail(), 'Date:')]")) if block: block = block[0] + + if RE_FWD.match(block.getparent().text or ''): + return False + while(block.getnext() is not None): block.getparent().remove(block.getnext()) block.getparent().remove(block) diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index 44f7ed2..665dbc1 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -340,3 +340,10 @@ def test_CRLF(): assert_false(symbol in extracted) eq_("

Reply

", RE_WHITESPACE.sub('', extracted)) + + +def test_gmail_forwarded_msg(): + msg_body = """

---------- Forwarded message ----------
From: Bob <bob@example.com>
Date: Fri, Feb 11, 2010 at 5:59 PM
Subject: Bob WFH today
To: Mary <mary@example.com>


eom
+

""" + extracted = quotations.extract_from_html(msg_body) + eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))