Compare commits
	
		
			8 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | f3c0942c49 | ||
|  | 02adf53ab9 | ||
|  | 3497b5cab4 | ||
|  | 9c17dca17c | ||
|  | de342d3177 | ||
|  | 743b452daf | ||
|  | c762f3c337 | ||
|  | 31803d41bc | 
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							| @@ -2,7 +2,7 @@ from setuptools import setup, find_packages | |||||||
|  |  | ||||||
|  |  | ||||||
| setup(name='talon', | setup(name='talon', | ||||||
|       version='1.2.2', |       version='1.2.5', | ||||||
|       description=("Mailgun library " |       description=("Mailgun library " | ||||||
|                    "to extract message quotations and signatures."), |                    "to extract message quotations and signatures."), | ||||||
|       long_description=open("README.rst").read(), |       long_description=open("README.rst").read(), | ||||||
|   | |||||||
| @@ -12,6 +12,7 @@ CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX) | |||||||
|  |  | ||||||
| # HTML quote indicators (tag ids) | # HTML quote indicators (tag ids) | ||||||
| QUOTE_IDS = ['OLK_SRC_BODY_SECTION'] | QUOTE_IDS = ['OLK_SRC_BODY_SECTION'] | ||||||
|  | RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) | ||||||
|  |  | ||||||
|  |  | ||||||
| def add_checkpoint(html_note, counter): | def add_checkpoint(html_note, counter): | ||||||
| @@ -77,7 +78,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints): | |||||||
| def cut_gmail_quote(html_message): | def cut_gmail_quote(html_message): | ||||||
|     ''' Cuts the outermost block element with class gmail_quote. ''' |     ''' Cuts the outermost block element with class gmail_quote. ''' | ||||||
|     gmail_quote = html_message.cssselect('div.gmail_quote') |     gmail_quote = html_message.cssselect('div.gmail_quote') | ||||||
|     if gmail_quote: |     if gmail_quote and not RE_FWD.match(gmail_quote[0].text): | ||||||
|         gmail_quote[0].getparent().remove(gmail_quote[0]) |         gmail_quote[0].getparent().remove(gmail_quote[0]) | ||||||
|         return True |         return True | ||||||
|  |  | ||||||
| @@ -172,6 +173,7 @@ def cut_from_block(html_message): | |||||||
|             parent_div_is_all_content = ( |             parent_div_is_all_content = ( | ||||||
|                 maybe_body is not None and maybe_body.tag == 'body' and |                 maybe_body is not None and maybe_body.tag == 'body' and | ||||||
|                 len(maybe_body.getchildren()) == 1) |                 len(maybe_body.getchildren()) == 1) | ||||||
|  |  | ||||||
|             if not parent_div_is_all_content: |             if not parent_div_is_all_content: | ||||||
|                 block.getparent().remove(block) |                 block.getparent().remove(block) | ||||||
|                 return True |                 return True | ||||||
| @@ -185,7 +187,17 @@ def cut_from_block(html_message): | |||||||
|          "//*[starts-with(mg:tail(), 'Date:')]")) |          "//*[starts-with(mg:tail(), 'Date:')]")) | ||||||
|     if block: |     if block: | ||||||
|         block = block[0] |         block = block[0] | ||||||
|  |  | ||||||
|  |         if RE_FWD.match(block.getparent().text or ''): | ||||||
|  |             return False | ||||||
|  |          | ||||||
|         while(block.getnext() is not None): |         while(block.getnext() is not None): | ||||||
|             block.getparent().remove(block.getnext()) |             block.getparent().remove(block.getnext()) | ||||||
|         block.getparent().remove(block) |         block.getparent().remove(block) | ||||||
|         return True |         return True | ||||||
|  |  | ||||||
|  | def cut_zimbra_quote(html_message): | ||||||
|  |     zDivider = html_message.xpath('//hr[@data-marker="__DIVIDER__"]') | ||||||
|  |     if zDivider: | ||||||
|  |         zDivider[0].getparent().remove(zDivider[0]) | ||||||
|  |         return True | ||||||
| @@ -148,7 +148,9 @@ SPLITTER_PATTERNS = [ | |||||||
|     re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S), |     re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S), | ||||||
|     # Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>: |     # Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>: | ||||||
|     re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' |     re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' | ||||||
|                '( \S+){3,6}@\S+:') |                '( \S+){3,6}@\S+:'), | ||||||
|  |     # Sent from Samsung MobileName <address@example.com> wrote: | ||||||
|  |     re.compile('Sent from Samsung .*@.*> wrote') | ||||||
|     ] |     ] | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -350,6 +352,7 @@ def extract_from_html(msg_body): | |||||||
|         parser=html.HTMLParser(encoding="utf-8") |         parser=html.HTMLParser(encoding="utf-8") | ||||||
|     ) |     ) | ||||||
|     cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or |     cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or | ||||||
|  |                       html_quotations.cut_zimbra_quote(html_tree) or | ||||||
|                       html_quotations.cut_blockquote(html_tree) or |                       html_quotations.cut_blockquote(html_tree) or | ||||||
|                       html_quotations.cut_microsoft_quote(html_tree) or |                       html_quotations.cut_microsoft_quote(html_tree) or | ||||||
|                       html_quotations.cut_by_id(html_tree) or |                       html_quotations.cut_by_id(html_tree) or | ||||||
|   | |||||||
| @@ -340,3 +340,10 @@ def test_CRLF(): | |||||||
|     assert_false(symbol in extracted)     |     assert_false(symbol in extracted)     | ||||||
|     eq_("<html><body><p>Reply</p></body></html>", |     eq_("<html><body><p>Reply</p></body></html>", | ||||||
|         RE_WHITESPACE.sub('', extracted)) |         RE_WHITESPACE.sub('', extracted)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_gmail_forwarded_msg(): | ||||||
|  |     msg_body = """<div dir="ltr"><br><div class="gmail_quote">---------- Forwarded message ----------<br>From: <b class="gmail_sendername">Bob</b> <span dir="ltr"><<a href="mailto:bob@example.com">bob@example.com</a>></span><br>Date: Fri, Feb 11, 2010 at 5:59 PM<br>Subject: Bob WFH today<br>To: Mary <<a href="mailto:mary@example.com">mary@example.com</a>><br><br><br><div dir="ltr">eom</div> | ||||||
|  | </div><br></div>""" | ||||||
|  |     extracted = quotations.extract_from_html(msg_body) | ||||||
|  |     eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) | ||||||
|   | |||||||
| @@ -32,6 +32,19 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote: | |||||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_pattern_sent_from_samsung_smb_wrote(): | ||||||
|  |     msg_body = """Test reply | ||||||
|  |  | ||||||
|  | Sent from Samsung MobileName <address@example.com> wrote: | ||||||
|  |  | ||||||
|  | > | ||||||
|  | > Test | ||||||
|  | > | ||||||
|  | > Roman""" | ||||||
|  |  | ||||||
|  |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_pattern_on_date_wrote_somebody(): | def test_pattern_on_date_wrote_somebody(): | ||||||
|     eq_('Lorem', quotations.extract_from_plain( |     eq_('Lorem', quotations.extract_from_plain( | ||||||
|     """Lorem |     """Lorem | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user