Compare commits
	
		
			15 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 5bcf7403ad | ||
|  | 2d6c092b65 | ||
|  | 6d0689cad6 | ||
|  | 3f80e93ee0 | ||
|  | 1b18abab1d | ||
|  | 03dd5af5ab | ||
|  | dfba82b07c | ||
|  | 08ca02c87f | ||
|  | b61f4ec095 | ||
|  | 9dbe6a494b | ||
|  | 44e70939d6 | ||
|  | ab6066eafa | ||
|  | 42258cdd36 | ||
|  | d3de9e6893 | ||
|  | 333beb94af | 
| @@ -95,7 +95,7 @@ classifiers. The core of machine learning algorithm lays in | |||||||
| apply to a message (``featurespace.py``), how data sets are built | apply to a message (``featurespace.py``), how data sets are built | ||||||
| (``dataset.py``), classifier’s interface (``classifier.py``). | (``dataset.py``), classifier’s interface (``classifier.py``). | ||||||
|  |  | ||||||
| The data used for training is taken from our personal email | Currently the data used for training is taken from our personal email | ||||||
| conversations and from `ENRON`_ dataset. As a result of applying our set | conversations and from `ENRON`_ dataset. As a result of applying our set | ||||||
| of features to the dataset we provide files ``classifier`` and | of features to the dataset we provide files ``classifier`` and | ||||||
| ``train.data`` that don’t have any personal information but could be | ``train.data`` that don’t have any personal information but could be | ||||||
|   | |||||||
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							| @@ -2,7 +2,7 @@ from setuptools import setup, find_packages | |||||||
|  |  | ||||||
|  |  | ||||||
| setup(name='talon', | setup(name='talon', | ||||||
|       version='1.2.5', |       version='1.2.9', | ||||||
|       description=("Mailgun library " |       description=("Mailgun library " | ||||||
|                    "to extract message quotations and signatures."), |                    "to extract message quotations and signatures."), | ||||||
|       long_description=open("README.rst").read(), |       long_description=open("README.rst").read(), | ||||||
|   | |||||||
| @@ -78,7 +78,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints): | |||||||
| def cut_gmail_quote(html_message): | def cut_gmail_quote(html_message): | ||||||
|     ''' Cuts the outermost block element with class gmail_quote. ''' |     ''' Cuts the outermost block element with class gmail_quote. ''' | ||||||
|     gmail_quote = html_message.cssselect('div.gmail_quote') |     gmail_quote = html_message.cssselect('div.gmail_quote') | ||||||
|     if gmail_quote and not RE_FWD.match(gmail_quote[0].text): |     if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)): | ||||||
|         gmail_quote[0].getparent().remove(gmail_quote[0]) |         gmail_quote[0].getparent().remove(gmail_quote[0]) | ||||||
|         return True |         return True | ||||||
|  |  | ||||||
| @@ -86,9 +86,12 @@ def cut_gmail_quote(html_message): | |||||||
| def cut_microsoft_quote(html_message): | def cut_microsoft_quote(html_message): | ||||||
|     ''' Cuts splitter block and all following blocks. ''' |     ''' Cuts splitter block and all following blocks. ''' | ||||||
|     splitter = html_message.xpath( |     splitter = html_message.xpath( | ||||||
|         #outlook 2007, 2010 |         #outlook 2007, 2010 (international) | ||||||
|         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" |         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" | ||||||
|         "padding:3.0pt 0cm 0cm 0cm']|" |         "padding:3.0pt 0cm 0cm 0cm']|" | ||||||
|  |         #outlook 2007, 2010 (american) | ||||||
|  |         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" | ||||||
|  |         "padding:3.0pt 0in 0in 0in']|" | ||||||
|         #windows mail |         #windows mail | ||||||
|         "//div[@style='padding-top: 5px; " |         "//div[@style='padding-top: 5px; " | ||||||
|         "border-top-color: rgb(229, 229, 229); " |         "border-top-color: rgb(229, 229, 229); " | ||||||
| @@ -175,7 +178,21 @@ def cut_from_block(html_message): | |||||||
|                 len(maybe_body.getchildren()) == 1) |                 len(maybe_body.getchildren()) == 1) | ||||||
|  |  | ||||||
|             if not parent_div_is_all_content: |             if not parent_div_is_all_content: | ||||||
|                 block.getparent().remove(block) |                 parent = block.getparent() | ||||||
|  |                 next_sibling = block.getnext() | ||||||
|  |  | ||||||
|  |                 # remove all tags after found From block | ||||||
|  |                 # (From block and quoted message are in separate divs) | ||||||
|  |                 while next_sibling is not None: | ||||||
|  |                     parent.remove(block) | ||||||
|  |                     block = next_sibling | ||||||
|  |                     next_sibling = block.getnext() | ||||||
|  |  | ||||||
|  |                 # remove the last sibling (or the | ||||||
|  |                 # From block if no siblings) | ||||||
|  |                 if block is not None: | ||||||
|  |                     parent.remove(block) | ||||||
|  |  | ||||||
|                 return True |                 return True | ||||||
|         else: |         else: | ||||||
|             return False |             return False | ||||||
|   | |||||||
| @@ -131,6 +131,17 @@ def test_gmail_quote(): | |||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_gmail_quote_compact(): | ||||||
|  |     msg_body = 'Reply' \ | ||||||
|  |                '<div class="gmail_quote">' \ | ||||||
|  |                '<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:' \ | ||||||
|  |                '<div>Test</div>' \ | ||||||
|  |                '</div>' \ | ||||||
|  |                '</div>' | ||||||
|  |     eq_("<html><body><p>Reply</p></body></html>", | ||||||
|  |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_gmail_quote_blockquote(): | def test_gmail_quote_blockquote(): | ||||||
|     msg_body = """Message |     msg_body = """Message | ||||||
| <blockquote class="gmail_quote"> | <blockquote class="gmail_quote"> | ||||||
| @@ -268,6 +279,26 @@ def test_reply_separated_by_hr(): | |||||||
|             '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) |             '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_from_block_and_quotations_in_separate_divs(): | ||||||
|  |     msg_body = ''' | ||||||
|  | Reply | ||||||
|  | <div> | ||||||
|  |   <hr/> | ||||||
|  |   <div> | ||||||
|  |     <font> | ||||||
|  |       <b>From: bob@example.com</b> | ||||||
|  |       <b>Date: Thu, 24 Mar 2016 08:07:12 -0700</b> | ||||||
|  |     </font> | ||||||
|  |   </div> | ||||||
|  |   <div> | ||||||
|  |     Quoted message | ||||||
|  |   </div> | ||||||
|  | </div> | ||||||
|  | ''' | ||||||
|  |     eq_('<html><body><p>Reply</p><div><hr></div></body></html>', | ||||||
|  |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| def extract_reply_and_check(filename): | def extract_reply_and_check(filename): | ||||||
|     f = open(filename) |     f = open(filename) | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user