Compare commits
	
		
			11 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 5bcf7403ad | ||
|  | 2d6c092b65 | ||
|  | 6d0689cad6 | ||
|  | 3f80e93ee0 | ||
|  | 1b18abab1d | ||
|  | 03dd5af5ab | ||
|  | dfba82b07c | ||
|  | 08ca02c87f | ||
|  | b61f4ec095 | ||
|  | 9dbe6a494b | ||
|  | 44e70939d6 | 
| @@ -95,7 +95,7 @@ classifiers. The core of machine learning algorithm lays in | |||||||
| apply to a message (``featurespace.py``), how data sets are built | apply to a message (``featurespace.py``), how data sets are built | ||||||
| (``dataset.py``), classifier’s interface (``classifier.py``). | (``dataset.py``), classifier’s interface (``classifier.py``). | ||||||
|  |  | ||||||
| The data used for training is taken from our personal email | Currently the data used for training is taken from our personal email | ||||||
| conversations and from `ENRON`_ dataset. As a result of applying our set | conversations and from `ENRON`_ dataset. As a result of applying our set | ||||||
| of features to the dataset we provide files ``classifier`` and | of features to the dataset we provide files ``classifier`` and | ||||||
| ``train.data`` that don’t have any personal information but could be | ``train.data`` that don’t have any personal information but could be | ||||||
|   | |||||||
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							| @@ -2,7 +2,7 @@ from setuptools import setup, find_packages | |||||||
|  |  | ||||||
|  |  | ||||||
| setup(name='talon', | setup(name='talon', | ||||||
|       version='1.2.6', |       version='1.2.9', | ||||||
|       description=("Mailgun library " |       description=("Mailgun library " | ||||||
|                    "to extract message quotations and signatures."), |                    "to extract message quotations and signatures."), | ||||||
|       long_description=open("README.rst").read(), |       long_description=open("README.rst").read(), | ||||||
|   | |||||||
| @@ -86,9 +86,12 @@ def cut_gmail_quote(html_message): | |||||||
| def cut_microsoft_quote(html_message): | def cut_microsoft_quote(html_message): | ||||||
|     ''' Cuts splitter block and all following blocks. ''' |     ''' Cuts splitter block and all following blocks. ''' | ||||||
|     splitter = html_message.xpath( |     splitter = html_message.xpath( | ||||||
|         #outlook 2007, 2010 |         #outlook 2007, 2010 (international) | ||||||
|         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" |         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" | ||||||
|         "padding:3.0pt 0cm 0cm 0cm']|" |         "padding:3.0pt 0cm 0cm 0cm']|" | ||||||
|  |         #outlook 2007, 2010 (american) | ||||||
|  |         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" | ||||||
|  |         "padding:3.0pt 0in 0in 0in']|" | ||||||
|         #windows mail |         #windows mail | ||||||
|         "//div[@style='padding-top: 5px; " |         "//div[@style='padding-top: 5px; " | ||||||
|         "border-top-color: rgb(229, 229, 229); " |         "border-top-color: rgb(229, 229, 229); " | ||||||
| @@ -175,7 +178,21 @@ def cut_from_block(html_message): | |||||||
|                 len(maybe_body.getchildren()) == 1) |                 len(maybe_body.getchildren()) == 1) | ||||||
|  |  | ||||||
|             if not parent_div_is_all_content: |             if not parent_div_is_all_content: | ||||||
|                 block.getparent().remove(block) |                 parent = block.getparent() | ||||||
|  |                 next_sibling = block.getnext() | ||||||
|  |  | ||||||
|  |                 # remove all tags after found From block | ||||||
|  |                 # (From block and quoted message are in separate divs) | ||||||
|  |                 while next_sibling is not None: | ||||||
|  |                     parent.remove(block) | ||||||
|  |                     block = next_sibling | ||||||
|  |                     next_sibling = block.getnext() | ||||||
|  |  | ||||||
|  |                 # remove the last sibling (or the | ||||||
|  |                 # From block if no siblings) | ||||||
|  |                 if block is not None: | ||||||
|  |                     parent.remove(block) | ||||||
|  |  | ||||||
|                 return True |                 return True | ||||||
|         else: |         else: | ||||||
|             return False |             return False | ||||||
|   | |||||||
| @@ -279,6 +279,26 @@ def test_reply_separated_by_hr(): | |||||||
|             '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) |             '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_from_block_and_quotations_in_separate_divs(): | ||||||
|  |     msg_body = ''' | ||||||
|  | Reply | ||||||
|  | <div> | ||||||
|  |   <hr/> | ||||||
|  |   <div> | ||||||
|  |     <font> | ||||||
|  |       <b>From: bob@example.com</b> | ||||||
|  |       <b>Date: Thu, 24 Mar 2016 08:07:12 -0700</b> | ||||||
|  |     </font> | ||||||
|  |   </div> | ||||||
|  |   <div> | ||||||
|  |     Quoted message | ||||||
|  |   </div> | ||||||
|  | </div> | ||||||
|  | ''' | ||||||
|  |     eq_('<html><body><p>Reply</p><div><hr></div></body></html>', | ||||||
|  |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| def extract_reply_and_check(filename): | def extract_reply_and_check(filename): | ||||||
|     f = open(filename) |     f = open(filename) | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user