| 
									
										
										
										
											2014-07-23 21:12:54 -07:00
										 |  |  | # -*- coding: utf-8 -*- | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-07-12 17:25:46 +05:00
										 |  |  | from __future__ import absolute_import | 
					
						
							| 
									
										
										
										
											2014-07-23 21:12:54 -07:00
										 |  |  | from ... import * | 
					
						
							|  |  |  | import os | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-03-08 00:06:01 -05:00
										 |  |  | from numpy import genfromtxt | 
					
						
							| 
									
										
										
										
											2014-07-23 21:12:54 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | from talon.signature.learning import dataset as d | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from talon.signature.learning.featurespace import features | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_is_sender_filename(): | 
					
						
							|  |  |  |     assert_false(d.is_sender_filename("foo/bar")) | 
					
						
							|  |  |  |     assert_false(d.is_sender_filename("foo/bar_body")) | 
					
						
							|  |  |  |     ok_(d.is_sender_filename("foo/bar_sender")) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_build_sender_filename(): | 
					
						
							|  |  |  |     eq_("foo/bar_sender", d.build_sender_filename("foo/bar_body")) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_parse_msg_sender(): | 
					
						
							|  |  |  |     sender, msg = d.parse_msg_sender(EML_MSG_FILENAME) | 
					
						
							|  |  |  |     # if the message in eml format | 
					
						
							|  |  |  |     with open(EML_MSG_FILENAME) as f: | 
					
						
							|  |  |  |         eq_(sender, | 
					
						
							|  |  |  |             " Alex Q <xxx@yahoo.com>") | 
					
						
							|  |  |  |         eq_(msg, f.read()) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # if the message sender is stored in a separate file | 
					
						
							|  |  |  |     sender, msg = d.parse_msg_sender(MSG_FILENAME_WITH_BODY_SUFFIX) | 
					
						
							|  |  |  |     with open(MSG_FILENAME_WITH_BODY_SUFFIX) as f: | 
					
						
							|  |  |  |         eq_(sender, u"john@example.com") | 
					
						
							|  |  |  |         eq_(msg, f.read()) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_build_extraction_dataset(): | 
					
						
							|  |  |  |     if os.path.exists(os.path.join(TMP_DIR, 'extraction.data')): | 
					
						
							|  |  |  |         os.remove(os.path.join(TMP_DIR, 'extraction.data')) | 
					
						
							|  |  |  |     d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'), | 
					
						
							|  |  |  |                                os.path.join(TMP_DIR, | 
					
						
							|  |  |  |                                             'extraction.data'), 1) | 
					
						
							| 
									
										
										
										
											2015-03-08 00:06:01 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  |     filename = os.path.join(TMP_DIR, 'extraction.data') | 
					
						
							|  |  |  |     file_data = genfromtxt(filename, delimiter=",") | 
					
						
							|  |  |  |     test_data = file_data[:, :-1] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2014-07-23 21:12:54 -07:00
										 |  |  |     # the result is a loadable signature extraction dataset | 
					
						
							|  |  |  |     # 32 comes from 3 emails in emails/P folder, 11 lines checked to be | 
					
						
							|  |  |  |     # a signature, one email has only 10 lines | 
					
						
							| 
									
										
										
										
											2015-03-08 00:06:01 -05:00
										 |  |  |     eq_(test_data.shape[0], 32) | 
					
						
							|  |  |  |     eq_(len(features('')), test_data.shape[1]) |