| 
									
										
										
										
											2014-07-23 21:12:54 -07:00
										 |  |  | # -*- coding: utf-8 -*- | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2016-07-12 17:25:46 +05:00
										 |  |  | from __future__ import absolute_import | 
					
						
							| 
									
										
										
										
											2014-07-23 21:12:54 -07:00
										 |  |  | from ... import * | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import regex as re | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from talon.signature.learning import helpers as h | 
					
						
							|  |  |  | from talon.signature.learning.helpers import * | 
					
						
							| 
									
										
										
										
											2016-07-12 17:25:46 +05:00
										 |  |  | from six.moves import range | 
					
						
							| 
									
										
										
										
											2014-07-23 21:12:54 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | # First testing regex constants. | 
					
						
							|  |  |  | VALID = '''
 | 
					
						
							|  |  |  | 15615552323 | 
					
						
							|  |  |  | 1-561-555-1212 | 
					
						
							|  |  |  | 5613333 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 18008793262 | 
					
						
							|  |  |  | 800-879-3262 | 
					
						
							|  |  |  | 0-800.879.3262 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 04 3452488 | 
					
						
							|  |  |  | 04 -3452488 | 
					
						
							|  |  |  | 04 - 3452499 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | (610) 310-5555 x5555 | 
					
						
							|  |  |  | 533-1123 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | (021)1234567 | 
					
						
							|  |  |  | (021)123456 | 
					
						
							|  |  |  | (000)000000 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | +7 920 34 57 23 | 
					
						
							|  |  |  | +7(920) 34 57 23 | 
					
						
							|  |  |  | +7(920)345723 | 
					
						
							|  |  |  | +7920345723 | 
					
						
							|  |  |  | 8920345723 | 
					
						
							|  |  |  | 21143 | 
					
						
							|  |  |  | 2-11-43 | 
					
						
							|  |  |  | 2 - 11 - 43 | 
					
						
							|  |  |  | '''
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | VALID_PHONE_NUMBERS = [e.strip() for e in VALID.splitlines() if e.strip()] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_match_phone_numbers(): | 
					
						
							|  |  |  |     for phone in VALID_PHONE_NUMBERS: | 
					
						
							| 
									
										
										
										
											2015-03-08 00:36:19 -05:00
										 |  |  |         ok_(RE_RELAX_PHONE.search(phone), "{} should be matched".format(phone)) | 
					
						
							| 
									
										
										
										
											2014-07-23 21:12:54 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_match_names(): | 
					
						
							|  |  |  |     names = ['John R. Doe'] | 
					
						
							|  |  |  |     for name in names: | 
					
						
							|  |  |  |         ok_(RE_NAME.match(name), "{} should be matched".format(name)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Now test helpers functions | 
					
						
							|  |  |  | def test_binary_regex_search(): | 
					
						
							|  |  |  |     eq_(1, h.binary_regex_search(re.compile("12"))("12")) | 
					
						
							|  |  |  |     eq_(0, h.binary_regex_search(re.compile("12"))("34")) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def binary_regex_match(prog): | 
					
						
							|  |  |  |     eq_(1, h.binary_regex_match(re.compile("12"))("12 3")) | 
					
						
							|  |  |  |     eq_(0, h.binary_regex_match(re.compile("12"))("3 12")) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_flatten_list(): | 
					
						
							|  |  |  |     eq_([1, 2, 3, 4, 5], h.flatten_list([[1, 2], [3, 4, 5]])) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @patch.object(h.re, 'compile') | 
					
						
							|  |  |  | def test_contains_sender_names(re_compile): | 
					
						
							|  |  |  |     with patch.object(h, 'extract_names', | 
					
						
							|  |  |  |                       Mock(return_value=['bob', 'smith'])) as extract_names: | 
					
						
							|  |  |  |         has_sender_names = h.contains_sender_names("bob.smith@example.com") | 
					
						
							|  |  |  |         extract_names.assert_called_with("bob.smith@example.com") | 
					
						
							|  |  |  |         for name in ["bob", "Bob", "smith", "Smith"]: | 
					
						
							|  |  |  |             ok_(has_sender_names(name)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         extract_names.return_value = '' | 
					
						
							|  |  |  |         has_sender_names = h.contains_sender_names("bob.smith@example.com") | 
					
						
							|  |  |  |         # if no names could be extracted fallback to the email address | 
					
						
							|  |  |  |         ok_(has_sender_names('bob.smith@example.com')) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # don't crash if there are no sender | 
					
						
							|  |  |  |         extract_names.return_value = '' | 
					
						
							|  |  |  |         has_sender_names = h.contains_sender_names("") | 
					
						
							|  |  |  |         assert_false(has_sender_names('')) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_extract_names(): | 
					
						
							|  |  |  |     senders_names = { | 
					
						
							|  |  |  |         # from example dataset | 
					
						
							|  |  |  |         ('Jay Rickerts <eCenter@example.com>@EXAMPLE <XXX-Jay+20Rickerts' | 
					
						
							|  |  |  |          '+20+3CeCenter+40example+2Ecom+3E+40EXAMPLE@EXAMPLE.com>'): | 
					
						
							|  |  |  |         ['Jay', 'Rickerts'], | 
					
						
							|  |  |  |         # if `,` is used in sender's name | 
					
						
							|  |  |  |         'Williams III, Bill </O=EXAMPLE/OU=NA/CN=RECIPIENTS/CN=BWILLIA5>': | 
					
						
							|  |  |  |         ['Williams', 'III', 'Bill'], | 
					
						
							|  |  |  |         # if somehow `'` or `"` are used in sender's name | 
					
						
							|  |  |  |         'Laura" "Goldberg <laura.goldberg@example.com>': | 
					
						
							|  |  |  |         ['Laura', 'Goldberg'], | 
					
						
							|  |  |  |         # extract from senders email address | 
					
						
							|  |  |  |         '<sergey@xxx.ru>': ['sergey'], | 
					
						
							|  |  |  |         # extract from sender's email address | 
					
						
							|  |  |  |         # if dots are used in the email address | 
					
						
							|  |  |  |         '<sergey.obukhov@xxx.ru>': ['sergey', 'obukhov'], | 
					
						
							|  |  |  |         # extract from sender's email address | 
					
						
							|  |  |  |         # if dashes are used in the email address | 
					
						
							|  |  |  |         '<sergey-obukhov@xxx.ru>': ['sergey', 'obukhov'], | 
					
						
							|  |  |  |         # extract from sender's email address | 
					
						
							|  |  |  |         # if `_` are used in the email address | 
					
						
							|  |  |  |         '<sergey_obukhov@xxx.ru>': ['sergey', 'obukhov'], | 
					
						
							|  |  |  |         # old style From field, found in jangada dataset | 
					
						
							|  |  |  |         'wcl@example.com (Wayne Long)': ['Wayne', 'Long'], | 
					
						
							|  |  |  |         # if only sender's name provided | 
					
						
							|  |  |  |         'Wayne Long': ['Wayne', 'Long'], | 
					
						
							|  |  |  |         # if middle name is shortened with dot | 
					
						
							|  |  |  |         'Sergey N.  Obukhov <serobnic@xxx.ru>': ['Sergey', 'Obukhov'], | 
					
						
							|  |  |  |         # not only spaces could be used as name splitters | 
					
						
							|  |  |  |         '  Sergey  Obukhov  <serobnic@xxx.ru>': ['Sergey', 'Obukhov'], | 
					
						
							|  |  |  |         # finally normal example | 
					
						
							|  |  |  |         'Sergey <serobnic@xxx.ru>': ['Sergey'], | 
					
						
							|  |  |  |         # if middle name is shortened with `,` | 
					
						
							|  |  |  |         'Sergey N, Obukhov': ['Sergey', 'Obukhov'], | 
					
						
							|  |  |  |         # if mailto used with email address and sender's name is specified | 
					
						
							|  |  |  |         'Sergey N, Obukhov [mailto: serobnic@xxx.ru]': ['Sergey', 'Obukhov'], | 
					
						
							|  |  |  |         # when only email address is given | 
					
						
							|  |  |  |         'serobnic@xxx.ru': ['serobnic'], | 
					
						
							|  |  |  |         # when nothing is given | 
					
						
							|  |  |  |         '': [], | 
					
						
							|  |  |  |         # if phone is specified in the `From:` header | 
					
						
							|  |  |  |         'wcl@example.com (Wayne Long +7 920 -256 - 35-09)': ['Wayne', 'Long'], | 
					
						
							|  |  |  |         # from crash reports `nothing to repeat` | 
					
						
							|  |  |  |         '* * * * <the_pod1@example.com>': ['the', 'pod'], | 
					
						
							|  |  |  |         '"**Bobby B**" <copymycashsystem@example.com>': | 
					
						
							|  |  |  |         ['Bobby', 'copymycashsystem'], | 
					
						
							|  |  |  |         # from crash reports `bad escape` | 
					
						
							|  |  |  |         '"M Ali B Azlan \(GHSE/PETH\)" <aliazlan@example.com>': | 
					
						
							|  |  |  |         ['Ali', 'Azlan'], | 
					
						
							|  |  |  |         ('"Ridthauddin B A Rahim \(DD/PCSB\)"' | 
					
						
							|  |  |  |          ' <ridthauddin_arahim@example.com>'): ['Ridthauddin', 'Rahim'], | 
					
						
							|  |  |  |         ('"Boland, Patrick \(Global Xxx Group, Ireland \)"' | 
					
						
							|  |  |  |          ' <Patrick.Boland@example.com>'): ['Boland', 'Patrick'], | 
					
						
							|  |  |  |         '"Mates Rate \(Wine\)" <amen@example.com.com>': | 
					
						
							|  |  |  |         ['Mates', 'Rate', 'Wine'], | 
					
						
							|  |  |  |         ('"Morgan, Paul \(Business Xxx RI, Xxx Xxx Group\)"' | 
					
						
							|  |  |  |          ' <paul.morgan@example.com>'): ['Morgan', 'Paul'], | 
					
						
							|  |  |  |         '"David DECOSTER \(Domicile\)" <decosterdavid@xxx.be>': | 
					
						
							|  |  |  |         ['David', 'DECOSTER', 'Domicile'] | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for sender, expected_names in senders_names.items(): | 
					
						
							|  |  |  |         extracted_names = h.extract_names(sender) | 
					
						
							|  |  |  |         # check that extracted names could be compiled | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             re.compile("|".join(extracted_names)) | 
					
						
							| 
									
										
										
										
											2016-07-12 17:25:46 +05:00
										 |  |  |         except Exception as e: | 
					
						
							| 
									
										
										
										
											2014-07-23 21:12:54 -07:00
										 |  |  |             ok_(False, ("Failed to compile extracted names {}" | 
					
						
							|  |  |  |                         "\n\nReason: {}").format(extracted_names, e)) | 
					
						
							|  |  |  |         if expected_names: | 
					
						
							|  |  |  |             for name in expected_names: | 
					
						
							|  |  |  |                 assert_in(name, extracted_names) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             eq_(expected_names, extracted_names) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # words like `ru`, `gmail`, `com`, `org`, etc. are not considered | 
					
						
							|  |  |  |     # sender's names | 
					
						
							|  |  |  |     for word in h.BAD_SENDER_NAMES: | 
					
						
							|  |  |  |         eq_(h.extract_names(word), []) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # duplicates are not allowed | 
					
						
							|  |  |  |     eq_(h.extract_names("sergey <sergey@example.com"), ["sergey"]) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_categories_percent(): | 
					
						
							|  |  |  |     eq_(0.0, h.categories_percent("qqq ggg hhh", ["Po"])) | 
					
						
							|  |  |  |     eq_(50.0, h.categories_percent("q,w.", ["Po"])) | 
					
						
							|  |  |  |     eq_(0.0, h.categories_percent("qqq ggg hhh", ["Nd"])) | 
					
						
							|  |  |  |     eq_(50.0, h.categories_percent("q5", ["Nd"])) | 
					
						
							|  |  |  |     eq_(50.0, h.categories_percent("s.s,5s", ["Po", "Nd"])) | 
					
						
							|  |  |  |     eq_(0.0, h.categories_percent("", ["Po", "Nd"])) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @patch.object(h, 'categories_percent') | 
					
						
							|  |  |  | def test_punctuation_percent(categories_percent): | 
					
						
							|  |  |  |     h.punctuation_percent("qqq") | 
					
						
							|  |  |  |     categories_percent.assert_called_with("qqq", ['Po']) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_capitalized_words_percent(): | 
					
						
							|  |  |  |     eq_(0.0, h.capitalized_words_percent('')) | 
					
						
							|  |  |  |     eq_(100.0, h.capitalized_words_percent('Example Corp')) | 
					
						
							| 
									
										
										
										
											2016-07-19 16:22:04 -07:00
										 |  |  |     eq_(50.0, h.capitalized_words_percent('Qqq qqq Aqs 123 sss')) | 
					
						
							| 
									
										
										
										
											2014-07-23 21:12:54 -07:00
										 |  |  |     eq_(100.0, h.capitalized_words_percent('Cell 713-444-7368')) | 
					
						
							|  |  |  |     eq_(100.0, h.capitalized_words_percent('8th Floor')) | 
					
						
							|  |  |  |     eq_(0.0, h.capitalized_words_percent('(212) 230-9276')) | 
					
						
							| 
									
										
										
										
											2016-07-19 16:22:04 -07:00
										 |  |  |     eq_(50.0, h.capitalized_words_percent('Password: REMARKABLE')) | 
					
						
							| 
									
										
										
										
											2014-07-23 21:12:54 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_has_signature(): | 
					
						
							|  |  |  |     ok_(h.has_signature('sender', 'sender@example.com')) | 
					
						
							|  |  |  |     ok_(h.has_signature('http://www.example.com\n555 555 5555', | 
					
						
							|  |  |  |                         'sender@example.com')) | 
					
						
							|  |  |  |     ok_(h.has_signature('http://www.example.com\naddress@example.com', | 
					
						
							|  |  |  |                         'sender@example.com')) | 
					
						
							|  |  |  |     assert_false(h.has_signature('http://www.example.com/555-555-5555', | 
					
						
							|  |  |  |                                  'sender@example.com')) | 
					
						
							| 
									
										
										
										
											2016-07-12 17:25:46 +05:00
										 |  |  |     long_line = ''.join(['q' for e in range(28)]) | 
					
						
							| 
									
										
										
										
											2014-07-23 21:12:54 -07:00
										 |  |  |     assert_false(h.has_signature(long_line + ' sender', 'sender@example.com')) | 
					
						
							|  |  |  |     # wont crash on an empty string | 
					
						
							|  |  |  |     assert_false(h.has_signature('', '')) | 
					
						
							|  |  |  |     # dont consider empty strings when analysing signature | 
					
						
							|  |  |  |     with patch.object(h, 'SIGNATURE_MAX_LINES', 1): | 
					
						
							|  |  |  |         ok_('sender\n\n', 'sender@example.com') |