217 lines
		
	
	
		
			7.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			217 lines
		
	
	
		
			7.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # -*- coding: utf-8 -*-
 | |
| 
 | |
| from __future__ import absolute_import
 | |
| from ... import *
 | |
| 
 | |
| import regex as re
 | |
| 
 | |
| from talon.signature.learning import helpers as h
 | |
| from talon.signature.learning.helpers import *
 | |
| from six.moves import range
 | |
| 
 | |
| # First testing regex constants.
 | |
| VALID = '''
 | |
| 15615552323
 | |
| 1-561-555-1212
 | |
| 5613333
 | |
| 
 | |
| 18008793262
 | |
| 800-879-3262
 | |
| 0-800.879.3262
 | |
| 
 | |
| 04 3452488
 | |
| 04 -3452488
 | |
| 04 - 3452499
 | |
| 
 | |
| (610) 310-5555 x5555
 | |
| 533-1123
 | |
| 
 | |
| (021)1234567
 | |
| (021)123456
 | |
| (000)000000
 | |
| 
 | |
| +7 920 34 57 23
 | |
| +7(920) 34 57 23
 | |
| +7(920)345723
 | |
| +7920345723
 | |
| 8920345723
 | |
| 21143
 | |
| 2-11-43
 | |
| 2 - 11 - 43
 | |
| '''
 | |
| 
 | |
| VALID_PHONE_NUMBERS = [e.strip() for e in VALID.splitlines() if e.strip()]
 | |
| 
 | |
| 
 | |
| def test_match_phone_numbers():
 | |
|     for phone in VALID_PHONE_NUMBERS:
 | |
|         ok_(RE_RELAX_PHONE.search(phone), "{} should be matched".format(phone))
 | |
| 
 | |
| 
 | |
| def test_match_names():
 | |
|     names = ['John R. Doe']
 | |
|     for name in names:
 | |
|         ok_(RE_NAME.match(name), "{} should be matched".format(name))
 | |
| 
 | |
| 
 | |
| # Now test helpers functions
 | |
| def test_binary_regex_search():
 | |
|     eq_(1, h.binary_regex_search(re.compile("12"))("12"))
 | |
|     eq_(0, h.binary_regex_search(re.compile("12"))("34"))
 | |
| 
 | |
| 
 | |
| def binary_regex_match(prog):
 | |
|     eq_(1, h.binary_regex_match(re.compile("12"))("12 3"))
 | |
|     eq_(0, h.binary_regex_match(re.compile("12"))("3 12"))
 | |
| 
 | |
| 
 | |
| def test_flatten_list():
 | |
|     eq_([1, 2, 3, 4, 5], h.flatten_list([[1, 2], [3, 4, 5]]))
 | |
| 
 | |
| 
 | |
| @patch.object(h.re, 'compile')
 | |
| def test_contains_sender_names(re_compile):
 | |
|     with patch.object(h, 'extract_names',
 | |
|                       Mock(return_value=['bob', 'smith'])) as extract_names:
 | |
|         has_sender_names = h.contains_sender_names("bob.smith@example.com")
 | |
|         extract_names.assert_called_with("bob.smith@example.com")
 | |
|         for name in ["bob", "Bob", "smith", "Smith"]:
 | |
|             ok_(has_sender_names(name))
 | |
| 
 | |
|         extract_names.return_value = ''
 | |
|         has_sender_names = h.contains_sender_names("bob.smith@example.com")
 | |
|         # if no names could be extracted fallback to the email address
 | |
|         ok_(has_sender_names('bob.smith@example.com'))
 | |
| 
 | |
|         # don't crash if there are no sender
 | |
|         extract_names.return_value = ''
 | |
|         has_sender_names = h.contains_sender_names("")
 | |
|         assert_false(has_sender_names(''))
 | |
| 
 | |
| 
 | |
| def test_extract_names():
 | |
|     senders_names = {
 | |
|         # from example dataset
 | |
|         ('Jay Rickerts <eCenter@example.com>@EXAMPLE <XXX-Jay+20Rickerts'
 | |
|          '+20+3CeCenter+40example+2Ecom+3E+40EXAMPLE@EXAMPLE.com>'):
 | |
|         ['Jay', 'Rickerts'],
 | |
|         # if `,` is used in sender's name
 | |
|         'Williams III, Bill </O=EXAMPLE/OU=NA/CN=RECIPIENTS/CN=BWILLIA5>':
 | |
|         ['Williams', 'III', 'Bill'],
 | |
|         # if somehow `'` or `"` are used in sender's name
 | |
|         'Laura" "Goldberg <laura.goldberg@example.com>':
 | |
|         ['Laura', 'Goldberg'],
 | |
|         # extract from senders email address
 | |
|         '<sergey@xxx.ru>': ['sergey'],
 | |
|         # extract from sender's email address
 | |
|         # if dots are used in the email address
 | |
|         '<sergey.obukhov@xxx.ru>': ['sergey', 'obukhov'],
 | |
|         # extract from sender's email address
 | |
|         # if dashes are used in the email address
 | |
|         '<sergey-obukhov@xxx.ru>': ['sergey', 'obukhov'],
 | |
|         # extract from sender's email address
 | |
|         # if `_` are used in the email address
 | |
|         '<sergey_obukhov@xxx.ru>': ['sergey', 'obukhov'],
 | |
|         # old style From field, found in jangada dataset
 | |
|         'wcl@example.com (Wayne Long)': ['Wayne', 'Long'],
 | |
|         # if only sender's name provided
 | |
|         'Wayne Long': ['Wayne', 'Long'],
 | |
|         # if middle name is shortened with dot
 | |
|         'Sergey N.  Obukhov <serobnic@xxx.ru>': ['Sergey', 'Obukhov'],
 | |
|         # not only spaces could be used as name splitters
 | |
|         '  Sergey  Obukhov  <serobnic@xxx.ru>': ['Sergey', 'Obukhov'],
 | |
|         # finally normal example
 | |
|         'Sergey <serobnic@xxx.ru>': ['Sergey'],
 | |
|         # if middle name is shortened with `,`
 | |
|         'Sergey N, Obukhov': ['Sergey', 'Obukhov'],
 | |
|         # if mailto used with email address and sender's name is specified
 | |
|         'Sergey N, Obukhov [mailto: serobnic@xxx.ru]': ['Sergey', 'Obukhov'],
 | |
|         # when only email address is given
 | |
|         'serobnic@xxx.ru': ['serobnic'],
 | |
|         # when nothing is given
 | |
|         '': [],
 | |
|         # if phone is specified in the `From:` header
 | |
|         'wcl@example.com (Wayne Long +7 920 -256 - 35-09)': ['Wayne', 'Long'],
 | |
|         # from crash reports `nothing to repeat`
 | |
|         '* * * * <the_pod1@example.com>': ['the', 'pod'],
 | |
|         '"**Bobby B**" <copymycashsystem@example.com>':
 | |
|         ['Bobby', 'copymycashsystem'],
 | |
|         # from crash reports `bad escape`
 | |
|         '"M Ali B Azlan \(GHSE/PETH\)" <aliazlan@example.com>':
 | |
|         ['Ali', 'Azlan'],
 | |
|         ('"Ridthauddin B A Rahim \(DD/PCSB\)"'
 | |
|          ' <ridthauddin_arahim@example.com>'): ['Ridthauddin', 'Rahim'],
 | |
|         ('"Boland, Patrick \(Global Xxx Group, Ireland \)"'
 | |
|          ' <Patrick.Boland@example.com>'): ['Boland', 'Patrick'],
 | |
|         '"Mates Rate \(Wine\)" <amen@example.com.com>':
 | |
|         ['Mates', 'Rate', 'Wine'],
 | |
|         ('"Morgan, Paul \(Business Xxx RI, Xxx Xxx Group\)"'
 | |
|          ' <paul.morgan@example.com>'): ['Morgan', 'Paul'],
 | |
|         '"David DECOSTER \(Domicile\)" <decosterdavid@xxx.be>':
 | |
|         ['David', 'DECOSTER', 'Domicile']
 | |
|         }
 | |
| 
 | |
|     for sender, expected_names in senders_names.items():
 | |
|         extracted_names = h.extract_names(sender)
 | |
|         # check that extracted names could be compiled
 | |
|         try:
 | |
|             re.compile("|".join(extracted_names))
 | |
|         except Exception as e:
 | |
|             ok_(False, ("Failed to compile extracted names {}"
 | |
|                         "\n\nReason: {}").format(extracted_names, e))
 | |
|         if expected_names:
 | |
|             for name in expected_names:
 | |
|                 assert_in(name, extracted_names)
 | |
|         else:
 | |
|             eq_(expected_names, extracted_names)
 | |
| 
 | |
|     # words like `ru`, `gmail`, `com`, `org`, etc. are not considered
 | |
|     # sender's names
 | |
|     for word in h.BAD_SENDER_NAMES:
 | |
|         eq_(h.extract_names(word), [])
 | |
| 
 | |
|     # duplicates are not allowed
 | |
|     eq_(h.extract_names("sergey <sergey@example.com"), ["sergey"])
 | |
| 
 | |
| 
 | |
| def test_categories_percent():
 | |
|     eq_(0.0, h.categories_percent("qqq ggg hhh", ["Po"]))
 | |
|     eq_(50.0, h.categories_percent("q,w.", ["Po"]))
 | |
|     eq_(0.0, h.categories_percent("qqq ggg hhh", ["Nd"]))
 | |
|     eq_(50.0, h.categories_percent("q5", ["Nd"]))
 | |
|     eq_(50.0, h.categories_percent("s.s,5s", ["Po", "Nd"]))
 | |
|     eq_(0.0, h.categories_percent("", ["Po", "Nd"]))
 | |
| 
 | |
| 
 | |
| @patch.object(h, 'categories_percent')
 | |
| def test_punctuation_percent(categories_percent):
 | |
|     h.punctuation_percent("qqq")
 | |
|     categories_percent.assert_called_with("qqq", ['Po'])
 | |
| 
 | |
| 
 | |
| def test_capitalized_words_percent():
 | |
|     eq_(0.0, h.capitalized_words_percent(''))
 | |
|     eq_(100.0, h.capitalized_words_percent('Example Corp'))
 | |
|     eq_(50.0, h.capitalized_words_percent('Qqq qqq Aqs 123 sss'))
 | |
|     eq_(100.0, h.capitalized_words_percent('Cell 713-444-7368'))
 | |
|     eq_(100.0, h.capitalized_words_percent('8th Floor'))
 | |
|     eq_(0.0, h.capitalized_words_percent('(212) 230-9276'))
 | |
|     eq_(50.0, h.capitalized_words_percent('Password: REMARKABLE'))
 | |
| 
 | |
| 
 | |
| def test_has_signature():
 | |
|     ok_(h.has_signature('sender', 'sender@example.com'))
 | |
|     ok_(h.has_signature('http://www.example.com\n555 555 5555',
 | |
|                         'sender@example.com'))
 | |
|     ok_(h.has_signature('http://www.example.com\naddress@example.com',
 | |
|                         'sender@example.com'))
 | |
|     assert_false(h.has_signature('http://www.example.com/555-555-5555',
 | |
|                                  'sender@example.com'))
 | |
|     long_line = ''.join(['q' for e in range(28)])
 | |
|     assert_false(h.has_signature(long_line + ' sender', 'sender@example.com'))
 | |
|     # wont crash on an empty string
 | |
|     assert_false(h.has_signature('', ''))
 | |
|     # dont consider empty strings when analysing signature
 | |
|     with patch.object(h, 'SIGNATURE_MAX_LINES', 1):
 | |
|         ok_('sender\n\n', 'sender@example.com')
 |