# -*- coding: utf-8 -*- from ... import * import regex as re from talon.signature.learning import helpers as h from talon.signature.learning.helpers import * # First testing regex constants. VALID = ''' 15615552323 1-561-555-1212 5613333 18008793262 800-879-3262 0-800.879.3262 04 3452488 04 -3452488 04 - 3452499 (610) 310-5555 x5555 533-1123 (021)1234567 (021)123456 (000)000000 +7 920 34 57 23 +7(920) 34 57 23 +7(920)345723 +7920345723 8920345723 21143 2-11-43 2 - 11 - 43 ''' VALID_PHONE_NUMBERS = [e.strip() for e in VALID.splitlines() if e.strip()] def test_match_phone_numbers(): for phone in VALID_PHONE_NUMBERS: ok_(RE_RELAX_PHONE.match(phone), "{} should be matched".format(phone)) def test_match_names(): names = ['John R. Doe'] for name in names: ok_(RE_NAME.match(name), "{} should be matched".format(name)) # Now test helpers functions def test_binary_regex_search(): eq_(1, h.binary_regex_search(re.compile("12"))("12")) eq_(0, h.binary_regex_search(re.compile("12"))("34")) def binary_regex_match(prog): eq_(1, h.binary_regex_match(re.compile("12"))("12 3")) eq_(0, h.binary_regex_match(re.compile("12"))("3 12")) def test_flatten_list(): eq_([1, 2, 3, 4, 5], h.flatten_list([[1, 2], [3, 4, 5]])) @patch.object(h.re, 'compile') def test_contains_sender_names(re_compile): with patch.object(h, 'extract_names', Mock(return_value=['bob', 'smith'])) as extract_names: has_sender_names = h.contains_sender_names("bob.smith@example.com") extract_names.assert_called_with("bob.smith@example.com") for name in ["bob", "Bob", "smith", "Smith"]: ok_(has_sender_names(name)) extract_names.return_value = '' has_sender_names = h.contains_sender_names("bob.smith@example.com") # if no names could be extracted fallback to the email address ok_(has_sender_names('bob.smith@example.com')) # don't crash if there are no sender extract_names.return_value = '' has_sender_names = h.contains_sender_names("") assert_false(has_sender_names('')) def test_extract_names(): senders_names = { # from example dataset ('Jay Rickerts @EXAMPLE '): ['Jay', 'Rickerts'], # if `,` is used in sender's name 'Williams III, Bill ': ['Williams', 'III', 'Bill'], # if somehow `'` or `"` are used in sender's name 'Laura" "Goldberg ': ['Laura', 'Goldberg'], # extract from senders email address '': ['sergey'], # extract from sender's email address # if dots are used in the email address '': ['sergey', 'obukhov'], # extract from sender's email address # if dashes are used in the email address '': ['sergey', 'obukhov'], # extract from sender's email address # if `_` are used in the email address '': ['sergey', 'obukhov'], # old style From field, found in jangada dataset 'wcl@example.com (Wayne Long)': ['Wayne', 'Long'], # if only sender's name provided 'Wayne Long': ['Wayne', 'Long'], # if middle name is shortened with dot 'Sergey N. Obukhov ': ['Sergey', 'Obukhov'], # not only spaces could be used as name splitters ' Sergey Obukhov ': ['Sergey', 'Obukhov'], # finally normal example 'Sergey ': ['Sergey'], # if middle name is shortened with `,` 'Sergey N, Obukhov': ['Sergey', 'Obukhov'], # if mailto used with email address and sender's name is specified 'Sergey N, Obukhov [mailto: serobnic@xxx.ru]': ['Sergey', 'Obukhov'], # when only email address is given 'serobnic@xxx.ru': ['serobnic'], # when nothing is given '': [], # if phone is specified in the `From:` header 'wcl@example.com (Wayne Long +7 920 -256 - 35-09)': ['Wayne', 'Long'], # from crash reports `nothing to repeat` '* * * * ': ['the', 'pod'], '"**Bobby B**" ': ['Bobby', 'copymycashsystem'], # from crash reports `bad escape` '"M Ali B Azlan \(GHSE/PETH\)" ': ['Ali', 'Azlan'], ('"Ridthauddin B A Rahim \(DD/PCSB\)"' ' '): ['Ridthauddin', 'Rahim'], ('"Boland, Patrick \(Global Xxx Group, Ireland \)"' ' '): ['Boland', 'Patrick'], '"Mates Rate \(Wine\)" ': ['Mates', 'Rate', 'Wine'], ('"Morgan, Paul \(Business Xxx RI, Xxx Xxx Group\)"' ' '): ['Morgan', 'Paul'], '"David DECOSTER \(Domicile\)" ': ['David', 'DECOSTER', 'Domicile'] } for sender, expected_names in senders_names.items(): extracted_names = h.extract_names(sender) # check that extracted names could be compiled try: re.compile("|".join(extracted_names)) except Exception, e: ok_(False, ("Failed to compile extracted names {}" "\n\nReason: {}").format(extracted_names, e)) if expected_names: for name in expected_names: assert_in(name, extracted_names) else: eq_(expected_names, extracted_names) # words like `ru`, `gmail`, `com`, `org`, etc. are not considered # sender's names for word in h.BAD_SENDER_NAMES: eq_(h.extract_names(word), []) # duplicates are not allowed eq_(h.extract_names("sergey