talon/tests/signature/learning/helpers_test.py

# -*- coding: utf-8 -*-

from ... import *

import regex as re

from talon.signature.learning import helpers as h
from talon.signature.learning.helpers import *

# First testing regex constants.
VALID = '''
15615552323
1-561-555-1212
5613333

18008793262
800-879-3262
0-800.879.3262

04 3452488
04 -3452488
04 - 3452499

(610) 310-5555 x5555
533-1123

(021)1234567
(021)123456
(000)000000

+7 920 34 57 23
+7(920) 34 57 23
+7(920)345723
+7920345723
8920345723
21143
2-11-43
2 - 11 - 43
'''

VALID_PHONE_NUMBERS = [e.strip() for e in VALID.splitlines() if e.strip()]


def test_match_phone_numbers():
    for phone in VALID_PHONE_NUMBERS:
        ok_(RE_RELAX_PHONE.match(phone), "{} should be matched".format(phone))


def test_match_names():
    names = ['John R. Doe']
    for name in names:
        ok_(RE_NAME.match(name), "{} should be matched".format(name))


def test_sender_with_name():
    ok_lines = ['Sergey Obukhov <serobnic@example.com>',
                '\tSergey  <serobnic@example.com>',
                ('"Doe, John (TX)"'
                 '<DowJ@example.com>@EXAMPLE'
                 '<IMCEANOTES-+22Doe+2C+20John+20'
                 '+28TX+29+22+20+3CDoeJ+40example+2Ecom+3E'
                 '+40EXAMPLE@EXAMPLE.com>'),
                ('Company Sleuth <csleuth@email.xxx.com>'
                 '@EXAMPLE <XXX-Company+20Sleuth+20+3Ccsleuth'
                 '+40email+2Exxx+2Ecom+3E+40EXAMPLE@EXAMPLE.com>'),
                ('Doe III, John '
                 '</O=EXAMPLE/OU=NA/CN=RECIPIENTS/CN=jDOE5>')]
    for line in ok_lines:
        ok_(RE_SENDER_WITH_NAME.match(line),
            '{} should be matched'.format(line))

    nok_lines = ['', '<serobnic@xxx.ru>', 'Sergey serobnic@xxx.ru']
    for line in nok_lines:
        assert_false(RE_SENDER_WITH_NAME.match(line),
                     '{} should not be matched'.format(line))


# Now test helpers functions
def test_binary_regex_search():
    eq_(1, h.binary_regex_search(re.compile("12"))("12"))
    eq_(0, h.binary_regex_search(re.compile("12"))("34"))


def binary_regex_match(prog):
    eq_(1, h.binary_regex_match(re.compile("12"))("12 3"))
    eq_(0, h.binary_regex_match(re.compile("12"))("3 12"))


def test_flatten_list():
    eq_([1, 2, 3, 4, 5], h.flatten_list([[1, 2], [3, 4, 5]]))


@patch.object(h.re, 'compile')
def test_contains_sender_names(re_compile):
    with patch.object(h, 'extract_names',
                      Mock(return_value=['bob', 'smith'])) as extract_names:
        has_sender_names = h.contains_sender_names("bob.smith@example.com")
        extract_names.assert_called_with("bob.smith@example.com")
        for name in ["bob", "Bob", "smith", "Smith"]:
            ok_(has_sender_names(name))

        extract_names.return_value = ''
        has_sender_names = h.contains_sender_names("bob.smith@example.com")
        # if no names could be extracted fallback to the email address
        ok_(has_sender_names('bob.smith@example.com'))

        # don't crash if there are no sender
        extract_names.return_value = ''
        has_sender_names = h.contains_sender_names("")
        assert_false(has_sender_names(''))


def test_extract_names():
    senders_names = {
        # from example dataset
        ('Jay Rickerts <eCenter@example.com>@EXAMPLE <XXX-Jay+20Rickerts'
         '+20+3CeCenter+40example+2Ecom+3E+40EXAMPLE@EXAMPLE.com>'):
        ['Jay', 'Rickerts'],
        # if `,` is used in sender's name
        'Williams III, Bill </O=EXAMPLE/OU=NA/CN=RECIPIENTS/CN=BWILLIA5>':
        ['Williams', 'III', 'Bill'],
        # if somehow `'` or `"` are used in sender's name
        'Laura" "Goldberg <laura.goldberg@example.com>':
        ['Laura', 'Goldberg'],
        # extract from senders email address
        '<sergey@xxx.ru>': ['sergey'],
        # extract from sender's email address
        # if dots are used in the email address
        '<sergey.obukhov@xxx.ru>': ['sergey', 'obukhov'],
        # extract from sender's email address
        # if dashes are used in the email address
        '<sergey-obukhov@xxx.ru>': ['sergey', 'obukhov'],
        # extract from sender's email address
        # if `_` are used in the email address
        '<sergey_obukhov@xxx.ru>': ['sergey', 'obukhov'],
        # old style From field, found in jangada dataset
        'wcl@example.com (Wayne Long)': ['Wayne', 'Long'],
        # if only sender's name provided
        'Wayne Long': ['Wayne', 'Long'],
        # if middle name is shortened with dot
        'Sergey N.  Obukhov <serobnic@xxx.ru>': ['Sergey', 'Obukhov'],
        # not only spaces could be used as name splitters
        '  Sergey  Obukhov  <serobnic@xxx.ru>': ['Sergey', 'Obukhov'],
        # finally normal example
        'Sergey <serobnic@xxx.ru>': ['Sergey'],
        # if middle name is shortened with `,`
        'Sergey N, Obukhov': ['Sergey', 'Obukhov'],
        # if mailto used with email address and sender's name is specified
        'Sergey N, Obukhov [mailto: serobnic@xxx.ru]': ['Sergey', 'Obukhov'],
        # when only email address is given
        'serobnic@xxx.ru': ['serobnic'],
        # when nothing is given
        '': [],
        # if phone is specified in the `From:` header
        'wcl@example.com (Wayne Long +7 920 -256 - 35-09)': ['Wayne', 'Long'],
        # from crash reports `nothing to repeat`
        '* * * * <the_pod1@example.com>': ['the', 'pod'],
        '"**Bobby B**" <copymycashsystem@example.com>':
        ['Bobby', 'copymycashsystem'],
        # from crash reports `bad escape`
        '"M Ali B Azlan \(GHSE/PETH\)" <aliazlan@example.com>':
        ['Ali', 'Azlan'],
        ('"Ridthauddin B A Rahim \(DD/PCSB\)"'
         ' <ridthauddin_arahim@example.com>'): ['Ridthauddin', 'Rahim'],
        ('"Boland, Patrick \(Global Xxx Group, Ireland \)"'
         ' <Patrick.Boland@example.com>'): ['Boland', 'Patrick'],
        '"Mates Rate \(Wine\)" <amen@example.com.com>':
        ['Mates', 'Rate', 'Wine'],
        ('"Morgan, Paul \(Business Xxx RI, Xxx Xxx Group\)"'
         ' <paul.morgan@example.com>'): ['Morgan', 'Paul'],
        '"David DECOSTER \(Domicile\)" <decosterdavid@xxx.be>':
        ['David', 'DECOSTER', 'Domicile']
        }

    for sender, expected_names in senders_names.items():
        extracted_names = h.extract_names(sender)
        # check that extracted names could be compiled
        try:
            re.compile("|".join(extracted_names))
        except Exception, e:
            ok_(False, ("Failed to compile extracted names {}"
                        "\n\nReason: {}").format(extracted_names, e))
        if expected_names:
            for name in expected_names:
                assert_in(name, extracted_names)
        else:
            eq_(expected_names, extracted_names)

    # words like `ru`, `gmail`, `com`, `org`, etc. are not considered
    # sender's names
    for word in h.BAD_SENDER_NAMES:
        eq_(h.extract_names(word), [])

    # duplicates are not allowed
    eq_(h.extract_names("sergey <sergey@example.com"), ["sergey"])


def test_categories_percent():
    eq_(0.0, h.categories_percent("qqq ggg hhh", ["Po"]))
    eq_(50.0, h.categories_percent("q,w.", ["Po"]))
    eq_(0.0, h.categories_percent("qqq ggg hhh", ["Nd"]))
    eq_(50.0, h.categories_percent("q5", ["Nd"]))
    eq_(50.0, h.categories_percent("s.s,5s", ["Po", "Nd"]))
    eq_(0.0, h.categories_percent("", ["Po", "Nd"]))


@patch.object(h, 'categories_percent')
def test_punctuation_percent(categories_percent):
    h.punctuation_percent("qqq")
    categories_percent.assert_called_with("qqq", ['Po'])


def test_capitalized_words_percent():
    eq_(0.0, h.capitalized_words_percent(''))
    eq_(100.0, h.capitalized_words_percent('Example Corp'))
    eq_(50.0, h.capitalized_words_percent('Qqq qqq QQQ 123 sss'))
    eq_(100.0, h.capitalized_words_percent('Cell 713-444-7368'))
    eq_(100.0, h.capitalized_words_percent('8th Floor'))
    eq_(0.0, h.capitalized_words_percent('(212) 230-9276'))


def test_has_signature():
    ok_(h.has_signature('sender', 'sender@example.com'))
    ok_(h.has_signature('http://www.example.com\n555 555 5555',
                        'sender@example.com'))
    ok_(h.has_signature('http://www.example.com\naddress@example.com',
                        'sender@example.com'))
    assert_false(h.has_signature('http://www.example.com/555-555-5555',
                                 'sender@example.com'))
    long_line = ''.join(['q' for e in xrange(28)])
    assert_false(h.has_signature(long_line + ' sender', 'sender@example.com'))
    # wont crash on an empty string
    assert_false(h.has_signature('', ''))
    # dont consider empty strings when analysing signature
    with patch.object(h, 'SIGNATURE_MAX_LINES', 1):
        ok_('sender\n\n', 'sender@example.com')
initial commit 2014-07-23 21:12:54 -07:00			`# -- coding: utf-8 --`

			`from ... import *`

			`import regex as re`

			`from talon.signature.learning import helpers as h`
			`from talon.signature.learning.helpers import *`

			`# First testing regex constants.`
			`VALID = '''`
			`15615552323`
			`1-561-555-1212`
			`5613333`

			`18008793262`
			`800-879-3262`
			`0-800.879.3262`

			`04 3452488`
			`04 -3452488`
			`04 - 3452499`

			`(610) 310-5555 x5555`
			`533-1123`

			`(021)1234567`
			`(021)123456`
			`(000)000000`

			`+7 920 34 57 23`
			`+7(920) 34 57 23`
			`+7(920)345723`
			`+7920345723`
			`8920345723`
			`21143`
			`2-11-43`
			`2 - 11 - 43`
			`'''`

			`VALID_PHONE_NUMBERS = [e.strip() for e in VALID.splitlines() if e.strip()]`


			`def test_match_phone_numbers():`
			`for phone in VALID_PHONE_NUMBERS:`
			`ok_(RE_RELAX_PHONE.match(phone), "{} should be matched".format(phone))`


			`def test_match_names():`
			`names = ['John R. Doe']`
			`for name in names:`
			`ok_(RE_NAME.match(name), "{} should be matched".format(name))`


			`def test_sender_with_name():`
			`ok_lines = ['Sergey Obukhov <serobnic@example.com>',`
			`'\tSergey <serobnic@example.com>',`
			`('"Doe, John (TX)"'`
			`'<DowJ@example.com>@EXAMPLE'`
			`'<IMCEANOTES-+22Doe+2C+20John+20'`
			`'+28TX+29+22+20+3CDoeJ+40example+2Ecom+3E'`
			`'+40EXAMPLE@EXAMPLE.com>'),`
			`('Company Sleuth <csleuth@email.xxx.com>'`
			`'@EXAMPLE <XXX-Company+20Sleuth+20+3Ccsleuth'`
			`'+40email+2Exxx+2Ecom+3E+40EXAMPLE@EXAMPLE.com>'),`
			`('Doe III, John '`
			`'</O=EXAMPLE/OU=NA/CN=RECIPIENTS/CN=jDOE5>')]`
			`for line in ok_lines:`
			`ok_(RE_SENDER_WITH_NAME.match(line),`
			`'{} should be matched'.format(line))`

			`nok_lines = ['', '<serobnic@xxx.ru>', 'Sergey serobnic@xxx.ru']`
			`for line in nok_lines:`
			`assert_false(RE_SENDER_WITH_NAME.match(line),`
			`'{} should not be matched'.format(line))`


			`# Now test helpers functions`
			`def test_binary_regex_search():`
			`eq_(1, h.binary_regex_search(re.compile("12"))("12"))`
			`eq_(0, h.binary_regex_search(re.compile("12"))("34"))`


			`def binary_regex_match(prog):`
			`eq_(1, h.binary_regex_match(re.compile("12"))("12 3"))`
			`eq_(0, h.binary_regex_match(re.compile("12"))("3 12"))`


			`def test_flatten_list():`
			`eq_([1, 2, 3, 4, 5], h.flatten_list([[1, 2], [3, 4, 5]]))`


			`@patch.object(h.re, 'compile')`
			`def test_contains_sender_names(re_compile):`
			`with patch.object(h, 'extract_names',`
			`Mock(return_value=['bob', 'smith'])) as extract_names:`
			`has_sender_names = h.contains_sender_names("bob.smith@example.com")`
			`extract_names.assert_called_with("bob.smith@example.com")`
			`for name in ["bob", "Bob", "smith", "Smith"]:`
			`ok_(has_sender_names(name))`

			`extract_names.return_value = ''`
			`has_sender_names = h.contains_sender_names("bob.smith@example.com")`
			`# if no names could be extracted fallback to the email address`
			`ok_(has_sender_names('bob.smith@example.com'))`

			`# don't crash if there are no sender`
			`extract_names.return_value = ''`
			`has_sender_names = h.contains_sender_names("")`
			`assert_false(has_sender_names(''))`


			`def test_extract_names():`
			`senders_names = {`
			`# from example dataset`
			`('Jay Rickerts <eCenter@example.com>@EXAMPLE <XXX-Jay+20Rickerts'`
			`'+20+3CeCenter+40example+2Ecom+3E+40EXAMPLE@EXAMPLE.com>'):`
			`['Jay', 'Rickerts'],`
			# if `,` is used in sender's name
			`'Williams III, Bill </O=EXAMPLE/OU=NA/CN=RECIPIENTS/CN=BWILLIA5>':`
			`['Williams', 'III', 'Bill'],`
			# if somehow `'` or `"` are used in sender's name
			`'Laura" "Goldberg <laura.goldberg@example.com>':`
			`['Laura', 'Goldberg'],`
			`# extract from senders email address`
			`'<sergey@xxx.ru>': ['sergey'],`
			`# extract from sender's email address`
			`# if dots are used in the email address`
			`'<sergey.obukhov@xxx.ru>': ['sergey', 'obukhov'],`
			`# extract from sender's email address`
			`# if dashes are used in the email address`
			`'<sergey-obukhov@xxx.ru>': ['sergey', 'obukhov'],`
			`# extract from sender's email address`
			# if `_` are used in the email address
			`'<sergey_obukhov@xxx.ru>': ['sergey', 'obukhov'],`
			`# old style From field, found in jangada dataset`
			`'wcl@example.com (Wayne Long)': ['Wayne', 'Long'],`
			`# if only sender's name provided`
			`'Wayne Long': ['Wayne', 'Long'],`
			`# if middle name is shortened with dot`
			`'Sergey N. Obukhov <serobnic@xxx.ru>': ['Sergey', 'Obukhov'],`
			`# not only spaces could be used as name splitters`
			`' Sergey Obukhov <serobnic@xxx.ru>': ['Sergey', 'Obukhov'],`
			`# finally normal example`
			`'Sergey <serobnic@xxx.ru>': ['Sergey'],`
			# if middle name is shortened with `,`
			`'Sergey N, Obukhov': ['Sergey', 'Obukhov'],`
			`# if mailto used with email address and sender's name is specified`
			`'Sergey N, Obukhov [mailto: serobnic@xxx.ru]': ['Sergey', 'Obukhov'],`
			`# when only email address is given`
			`'serobnic@xxx.ru': ['serobnic'],`
			`# when nothing is given`
			`'': [],`
			# if phone is specified in the `From:` header
			`'wcl@example.com (Wayne Long +7 920 -256 - 35-09)': ['Wayne', 'Long'],`
			# from crash reports `nothing to repeat`
			`'* * * * <the_pod1@example.com>': ['the', 'pod'],`
			`'"Bobby B" <copymycashsystem@example.com>':`
			`['Bobby', 'copymycashsystem'],`
			# from crash reports `bad escape`
			`'"M Ali B Azlan \(GHSE/PETH\)" <aliazlan@example.com>':`
			`['Ali', 'Azlan'],`
			`('"Ridthauddin B A Rahim \(DD/PCSB\)"'`
			`' <ridthauddin_arahim@example.com>'): ['Ridthauddin', 'Rahim'],`
			`('"Boland, Patrick \(Global Xxx Group, Ireland \)"'`
			`' <Patrick.Boland@example.com>'): ['Boland', 'Patrick'],`
			`'"Mates Rate \(Wine\)" <amen@example.com.com>':`
			`['Mates', 'Rate', 'Wine'],`
			`('"Morgan, Paul \(Business Xxx RI, Xxx Xxx Group\)"'`
			`' <paul.morgan@example.com>'): ['Morgan', 'Paul'],`
			`'"David DECOSTER \(Domicile\)" <decosterdavid@xxx.be>':`
			`['David', 'DECOSTER', 'Domicile']`
			`}`

			`for sender, expected_names in senders_names.items():`
			`extracted_names = h.extract_names(sender)`
			`# check that extracted names could be compiled`
			`try:`
			`re.compile("\|".join(extracted_names))`
			`except Exception, e:`
			`ok_(False, ("Failed to compile extracted names {}"`
			`"\n\nReason: {}").format(extracted_names, e))`
			`if expected_names:`
			`for name in expected_names:`
			`assert_in(name, extracted_names)`
			`else:`
			`eq_(expected_names, extracted_names)`

			# words like `ru`, `gmail`, `com`, `org`, etc. are not considered
			`# sender's names`
			`for word in h.BAD_SENDER_NAMES:`
			`eq_(h.extract_names(word), [])`

			`# duplicates are not allowed`
			`eq_(h.extract_names("sergey <sergey@example.com"), ["sergey"])`


			`def test_categories_percent():`
			`eq_(0.0, h.categories_percent("qqq ggg hhh", ["Po"]))`
			`eq_(50.0, h.categories_percent("q,w.", ["Po"]))`
			`eq_(0.0, h.categories_percent("qqq ggg hhh", ["Nd"]))`
			`eq_(50.0, h.categories_percent("q5", ["Nd"]))`
			`eq_(50.0, h.categories_percent("s.s,5s", ["Po", "Nd"]))`
			`eq_(0.0, h.categories_percent("", ["Po", "Nd"]))`


			`@patch.object(h, 'categories_percent')`
			`def test_punctuation_percent(categories_percent):`
			`h.punctuation_percent("qqq")`
			`categories_percent.assert_called_with("qqq", ['Po'])`


			`def test_capitalized_words_percent():`
			`eq_(0.0, h.capitalized_words_percent(''))`
			`eq_(100.0, h.capitalized_words_percent('Example Corp'))`
			`eq_(50.0, h.capitalized_words_percent('Qqq qqq QQQ 123 sss'))`
			`eq_(100.0, h.capitalized_words_percent('Cell 713-444-7368'))`
			`eq_(100.0, h.capitalized_words_percent('8th Floor'))`
			`eq_(0.0, h.capitalized_words_percent('(212) 230-9276'))`


			`def test_has_signature():`
			`ok_(h.has_signature('sender', 'sender@example.com'))`
			`ok_(h.has_signature('http://www.example.com\n555 555 5555',`
			`'sender@example.com'))`
			`ok_(h.has_signature('http://www.example.com\naddress@example.com',`
			`'sender@example.com'))`
			`assert_false(h.has_signature('http://www.example.com/555-555-5555',`
			`'sender@example.com'))`
			`long_line = ''.join(['q' for e in xrange(28)])`
			`assert_false(h.has_signature(long_line + ' sender', 'sender@example.com'))`
			`# wont crash on an empty string`
			`assert_false(h.has_signature('', ''))`
			`# dont consider empty strings when analysing signature`
			`with patch.object(h, 'SIGNATURE_MAX_LINES', 1):`
			`ok_('sender\n\n', 'sender@example.com')`