diff --git a/talon/signature/data/classifier b/talon/signature/data/classifier index b7d72eb..1c3a4b0 100644 Binary files a/talon/signature/data/classifier and b/talon/signature/data/classifier differ diff --git a/talon/signature/data/classifier_02.npy b/talon/signature/data/classifier_02.npy index 0f965ba..2cec729 100644 Binary files a/talon/signature/data/classifier_02.npy and b/talon/signature/data/classifier_02.npy differ diff --git a/talon/signature/data/classifier_03.npy b/talon/signature/data/classifier_03.npy index 5a35962..e5762ae 100644 Binary files a/talon/signature/data/classifier_03.npy and b/talon/signature/data/classifier_03.npy differ diff --git a/talon/signature/learning/helpers.py b/talon/signature/learning/helpers.py index 953662b..7085a74 100644 --- a/talon/signature/learning/helpers.py +++ b/talon/signature/learning/helpers.py @@ -16,7 +16,7 @@ from talon.signature.constants import SIGNATURE_MAX_LINES rc = re.compile -RE_EMAIL = rc('@') +RE_EMAIL = rc('\S@\S') RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}') RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''') @@ -120,7 +120,7 @@ def contains_sender_names(sender): names = names or sender if names != '': return binary_regex_search(re.compile(names)) - return lambda s: False + return lambda s: 0 def extract_names(sender): diff --git a/tests/signature/learning/featurespace_test.py b/tests/signature/learning/featurespace_test.py index a2c81cb..70df62b 100644 --- a/tests/signature/learning/featurespace_test.py +++ b/tests/signature/learning/featurespace_test.py @@ -6,7 +6,9 @@ from talon.signature.learning import featurespace as fs def test_apply_features(): - s = '''John Doe + s = '''This is John Doe + +Tuesday @3pm suits. I'll chat to you then. VP Research and Development, Xxxx Xxxx Xxxxx @@ -19,11 +21,12 @@ john@example.com''' # note that we don't consider the first line because signatures don't # usually take all the text, empty lines are not considered eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) - with patch.object(fs, 'SIGNATURE_MAX_LINES', 4): + with patch.object(fs, 'SIGNATURE_MAX_LINES', 5): features = fs.features(sender) new_result = fs.apply_features(s, features) # result remains the same because we don't consider empty lines