initial commit
This commit is contained in:
73
talon/signature/learning/featurespace.py
Normal file
73
talon/signature/learning/featurespace.py
Normal file
@@ -0,0 +1,73 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
""" The module provides functions for convertion of a message body/body lines
|
||||
into classifiers features space.
|
||||
|
||||
The body and the message sender string are converted into unicode before
|
||||
applying features to them.
|
||||
"""
|
||||
|
||||
from talon.signature.constants import SIGNATURE_MAX_LINES
|
||||
from talon.signature.learning.helpers import *
|
||||
|
||||
|
||||
def features(sender=''):
|
||||
'''Returns a list of signature features.'''
|
||||
return [
|
||||
# This one isn't from paper.
|
||||
# Meant to match companies names, sender's names, address.
|
||||
many_capitalized_words,
|
||||
# This one is not from paper.
|
||||
# Line is too long.
|
||||
# This one is less aggressive than `Line is too short`
|
||||
lambda line: 1 if len(line) > 60 else 0,
|
||||
# Line contains email pattern.
|
||||
binary_regex_search(RE_EMAIL),
|
||||
# Line contains url.
|
||||
binary_regex_search(RE_URL),
|
||||
# Line contains phone number pattern.
|
||||
binary_regex_search(RE_RELAX_PHONE),
|
||||
# Line matches the regular expression "^[\s]*---*[\s]*$".
|
||||
binary_regex_match(RE_SEPARATOR),
|
||||
# Line has a sequence of 10 or more special characters.
|
||||
binary_regex_search(RE_SPECIAL_CHARS),
|
||||
# Line contains any typical signature words.
|
||||
binary_regex_search(RE_SIGNATURE_WORDS),
|
||||
# Line contains a pattern like Vitor R. Carvalho or William W. Cohen.
|
||||
binary_regex_search(RE_NAME),
|
||||
# Percentage of punctuation symbols in the line is larger than 50%
|
||||
lambda line: 1 if punctuation_percent(line) > 50 else 0,
|
||||
# Percentage of punctuation symbols in the line is larger than 90%
|
||||
lambda line: 1 if punctuation_percent(line) > 90 else 0,
|
||||
contains_sender_names(sender)
|
||||
]
|
||||
|
||||
|
||||
def apply_features(body, features):
|
||||
'''Applies features to message body lines.
|
||||
|
||||
Returns list of lists. Each of the lists corresponds to the body line
|
||||
and is constituted by the numbers of features occurances (0 or 1).
|
||||
E.g. if element j of list i equals 1 this means that
|
||||
feature j occured in line i (counting from the last line of the body).
|
||||
'''
|
||||
# collect all non empty lines
|
||||
lines = [line for line in body.splitlines() if line.strip()]
|
||||
|
||||
# take the last SIGNATURE_MAX_LINES
|
||||
last_lines = lines[-SIGNATURE_MAX_LINES:]
|
||||
|
||||
# apply features, fallback to zeros
|
||||
return ([[f(line) for f in features] for line in last_lines] or
|
||||
[[0 for f in features]])
|
||||
|
||||
|
||||
def build_pattern(body, features):
|
||||
'''Converts body into a pattern i.e. a point in the features space.
|
||||
|
||||
Applies features to the body lines and sums up the results.
|
||||
Elements of the pattern indicate how many times a certain feature occured
|
||||
in the last lines of the body.
|
||||
'''
|
||||
line_patterns = apply_features(body, features)
|
||||
return reduce(lambda x, y: [i + j for i, j in zip(x, y)], line_patterns)
|
||||
Reference in New Issue
Block a user