initial commit

This commit is contained in:
Sergey Obukhov
2014-07-23 21:12:54 -07:00
commit 170f11038b
80 changed files with 7481 additions and 0 deletions

View File

@@ -0,0 +1,73 @@
# -*- coding: utf-8 -*-
""" The module provides functions for convertion of a message body/body lines
into classifiers features space.
The body and the message sender string are converted into unicode before
applying features to them.
"""
from talon.signature.constants import SIGNATURE_MAX_LINES
from talon.signature.learning.helpers import *
def features(sender=''):
'''Returns a list of signature features.'''
return [
# This one isn't from paper.
# Meant to match companies names, sender's names, address.
many_capitalized_words,
# This one is not from paper.
# Line is too long.
# This one is less aggressive than `Line is too short`
lambda line: 1 if len(line) > 60 else 0,
# Line contains email pattern.
binary_regex_search(RE_EMAIL),
# Line contains url.
binary_regex_search(RE_URL),
# Line contains phone number pattern.
binary_regex_search(RE_RELAX_PHONE),
# Line matches the regular expression "^[\s]*---*[\s]*$".
binary_regex_match(RE_SEPARATOR),
# Line has a sequence of 10 or more special characters.
binary_regex_search(RE_SPECIAL_CHARS),
# Line contains any typical signature words.
binary_regex_search(RE_SIGNATURE_WORDS),
# Line contains a pattern like Vitor R. Carvalho or William W. Cohen.
binary_regex_search(RE_NAME),
# Percentage of punctuation symbols in the line is larger than 50%
lambda line: 1 if punctuation_percent(line) > 50 else 0,
# Percentage of punctuation symbols in the line is larger than 90%
lambda line: 1 if punctuation_percent(line) > 90 else 0,
contains_sender_names(sender)
]
def apply_features(body, features):
'''Applies features to message body lines.
Returns list of lists. Each of the lists corresponds to the body line
and is constituted by the numbers of features occurances (0 or 1).
E.g. if element j of list i equals 1 this means that
feature j occured in line i (counting from the last line of the body).
'''
# collect all non empty lines
lines = [line for line in body.splitlines() if line.strip()]
# take the last SIGNATURE_MAX_LINES
last_lines = lines[-SIGNATURE_MAX_LINES:]
# apply features, fallback to zeros
return ([[f(line) for f in features] for line in last_lines] or
[[0 for f in features]])
def build_pattern(body, features):
'''Converts body into a pattern i.e. a point in the features space.
Applies features to the body lines and sums up the results.
Elements of the pattern indicate how many times a certain feature occured
in the last lines of the body.
'''
line_patterns = apply_features(body, features)
return reduce(lambda x, y: [i + j for i, j in zip(x, y)], line_patterns)