Files
talon/talon/signature/learning/featurespace.py

78 lines
3.0 KiB
Python
Raw Normal View History

2014-07-23 21:12:54 -07:00
# -*- coding: utf-8 -*-
2014-07-25 02:40:37 +00:00
""" The module provides functions for conversion of a message body/body lines
2014-07-23 21:12:54 -07:00
into classifiers features space.
The body and the message sender string are converted into unicode before
applying features to them.
"""
2016-07-12 17:25:46 +05:00
from __future__ import absolute_import
from talon.signature.constants import (SIGNATURE_MAX_LINES,
TOO_LONG_SIGNATURE_LINE)
2014-07-23 21:12:54 -07:00
from talon.signature.learning.helpers import *
2016-07-12 17:25:46 +05:00
from six.moves import zip
from functools import reduce
2014-07-23 21:12:54 -07:00
def features(sender=''):
'''Returns a list of signature features.'''
return [
# This one isn't from paper.
# Meant to match companies names, sender's names, address.
many_capitalized_words,
# This one is not from paper.
# Line is too long.
# This one is less aggressive than `Line is too short`
lambda line: 1 if len(line) > TOO_LONG_SIGNATURE_LINE else 0,
2014-07-23 21:12:54 -07:00
# Line contains email pattern.
binary_regex_search(RE_EMAIL),
# Line contains url.
binary_regex_search(RE_URL),
# Line contains phone number pattern.
binary_regex_search(RE_RELAX_PHONE),
# Line matches the regular expression "^[\s]*---*[\s]*$".
binary_regex_match(RE_SEPARATOR),
# Line has a sequence of 10 or more special characters.
binary_regex_search(RE_SPECIAL_CHARS),
# Line contains any typical signature words.
binary_regex_search(RE_SIGNATURE_WORDS),
# Line contains a pattern like Vitor R. Carvalho or William W. Cohen.
binary_regex_search(RE_NAME),
# Percentage of punctuation symbols in the line is larger than 50%
lambda line: 1 if punctuation_percent(line) > 50 else 0,
# Percentage of punctuation symbols in the line is larger than 90%
lambda line: 1 if punctuation_percent(line) > 90 else 0,
contains_sender_names(sender)
]
def apply_features(body, features):
'''Applies features to message body lines.
Returns list of lists. Each of the lists corresponds to the body line
2014-07-25 02:40:37 +00:00
and is constituted by the numbers of features occurrences (0 or 1).
2014-07-23 21:12:54 -07:00
E.g. if element j of list i equals 1 this means that
2014-07-25 02:40:37 +00:00
feature j occurred in line i (counting from the last line of the body).
2014-07-23 21:12:54 -07:00
'''
# collect all non empty lines
lines = [line for line in body.splitlines() if line.strip()]
# take the last SIGNATURE_MAX_LINES
last_lines = lines[-SIGNATURE_MAX_LINES:]
# apply features, fallback to zeros
return ([[f(line) for f in features] for line in last_lines] or
[[0 for f in features]])
def build_pattern(body, features):
'''Converts body into a pattern i.e. a point in the features space.
Applies features to the body lines and sums up the results.
2014-07-25 02:40:37 +00:00
Elements of the pattern indicate how many times a certain feature occurred
2014-07-23 21:12:54 -07:00
in the last lines of the body.
'''
line_patterns = apply_features(body, features)
return reduce(lambda x, y: [i + j for i, j in zip(x, y)], line_patterns)