talon/talon/signature/learning/featurespace.py

# -*- coding: utf-8 -*-

""" The module provides functions for conversion of a message body/body lines
into classifiers features space.

The body and the message sender string are converted into unicode before
applying features to them.
"""

from __future__ import absolute_import
from talon.signature.constants import (SIGNATURE_MAX_LINES,
                                       TOO_LONG_SIGNATURE_LINE)
from talon.signature.learning.helpers import *
from six.moves import zip
from functools import reduce


def features(sender=''):
    '''Returns a list of signature features.'''
    return [
        # This one isn't from paper.
        # Meant to match companies names, sender's names, address.
        many_capitalized_words,
        # This one is not from paper.
        # Line is too long.
        # This one is less aggressive than `Line is too short`
        lambda line: 1 if len(line) > TOO_LONG_SIGNATURE_LINE else 0,
        # Line contains email pattern.
        binary_regex_search(RE_EMAIL),
        # Line contains url.
        binary_regex_search(RE_URL),
        # Line contains phone number pattern.
        binary_regex_search(RE_RELAX_PHONE),
        # Line matches the regular expression "^[\s]*---*[\s]*$".
        binary_regex_match(RE_SEPARATOR),
        # Line has a sequence of 10 or more special characters.
        binary_regex_search(RE_SPECIAL_CHARS),
        # Line contains any typical signature words.
        binary_regex_search(RE_SIGNATURE_WORDS),
        # Line contains a pattern like Vitor R. Carvalho or William W. Cohen.
        binary_regex_search(RE_NAME),
        # Percentage of punctuation symbols in the line is larger than 50%
        lambda line: 1 if punctuation_percent(line) > 50 else 0,
        # Percentage of punctuation symbols in the line is larger than 90%
        lambda line: 1 if punctuation_percent(line) > 90 else 0,
        contains_sender_names(sender)
        ]


def apply_features(body, features):
    '''Applies features to message body lines.

    Returns list of lists. Each of the lists corresponds to the body line
    and is constituted by the numbers of features occurrences (0 or 1).
    E.g. if element j of list i equals 1 this means that
    feature j occurred in line i (counting from the last line of the body).
    '''
    # collect all non empty lines
    lines = [line for line in body.splitlines() if line.strip()]

    # take the last SIGNATURE_MAX_LINES
    last_lines = lines[-SIGNATURE_MAX_LINES:]

    # apply features, fallback to zeros
    return ([[f(line) for f in features] for line in last_lines] or
            [[0 for f in features]])


def build_pattern(body, features):
    '''Converts body into a pattern i.e. a point in the features space.

    Applies features to the body lines and sums up the results.
    Elements of the pattern indicate how many times a certain feature occurred
    in the last lines of the body.
    '''
    line_patterns = apply_features(body, features)
    return reduce(lambda x, y: [i + j for i, j in zip(x, y)], line_patterns)
initial commit 2014-07-23 21:12:54 -07:00			`# -- coding: utf-8 --`

Fixed typos 2014-07-25 02:40:37 +00:00			`""" The module provides functions for conversion of a message body/body lines`
initial commit 2014-07-23 21:12:54 -07:00			`into classifiers features space.`

			`The body and the message sender string are converted into unicode before`
			`applying features to them.`
			`"""`

Run modernizer on the code. 2016-07-12 17:25:46 +05:00			`from __future__ import absolute_import`
Utilising the Constants Checking for the length of a line to determine if it is possibly a signature or not could be done in a more generic way by determining the maximum size of the line via a constant. Hence advocating the spirit of the modifying the code in only one place and propagating that change everywhere. This exact approach has already been used at: 2015-01-21 15:54:57 +01:00			`from talon.signature.constants import (SIGNATURE_MAX_LINES,`
			`TOO_LONG_SIGNATURE_LINE)`
initial commit 2014-07-23 21:12:54 -07:00			`from talon.signature.learning.helpers import *`
Run modernizer on the code. 2016-07-12 17:25:46 +05:00			`from six.moves import zip`
			`from functools import reduce`
initial commit 2014-07-23 21:12:54 -07:00

			`def features(sender=''):`
			`'''Returns a list of signature features.'''`
			`return [`
			`# This one isn't from paper.`
			`# Meant to match companies names, sender's names, address.`
			`many_capitalized_words,`
			`# This one is not from paper.`
			`# Line is too long.`
			# This one is less aggressive than `Line is too short`
Utilising the Constants Checking for the length of a line to determine if it is possibly a signature or not could be done in a more generic way by determining the maximum size of the line via a constant. Hence advocating the spirit of the modifying the code in only one place and propagating that change everywhere. This exact approach has already been used at: 2015-01-21 15:54:57 +01:00			`lambda line: 1 if len(line) > TOO_LONG_SIGNATURE_LINE else 0,`
initial commit 2014-07-23 21:12:54 -07:00			`# Line contains email pattern.`
			`binary_regex_search(RE_EMAIL),`
			`# Line contains url.`
			`binary_regex_search(RE_URL),`
			`# Line contains phone number pattern.`
			`binary_regex_search(RE_RELAX_PHONE),`
			`# Line matches the regular expression "^[\s]---[\s]*$".`
			`binary_regex_match(RE_SEPARATOR),`
			`# Line has a sequence of 10 or more special characters.`
			`binary_regex_search(RE_SPECIAL_CHARS),`
			`# Line contains any typical signature words.`
			`binary_regex_search(RE_SIGNATURE_WORDS),`
			`# Line contains a pattern like Vitor R. Carvalho or William W. Cohen.`
			`binary_regex_search(RE_NAME),`
			`# Percentage of punctuation symbols in the line is larger than 50%`
			`lambda line: 1 if punctuation_percent(line) > 50 else 0,`
			`# Percentage of punctuation symbols in the line is larger than 90%`
			`lambda line: 1 if punctuation_percent(line) > 90 else 0,`
			`contains_sender_names(sender)`
			`]`


			`def apply_features(body, features):`
			`'''Applies features to message body lines.`

			`Returns list of lists. Each of the lists corresponds to the body line`
Fixed typos 2014-07-25 02:40:37 +00:00			`and is constituted by the numbers of features occurrences (0 or 1).`
initial commit 2014-07-23 21:12:54 -07:00			`E.g. if element j of list i equals 1 this means that`
Fixed typos 2014-07-25 02:40:37 +00:00			`feature j occurred in line i (counting from the last line of the body).`
initial commit 2014-07-23 21:12:54 -07:00			`'''`
			`# collect all non empty lines`
			`lines = [line for line in body.splitlines() if line.strip()]`

			`# take the last SIGNATURE_MAX_LINES`
			`last_lines = lines[-SIGNATURE_MAX_LINES:]`

			`# apply features, fallback to zeros`
			`return ([[f(line) for f in features] for line in last_lines] or`
			`[[0 for f in features]])`


			`def build_pattern(body, features):`
			`'''Converts body into a pattern i.e. a point in the features space.`

			`Applies features to the body lines and sums up the results.`
Fixed typos 2014-07-25 02:40:37 +00:00			`Elements of the pattern indicate how many times a certain feature occurred`
initial commit 2014-07-23 21:12:54 -07:00			`in the last lines of the body.`
			`'''`
			`line_patterns = apply_features(body, features)`
			`return reduce(lambda x, y: [i + j for i, j in zip(x, y)], line_patterns)`