initial commit

2014-07-23 21:12:54 -07:00
commit 170f11038b
80 changed files with 7481 additions and 0 deletions
--- a/talon/signature/learning/featurespace.py
+++ b/talon/signature/learning/featurespace.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+
+""" The module provides functions for convertion of a message body/body lines
+into classifiers features space.
+
+The body and the message sender string are converted into unicode before
+applying features to them.
+"""
+
+from talon.signature.constants import SIGNATURE_MAX_LINES
+from talon.signature.learning.helpers import *
+
+
+def features(sender=''):
+    '''Returns a list of signature features.'''
+    return [
+        # This one isn't from paper.
+        # Meant to match companies names, sender's names, address.
+        many_capitalized_words,
+        # This one is not from paper.
+        # Line is too long.
+        # This one is less aggressive than `Line is too short`
+        lambda line: 1 if len(line) > 60 else 0,
+        # Line contains email pattern.
+        binary_regex_search(RE_EMAIL),
+        # Line contains url.
+        binary_regex_search(RE_URL),
+        # Line contains phone number pattern.
+        binary_regex_search(RE_RELAX_PHONE),
+        # Line matches the regular expression "^[\s]*---*[\s]*$".
+        binary_regex_match(RE_SEPARATOR),
+        # Line has a sequence of 10 or more special characters.
+        binary_regex_search(RE_SPECIAL_CHARS),
+        # Line contains any typical signature words.
+        binary_regex_search(RE_SIGNATURE_WORDS),
+        # Line contains a pattern like Vitor R. Carvalho or William W. Cohen.
+        binary_regex_search(RE_NAME),
+        # Percentage of punctuation symbols in the line is larger than 50%
+        lambda line: 1 if punctuation_percent(line) > 50 else 0,
+        # Percentage of punctuation symbols in the line is larger than 90%
+        lambda line: 1 if punctuation_percent(line) > 90 else 0,
+        contains_sender_names(sender)
+        ]
+
+
+def apply_features(body, features):
+    '''Applies features to message body lines.
+
+    Returns list of lists. Each of the lists corresponds to the body line
+    and is constituted by the numbers of features occurances (0 or 1).
+    E.g. if element j of list i equals 1 this means that
+    feature j occured in line i (counting from the last line of the body).
+    '''
+    # collect all non empty lines
+    lines = [line for line in body.splitlines() if line.strip()]
+
+    # take the last SIGNATURE_MAX_LINES
+    last_lines = lines[-SIGNATURE_MAX_LINES:]
+
+    # apply features, fallback to zeros
+    return ([[f(line) for f in features] for line in last_lines] or
+            [[0 for f in features]])
+
+
+def build_pattern(body, features):
+    '''Converts body into a pattern i.e. a point in the features space.
+
+    Applies features to the body lines and sums up the results.
+    Elements of the pattern indicate how many times a certain feature occured
+    in the last lines of the body.
+    '''
+    line_patterns = apply_features(body, features)
+    return reduce(lambda x, y: [i + j for i, j in zip(x, y)], line_patterns)