talon/talon/signature/learning/helpers.py

# -*- coding: utf-8 -*-

""" The module provides:
* functions used when evaluating signature's features
* regexp's constants used when evaluating signature's features

"""
import unicodedata

import regex as re

from talon.signature.constants import SIGNATURE_MAX_LINES

rc = re.compile

RE_EMAIL = rc('\S@\S')
RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
RE_URL = rc(r"""https?://|www\.[\S]+\.[\S]""")

# Taken from:
# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
# Line matches the regular expression "^[\s]*---*[\s]*$".
RE_SEPARATOR = rc('^[\s]*---*[\s]*$')

# Taken from:
# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
# Line has a sequence of 10 or more special characters.
RE_SPECIAL_CHARS = rc(('^[\s]*([\*]|#|[\+]|[\^]|-|[\~]|[\&]|[\$]|_|[\!]|'
                       '[\/]|[\%]|[\:]|[\=]){10,}[\s]*$'))

RE_SIGNATURE_WORDS = rc(('(T|t)hank.*,|(B|b)est|(R|r)egards|'
                         '^sent[ ]{1}from[ ]{1}my[\s,!\w]*$|BR|(S|s)incerely|'
                         '(C|c)orporation|Group'))

# Taken from:
# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
# Line contains a pattern like Vitor R. Carvalho or William W. Cohen.
RE_NAME = rc('[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+')

INVALID_WORD_START = rc('\(|\+|[\d]')

BAD_SENDER_NAMES = [
    # known mail domains
    'hotmail', 'gmail', 'yandex', 'mail', 'yahoo', 'mailgun', 'mailgunhq',
    'example',
    # first level domains
    'com', 'org', 'net', 'ru',
    # bad words
    'mailto'
    ]


def binary_regex_search(prog):
    """Returns a function that returns 1 or 0 depending on regex search result.

    If regular expression compiled into prog is present in a string
    the result of calling the returned function with the string will be 1
    and 0 otherwise.

    >>> import regex as re
    >>> binary_regex_search(re.compile("12"))("12")
    1
    >>> binary_regex_search(re.compile("12"))("34")
    0
    """
    return lambda s: 1 if prog.search(s) else 0


def binary_regex_match(prog):
    """Returns a function that returns 1 or 0 depending on regex match result.

    If a string matches regular expression compiled into prog
    the result of calling the returned function with the string will be 1
    and 0 otherwise.

    >>> import regex as re
    >>> binary_regex_match(re.compile("12"))("12 3")
    1
    >>> binary_regex_match(re.compile("12"))("3 12")
    0
    """
    return lambda s: 1 if prog.match(s) else 0


def flatten_list(list_to_flatten):
    """Simple list comprehension to flatten list.

    >>> flatten_list([[1, 2], [3, 4, 5]])
    [1, 2, 3, 4, 5]
    >>> flatten_list([[1], [[2]]])
    [1, [2]]
    >>> flatten_list([1, [2]])
    Traceback (most recent call last):
    ...
    TypeError: 'int' object is not iterable
    """
    return [e for sublist in list_to_flatten for e in sublist]


def contains_sender_names(sender):
    """Returns a functions to search sender\'s name or it\'s part.

    >>> feature = contains_sender_names("Sergey N.  Obukhov <xxx@example.com>")
    >>> feature("Sergey Obukhov")
    1
    >>> feature("BR, Sergey N.")
    1
    >>> feature("Sergey")
    1
    >>> contains_sender_names("<serobnic@mail.ru>")("Serobnic")
    1
    >>> contains_sender_names("<serobnic@mail.ru>")("serobnic")
    1
    """
    names = '( |$)|'.join(flatten_list([[e, e.capitalize()]
                                        for e in extract_names(sender)]))
    names = names or sender
    if names != '':
        return binary_regex_search(re.compile(names))
    return lambda s: 0


def extract_names(sender):
    """Tries to extract sender's names from `From:` header.

    It could extract not only the actual names but e.g.
    the name of the company, parts of email, etc.

    >>> extract_names('Sergey N.  Obukhov <serobnic@mail.ru>')
    ['Sergey', 'Obukhov', 'serobnic']
    >>> extract_names('')
    []
    """
    # Remove non-alphabetical characters
    sender = "".join([char if char.isalpha() else ' ' for char in sender])
    # Remove too short words and words from "black" list i.e.
    # words like `ru`, `gmail`, `com`, `org`, etc.
    names = list()
    for word in sender.split():
        if len(word) < 2:
            continue
        if word in BAD_SENDER_NAMES:
            continue
        if word in names:
            continue
        names.append(word)

    return names


def categories_percent(s, categories):
    """Returns category characters percent.

    >>> categories_percent("qqq ggg hhh", ["Po"])
    0.0
    >>> categories_percent("q,w.", ["Po"])
    50.0
    >>> categories_percent("qqq ggg hhh", ["Nd"])
    0.0
    >>> categories_percent("q5", ["Nd"])
    50.0
    >>> categories_percent("s.s,5s", ["Po", "Nd"])
    50.0
    """
    count = 0
    for c in s:
        if unicodedata.category(c) in categories:
            count += 1
    return 100 * float(count) / len(s) if len(s) else 0


def punctuation_percent(s):
    """Returns punctuation percent.

    >>> punctuation_percent("qqq ggg hhh")
    0.0
    >>> punctuation_percent("q,w.")
    50.0
    """
    return categories_percent(s, ['Po'])


def capitalized_words_percent(s):
    """Returns capitalized words percent."""
    words = re.split('\s', s)
    words = [w for w in words if w.strip()]
    words = [w for w in words if len(w) > 2]
    capitalized_words_counter = 0
    valid_words_counter = 0
    for word in words:
        if not INVALID_WORD_START.match(word):
            valid_words_counter += 1
            if word[0].isupper() and not word[1].isupper():
                capitalized_words_counter += 1
    if valid_words_counter > 0 and len(words) > 1:
        return 100 * float(capitalized_words_counter) / valid_words_counter

    return 0


def many_capitalized_words(s):
    """Returns a function to check percentage of capitalized words.

    The function returns 1 if percentage greater then 65% and 0 otherwise.
    """
    return 1 if capitalized_words_percent(s) > 66 else 0


def has_signature(body, sender):
    """Checks if the body has signature. Returns True or False."""
    non_empty = [line for line in body.splitlines() if line.strip()]
    candidate = non_empty[-SIGNATURE_MAX_LINES:]
    upvotes = 0
    sender_check = contains_sender_names(sender)
    for line in candidate:
        # we check lines for sender's name, phone, email and url,
        # those signature lines don't take more then 27 lines
        if len(line.strip()) > 27:
            continue

        if sender_check(line):
            return True

        if (binary_regex_search(RE_RELAX_PHONE)(line) +
                binary_regex_search(RE_EMAIL)(line) +
                binary_regex_search(RE_URL)(line) == 1):
            upvotes += 1

    if upvotes > 1:
        return True

    return False