Files
talon/talon/signature/learning/helpers.py
2022-02-04 17:31:53 +03:00

233 lines
6.6 KiB
Python

# -*- coding: utf-8 -*-
""" The module provides:
* functions used when evaluating signature's features
* regexp's constants used when evaluating signature's features
"""
import unicodedata
import regex as re
from talon.signature.constants import SIGNATURE_MAX_LINES
rc = re.compile
RE_EMAIL = rc('\S@\S')
RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
RE_URL = rc(r"""https?://|www\.[\S]+\.[\S]""")
# Taken from:
# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
# Line matches the regular expression "^[\s]*---*[\s]*$".
RE_SEPARATOR = rc('^[\s]*---*[\s]*$')
# Taken from:
# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
# Line has a sequence of 10 or more special characters.
RE_SPECIAL_CHARS = rc(('^[\s]*([\*]|#|[\+]|[\^]|-|[\~]|[\&]|[\$]|_|[\!]|'
'[\/]|[\%]|[\:]|[\=]){10,}[\s]*$'))
RE_SIGNATURE_WORDS = rc(('(T|t)hank.*,|(B|b)est|(R|r)egards|'
'^sent[ ]{1}from[ ]{1}my[\s,!\w]*$|BR|(S|s)incerely|'
'(C|c)orporation|Group'))
# Taken from:
# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
# Line contains a pattern like Vitor R. Carvalho or William W. Cohen.
RE_NAME = rc('[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+')
INVALID_WORD_START = rc('\(|\+|[\d]')
BAD_SENDER_NAMES = [
# known mail domains
'hotmail', 'gmail', 'yandex', 'mail', 'yahoo', 'mailgun', 'mailgunhq',
'example',
# first level domains
'com', 'org', 'net', 'ru',
# bad words
'mailto'
]
def binary_regex_search(prog):
"""Returns a function that returns 1 or 0 depending on regex search result.
If regular expression compiled into prog is present in a string
the result of calling the returned function with the string will be 1
and 0 otherwise.
>>> import regex as re
>>> binary_regex_search(re.compile("12"))("12")
1
>>> binary_regex_search(re.compile("12"))("34")
0
"""
return lambda s: 1 if prog.search(s) else 0
def binary_regex_match(prog):
"""Returns a function that returns 1 or 0 depending on regex match result.
If a string matches regular expression compiled into prog
the result of calling the returned function with the string will be 1
and 0 otherwise.
>>> import regex as re
>>> binary_regex_match(re.compile("12"))("12 3")
1
>>> binary_regex_match(re.compile("12"))("3 12")
0
"""
return lambda s: 1 if prog.match(s) else 0
def flatten_list(list_to_flatten):
"""Simple list comprehension to flatten list.
>>> flatten_list([[1, 2], [3, 4, 5]])
[1, 2, 3, 4, 5]
>>> flatten_list([[1], [[2]]])
[1, [2]]
>>> flatten_list([1, [2]])
Traceback (most recent call last):
...
TypeError: 'int' object is not iterable
"""
return [e for sublist in list_to_flatten for e in sublist]
def contains_sender_names(sender):
"""Returns a functions to search sender\'s name or it\'s part.
>>> feature = contains_sender_names("Sergey N. Obukhov <xxx@example.com>")
>>> feature("Sergey Obukhov")
1
>>> feature("BR, Sergey N.")
1
>>> feature("Sergey")
1
>>> contains_sender_names("<serobnic@mail.ru>")("Serobnic")
1
>>> contains_sender_names("<serobnic@mail.ru>")("serobnic")
1
"""
names = '( |$)|'.join(flatten_list([[e, e.capitalize()]
for e in extract_names(sender)]))
names = names or sender
if names != '':
return binary_regex_search(re.compile(names))
return lambda s: 0
def extract_names(sender):
"""Tries to extract sender's names from `From:` header.
It could extract not only the actual names but e.g.
the name of the company, parts of email, etc.
>>> extract_names('Sergey N. Obukhov <serobnic@mail.ru>')
['Sergey', 'Obukhov', 'serobnic']
>>> extract_names('')
[]
"""
# Remove non-alphabetical characters
sender = "".join([char if char.isalpha() else ' ' for char in sender])
# Remove too short words and words from "black" list i.e.
# words like `ru`, `gmail`, `com`, `org`, etc.
names = list()
for word in sender.split():
if len(word) < 2:
continue
if word in BAD_SENDER_NAMES:
continue
if word in names:
continue
names.append(word)
return names
def categories_percent(s, categories):
"""Returns category characters percent.
>>> categories_percent("qqq ggg hhh", ["Po"])
0.0
>>> categories_percent("q,w.", ["Po"])
50.0
>>> categories_percent("qqq ggg hhh", ["Nd"])
0.0
>>> categories_percent("q5", ["Nd"])
50.0
>>> categories_percent("s.s,5s", ["Po", "Nd"])
50.0
"""
count = 0
for c in s:
if unicodedata.category(c) in categories:
count += 1
return 100 * float(count) / len(s) if len(s) else 0
def punctuation_percent(s):
"""Returns punctuation percent.
>>> punctuation_percent("qqq ggg hhh")
0.0
>>> punctuation_percent("q,w.")
50.0
"""
return categories_percent(s, ['Po'])
def capitalized_words_percent(s):
"""Returns capitalized words percent."""
words = re.split('\s', s)
words = [w for w in words if w.strip()]
words = [w for w in words if len(w) > 2]
capitalized_words_counter = 0
valid_words_counter = 0
for word in words:
if not INVALID_WORD_START.match(word):
valid_words_counter += 1
if word[0].isupper() and not word[1].isupper():
capitalized_words_counter += 1
if valid_words_counter > 0 and len(words) > 1:
return 100 * float(capitalized_words_counter) / valid_words_counter
return 0
def many_capitalized_words(s):
"""Returns a function to check percentage of capitalized words.
The function returns 1 if percentage greater then 65% and 0 otherwise.
"""
return 1 if capitalized_words_percent(s) > 66 else 0
def has_signature(body, sender):
"""Checks if the body has signature. Returns True or False."""
non_empty = [line for line in body.splitlines() if line.strip()]
candidate = non_empty[-SIGNATURE_MAX_LINES:]
upvotes = 0
sender_check = contains_sender_names(sender)
for line in candidate:
# we check lines for sender's name, phone, email and url,
# those signature lines don't take more then 27 lines
if len(line.strip()) > 27:
continue
if sender_check(line):
return True
if (binary_regex_search(RE_RELAX_PHONE)(line) +
binary_regex_search(RE_EMAIL)(line) +
binary_regex_search(RE_URL)(line) == 1):
upvotes += 1
if upvotes > 1:
return True
return False