233 lines
6.6 KiB
Python
233 lines
6.6 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
""" The module provides:
|
|
* functions used when evaluating signature's features
|
|
* regexp's constants used when evaluating signature's features
|
|
|
|
"""
|
|
import unicodedata
|
|
|
|
import regex as re
|
|
|
|
from talon.signature.constants import SIGNATURE_MAX_LINES
|
|
|
|
rc = re.compile
|
|
|
|
RE_EMAIL = rc('\S@\S')
|
|
RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
|
|
RE_URL = rc(r"""https?://|www\.[\S]+\.[\S]""")
|
|
|
|
# Taken from:
|
|
# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
|
|
# Line matches the regular expression "^[\s]*---*[\s]*$".
|
|
RE_SEPARATOR = rc('^[\s]*---*[\s]*$')
|
|
|
|
# Taken from:
|
|
# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
|
|
# Line has a sequence of 10 or more special characters.
|
|
RE_SPECIAL_CHARS = rc(('^[\s]*([\*]|#|[\+]|[\^]|-|[\~]|[\&]|[\$]|_|[\!]|'
|
|
'[\/]|[\%]|[\:]|[\=]){10,}[\s]*$'))
|
|
|
|
RE_SIGNATURE_WORDS = rc(('(T|t)hank.*,|(B|b)est|(R|r)egards|'
|
|
'^sent[ ]{1}from[ ]{1}my[\s,!\w]*$|BR|(S|s)incerely|'
|
|
'(C|c)orporation|Group'))
|
|
|
|
# Taken from:
|
|
# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
|
|
# Line contains a pattern like Vitor R. Carvalho or William W. Cohen.
|
|
RE_NAME = rc('[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+')
|
|
|
|
INVALID_WORD_START = rc('\(|\+|[\d]')
|
|
|
|
BAD_SENDER_NAMES = [
|
|
# known mail domains
|
|
'hotmail', 'gmail', 'yandex', 'mail', 'yahoo', 'mailgun', 'mailgunhq',
|
|
'example',
|
|
# first level domains
|
|
'com', 'org', 'net', 'ru',
|
|
# bad words
|
|
'mailto'
|
|
]
|
|
|
|
|
|
def binary_regex_search(prog):
|
|
"""Returns a function that returns 1 or 0 depending on regex search result.
|
|
|
|
If regular expression compiled into prog is present in a string
|
|
the result of calling the returned function with the string will be 1
|
|
and 0 otherwise.
|
|
|
|
>>> import regex as re
|
|
>>> binary_regex_search(re.compile("12"))("12")
|
|
1
|
|
>>> binary_regex_search(re.compile("12"))("34")
|
|
0
|
|
"""
|
|
return lambda s: 1 if prog.search(s) else 0
|
|
|
|
|
|
def binary_regex_match(prog):
|
|
"""Returns a function that returns 1 or 0 depending on regex match result.
|
|
|
|
If a string matches regular expression compiled into prog
|
|
the result of calling the returned function with the string will be 1
|
|
and 0 otherwise.
|
|
|
|
>>> import regex as re
|
|
>>> binary_regex_match(re.compile("12"))("12 3")
|
|
1
|
|
>>> binary_regex_match(re.compile("12"))("3 12")
|
|
0
|
|
"""
|
|
return lambda s: 1 if prog.match(s) else 0
|
|
|
|
|
|
def flatten_list(list_to_flatten):
|
|
"""Simple list comprehension to flatten list.
|
|
|
|
>>> flatten_list([[1, 2], [3, 4, 5]])
|
|
[1, 2, 3, 4, 5]
|
|
>>> flatten_list([[1], [[2]]])
|
|
[1, [2]]
|
|
>>> flatten_list([1, [2]])
|
|
Traceback (most recent call last):
|
|
...
|
|
TypeError: 'int' object is not iterable
|
|
"""
|
|
return [e for sublist in list_to_flatten for e in sublist]
|
|
|
|
|
|
def contains_sender_names(sender):
|
|
"""Returns a functions to search sender\'s name or it\'s part.
|
|
|
|
>>> feature = contains_sender_names("Sergey N. Obukhov <xxx@example.com>")
|
|
>>> feature("Sergey Obukhov")
|
|
1
|
|
>>> feature("BR, Sergey N.")
|
|
1
|
|
>>> feature("Sergey")
|
|
1
|
|
>>> contains_sender_names("<serobnic@mail.ru>")("Serobnic")
|
|
1
|
|
>>> contains_sender_names("<serobnic@mail.ru>")("serobnic")
|
|
1
|
|
"""
|
|
names = '( |$)|'.join(flatten_list([[e, e.capitalize()]
|
|
for e in extract_names(sender)]))
|
|
names = names or sender
|
|
if names != '':
|
|
return binary_regex_search(re.compile(names))
|
|
return lambda s: 0
|
|
|
|
|
|
def extract_names(sender):
|
|
"""Tries to extract sender's names from `From:` header.
|
|
|
|
It could extract not only the actual names but e.g.
|
|
the name of the company, parts of email, etc.
|
|
|
|
>>> extract_names('Sergey N. Obukhov <serobnic@mail.ru>')
|
|
['Sergey', 'Obukhov', 'serobnic']
|
|
>>> extract_names('')
|
|
[]
|
|
"""
|
|
# Remove non-alphabetical characters
|
|
sender = "".join([char if char.isalpha() else ' ' for char in sender])
|
|
# Remove too short words and words from "black" list i.e.
|
|
# words like `ru`, `gmail`, `com`, `org`, etc.
|
|
names = list()
|
|
for word in sender.split():
|
|
if len(word) < 2:
|
|
continue
|
|
if word in BAD_SENDER_NAMES:
|
|
continue
|
|
if word in names:
|
|
continue
|
|
names.append(word)
|
|
|
|
return names
|
|
|
|
|
|
def categories_percent(s, categories):
|
|
"""Returns category characters percent.
|
|
|
|
>>> categories_percent("qqq ggg hhh", ["Po"])
|
|
0.0
|
|
>>> categories_percent("q,w.", ["Po"])
|
|
50.0
|
|
>>> categories_percent("qqq ggg hhh", ["Nd"])
|
|
0.0
|
|
>>> categories_percent("q5", ["Nd"])
|
|
50.0
|
|
>>> categories_percent("s.s,5s", ["Po", "Nd"])
|
|
50.0
|
|
"""
|
|
count = 0
|
|
for c in s:
|
|
if unicodedata.category(c) in categories:
|
|
count += 1
|
|
return 100 * float(count) / len(s) if len(s) else 0
|
|
|
|
|
|
def punctuation_percent(s):
|
|
"""Returns punctuation percent.
|
|
|
|
>>> punctuation_percent("qqq ggg hhh")
|
|
0.0
|
|
>>> punctuation_percent("q,w.")
|
|
50.0
|
|
"""
|
|
return categories_percent(s, ['Po'])
|
|
|
|
|
|
def capitalized_words_percent(s):
|
|
"""Returns capitalized words percent."""
|
|
words = re.split('\s', s)
|
|
words = [w for w in words if w.strip()]
|
|
words = [w for w in words if len(w) > 2]
|
|
capitalized_words_counter = 0
|
|
valid_words_counter = 0
|
|
for word in words:
|
|
if not INVALID_WORD_START.match(word):
|
|
valid_words_counter += 1
|
|
if word[0].isupper() and not word[1].isupper():
|
|
capitalized_words_counter += 1
|
|
if valid_words_counter > 0 and len(words) > 1:
|
|
return 100 * float(capitalized_words_counter) / valid_words_counter
|
|
|
|
return 0
|
|
|
|
|
|
def many_capitalized_words(s):
|
|
"""Returns a function to check percentage of capitalized words.
|
|
|
|
The function returns 1 if percentage greater then 65% and 0 otherwise.
|
|
"""
|
|
return 1 if capitalized_words_percent(s) > 66 else 0
|
|
|
|
|
|
def has_signature(body, sender):
|
|
"""Checks if the body has signature. Returns True or False."""
|
|
non_empty = [line for line in body.splitlines() if line.strip()]
|
|
candidate = non_empty[-SIGNATURE_MAX_LINES:]
|
|
upvotes = 0
|
|
sender_check = contains_sender_names(sender)
|
|
for line in candidate:
|
|
# we check lines for sender's name, phone, email and url,
|
|
# those signature lines don't take more then 27 lines
|
|
if len(line.strip()) > 27:
|
|
continue
|
|
|
|
if sender_check(line):
|
|
return True
|
|
|
|
if (binary_regex_search(RE_RELAX_PHONE)(line) +
|
|
binary_regex_search(RE_EMAIL)(line) +
|
|
binary_regex_search(RE_URL)(line) == 1):
|
|
upvotes += 1
|
|
|
|
if upvotes > 1:
|
|
return True
|
|
|
|
return False
|