Files
talon/talon/signature/bruteforce.py

188 lines
5.7 KiB
Python
Raw Normal View History

2016-07-12 17:25:46 +05:00
from __future__ import absolute_import
2017-05-23 15:39:50 -07:00
2014-07-23 21:12:54 -07:00
import logging
import regex as re
from talon.signature.constants import (SIGNATURE_MAX_LINES,
TOO_LONG_SIGNATURE_LINE)
2017-05-23 15:39:50 -07:00
from talon.utils import get_delimiter
2014-07-23 21:12:54 -07:00
log = logging.getLogger(__name__)
# regex to fetch signature based on common signature words
RE_SIGNATURE = re.compile(r'''
(
(?:
^[\s]*--*[\s]*[a-z \.]*$
|
^thanks[\s,!]*$
|
^regards[\s,!]*$
|
^cheers[\s,!]*$
|
^best[ a-z]*[\s,!]*$
)
.*
)
''', re.I | re.X | re.M | re.S)
# signatures appended by phone email clients
RE_PHONE_SIGNATURE = re.compile(r'''
(
(?:
^sent[ ]{1}from[ ]{1}my[\s,!\w]*$
|
^sent[ ]from[ ]Mailbox[ ]for[ ]iPhone.*$
|
^sent[ ]([\S]*[ ])?from[ ]my[ ]BlackBerry.*$
|
^Enviado[ ]desde[ ]mi[ ]([\S]+[ ]){0,2}BlackBerry.*$
)
.*
)
''', re.I | re.X | re.M | re.S)
# see _mark_candidate_indexes() for details
# c - could be signature line
# d - line starts with dashes (could be signature or list item)
# l - long line
2014-07-25 02:40:37 +00:00
RE_SIGNATURE_CANDIDATE = re.compile(r'''
2014-07-23 21:12:54 -07:00
(?P<candidate>c+d)[^d]
|
(?P<candidate>c+d)$
|
(?P<candidate>c+)
|
(?P<candidate>d)[^d]
|
(?P<candidate>d)$
''', re.I | re.X | re.M | re.S)
def extract_signature(msg_body):
'''
Analyzes message for a presence of signature block (by common patterns)
and returns tuple with two elements: message text without signature block
and the signature itself.
>>> extract_signature('Hey man! How r u?\n\n--\nRegards,\nRoman')
('Hey man! How r u?', '--\nRegards,\nRoman')
>>> extract_signature('Hey man!')
('Hey man!', None)
'''
try:
# identify line delimiter first
delimiter = get_delimiter(msg_body)
# make an assumption
stripped_body = msg_body.strip()
phone_signature = None
# strip off phone signature
phone_signature = RE_PHONE_SIGNATURE.search(msg_body)
if phone_signature:
stripped_body = stripped_body[:phone_signature.start()]
phone_signature = phone_signature.group()
# decide on signature candidate
lines = stripped_body.splitlines()
candidate = get_signature_candidate(lines)
candidate = delimiter.join(candidate)
# try to extract signature
signature = RE_SIGNATURE.search(candidate)
if not signature:
return (stripped_body.strip(), phone_signature)
else:
signature = signature.group()
# when we splitlines() and then join them
# we can lose a new line at the end
# we did it when identifying a candidate
# so we had to do it for stripped_body now
stripped_body = delimiter.join(lines)
stripped_body = stripped_body[:-len(signature)]
if phone_signature:
signature = delimiter.join([signature, phone_signature])
return (stripped_body.strip(),
signature.strip())
except Exception:
2014-07-23 21:12:54 -07:00
log.exception('ERROR extracting signature')
return (msg_body, None)
def get_signature_candidate(lines):
"""Return lines that could hold signature
The lines should:
* be among last SIGNATURE_MAX_LINES non-empty lines.
* not include first line
* be shorter than TOO_LONG_SIGNATURE_LINE
* not include more than one line that starts with dashes
"""
# non empty lines indexes
non_empty = [i for i, line in enumerate(lines) if line.strip()]
# if message is empty or just one line then there is no signature
if len(non_empty) <= 1:
return []
# we don't expect signature to start at the 1st line
candidate = non_empty[1:]
# signature shouldn't be longer then SIGNATURE_MAX_LINES
candidate = candidate[-SIGNATURE_MAX_LINES:]
markers = _mark_candidate_indexes(lines, candidate)
candidate = _process_marked_candidate_indexes(candidate, markers)
# get actual lines for the candidate instead of indexes
if candidate:
candidate = lines[candidate[0]:]
return candidate
return []
def _mark_candidate_indexes(lines, candidate):
"""Mark candidate indexes with markers
Markers:
* c - line that could be a signature line
* l - long line
* d - line that starts with dashes but has other chars as well
>>> _mark_candidate_lines(['Some text', '', '-', 'Bob'], [0, 2, 3])
'cdc'
"""
# at first consider everything to be potential signature lines
2017-05-23 15:39:50 -07:00
markers = list('c' * len(candidate))
2014-07-23 21:12:54 -07:00
# mark lines starting from bottom up
for i, line_idx in reversed(list(enumerate(candidate))):
if len(lines[line_idx].strip()) > TOO_LONG_SIGNATURE_LINE:
markers[i] = 'l'
else:
line = lines[line_idx].strip()
if line.startswith('-') and line.strip("-"):
markers[i] = 'd'
2017-05-23 15:39:50 -07:00
return "".join(markers)
2014-07-23 21:12:54 -07:00
def _process_marked_candidate_indexes(candidate, markers):
"""
Run regexes against candidate's marked indexes to strip
signature candidate.
>>> _process_marked_candidate_indexes([9, 12, 14, 15, 17], 'clddc')
[15, 17]
"""
2014-07-25 02:40:37 +00:00
match = RE_SIGNATURE_CANDIDATE.match(markers[::-1])
2014-07-23 21:12:54 -07:00
return candidate[-match.end('candidate'):] if match else []