2016-07-12 17:25:46 +05:00
|
|
|
from __future__ import absolute_import
|
2017-05-23 15:39:50 -07:00
|
|
|
|
2014-07-23 21:12:54 -07:00
|
|
|
import logging
|
|
|
|
|
|
|
|
|
|
import regex as re
|
|
|
|
|
|
|
|
|
|
from talon.signature.constants import (SIGNATURE_MAX_LINES,
|
|
|
|
|
TOO_LONG_SIGNATURE_LINE)
|
2017-05-23 15:39:50 -07:00
|
|
|
from talon.utils import get_delimiter
|
2014-07-23 21:12:54 -07:00
|
|
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
# regex to fetch signature based on common signature words
|
|
|
|
|
RE_SIGNATURE = re.compile(r'''
|
|
|
|
|
(
|
|
|
|
|
(?:
|
|
|
|
|
^[\s]*--*[\s]*[a-z \.]*$
|
|
|
|
|
|
|
|
|
|
|
^thanks[\s,!]*$
|
|
|
|
|
|
|
|
|
|
|
^regards[\s,!]*$
|
|
|
|
|
|
|
|
|
|
|
^cheers[\s,!]*$
|
|
|
|
|
|
|
|
|
|
|
^best[ a-z]*[\s,!]*$
|
|
|
|
|
)
|
|
|
|
|
.*
|
|
|
|
|
)
|
|
|
|
|
''', re.I | re.X | re.M | re.S)
|
|
|
|
|
|
|
|
|
|
# signatures appended by phone email clients
|
|
|
|
|
RE_PHONE_SIGNATURE = re.compile(r'''
|
|
|
|
|
(
|
|
|
|
|
(?:
|
|
|
|
|
^sent[ ]{1}from[ ]{1}my[\s,!\w]*$
|
|
|
|
|
|
|
|
|
|
|
^sent[ ]from[ ]Mailbox[ ]for[ ]iPhone.*$
|
|
|
|
|
|
|
|
|
|
|
^sent[ ]([\S]*[ ])?from[ ]my[ ]BlackBerry.*$
|
|
|
|
|
|
|
|
|
|
|
^Enviado[ ]desde[ ]mi[ ]([\S]+[ ]){0,2}BlackBerry.*$
|
|
|
|
|
)
|
|
|
|
|
.*
|
|
|
|
|
)
|
|
|
|
|
''', re.I | re.X | re.M | re.S)
|
|
|
|
|
|
|
|
|
|
# see _mark_candidate_indexes() for details
|
|
|
|
|
# c - could be signature line
|
|
|
|
|
# d - line starts with dashes (could be signature or list item)
|
|
|
|
|
# l - long line
|
2014-07-25 02:40:37 +00:00
|
|
|
RE_SIGNATURE_CANDIDATE = re.compile(r'''
|
2014-07-23 21:12:54 -07:00
|
|
|
(?P<candidate>c+d)[^d]
|
|
|
|
|
|
|
|
|
|
|
(?P<candidate>c+d)$
|
|
|
|
|
|
|
|
|
|
|
(?P<candidate>c+)
|
|
|
|
|
|
|
|
|
|
|
(?P<candidate>d)[^d]
|
|
|
|
|
|
|
|
|
|
|
(?P<candidate>d)$
|
|
|
|
|
''', re.I | re.X | re.M | re.S)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_signature(msg_body):
|
|
|
|
|
'''
|
|
|
|
|
Analyzes message for a presence of signature block (by common patterns)
|
|
|
|
|
and returns tuple with two elements: message text without signature block
|
|
|
|
|
and the signature itself.
|
|
|
|
|
|
|
|
|
|
>>> extract_signature('Hey man! How r u?\n\n--\nRegards,\nRoman')
|
|
|
|
|
('Hey man! How r u?', '--\nRegards,\nRoman')
|
|
|
|
|
|
|
|
|
|
>>> extract_signature('Hey man!')
|
|
|
|
|
('Hey man!', None)
|
|
|
|
|
'''
|
|
|
|
|
try:
|
|
|
|
|
# identify line delimiter first
|
|
|
|
|
delimiter = get_delimiter(msg_body)
|
|
|
|
|
|
|
|
|
|
# make an assumption
|
|
|
|
|
stripped_body = msg_body.strip()
|
|
|
|
|
phone_signature = None
|
|
|
|
|
|
|
|
|
|
# strip off phone signature
|
|
|
|
|
phone_signature = RE_PHONE_SIGNATURE.search(msg_body)
|
|
|
|
|
if phone_signature:
|
|
|
|
|
stripped_body = stripped_body[:phone_signature.start()]
|
|
|
|
|
phone_signature = phone_signature.group()
|
|
|
|
|
|
|
|
|
|
# decide on signature candidate
|
|
|
|
|
lines = stripped_body.splitlines()
|
|
|
|
|
candidate = get_signature_candidate(lines)
|
|
|
|
|
candidate = delimiter.join(candidate)
|
|
|
|
|
|
|
|
|
|
# try to extract signature
|
|
|
|
|
signature = RE_SIGNATURE.search(candidate)
|
|
|
|
|
if not signature:
|
|
|
|
|
return (stripped_body.strip(), phone_signature)
|
|
|
|
|
else:
|
|
|
|
|
signature = signature.group()
|
|
|
|
|
# when we splitlines() and then join them
|
|
|
|
|
# we can lose a new line at the end
|
|
|
|
|
# we did it when identifying a candidate
|
|
|
|
|
# so we had to do it for stripped_body now
|
|
|
|
|
stripped_body = delimiter.join(lines)
|
|
|
|
|
stripped_body = stripped_body[:-len(signature)]
|
|
|
|
|
|
|
|
|
|
if phone_signature:
|
|
|
|
|
signature = delimiter.join([signature, phone_signature])
|
|
|
|
|
|
|
|
|
|
return (stripped_body.strip(),
|
|
|
|
|
signature.strip())
|
2017-05-24 10:26:33 -07:00
|
|
|
except Exception:
|
2014-07-23 21:12:54 -07:00
|
|
|
log.exception('ERROR extracting signature')
|
|
|
|
|
return (msg_body, None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_signature_candidate(lines):
|
|
|
|
|
"""Return lines that could hold signature
|
|
|
|
|
|
|
|
|
|
The lines should:
|
|
|
|
|
|
|
|
|
|
* be among last SIGNATURE_MAX_LINES non-empty lines.
|
|
|
|
|
* not include first line
|
|
|
|
|
* be shorter than TOO_LONG_SIGNATURE_LINE
|
|
|
|
|
* not include more than one line that starts with dashes
|
|
|
|
|
"""
|
|
|
|
|
# non empty lines indexes
|
|
|
|
|
non_empty = [i for i, line in enumerate(lines) if line.strip()]
|
|
|
|
|
|
|
|
|
|
# if message is empty or just one line then there is no signature
|
|
|
|
|
if len(non_empty) <= 1:
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
# we don't expect signature to start at the 1st line
|
|
|
|
|
candidate = non_empty[1:]
|
|
|
|
|
# signature shouldn't be longer then SIGNATURE_MAX_LINES
|
|
|
|
|
candidate = candidate[-SIGNATURE_MAX_LINES:]
|
|
|
|
|
|
|
|
|
|
markers = _mark_candidate_indexes(lines, candidate)
|
|
|
|
|
candidate = _process_marked_candidate_indexes(candidate, markers)
|
|
|
|
|
|
|
|
|
|
# get actual lines for the candidate instead of indexes
|
|
|
|
|
if candidate:
|
|
|
|
|
candidate = lines[candidate[0]:]
|
|
|
|
|
return candidate
|
|
|
|
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _mark_candidate_indexes(lines, candidate):
|
|
|
|
|
"""Mark candidate indexes with markers
|
|
|
|
|
|
|
|
|
|
Markers:
|
|
|
|
|
|
|
|
|
|
* c - line that could be a signature line
|
|
|
|
|
* l - long line
|
|
|
|
|
* d - line that starts with dashes but has other chars as well
|
|
|
|
|
|
|
|
|
|
>>> _mark_candidate_lines(['Some text', '', '-', 'Bob'], [0, 2, 3])
|
|
|
|
|
'cdc'
|
|
|
|
|
"""
|
|
|
|
|
# at first consider everything to be potential signature lines
|
2017-05-23 15:39:50 -07:00
|
|
|
markers = list('c' * len(candidate))
|
2014-07-23 21:12:54 -07:00
|
|
|
|
|
|
|
|
# mark lines starting from bottom up
|
|
|
|
|
for i, line_idx in reversed(list(enumerate(candidate))):
|
|
|
|
|
if len(lines[line_idx].strip()) > TOO_LONG_SIGNATURE_LINE:
|
|
|
|
|
markers[i] = 'l'
|
|
|
|
|
else:
|
|
|
|
|
line = lines[line_idx].strip()
|
|
|
|
|
if line.startswith('-') and line.strip("-"):
|
|
|
|
|
markers[i] = 'd'
|
|
|
|
|
|
2017-05-23 15:39:50 -07:00
|
|
|
return "".join(markers)
|
2014-07-23 21:12:54 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def _process_marked_candidate_indexes(candidate, markers):
|
|
|
|
|
"""
|
|
|
|
|
Run regexes against candidate's marked indexes to strip
|
|
|
|
|
signature candidate.
|
|
|
|
|
|
|
|
|
|
>>> _process_marked_candidate_indexes([9, 12, 14, 15, 17], 'clddc')
|
|
|
|
|
[15, 17]
|
|
|
|
|
"""
|
2014-07-25 02:40:37 +00:00
|
|
|
match = RE_SIGNATURE_CANDIDATE.match(markers[::-1])
|
2014-07-23 21:12:54 -07:00
|
|
|
return candidate[-match.end('candidate'):] if match else []
|