initial commit

2014-07-23 21:12:54 -07:00
commit 170f11038b
80 changed files with 7481 additions and 0 deletions
--- a/talon/signature/bruteforce.py
+++ b/talon/signature/bruteforce.py
@@ -0,0 +1,188 @@
+import logging
+
+import regex as re
+
+from talon.utils import get_delimiter
+from talon.signature.constants import (SIGNATURE_MAX_LINES,
+                                       TOO_LONG_SIGNATURE_LINE)
+
+log = logging.getLogger(__name__)
+
+
+# regex to fetch signature based on common signature words
+RE_SIGNATURE = re.compile(r'''
+               (
+                   (?:
+                       ^[\s]*--*[\s]*[a-z \.]*$
+                       |
+                       ^thanks[\s,!]*$
+                       |
+                       ^regards[\s,!]*$
+                       |
+                       ^cheers[\s,!]*$
+                       |
+                       ^best[ a-z]*[\s,!]*$
+                   )
+                   .*
+               )
+               ''', re.I | re.X | re.M | re.S)
+
+
+# signatures appended by phone email clients
+RE_PHONE_SIGNATURE = re.compile(r'''
+               (
+                   (?:
+                       ^sent[ ]{1}from[ ]{1}my[\s,!\w]*$
+                       |
+                       ^sent[ ]from[ ]Mailbox[ ]for[ ]iPhone.*$
+                       |
+                       ^sent[ ]([\S]*[ ])?from[ ]my[ ]BlackBerry.*$
+                       |
+                       ^Enviado[ ]desde[ ]mi[ ]([\S]+[ ]){0,2}BlackBerry.*$
+                   )
+                   .*
+               )
+               ''', re.I | re.X | re.M | re.S)
+
+
+# see _mark_candidate_indexes() for details
+# c - could be signature line
+# d - line starts with dashes (could be signature or list item)
+# l - long line
+RE_SIGNATURE_CANDIDAATE = re.compile(r'''
+    (?P<candidate>c+d)[^d]
+    |
+    (?P<candidate>c+d)$
+    |
+    (?P<candidate>c+)
+    |
+    (?P<candidate>d)[^d]
+    |
+    (?P<candidate>d)$
+''', re.I | re.X | re.M | re.S)
+
+
+def extract_signature(msg_body):
+    '''
+    Analyzes message for a presence of signature block (by common patterns)
+    and returns tuple with two elements: message text without signature block
+    and the signature itself.
+
+    >>> extract_signature('Hey man! How r u?\n\n--\nRegards,\nRoman')
+    ('Hey man! How r u?', '--\nRegards,\nRoman')
+
+    >>> extract_signature('Hey man!')
+    ('Hey man!', None)
+    '''
+    try:
+        # identify line delimiter first
+        delimiter = get_delimiter(msg_body)
+
+        # make an assumption
+        stripped_body = msg_body.strip()
+        phone_signature = None
+
+        # strip off phone signature
+        phone_signature = RE_PHONE_SIGNATURE.search(msg_body)
+        if phone_signature:
+            stripped_body = stripped_body[:phone_signature.start()]
+            phone_signature = phone_signature.group()
+
+        # decide on signature candidate
+        lines = stripped_body.splitlines()
+        candidate = get_signature_candidate(lines)
+        candidate = delimiter.join(candidate)
+
+        # try to extract signature
+        signature = RE_SIGNATURE.search(candidate)
+        if not signature:
+            return (stripped_body.strip(), phone_signature)
+        else:
+            signature = signature.group()
+            # when we splitlines() and then join them
+            # we can lose a new line at the end
+            # we did it when identifying a candidate
+            # so we had to do it for stripped_body now
+            stripped_body = delimiter.join(lines)
+            stripped_body = stripped_body[:-len(signature)]
+
+            if phone_signature:
+                signature = delimiter.join([signature, phone_signature])
+
+            return (stripped_body.strip(),
+                    signature.strip())
+    except Exception, e:
+        log.exception('ERROR extracting signature')
+        return (msg_body, None)
+
+
+def get_signature_candidate(lines):
+    """Return lines that could hold signature
+
+    The lines should:
+
+    * be among last SIGNATURE_MAX_LINES non-empty lines.
+    * not include first line
+    * be shorter than TOO_LONG_SIGNATURE_LINE
+    * not include more than one line that starts with dashes
+    """
+    # non empty lines indexes
+    non_empty = [i for i, line in enumerate(lines) if line.strip()]
+
+    # if message is empty or just one line then there is no signature
+    if len(non_empty) <= 1:
+        return []
+
+    # we don't expect signature to start at the 1st line
+    candidate = non_empty[1:]
+    # signature shouldn't be longer then SIGNATURE_MAX_LINES
+    candidate = candidate[-SIGNATURE_MAX_LINES:]
+
+    markers = _mark_candidate_indexes(lines, candidate)
+    candidate = _process_marked_candidate_indexes(candidate, markers)
+
+    # get actual lines for the candidate instead of indexes
+    if candidate:
+        candidate = lines[candidate[0]:]
+        return candidate
+
+    return []
+
+
+def _mark_candidate_indexes(lines, candidate):
+    """Mark candidate indexes with markers
+
+    Markers:
+
+    * c - line that could be a signature line
+    * l - long line
+    * d - line that starts with dashes but has other chars as well
+
+    >>> _mark_candidate_lines(['Some text', '', '-', 'Bob'], [0, 2, 3])
+    'cdc'
+    """
+    # at first consider everything to be potential signature lines
+    markers = bytearray('c'*len(candidate))
+
+    # mark lines starting from bottom up
+    for i, line_idx in reversed(list(enumerate(candidate))):
+        if len(lines[line_idx].strip()) > TOO_LONG_SIGNATURE_LINE:
+            markers[i] = 'l'
+        else:
+            line = lines[line_idx].strip()
+            if line.startswith('-') and line.strip("-"):
+                markers[i] = 'd'
+
+    return markers
+
+
+def _process_marked_candidate_indexes(candidate, markers):
+    """
+    Run regexes against candidate's marked indexes to strip
+    signature candidate.
+
+    >>> _process_marked_candidate_indexes([9, 12, 14, 15, 17], 'clddc')
+    [15, 17]
+    """
+    match = RE_SIGNATURE_CANDIDAATE.match(markers[::-1])
+    return candidate[-match.end('candidate'):] if match else []