initial commit

This commit is contained in:
Sergey Obukhov
2014-07-23 21:12:54 -07:00
commit 170f11038b
80 changed files with 7481 additions and 0 deletions

7
talon/__init__.py Normal file
View File

@@ -0,0 +1,7 @@
from talon.quotations import register_xpath_extensions
from talon import signature
def init():
register_xpath_extensions()
signature.initialize()

4
talon/constants.py Normal file
View File

@@ -0,0 +1,4 @@
import regex as re
RE_DELIMITER = re.compile('\r?\n')

174
talon/html_quotations.py Normal file
View File

@@ -0,0 +1,174 @@
"""
The module's functions operate on message bodies trying to extract original
messages (without quoted messages) from html
"""
import regex as re
CHECKPOINT_PREFIX = '#!%!'
CHECKPOINT_SUFFIX = '!%!#'
CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX)
# HTML quote indicators (tag ids)
QUOTE_IDS = ['OLK_SRC_BODY_SECTION']
def add_checkpoint(html_note, counter):
"""Recursively adds checkpoints to html tree.
"""
if html_note.text:
html_note.text = (html_note.text + CHECKPOINT_PREFIX +
str(counter) + CHECKPOINT_SUFFIX)
else:
html_note.text = (CHECKPOINT_PREFIX + str(counter) +
CHECKPOINT_SUFFIX)
counter += 1
for child in html_note.iterchildren():
counter = add_checkpoint(child, counter)
if html_note.tail:
html_note.tail = (html_note.tail + CHECKPOINT_PREFIX +
str(counter) + CHECKPOINT_SUFFIX)
else:
html_note.tail = (CHECKPOINT_PREFIX + str(counter) +
CHECKPOINT_SUFFIX)
counter += 1
return counter
def delete_quotation_tags(html_note, counter, quotation_checkpoints):
"""Deletes tags with quotation checkpoints from html tree.
"""
tag_in_quotation = True
if quotation_checkpoints[counter]:
html_note.text = ''
else:
tag_in_quotation = False
counter += 1
quotation_children = [] # Children tags which are in quotation.
for child in html_note.iterchildren():
counter, child_tag_in_quotation = delete_quotation_tags(
child, counter,
quotation_checkpoints
)
if child_tag_in_quotation:
quotation_children.append(child)
if quotation_checkpoints[counter]:
html_note.tail = ''
else:
tag_in_quotation = False
counter += 1
if tag_in_quotation:
return counter, tag_in_quotation
else:
# Remove quotation children.
for child in quotation_children:
html_note.remove(child)
return counter, tag_in_quotation
def cut_gmail_quote(html_message):
''' Cuts the outermost block element with class gmail_quote. '''
gmail_quote = html_message.cssselect('.gmail_quote')
if gmail_quote:
gmail_quote[0].getparent().remove(gmail_quote[0])
return True
def cut_microsoft_quote(html_message):
''' Cuts splitter block and all following blocks. '''
splitter = html_message.xpath(
#outlook 2007, 2010
"//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
"padding:3.0pt 0cm 0cm 0cm']|"
#windows mail
"//div[@style='padding-top: 5px; "
"border-top-color: rgb(229, 229, 229); "
"border-top-width: 1px; border-top-style: solid;']"
)
if splitter:
splitter = splitter[0]
#outlook 2010
if splitter == splitter.getparent().getchildren()[0]:
splitter = splitter.getparent()
else:
#outlook 2003
splitter = html_message.xpath(
"//div"
"/div[@class='MsoNormal' and @align='center' "
"and @style='text-align:center']"
"/font"
"/span"
"/hr[@size='3' and @width='100%' and @align='center' "
"and @tabindex='-1']"
)
if len(splitter):
splitter = splitter[0]
splitter = splitter.getparent().getparent()
splitter = splitter.getparent().getparent()
if len(splitter):
parent = splitter.getparent()
after_splitter = splitter.getnext()
while after_splitter is not None:
parent.remove(after_splitter)
after_splitter = splitter.getnext()
parent.remove(splitter)
return True
return False
def cut_by_id(html_message):
found = False
for quote_id in QUOTE_IDS:
quote = html_message.cssselect('#{}'.format(quote_id))
if quote:
found = True
quote[0].getparent().remove(quote[0])
return found
def cut_blockquote(html_message):
''' Cuts blockquote with wrapping elements. '''
quote = html_message.find('.//blockquote')
if quote is not None:
quote.getparent().remove(quote)
return True
def cut_from_block(html_message):
"""Cuts div tag which wraps block starting with "From:"."""
# handle the case when From: block is enclosed in some tag
block = html_message.xpath(
("//*[starts-with(mg:text_content(), 'From:')]|"
"//*[starts-with(mg:text_content(), 'Date:')]"))
if block:
block = block[-1]
while block.getparent() is not None:
if block.tag == 'div':
block.getparent().remove(block)
return True
else:
block = block.getparent()
else:
# handle the case when From: block goes right after e.g. <hr>
# and not enclosed in some tag
block = html_message.xpath(
("//*[starts-with(mg:tail(), 'From:')]|"
"//*[starts-with(mg:tail(), 'Date:')]"))
if block:
block = block[0]
while(block.getnext() is not None):
block.getparent().remove(block.getnext())
block.getparent().remove(block)
return True

376
talon/quotations.py Normal file
View File

@@ -0,0 +1,376 @@
# -*- coding: utf-8 -*-
"""
The module's functions operate on message bodies trying to extract
original messages (without quoted messages)
"""
import regex as re
import logging
from copy import deepcopy
from lxml import html, etree
import html2text
from talon.constants import RE_DELIMITER
from talon.utils import random_token, get_delimiter
from talon import html_quotations
log = logging.getLogger(__name__)
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
RE_ON_DATE_SMB_WROTE = re.compile(
r'''
(
-* # could include dashes
[ ]?On[ ].*, # date part ends with comma
(.*\n){0,2} # splitter takes 4 lines at most
.*(wrote|sent):
)
''', re.VERBOSE)
RE_QUOTATION = re.compile(
r'''
(
# quotation border: splitter line or a number of quotation marker lines
(?:
s
|
(?:me*){2,}
)
# quotation lines could be marked as splitter or text, etc.
.*
# but we expect it to end with a quotation marker line
me*
)
# after quotations should be text only or nothing at all
[te]*$
''', re.VERBOSE)
RE_EMPTY_QUOTATION = re.compile(
r'''
(
# quotation border: splitter line or a number of quotation marker lines
(?:
s
|
(?:me*){2,}
)
)
e*
''', re.VERBOSE)
SPLITTER_PATTERNS = [
# ------Original Message------ or ---- Reply Message ----
re.compile("[\s]*[-]+[ ]*(Original|Reply) Message[ ]*[-]+", re.I),
# <date> <person>
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE),
RE_ON_DATE_SMB_WROTE,
re.compile('(_+\r?\n)?[\s]*(:?[*]?From|Date):[*]? .*'),
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
'( \S+){3,6}@\S+:')
]
RE_LINK = re.compile('<(http://[^>]*)>')
RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@')
RE_PARANTHESIS_LINK = re.compile("\(https?://")
SPLITTER_MAX_LINES = 4
MAX_LINES_COUNT = 1000
QUOT_PATTERN = re.compile('^>+ ?')
NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
def extract_from(msg_body, content_type='text/plain'):
try:
if content_type == 'text/plain':
return extract_from_plain(msg_body)
elif content_type == 'text/html':
return extract_from_html(msg_body)
except Exception, e:
log.exception('ERROR extracting message')
return msg_body
def mark_message_lines(lines):
"""Mark message lines with markers to distinguish quotation lines.
Markers:
* e - empty line
* m - line that starts with quotation marker '>'
* s - splitter line
* t - presumably lines from the last message in the conversation
>>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question'])
'tsem'
"""
markers = bytearray(len(lines))
i = 0
while i < len(lines):
if not lines[i].strip():
markers[i] = 'e' # empty line
elif QUOT_PATTERN.match(lines[i]):
markers[i] = 'm' # line with quotation marker
elif RE_FWD.match(lines[i]):
markers[i] = 'f' # ---- Forwarded message ----
else:
# in case splitter is spread across several lines
splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
if splitter:
# append as many splitter markers as lines in splitter
splitter_lines = splitter.group().splitlines()
for j in xrange(len(splitter_lines)):
markers[i + j] = 's'
# skip splitter lines
i += len(splitter_lines) - 1
else:
# probably the line from the last message in the conversation
markers[i] = 't'
i += 1
return markers
def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
"""Run regexes against message's marked lines to strip quotations.
Return only last message lines.
>>> mark_message_lines(['Hello', 'From: foo@bar.com', '', '> Hi', 'tsem'])
['Hello']
Also returns return_flags.
return_flags = [were_lines_deleted, first_deleted_line,
last_deleted_line]
"""
# if there are no splitter there should be no markers
if 's' not in markers and not re.search('(me*){3}', markers):
markers = markers.replace('m', 't')
if re.match('[te]*f', markers):
return_flags[:] = [False, -1, -1]
return lines
# inlined reply
# use lookbehind assertions to find overlapping entries e.g. for 'mtmtm'
# both 't' entries should be found
for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers):
# long links could break sequence of quotation lines but they shouldn't
# be considered an inline reply
links = (
RE_PARANTHESIS_LINK.search(lines[inline_reply.start() - 1]) or
RE_PARANTHESIS_LINK.match(lines[inline_reply.start()].strip()))
if not links:
return_flags[:] = [False, -1, -1]
return lines
# cut out text lines coming after splitter if there are no markers there
quotation = re.search('(se*)+((t|f)+e*)+', markers)
if quotation:
return_flags[:] = [True, quotation.start(), len(lines)]
return lines[:quotation.start()]
# handle the case with markers
quotation = (RE_QUOTATION.search(markers) or
RE_EMPTY_QUOTATION.search(markers))
if quotation:
return_flags[:] = True, quotation.start(1), quotation.end(1)
return lines[:quotation.start(1)] + lines[quotation.end(1):]
return_flags[:] = [False, -1, -1]
return lines
def preprocess(msg_body, delimiter, content_type='text/plain'):
"""Prepares msg_body for being stripped.
Replaces link brackets so that they couldn't be taken for quotation marker.
Splits line in two if splitter pattern preceeded by some text on the same
line (done only for 'On <date> <person> wrote:' pattern).
"""
# normalize links i.e. replace '<', '>' wrapping the link with some symbols
# so that '>' closing the link couldn't be mistakenly taken for quotation
# marker.
def link_wrapper(link):
newline_index = msg_body[:link.start()].rfind("\n")
if msg_body[newline_index + 1] == ">":
return link.group()
else:
return "@@%s@@" % link.group(1)
msg_body = re.sub(RE_LINK, link_wrapper, msg_body)
def splitter_wrapper(splitter):
"""Wrapps splitter with new line"""
if splitter.start() and msg_body[splitter.start() - 1] != '\n':
return '%s%s' % (delimiter, splitter.group())
else:
return splitter.group()
if content_type == 'text/plain':
msg_body = re.sub(RE_ON_DATE_SMB_WROTE, splitter_wrapper, msg_body)
return msg_body
def postprocess(msg_body):
"""Make up for changes done at preprocessing message.
Replace link brackets back to '<' and '>'.
"""
return re.sub(RE_NORMALIZED_LINK, r'<\1>', msg_body).strip()
def extract_from_plain(msg_body):
"""Extracts a non quoted message from provided plain text."""
stripped_text = msg_body
delimiter = get_delimiter(msg_body)
msg_body = preprocess(msg_body, delimiter)
lines = msg_body.splitlines()
# don't process too long messages
if len(lines) > MAX_LINES_COUNT:
return stripped_text
markers = mark_message_lines(lines)
lines = process_marked_lines(lines, markers)
# concatenate lines, change links back, strip and return
msg_body = delimiter.join(lines)
msg_body = postprocess(msg_body)
return msg_body
def extract_from_html(msg_body):
"""
Extract not quoted message from provided html message body
using tags and plain text algorithm.
Cut out the 'blockquote', 'gmail_quote' tags.
Cut Microsoft quotations.
Then use plain text algorithm to cut out splitter or
leftover quotation.
This works by adding checkpoint text to all html tags,
then converting html to text,
then extracting quotations from text,
then checking deleted checkpoints,
then deleting neccessary tags.
"""
if msg_body.strip() == '':
return msg_body
html_tree = html.document_fromstring(
msg_body,
parser=html.HTMLParser(encoding="utf-8")
)
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
html_quotations.cut_blockquote(html_tree) or
html_quotations.cut_microsoft_quote(html_tree) or
html_quotations.cut_by_id(html_tree) or
html_quotations.cut_from_block(html_tree)
)
html_tree_copy = deepcopy(html_tree)
number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
quotation_checkpoints = [False for i in xrange(number_of_checkpoints)]
msg_with_checkpoints = html.tostring(html_tree)
h = html2text.HTML2Text()
h.body_width = 0 # generate plain text without wrap
# html2text adds unnecessary star symbols. Remove them.
# Mask star symbols
msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432')
plain_text = h.handle(msg_with_checkpoints)
# Remove created star symbols
plain_text = plain_text.replace('*', '')
# Unmask saved star symbols
plain_text = plain_text.replace('3423oorkg432', '*')
delimiter = get_delimiter(plain_text)
plain_text = preprocess(plain_text, delimiter, content_type='text/html')
lines = plain_text.splitlines()
# Don't process too long messages
if len(lines) > MAX_LINES_COUNT:
return msg_body
# Collect checkpoints on each line
line_checkpoints = [
[int(i[4:-4]) # Only checkpoint number
for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)]
for line in lines]
# Remove checkpoints
lines = [re.sub(html_quotations.CHECKPOINT_PATTERN, '', line)
for line in lines]
# Use plain text quotation extracting algorithm
markers = mark_message_lines(lines)
return_flags = []
process_marked_lines(lines, markers, return_flags)
lines_were_deleted, first_deleted, last_deleted = return_flags
if lines_were_deleted:
#collect checkpoints from deleted lines
for i in xrange(first_deleted, last_deleted):
for checkpoint in line_checkpoints[i]:
quotation_checkpoints[checkpoint] = True
else:
if cut_quotations:
return html.tostring(html_tree_copy)
else:
return msg_body
# Remove tags with quotation checkpoints
html_quotations.delete_quotation_tags(
html_tree_copy, 0, quotation_checkpoints
)
return html.tostring(html_tree_copy)
def is_splitter(line):
'''
Returns Matcher object if provided string is a splitter and
None otherwise.
'''
for pattern in SPLITTER_PATTERNS:
matcher = re.match(pattern, line)
if matcher:
return matcher
def text_content(context):
'''XPath Extension function to return a node text content.'''
return context.context_node.text_content().strip()
def tail(context):
'''XPath Extension function to return a node tail text.'''
return context.context_node.tail or ''
def register_xpath_extensions():
ns = etree.FunctionNamespace("http://mailgun.net")
ns.prefix = 'mg'
ns['text_content'] = text_content
ns['tail'] = tail

View File

@@ -0,0 +1,48 @@
"""The package exploits machine learning for parsing message signatures.
The public interface consists of only one `extract` function:
>>> (body, signature) = extract(body, sender)
Where body is the original message `body` and `sender` corresponds to a person
who sent the message.
When importing the package classifiers instances are loaded.
So each process will have it's classifiers in memory.
The import of the package and the call to the `extract` function are better be
enclosed in a try-catch block in case they fail.
.. warning:: When making changes to features or emails the classifier is
trained against, don't forget to regenerate:
* signature/data/train.data and
* signature/data/classifier
"""
import os
import sys
from cStringIO import StringIO
from . import extraction
from . extraction import extract
from . learning import classifier
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
EXTRACTOR_FILENAME = os.path.join(DATA_DIR, 'classifier')
EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data')
def initialize():
try:
# redirect output
so, sys.stdout = sys.stdout, StringIO()
extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME,
EXTRACTOR_DATA)
sys.stdout = so
except Exception, e:
raise Exception(
"Failed initializing signature parsing with classifiers", e)

View File

@@ -0,0 +1,188 @@
import logging
import regex as re
from talon.utils import get_delimiter
from talon.signature.constants import (SIGNATURE_MAX_LINES,
TOO_LONG_SIGNATURE_LINE)
log = logging.getLogger(__name__)
# regex to fetch signature based on common signature words
RE_SIGNATURE = re.compile(r'''
(
(?:
^[\s]*--*[\s]*[a-z \.]*$
|
^thanks[\s,!]*$
|
^regards[\s,!]*$
|
^cheers[\s,!]*$
|
^best[ a-z]*[\s,!]*$
)
.*
)
''', re.I | re.X | re.M | re.S)
# signatures appended by phone email clients
RE_PHONE_SIGNATURE = re.compile(r'''
(
(?:
^sent[ ]{1}from[ ]{1}my[\s,!\w]*$
|
^sent[ ]from[ ]Mailbox[ ]for[ ]iPhone.*$
|
^sent[ ]([\S]*[ ])?from[ ]my[ ]BlackBerry.*$
|
^Enviado[ ]desde[ ]mi[ ]([\S]+[ ]){0,2}BlackBerry.*$
)
.*
)
''', re.I | re.X | re.M | re.S)
# see _mark_candidate_indexes() for details
# c - could be signature line
# d - line starts with dashes (could be signature or list item)
# l - long line
RE_SIGNATURE_CANDIDAATE = re.compile(r'''
(?P<candidate>c+d)[^d]
|
(?P<candidate>c+d)$
|
(?P<candidate>c+)
|
(?P<candidate>d)[^d]
|
(?P<candidate>d)$
''', re.I | re.X | re.M | re.S)
def extract_signature(msg_body):
'''
Analyzes message for a presence of signature block (by common patterns)
and returns tuple with two elements: message text without signature block
and the signature itself.
>>> extract_signature('Hey man! How r u?\n\n--\nRegards,\nRoman')
('Hey man! How r u?', '--\nRegards,\nRoman')
>>> extract_signature('Hey man!')
('Hey man!', None)
'''
try:
# identify line delimiter first
delimiter = get_delimiter(msg_body)
# make an assumption
stripped_body = msg_body.strip()
phone_signature = None
# strip off phone signature
phone_signature = RE_PHONE_SIGNATURE.search(msg_body)
if phone_signature:
stripped_body = stripped_body[:phone_signature.start()]
phone_signature = phone_signature.group()
# decide on signature candidate
lines = stripped_body.splitlines()
candidate = get_signature_candidate(lines)
candidate = delimiter.join(candidate)
# try to extract signature
signature = RE_SIGNATURE.search(candidate)
if not signature:
return (stripped_body.strip(), phone_signature)
else:
signature = signature.group()
# when we splitlines() and then join them
# we can lose a new line at the end
# we did it when identifying a candidate
# so we had to do it for stripped_body now
stripped_body = delimiter.join(lines)
stripped_body = stripped_body[:-len(signature)]
if phone_signature:
signature = delimiter.join([signature, phone_signature])
return (stripped_body.strip(),
signature.strip())
except Exception, e:
log.exception('ERROR extracting signature')
return (msg_body, None)
def get_signature_candidate(lines):
"""Return lines that could hold signature
The lines should:
* be among last SIGNATURE_MAX_LINES non-empty lines.
* not include first line
* be shorter than TOO_LONG_SIGNATURE_LINE
* not include more than one line that starts with dashes
"""
# non empty lines indexes
non_empty = [i for i, line in enumerate(lines) if line.strip()]
# if message is empty or just one line then there is no signature
if len(non_empty) <= 1:
return []
# we don't expect signature to start at the 1st line
candidate = non_empty[1:]
# signature shouldn't be longer then SIGNATURE_MAX_LINES
candidate = candidate[-SIGNATURE_MAX_LINES:]
markers = _mark_candidate_indexes(lines, candidate)
candidate = _process_marked_candidate_indexes(candidate, markers)
# get actual lines for the candidate instead of indexes
if candidate:
candidate = lines[candidate[0]:]
return candidate
return []
def _mark_candidate_indexes(lines, candidate):
"""Mark candidate indexes with markers
Markers:
* c - line that could be a signature line
* l - long line
* d - line that starts with dashes but has other chars as well
>>> _mark_candidate_lines(['Some text', '', '-', 'Bob'], [0, 2, 3])
'cdc'
"""
# at first consider everything to be potential signature lines
markers = bytearray('c'*len(candidate))
# mark lines starting from bottom up
for i, line_idx in reversed(list(enumerate(candidate))):
if len(lines[line_idx].strip()) > TOO_LONG_SIGNATURE_LINE:
markers[i] = 'l'
else:
line = lines[line_idx].strip()
if line.startswith('-') and line.strip("-"):
markers[i] = 'd'
return markers
def _process_marked_candidate_indexes(candidate, markers):
"""
Run regexes against candidate's marked indexes to strip
signature candidate.
>>> _process_marked_candidate_indexes([9, 12, 14, 15, 17], 'clddc')
[15, 17]
"""
match = RE_SIGNATURE_CANDIDAATE.match(markers[::-1])
return candidate[-match.end('candidate'):] if match else []

View File

@@ -0,0 +1,2 @@
SIGNATURE_MAX_LINES = 11
TOO_LONG_SIGNATURE_LINE = 60

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,116 @@
# -*- coding: utf-8 -*-
import os
import logging
import regex as re
from PyML import SparseDataSet
from talon.constants import RE_DELIMITER
from talon.signature.constants import (SIGNATURE_MAX_LINES,
TOO_LONG_SIGNATURE_LINE)
from talon.signature.learning.featurespace import features, build_pattern
from talon.utils import get_delimiter
from talon.signature.bruteforce import get_signature_candidate
from talon.signature.learning.helpers import has_signature
log = logging.getLogger(__name__)
EXTRACTOR = None
# regex signature pattern for reversed lines
# assumes that all long lines have been excluded
RE_REVERSE_SIGNATURE = re.compile(r'''
# signature should consists of blocks like this
(?:
# it could end with empty line
e*
# there could be text lines but no more than 2 in a row
(te*){,2}
# every block should end with signature line
s
)+
''', re.I | re.X | re.M | re.S)
def is_signature_line(line, sender, classifier):
'''Checks if the line belongs to signature. Returns True or False.'''
data = SparseDataSet([build_pattern(line, features(sender))])
return classifier.decisionFunc(data, 0) > 0
def extract(body, sender):
"""Strips signature from the body of the message.
Returns stripped body and signature as a tuple.
If no signature is found the corresponding returned value is None.
"""
try:
delimiter = get_delimiter(body)
body = body.strip()
if has_signature(body, sender):
lines = body.splitlines()
markers = _mark_lines(lines, sender)
text, signature = _process_marked_lines(lines, markers)
if signature:
text = delimiter.join(text)
if text.strip():
return (text, delimiter.join(signature))
except Exception, e:
log.exception('ERROR when extracting signature with classifiers')
return (body, None)
def _mark_lines(lines, sender):
"""Mark message lines with markers to distinguish signature lines.
Markers:
* e - empty line
* s - line identified as signature
* t - other i.e. ordinary text line
>>> mark_message_lines(['Some text', '', 'Bob'], 'Bob')
'tes'
"""
global EXTRACTOR
candidate = get_signature_candidate(lines)
# at first consider everything to be text no signature
markers = bytearray('t'*len(lines))
# mark lines starting from bottom up
# mark only lines that belong to candidate
# no need to mark all lines of the message
for i, line in reversed(list(enumerate(candidate))):
# markers correspond to lines not candidate
# so we need to recalculate our index to be
# relative to lines not candidate
j = len(lines) - len(candidate) + i
if not line.strip():
markers[j] = 'e'
elif is_signature_line(line, sender, EXTRACTOR):
markers[j] = 's'
return markers
def _process_marked_lines(lines, markers):
"""Run regexes against message's marked lines to strip signature.
>>> _process_marked_lines(['Some text', '', 'Bob'], 'tes')
(['Some text', ''], ['Bob'])
"""
# reverse lines and match signature pattern for reversed lines
signature = RE_REVERSE_SIGNATURE.match(markers[::-1])
if signature:
return (lines[:-signature.end()], lines[-signature.end():])
return (lines, None)

View File

View File

@@ -0,0 +1,36 @@
# -*- coding: utf-8 -*-
"""The module's functions could init, train, save and load a classifier.
The classifier could be used to detect if a certain line of the message
body belongs to the signature.
"""
import os
import sys
from PyML import SparseDataSet, SVM
def init():
'''Inits classifier with optimal options.'''
return SVM(C=10, optimization='liblinear')
def train(classifier, train_data_filename, save_classifier_filename=None):
'''Trains and saves classifier so that it could be easily loaded later.'''
data = SparseDataSet(train_data_filename, labelsColumn=-1)
classifier.train(data)
if save_classifier_filename:
classifier.save(save_classifier_filename)
return classifier
def load(saved_classifier_filename, train_data_filename):
"""Loads saved classifier.
Classifier should be loaded with the same data it was trained against
"""
train_data = SparseDataSet(train_data_filename, labelsColumn=-1)
classifier = init()
classifier.load(saved_classifier_filename, train_data)
return classifier

View File

@@ -0,0 +1,161 @@
# -*- coding: utf-8 -*-
"""The module's functions build datasets to train/assess classifiers.
For signature detection the input should be a folder with two directories
that contain emails with and without signatures.
For signature extraction the input should be a folder with annotated emails.
To indicate that a line is a signature line use #sig# at the start of the line.
A sender of an email could be specified in the same file as
the message body e.g. when .eml format is used or in a separate file.
In the letter case it is assumed that a body filename ends with the `_body`
suffix and the corresponding sender file has the same name except for the
suffix which should be `_sender`.
"""
import os
import regex as re
from talon.signature.constants import SIGNATURE_MAX_LINES
from talon.signature.learning.featurespace import build_pattern, features
SENDER_SUFFIX = '_sender'
BODY_SUFFIX = '_body'
SIGNATURE_ANNOTATION = '#sig#'
REPLY_ANNOTATION = '#reply#'
ANNOTATIONS = [SIGNATURE_ANNOTATION, REPLY_ANNOTATION]
def is_sender_filename(filename):
"""Checks if the file could contain message sender's name."""
return filename.endswith(SENDER_SUFFIX)
def build_sender_filename(msg_filename):
"""By the message filename gives expected sender's filename."""
return msg_filename[:-len(BODY_SUFFIX)] + SENDER_SUFFIX
def parse_msg_sender(filename, sender_known=True):
"""Given a filename returns the sender and the message.
Here the message is assumed to be a whole MIME message or just
message body.
>>> sender, msg = parse_msg_sender('msg.eml')
>>> sender, msg = parse_msg_sender('msg_body')
If you don't want to consider the sender's name in your classification
algorithm:
>>> parse_msg_sender(filename, False)
"""
sender, msg = None, None
if os.path.isfile(filename) and not is_sender_filename(filename):
with open(filename) as f:
msg = f.read()
sender = u''
if sender_known:
sender_filename = build_sender_filename(filename)
if os.path.exists(sender_filename):
with open(sender_filename) as sender_file:
sender = sender_file.read().strip()
else:
# if sender isn't found then the next line fails
# and it is ok
lines = msg.splitlines()
for line in lines:
match = re.match('From:(.*)', line)
if match:
sender = match.group(1)
break
return (sender, msg)
def build_detection_class(folder, dataset_filename,
label, sender_known=True):
"""Builds signature detection class.
Signature detection dataset includes patterns for two classes:
* class for positive patterns (goes with label 1)
* class for negative patterns (goes with label -1)
The patterns are build of emails from `folder` and appended to
dataset file.
>>> build_signature_detection_class('emails/P', 'train.data', 1)
"""
with open(dataset_filename, 'a') as dataset:
for filename in os.listdir(folder):
filename = os.path.join(folder, filename)
sender, msg = parse_msg_sender(filename, sender_known)
if sender is None or msg is None:
continue
msg = re.sub('|'.join(ANNOTATIONS), '', msg)
X = build_pattern(msg, features(sender))
X.append(label)
labeled_pattern = ','.join([str(e) for e in X])
dataset.write(labeled_pattern + '\n')
def build_detection_dataset(folder, dataset_filename,
sender_known=True):
"""Builds signature detection dataset using emails from folder.
folder should have the following structure:
x-- folder
| x-- P
| | | -- positive sample email 1
| | | -- positive sample email 2
| | | -- ...
| x-- N
| | | -- negative sample email 1
| | | -- negative sample email 2
| | | -- ...
If the dataset file already exist it is rewritten.
"""
if os.path.exists(dataset_filename):
os.remove(dataset_filename)
build_detection_class(os.path.join(folder, u'P'),
dataset_filename, 1)
build_detection_class(os.path.join(folder, u'N'),
dataset_filename, -1)
def build_extraction_dataset(folder, dataset_filename,
sender_known=True):
"""Builds signature extraction dataset using emails in the `folder`.
The emails in the `folder` should be annotated i.e. signature lines
should be marked with `#sig#`.
"""
if os.path.exists(dataset_filename):
os.remove(dataset_filename)
with open(dataset_filename, 'a') as dataset:
for filename in os.listdir(folder):
filename = os.path.join(folder, filename)
sender, msg = parse_msg_sender(filename, sender_known)
if not sender or not msg:
continue
lines = msg.splitlines()
for i in xrange(1, min(SIGNATURE_MAX_LINES,
len(lines)) + 1):
line = lines[-i]
label = -1
if line[:len(SIGNATURE_ANNOTATION)] == \
SIGNATURE_ANNOTATION:
label = 1
line = line[len(SIGNATURE_ANNOTATION):]
elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION:
line = line[len(REPLY_ANNOTATION):]
X = build_pattern(line, features(sender))
X.append(label)
labeled_pattern = ','.join([str(e) for e in X])
dataset.write(labeled_pattern + '\n')

View File

@@ -0,0 +1,73 @@
# -*- coding: utf-8 -*-
""" The module provides functions for convertion of a message body/body lines
into classifiers features space.
The body and the message sender string are converted into unicode before
applying features to them.
"""
from talon.signature.constants import SIGNATURE_MAX_LINES
from talon.signature.learning.helpers import *
def features(sender=''):
'''Returns a list of signature features.'''
return [
# This one isn't from paper.
# Meant to match companies names, sender's names, address.
many_capitalized_words,
# This one is not from paper.
# Line is too long.
# This one is less aggressive than `Line is too short`
lambda line: 1 if len(line) > 60 else 0,
# Line contains email pattern.
binary_regex_search(RE_EMAIL),
# Line contains url.
binary_regex_search(RE_URL),
# Line contains phone number pattern.
binary_regex_search(RE_RELAX_PHONE),
# Line matches the regular expression "^[\s]*---*[\s]*$".
binary_regex_match(RE_SEPARATOR),
# Line has a sequence of 10 or more special characters.
binary_regex_search(RE_SPECIAL_CHARS),
# Line contains any typical signature words.
binary_regex_search(RE_SIGNATURE_WORDS),
# Line contains a pattern like Vitor R. Carvalho or William W. Cohen.
binary_regex_search(RE_NAME),
# Percentage of punctuation symbols in the line is larger than 50%
lambda line: 1 if punctuation_percent(line) > 50 else 0,
# Percentage of punctuation symbols in the line is larger than 90%
lambda line: 1 if punctuation_percent(line) > 90 else 0,
contains_sender_names(sender)
]
def apply_features(body, features):
'''Applies features to message body lines.
Returns list of lists. Each of the lists corresponds to the body line
and is constituted by the numbers of features occurances (0 or 1).
E.g. if element j of list i equals 1 this means that
feature j occured in line i (counting from the last line of the body).
'''
# collect all non empty lines
lines = [line for line in body.splitlines() if line.strip()]
# take the last SIGNATURE_MAX_LINES
last_lines = lines[-SIGNATURE_MAX_LINES:]
# apply features, fallback to zeros
return ([[f(line) for f in features] for line in last_lines] or
[[0 for f in features]])
def build_pattern(body, features):
'''Converts body into a pattern i.e. a point in the features space.
Applies features to the body lines and sums up the results.
Elements of the pattern indicate how many times a certain feature occured
in the last lines of the body.
'''
line_patterns = apply_features(body, features)
return reduce(lambda x, y: [i + j for i, j in zip(x, y)], line_patterns)

View File

@@ -0,0 +1,233 @@
# -*- coding: utf-8 -*-
""" The module provides:
* functions used when evaluating signature's features
* regexp's constants used when evaluating signature's features
"""
import unicodedata
import regex as re
from talon.utils import to_unicode
from talon.signature.constants import SIGNATURE_MAX_LINES
rc = re.compile
RE_EMAIL = rc('@')
RE_RELAX_PHONE = rc('.*(\(? ?[\d]{2,3} ?\)?.{,3}){2,}')
RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')
# Taken from:
# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
# Line matches the regular expression "^[\s]*---*[\s]*$".
RE_SEPARATOR = rc('^[\s]*---*[\s]*$')
# Taken from:
# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
# Line has a sequence of 10 or more special characters.
RE_SPECIAL_CHARS = rc(('^[\s]*([\*]|#|[\+]|[\^]|-|[\~]|[\&]|[\$]|_|[\!]|'
'[\/]|[\%]|[\:]|[\=]){10,}[\s]*$'))
RE_SIGNATURE_WORDS = rc(('(T|t)hank.*,|(B|b)est|(R|r)egards|'
'^sent[ ]{1}from[ ]{1}my[\s,!\w]*$|BR|(S|s)incerely|'
'(C|c)orporation|Group'))
# Taken from:
# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
# Line contains a pattern like Vitor R. Carvalho or William W. Cohen.
RE_NAME = rc('[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+')
# Pattern to match if e.g. 'Sender:' header field has sender names.
SENDER_WITH_NAME_PATTERN = '([\s]*[\S]+,?)+[\s]*<.*>.*'
RE_SENDER_WITH_NAME = rc(SENDER_WITH_NAME_PATTERN)
# Reply line clue line endings, as in regular expression:
# " wrote:$" or " writes:$"
RE_CLUE_LINE_END = rc('.*(W|w)rotes?:$')
INVALID_WORD_START = rc('\(|\+|[\d]')
BAD_SENDER_NAMES = [
# known mail domains
'hotmail', 'gmail', 'yandex', 'mail', 'yahoo', 'mailgun', 'mailgunhq',
'example',
# first level domains
'com', 'org', 'net', 'ru',
# bad words
'mailto'
]
def binary_regex_search(prog):
'''Returns a function that returns 1 or 0 depending on regex search result.
If regular expression compiled into prog is present in a string
the result of calling the returned function with the string will be 1
and 0 otherwise.
>>> import regex as re
>>> binary_regex_search(re.compile("12"))("12")
1
>>> binary_regex_search(re.compile("12"))("34")
0
'''
return lambda s: 1 if prog.search(s) else 0
def binary_regex_match(prog):
'''Returns a function that returns 1 or 0 depending on regex match result.
If a string matches regular expression compiled into prog
the result of calling the returned function with the string will be 1
and 0 otherwise.
>>> import regex as re
>>> binary_regex_match(re.compile("12"))("12 3")
1
>>> binary_regex_match(re.compile("12"))("3 12")
0
'''
return lambda s: 1 if prog.match(s) else 0
def flatten_list(list_to_flatten):
"""Simple list comprehesion to flatten list.
>>> flatten_list([[1, 2], [3, 4, 5]])
[1, 2, 3, 4, 5]
>>> flatten_list([[1], [[2]]])
[1, [2]]
>>> flatten_list([1, [2]])
Traceback (most recent call last):
...
TypeError: 'int' object is not iterable
"""
return [e for sublist in list_to_flatten for e in sublist]
def contains_sender_names(sender):
'''Returns a functions to search sender\'s name or it\'s part.
>>> feature = contains_sender_names("Sergey N. Obukhov <xxx@example.com>")
>>> feature("Sergey Obukhov")
1
>>> feature("BR, Sergey N.")
1
>>> feature("Sergey")
1
>>> contains_sender_names("<serobnic@mail.ru>")("Serobnic")
1
>>> contains_sender_names("<serobnic@mail.ru>")("serobnic")
1
'''
names = '( |$)|'.join(flatten_list([[e, e.capitalize()]
for e in extract_names(sender)]))
names = names or sender
if names != '':
return binary_regex_search(re.compile(names))
return lambda s: False
def extract_names(sender):
"""Tries to extract sender's names from `From:` header.
It could extract not only the actual names but e.g.
the name of the company, parts of email, etc.
>>> extract_names('Sergey N. Obukhov <serobnic@mail.ru>')
['Sergey', 'Obukhov', 'serobnic']
>>> extract_names('')
[]
"""
sender = to_unicode(sender)
# Remove non-alphabetical characters
sender = "".join([char if char.isalpha() else ' ' for char in sender])
# Remove too short words and words from "black" list i.e.
# words like `ru`, `gmail`, `com`, `org`, etc.
sender = [word for word in sender.split() if len(word) > 1 and
not word in BAD_SENDER_NAMES]
# Remove duplicates
names = list(set(sender))
return names
def categories_percent(s, categories):
'''Returns category characters persent.
>>> categories_percent("qqq ggg hhh", ["Po"])
0.0
>>> categories_percent("q,w.", ["Po"])
50.0
>>> categories_percent("qqq ggg hhh", ["Nd"])
0.0
>>> categories_percent("q5", ["Nd"])
50.0
>>> categories_percent("s.s,5s", ["Po", "Nd"])
50.0
'''
count = 0
s = to_unicode(s)
for c in s:
if unicodedata.category(c) in categories:
count += 1
return 100 * float(count) / len(s) if len(s) else 0
def punctuation_percent(s):
'''Returns punctuation persent.
>>> punctuation_percent("qqq ggg hhh")
0.0
>>> punctuation_percent("q,w.")
50.0
'''
return categories_percent(s, ['Po'])
def capitalized_words_percent(s):
'''Returns capitalized words percent.'''
s = to_unicode(s)
words = re.split('\s', s)
words = [w for w in words if w.strip()]
capitalized_words_counter = 0
valid_words_counter = 0
for word in words:
if not INVALID_WORD_START.match(word):
valid_words_counter += 1
if word[0].isupper():
capitalized_words_counter += 1
if valid_words_counter > 0 and len(words) > 1:
return 100 * float(capitalized_words_counter) / valid_words_counter
return 0
def many_capitalized_words(s):
"""Returns a function to check percentage of capitalized words.
The function returns 1 if percentage greater then 65% and 0 otherwise.
"""
return 1 if capitalized_words_percent(s) > 66 else 0
def has_signature(body, sender):
'''Checks if the body has signature. Returns True or False.'''
non_empty = [line for line in body.splitlines() if line.strip()]
candidate = non_empty[-SIGNATURE_MAX_LINES:]
upvotes = 0
for line in candidate:
# we check lines for sender's name, phone, email and url,
# those signature lines don't take more then 27 lines
if len(line.strip()) > 27:
continue
elif contains_sender_names(sender)(line):
return True
elif (binary_regex_search(RE_RELAX_PHONE)(line) +
binary_regex_search(RE_EMAIL)(line) +
binary_regex_search(RE_URL)(line) == 1):
upvotes += 1
if upvotes > 1:
return True

76
talon/utils.py Normal file
View File

@@ -0,0 +1,76 @@
# coding:utf-8
import logging
from random import shuffle
from talon.constants import RE_DELIMITER
log = logging.getLogger(__name__)
def safe_format(format_string, *args, **kwargs):
"""
Helper: formats string with any combination of bytestrings/unicode
strings without raising exceptions
"""
try:
if not args and not kwargs:
return format_string
else:
return format_string.format(*args, **kwargs)
# catch encoding errors and transform everything into utf-8 string
# before logging:
except (UnicodeEncodeError, UnicodeDecodeError):
format_string = to_utf8(format_string)
args = [to_utf8(p) for p in args]
kwargs = {k: to_utf8(v) for k, v in kwargs.iteritems()}
return format_string.format(*args, **kwargs)
# ignore other errors
except:
return u''
def to_unicode(str_or_unicode, precise=False):
"""
Safely returns a unicode version of a given string
>>> utils.to_unicode('привет')
u'привет'
>>> utils.to_unicode(u'привет')
u'привет'
If `precise` flag is True, tries to guess the correct encoding first.
"""
encoding = detect_encoding(str_or_unicode) if precise else 'utf-8'
if isinstance(str_or_unicode, str):
return unicode(str_or_unicode, encoding, 'replace')
return str_or_unicode
def to_utf8(str_or_unicode):
"""
Safely returns a UTF-8 version of a given string
>>> utils.to_utf8(u'hi')
'hi'
"""
if isinstance(str_or_unicode, unicode):
return str_or_unicode.encode("utf-8", "ignore")
return str(str_or_unicode)
def random_token(length=7):
vals = ("a b c d e f g h i j k l m n o p q r s t u v w x y z "
"0 1 2 3 4 5 6 7 8 9").split(' ')
shuffle(vals)
return ''.join(vals[:length])
def get_delimiter(msg_body):
delimiter = RE_DELIMITER.search(msg_body)
if delimiter:
delimiter = delimiter.group()
else:
delimiter = '\n'
return delimiter