initial commit
This commit is contained in:
7
talon/__init__.py
Normal file
7
talon/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from talon.quotations import register_xpath_extensions
|
||||
from talon import signature
|
||||
|
||||
|
||||
def init():
|
||||
register_xpath_extensions()
|
||||
signature.initialize()
|
||||
4
talon/constants.py
Normal file
4
talon/constants.py
Normal file
@@ -0,0 +1,4 @@
|
||||
import regex as re
|
||||
|
||||
|
||||
RE_DELIMITER = re.compile('\r?\n')
|
||||
174
talon/html_quotations.py
Normal file
174
talon/html_quotations.py
Normal file
@@ -0,0 +1,174 @@
|
||||
"""
|
||||
The module's functions operate on message bodies trying to extract original
|
||||
messages (without quoted messages) from html
|
||||
"""
|
||||
|
||||
import regex as re
|
||||
|
||||
|
||||
CHECKPOINT_PREFIX = '#!%!'
|
||||
CHECKPOINT_SUFFIX = '!%!#'
|
||||
CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX)
|
||||
|
||||
# HTML quote indicators (tag ids)
|
||||
QUOTE_IDS = ['OLK_SRC_BODY_SECTION']
|
||||
|
||||
|
||||
def add_checkpoint(html_note, counter):
|
||||
"""Recursively adds checkpoints to html tree.
|
||||
"""
|
||||
if html_note.text:
|
||||
html_note.text = (html_note.text + CHECKPOINT_PREFIX +
|
||||
str(counter) + CHECKPOINT_SUFFIX)
|
||||
else:
|
||||
html_note.text = (CHECKPOINT_PREFIX + str(counter) +
|
||||
CHECKPOINT_SUFFIX)
|
||||
counter += 1
|
||||
|
||||
for child in html_note.iterchildren():
|
||||
counter = add_checkpoint(child, counter)
|
||||
|
||||
if html_note.tail:
|
||||
html_note.tail = (html_note.tail + CHECKPOINT_PREFIX +
|
||||
str(counter) + CHECKPOINT_SUFFIX)
|
||||
else:
|
||||
html_note.tail = (CHECKPOINT_PREFIX + str(counter) +
|
||||
CHECKPOINT_SUFFIX)
|
||||
counter += 1
|
||||
|
||||
return counter
|
||||
|
||||
|
||||
def delete_quotation_tags(html_note, counter, quotation_checkpoints):
|
||||
"""Deletes tags with quotation checkpoints from html tree.
|
||||
"""
|
||||
tag_in_quotation = True
|
||||
|
||||
if quotation_checkpoints[counter]:
|
||||
html_note.text = ''
|
||||
else:
|
||||
tag_in_quotation = False
|
||||
counter += 1
|
||||
|
||||
quotation_children = [] # Children tags which are in quotation.
|
||||
for child in html_note.iterchildren():
|
||||
counter, child_tag_in_quotation = delete_quotation_tags(
|
||||
child, counter,
|
||||
quotation_checkpoints
|
||||
)
|
||||
if child_tag_in_quotation:
|
||||
quotation_children.append(child)
|
||||
|
||||
if quotation_checkpoints[counter]:
|
||||
html_note.tail = ''
|
||||
else:
|
||||
tag_in_quotation = False
|
||||
counter += 1
|
||||
|
||||
if tag_in_quotation:
|
||||
return counter, tag_in_quotation
|
||||
else:
|
||||
# Remove quotation children.
|
||||
for child in quotation_children:
|
||||
html_note.remove(child)
|
||||
return counter, tag_in_quotation
|
||||
|
||||
|
||||
def cut_gmail_quote(html_message):
|
||||
''' Cuts the outermost block element with class gmail_quote. '''
|
||||
gmail_quote = html_message.cssselect('.gmail_quote')
|
||||
if gmail_quote:
|
||||
gmail_quote[0].getparent().remove(gmail_quote[0])
|
||||
return True
|
||||
|
||||
|
||||
def cut_microsoft_quote(html_message):
|
||||
''' Cuts splitter block and all following blocks. '''
|
||||
splitter = html_message.xpath(
|
||||
#outlook 2007, 2010
|
||||
"//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
|
||||
"padding:3.0pt 0cm 0cm 0cm']|"
|
||||
#windows mail
|
||||
"//div[@style='padding-top: 5px; "
|
||||
"border-top-color: rgb(229, 229, 229); "
|
||||
"border-top-width: 1px; border-top-style: solid;']"
|
||||
)
|
||||
|
||||
if splitter:
|
||||
splitter = splitter[0]
|
||||
#outlook 2010
|
||||
if splitter == splitter.getparent().getchildren()[0]:
|
||||
splitter = splitter.getparent()
|
||||
else:
|
||||
#outlook 2003
|
||||
splitter = html_message.xpath(
|
||||
"//div"
|
||||
"/div[@class='MsoNormal' and @align='center' "
|
||||
"and @style='text-align:center']"
|
||||
"/font"
|
||||
"/span"
|
||||
"/hr[@size='3' and @width='100%' and @align='center' "
|
||||
"and @tabindex='-1']"
|
||||
)
|
||||
if len(splitter):
|
||||
splitter = splitter[0]
|
||||
splitter = splitter.getparent().getparent()
|
||||
splitter = splitter.getparent().getparent()
|
||||
|
||||
if len(splitter):
|
||||
parent = splitter.getparent()
|
||||
after_splitter = splitter.getnext()
|
||||
while after_splitter is not None:
|
||||
parent.remove(after_splitter)
|
||||
after_splitter = splitter.getnext()
|
||||
parent.remove(splitter)
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def cut_by_id(html_message):
|
||||
found = False
|
||||
for quote_id in QUOTE_IDS:
|
||||
quote = html_message.cssselect('#{}'.format(quote_id))
|
||||
if quote:
|
||||
found = True
|
||||
quote[0].getparent().remove(quote[0])
|
||||
return found
|
||||
|
||||
|
||||
def cut_blockquote(html_message):
|
||||
''' Cuts blockquote with wrapping elements. '''
|
||||
quote = html_message.find('.//blockquote')
|
||||
if quote is not None:
|
||||
quote.getparent().remove(quote)
|
||||
return True
|
||||
|
||||
|
||||
def cut_from_block(html_message):
|
||||
"""Cuts div tag which wraps block starting with "From:"."""
|
||||
# handle the case when From: block is enclosed in some tag
|
||||
block = html_message.xpath(
|
||||
("//*[starts-with(mg:text_content(), 'From:')]|"
|
||||
"//*[starts-with(mg:text_content(), 'Date:')]"))
|
||||
|
||||
if block:
|
||||
block = block[-1]
|
||||
while block.getparent() is not None:
|
||||
if block.tag == 'div':
|
||||
block.getparent().remove(block)
|
||||
return True
|
||||
else:
|
||||
block = block.getparent()
|
||||
else:
|
||||
# handle the case when From: block goes right after e.g. <hr>
|
||||
# and not enclosed in some tag
|
||||
block = html_message.xpath(
|
||||
("//*[starts-with(mg:tail(), 'From:')]|"
|
||||
"//*[starts-with(mg:tail(), 'Date:')]"))
|
||||
if block:
|
||||
block = block[0]
|
||||
while(block.getnext() is not None):
|
||||
block.getparent().remove(block.getnext())
|
||||
block.getparent().remove(block)
|
||||
return True
|
||||
376
talon/quotations.py
Normal file
376
talon/quotations.py
Normal file
@@ -0,0 +1,376 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
The module's functions operate on message bodies trying to extract
|
||||
original messages (without quoted messages)
|
||||
"""
|
||||
|
||||
import regex as re
|
||||
import logging
|
||||
from copy import deepcopy
|
||||
|
||||
from lxml import html, etree
|
||||
import html2text
|
||||
|
||||
from talon.constants import RE_DELIMITER
|
||||
from talon.utils import random_token, get_delimiter
|
||||
from talon import html_quotations
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
|
||||
|
||||
RE_ON_DATE_SMB_WROTE = re.compile(
|
||||
r'''
|
||||
(
|
||||
-* # could include dashes
|
||||
[ ]?On[ ].*, # date part ends with comma
|
||||
(.*\n){0,2} # splitter takes 4 lines at most
|
||||
.*(wrote|sent):
|
||||
)
|
||||
''', re.VERBOSE)
|
||||
|
||||
RE_QUOTATION = re.compile(
|
||||
r'''
|
||||
(
|
||||
# quotation border: splitter line or a number of quotation marker lines
|
||||
(?:
|
||||
s
|
||||
|
|
||||
(?:me*){2,}
|
||||
)
|
||||
|
||||
# quotation lines could be marked as splitter or text, etc.
|
||||
.*
|
||||
|
||||
# but we expect it to end with a quotation marker line
|
||||
me*
|
||||
)
|
||||
|
||||
# after quotations should be text only or nothing at all
|
||||
[te]*$
|
||||
''', re.VERBOSE)
|
||||
|
||||
RE_EMPTY_QUOTATION = re.compile(
|
||||
r'''
|
||||
(
|
||||
# quotation border: splitter line or a number of quotation marker lines
|
||||
(?:
|
||||
s
|
||||
|
|
||||
(?:me*){2,}
|
||||
)
|
||||
)
|
||||
e*
|
||||
''', re.VERBOSE)
|
||||
|
||||
SPLITTER_PATTERNS = [
|
||||
# ------Original Message------ or ---- Reply Message ----
|
||||
re.compile("[\s]*[-]+[ ]*(Original|Reply) Message[ ]*[-]+", re.I),
|
||||
# <date> <person>
|
||||
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE),
|
||||
RE_ON_DATE_SMB_WROTE,
|
||||
re.compile('(_+\r?\n)?[\s]*(:?[*]?From|Date):[*]? .*'),
|
||||
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
|
||||
'( \S+){3,6}@\S+:')
|
||||
]
|
||||
|
||||
|
||||
RE_LINK = re.compile('<(http://[^>]*)>')
|
||||
RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@')
|
||||
|
||||
RE_PARANTHESIS_LINK = re.compile("\(https?://")
|
||||
|
||||
SPLITTER_MAX_LINES = 4
|
||||
MAX_LINES_COUNT = 1000
|
||||
|
||||
QUOT_PATTERN = re.compile('^>+ ?')
|
||||
NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
|
||||
|
||||
|
||||
def extract_from(msg_body, content_type='text/plain'):
|
||||
try:
|
||||
if content_type == 'text/plain':
|
||||
return extract_from_plain(msg_body)
|
||||
elif content_type == 'text/html':
|
||||
return extract_from_html(msg_body)
|
||||
except Exception, e:
|
||||
log.exception('ERROR extracting message')
|
||||
|
||||
return msg_body
|
||||
|
||||
|
||||
def mark_message_lines(lines):
|
||||
"""Mark message lines with markers to distinguish quotation lines.
|
||||
|
||||
Markers:
|
||||
|
||||
* e - empty line
|
||||
* m - line that starts with quotation marker '>'
|
||||
* s - splitter line
|
||||
* t - presumably lines from the last message in the conversation
|
||||
|
||||
>>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question'])
|
||||
'tsem'
|
||||
"""
|
||||
markers = bytearray(len(lines))
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
if not lines[i].strip():
|
||||
markers[i] = 'e' # empty line
|
||||
elif QUOT_PATTERN.match(lines[i]):
|
||||
markers[i] = 'm' # line with quotation marker
|
||||
elif RE_FWD.match(lines[i]):
|
||||
markers[i] = 'f' # ---- Forwarded message ----
|
||||
else:
|
||||
# in case splitter is spread across several lines
|
||||
splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
|
||||
if splitter:
|
||||
# append as many splitter markers as lines in splitter
|
||||
splitter_lines = splitter.group().splitlines()
|
||||
for j in xrange(len(splitter_lines)):
|
||||
markers[i + j] = 's'
|
||||
|
||||
# skip splitter lines
|
||||
i += len(splitter_lines) - 1
|
||||
else:
|
||||
# probably the line from the last message in the conversation
|
||||
markers[i] = 't'
|
||||
i += 1
|
||||
|
||||
return markers
|
||||
|
||||
|
||||
def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
|
||||
"""Run regexes against message's marked lines to strip quotations.
|
||||
|
||||
Return only last message lines.
|
||||
>>> mark_message_lines(['Hello', 'From: foo@bar.com', '', '> Hi', 'tsem'])
|
||||
['Hello']
|
||||
|
||||
Also returns return_flags.
|
||||
return_flags = [were_lines_deleted, first_deleted_line,
|
||||
last_deleted_line]
|
||||
"""
|
||||
# if there are no splitter there should be no markers
|
||||
if 's' not in markers and not re.search('(me*){3}', markers):
|
||||
markers = markers.replace('m', 't')
|
||||
|
||||
if re.match('[te]*f', markers):
|
||||
return_flags[:] = [False, -1, -1]
|
||||
return lines
|
||||
|
||||
# inlined reply
|
||||
# use lookbehind assertions to find overlapping entries e.g. for 'mtmtm'
|
||||
# both 't' entries should be found
|
||||
for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers):
|
||||
# long links could break sequence of quotation lines but they shouldn't
|
||||
# be considered an inline reply
|
||||
links = (
|
||||
RE_PARANTHESIS_LINK.search(lines[inline_reply.start() - 1]) or
|
||||
RE_PARANTHESIS_LINK.match(lines[inline_reply.start()].strip()))
|
||||
if not links:
|
||||
return_flags[:] = [False, -1, -1]
|
||||
return lines
|
||||
|
||||
# cut out text lines coming after splitter if there are no markers there
|
||||
quotation = re.search('(se*)+((t|f)+e*)+', markers)
|
||||
if quotation:
|
||||
return_flags[:] = [True, quotation.start(), len(lines)]
|
||||
return lines[:quotation.start()]
|
||||
|
||||
# handle the case with markers
|
||||
quotation = (RE_QUOTATION.search(markers) or
|
||||
RE_EMPTY_QUOTATION.search(markers))
|
||||
|
||||
if quotation:
|
||||
return_flags[:] = True, quotation.start(1), quotation.end(1)
|
||||
return lines[:quotation.start(1)] + lines[quotation.end(1):]
|
||||
|
||||
return_flags[:] = [False, -1, -1]
|
||||
return lines
|
||||
|
||||
|
||||
def preprocess(msg_body, delimiter, content_type='text/plain'):
|
||||
"""Prepares msg_body for being stripped.
|
||||
|
||||
Replaces link brackets so that they couldn't be taken for quotation marker.
|
||||
Splits line in two if splitter pattern preceeded by some text on the same
|
||||
line (done only for 'On <date> <person> wrote:' pattern).
|
||||
"""
|
||||
# normalize links i.e. replace '<', '>' wrapping the link with some symbols
|
||||
# so that '>' closing the link couldn't be mistakenly taken for quotation
|
||||
# marker.
|
||||
def link_wrapper(link):
|
||||
newline_index = msg_body[:link.start()].rfind("\n")
|
||||
if msg_body[newline_index + 1] == ">":
|
||||
return link.group()
|
||||
else:
|
||||
return "@@%s@@" % link.group(1)
|
||||
|
||||
msg_body = re.sub(RE_LINK, link_wrapper, msg_body)
|
||||
|
||||
def splitter_wrapper(splitter):
|
||||
"""Wrapps splitter with new line"""
|
||||
if splitter.start() and msg_body[splitter.start() - 1] != '\n':
|
||||
return '%s%s' % (delimiter, splitter.group())
|
||||
else:
|
||||
return splitter.group()
|
||||
|
||||
if content_type == 'text/plain':
|
||||
msg_body = re.sub(RE_ON_DATE_SMB_WROTE, splitter_wrapper, msg_body)
|
||||
|
||||
return msg_body
|
||||
|
||||
|
||||
def postprocess(msg_body):
|
||||
"""Make up for changes done at preprocessing message.
|
||||
|
||||
Replace link brackets back to '<' and '>'.
|
||||
"""
|
||||
return re.sub(RE_NORMALIZED_LINK, r'<\1>', msg_body).strip()
|
||||
|
||||
|
||||
def extract_from_plain(msg_body):
|
||||
"""Extracts a non quoted message from provided plain text."""
|
||||
stripped_text = msg_body
|
||||
|
||||
delimiter = get_delimiter(msg_body)
|
||||
msg_body = preprocess(msg_body, delimiter)
|
||||
lines = msg_body.splitlines()
|
||||
|
||||
# don't process too long messages
|
||||
if len(lines) > MAX_LINES_COUNT:
|
||||
return stripped_text
|
||||
|
||||
markers = mark_message_lines(lines)
|
||||
lines = process_marked_lines(lines, markers)
|
||||
|
||||
# concatenate lines, change links back, strip and return
|
||||
msg_body = delimiter.join(lines)
|
||||
msg_body = postprocess(msg_body)
|
||||
return msg_body
|
||||
|
||||
|
||||
def extract_from_html(msg_body):
|
||||
"""
|
||||
Extract not quoted message from provided html message body
|
||||
using tags and plain text algorithm.
|
||||
|
||||
Cut out the 'blockquote', 'gmail_quote' tags.
|
||||
Cut Microsoft quotations.
|
||||
|
||||
Then use plain text algorithm to cut out splitter or
|
||||
leftover quotation.
|
||||
This works by adding checkpoint text to all html tags,
|
||||
then converting html to text,
|
||||
then extracting quotations from text,
|
||||
then checking deleted checkpoints,
|
||||
then deleting neccessary tags.
|
||||
"""
|
||||
|
||||
if msg_body.strip() == '':
|
||||
return msg_body
|
||||
|
||||
html_tree = html.document_fromstring(
|
||||
msg_body,
|
||||
parser=html.HTMLParser(encoding="utf-8")
|
||||
)
|
||||
|
||||
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
|
||||
html_quotations.cut_blockquote(html_tree) or
|
||||
html_quotations.cut_microsoft_quote(html_tree) or
|
||||
html_quotations.cut_by_id(html_tree) or
|
||||
html_quotations.cut_from_block(html_tree)
|
||||
)
|
||||
|
||||
html_tree_copy = deepcopy(html_tree)
|
||||
|
||||
number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
|
||||
quotation_checkpoints = [False for i in xrange(number_of_checkpoints)]
|
||||
msg_with_checkpoints = html.tostring(html_tree)
|
||||
|
||||
h = html2text.HTML2Text()
|
||||
h.body_width = 0 # generate plain text without wrap
|
||||
|
||||
# html2text adds unnecessary star symbols. Remove them.
|
||||
# Mask star symbols
|
||||
msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432')
|
||||
plain_text = h.handle(msg_with_checkpoints)
|
||||
# Remove created star symbols
|
||||
plain_text = plain_text.replace('*', '')
|
||||
# Unmask saved star symbols
|
||||
plain_text = plain_text.replace('3423oorkg432', '*')
|
||||
|
||||
delimiter = get_delimiter(plain_text)
|
||||
|
||||
plain_text = preprocess(plain_text, delimiter, content_type='text/html')
|
||||
lines = plain_text.splitlines()
|
||||
|
||||
# Don't process too long messages
|
||||
if len(lines) > MAX_LINES_COUNT:
|
||||
return msg_body
|
||||
|
||||
# Collect checkpoints on each line
|
||||
line_checkpoints = [
|
||||
[int(i[4:-4]) # Only checkpoint number
|
||||
for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)]
|
||||
for line in lines]
|
||||
|
||||
# Remove checkpoints
|
||||
lines = [re.sub(html_quotations.CHECKPOINT_PATTERN, '', line)
|
||||
for line in lines]
|
||||
|
||||
# Use plain text quotation extracting algorithm
|
||||
markers = mark_message_lines(lines)
|
||||
return_flags = []
|
||||
process_marked_lines(lines, markers, return_flags)
|
||||
lines_were_deleted, first_deleted, last_deleted = return_flags
|
||||
|
||||
if lines_were_deleted:
|
||||
#collect checkpoints from deleted lines
|
||||
for i in xrange(first_deleted, last_deleted):
|
||||
for checkpoint in line_checkpoints[i]:
|
||||
quotation_checkpoints[checkpoint] = True
|
||||
else:
|
||||
if cut_quotations:
|
||||
return html.tostring(html_tree_copy)
|
||||
else:
|
||||
return msg_body
|
||||
|
||||
# Remove tags with quotation checkpoints
|
||||
html_quotations.delete_quotation_tags(
|
||||
html_tree_copy, 0, quotation_checkpoints
|
||||
)
|
||||
|
||||
return html.tostring(html_tree_copy)
|
||||
|
||||
|
||||
def is_splitter(line):
|
||||
'''
|
||||
Returns Matcher object if provided string is a splitter and
|
||||
None otherwise.
|
||||
'''
|
||||
for pattern in SPLITTER_PATTERNS:
|
||||
matcher = re.match(pattern, line)
|
||||
if matcher:
|
||||
return matcher
|
||||
|
||||
|
||||
def text_content(context):
|
||||
'''XPath Extension function to return a node text content.'''
|
||||
return context.context_node.text_content().strip()
|
||||
|
||||
|
||||
def tail(context):
|
||||
'''XPath Extension function to return a node tail text.'''
|
||||
return context.context_node.tail or ''
|
||||
|
||||
|
||||
def register_xpath_extensions():
|
||||
ns = etree.FunctionNamespace("http://mailgun.net")
|
||||
ns.prefix = 'mg'
|
||||
ns['text_content'] = text_content
|
||||
ns['tail'] = tail
|
||||
48
talon/signature/__init__.py
Normal file
48
talon/signature/__init__.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""The package exploits machine learning for parsing message signatures.
|
||||
|
||||
The public interface consists of only one `extract` function:
|
||||
|
||||
>>> (body, signature) = extract(body, sender)
|
||||
|
||||
Where body is the original message `body` and `sender` corresponds to a person
|
||||
who sent the message.
|
||||
|
||||
When importing the package classifiers instances are loaded.
|
||||
So each process will have it's classifiers in memory.
|
||||
|
||||
The import of the package and the call to the `extract` function are better be
|
||||
enclosed in a try-catch block in case they fail.
|
||||
|
||||
.. warning:: When making changes to features or emails the classifier is
|
||||
trained against, don't forget to regenerate:
|
||||
|
||||
* signature/data/train.data and
|
||||
* signature/data/classifier
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from cStringIO import StringIO
|
||||
|
||||
from . import extraction
|
||||
from . extraction import extract
|
||||
from . learning import classifier
|
||||
|
||||
|
||||
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
|
||||
|
||||
EXTRACTOR_FILENAME = os.path.join(DATA_DIR, 'classifier')
|
||||
EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data')
|
||||
|
||||
|
||||
def initialize():
|
||||
try:
|
||||
# redirect output
|
||||
so, sys.stdout = sys.stdout, StringIO()
|
||||
|
||||
extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME,
|
||||
EXTRACTOR_DATA)
|
||||
sys.stdout = so
|
||||
except Exception, e:
|
||||
raise Exception(
|
||||
"Failed initializing signature parsing with classifiers", e)
|
||||
188
talon/signature/bruteforce.py
Normal file
188
talon/signature/bruteforce.py
Normal file
@@ -0,0 +1,188 @@
|
||||
import logging
|
||||
|
||||
import regex as re
|
||||
|
||||
from talon.utils import get_delimiter
|
||||
from talon.signature.constants import (SIGNATURE_MAX_LINES,
|
||||
TOO_LONG_SIGNATURE_LINE)
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# regex to fetch signature based on common signature words
|
||||
RE_SIGNATURE = re.compile(r'''
|
||||
(
|
||||
(?:
|
||||
^[\s]*--*[\s]*[a-z \.]*$
|
||||
|
|
||||
^thanks[\s,!]*$
|
||||
|
|
||||
^regards[\s,!]*$
|
||||
|
|
||||
^cheers[\s,!]*$
|
||||
|
|
||||
^best[ a-z]*[\s,!]*$
|
||||
)
|
||||
.*
|
||||
)
|
||||
''', re.I | re.X | re.M | re.S)
|
||||
|
||||
|
||||
# signatures appended by phone email clients
|
||||
RE_PHONE_SIGNATURE = re.compile(r'''
|
||||
(
|
||||
(?:
|
||||
^sent[ ]{1}from[ ]{1}my[\s,!\w]*$
|
||||
|
|
||||
^sent[ ]from[ ]Mailbox[ ]for[ ]iPhone.*$
|
||||
|
|
||||
^sent[ ]([\S]*[ ])?from[ ]my[ ]BlackBerry.*$
|
||||
|
|
||||
^Enviado[ ]desde[ ]mi[ ]([\S]+[ ]){0,2}BlackBerry.*$
|
||||
)
|
||||
.*
|
||||
)
|
||||
''', re.I | re.X | re.M | re.S)
|
||||
|
||||
|
||||
# see _mark_candidate_indexes() for details
|
||||
# c - could be signature line
|
||||
# d - line starts with dashes (could be signature or list item)
|
||||
# l - long line
|
||||
RE_SIGNATURE_CANDIDAATE = re.compile(r'''
|
||||
(?P<candidate>c+d)[^d]
|
||||
|
|
||||
(?P<candidate>c+d)$
|
||||
|
|
||||
(?P<candidate>c+)
|
||||
|
|
||||
(?P<candidate>d)[^d]
|
||||
|
|
||||
(?P<candidate>d)$
|
||||
''', re.I | re.X | re.M | re.S)
|
||||
|
||||
|
||||
def extract_signature(msg_body):
|
||||
'''
|
||||
Analyzes message for a presence of signature block (by common patterns)
|
||||
and returns tuple with two elements: message text without signature block
|
||||
and the signature itself.
|
||||
|
||||
>>> extract_signature('Hey man! How r u?\n\n--\nRegards,\nRoman')
|
||||
('Hey man! How r u?', '--\nRegards,\nRoman')
|
||||
|
||||
>>> extract_signature('Hey man!')
|
||||
('Hey man!', None)
|
||||
'''
|
||||
try:
|
||||
# identify line delimiter first
|
||||
delimiter = get_delimiter(msg_body)
|
||||
|
||||
# make an assumption
|
||||
stripped_body = msg_body.strip()
|
||||
phone_signature = None
|
||||
|
||||
# strip off phone signature
|
||||
phone_signature = RE_PHONE_SIGNATURE.search(msg_body)
|
||||
if phone_signature:
|
||||
stripped_body = stripped_body[:phone_signature.start()]
|
||||
phone_signature = phone_signature.group()
|
||||
|
||||
# decide on signature candidate
|
||||
lines = stripped_body.splitlines()
|
||||
candidate = get_signature_candidate(lines)
|
||||
candidate = delimiter.join(candidate)
|
||||
|
||||
# try to extract signature
|
||||
signature = RE_SIGNATURE.search(candidate)
|
||||
if not signature:
|
||||
return (stripped_body.strip(), phone_signature)
|
||||
else:
|
||||
signature = signature.group()
|
||||
# when we splitlines() and then join them
|
||||
# we can lose a new line at the end
|
||||
# we did it when identifying a candidate
|
||||
# so we had to do it for stripped_body now
|
||||
stripped_body = delimiter.join(lines)
|
||||
stripped_body = stripped_body[:-len(signature)]
|
||||
|
||||
if phone_signature:
|
||||
signature = delimiter.join([signature, phone_signature])
|
||||
|
||||
return (stripped_body.strip(),
|
||||
signature.strip())
|
||||
except Exception, e:
|
||||
log.exception('ERROR extracting signature')
|
||||
return (msg_body, None)
|
||||
|
||||
|
||||
def get_signature_candidate(lines):
|
||||
"""Return lines that could hold signature
|
||||
|
||||
The lines should:
|
||||
|
||||
* be among last SIGNATURE_MAX_LINES non-empty lines.
|
||||
* not include first line
|
||||
* be shorter than TOO_LONG_SIGNATURE_LINE
|
||||
* not include more than one line that starts with dashes
|
||||
"""
|
||||
# non empty lines indexes
|
||||
non_empty = [i for i, line in enumerate(lines) if line.strip()]
|
||||
|
||||
# if message is empty or just one line then there is no signature
|
||||
if len(non_empty) <= 1:
|
||||
return []
|
||||
|
||||
# we don't expect signature to start at the 1st line
|
||||
candidate = non_empty[1:]
|
||||
# signature shouldn't be longer then SIGNATURE_MAX_LINES
|
||||
candidate = candidate[-SIGNATURE_MAX_LINES:]
|
||||
|
||||
markers = _mark_candidate_indexes(lines, candidate)
|
||||
candidate = _process_marked_candidate_indexes(candidate, markers)
|
||||
|
||||
# get actual lines for the candidate instead of indexes
|
||||
if candidate:
|
||||
candidate = lines[candidate[0]:]
|
||||
return candidate
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def _mark_candidate_indexes(lines, candidate):
|
||||
"""Mark candidate indexes with markers
|
||||
|
||||
Markers:
|
||||
|
||||
* c - line that could be a signature line
|
||||
* l - long line
|
||||
* d - line that starts with dashes but has other chars as well
|
||||
|
||||
>>> _mark_candidate_lines(['Some text', '', '-', 'Bob'], [0, 2, 3])
|
||||
'cdc'
|
||||
"""
|
||||
# at first consider everything to be potential signature lines
|
||||
markers = bytearray('c'*len(candidate))
|
||||
|
||||
# mark lines starting from bottom up
|
||||
for i, line_idx in reversed(list(enumerate(candidate))):
|
||||
if len(lines[line_idx].strip()) > TOO_LONG_SIGNATURE_LINE:
|
||||
markers[i] = 'l'
|
||||
else:
|
||||
line = lines[line_idx].strip()
|
||||
if line.startswith('-') and line.strip("-"):
|
||||
markers[i] = 'd'
|
||||
|
||||
return markers
|
||||
|
||||
|
||||
def _process_marked_candidate_indexes(candidate, markers):
|
||||
"""
|
||||
Run regexes against candidate's marked indexes to strip
|
||||
signature candidate.
|
||||
|
||||
>>> _process_marked_candidate_indexes([9, 12, 14, 15, 17], 'clddc')
|
||||
[15, 17]
|
||||
"""
|
||||
match = RE_SIGNATURE_CANDIDAATE.match(markers[::-1])
|
||||
return candidate[-match.end('candidate'):] if match else []
|
||||
2
talon/signature/constants.py
Normal file
2
talon/signature/constants.py
Normal file
@@ -0,0 +1,2 @@
|
||||
SIGNATURE_MAX_LINES = 11
|
||||
TOO_LONG_SIGNATURE_LINE = 60
|
||||
1
talon/signature/data/classifier
Normal file
1
talon/signature/data/classifier
Normal file
File diff suppressed because one or more lines are too long
2912
talon/signature/data/train.data
Normal file
2912
talon/signature/data/train.data
Normal file
File diff suppressed because it is too large
Load Diff
116
talon/signature/extraction.py
Normal file
116
talon/signature/extraction.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import logging
|
||||
|
||||
import regex as re
|
||||
from PyML import SparseDataSet
|
||||
|
||||
from talon.constants import RE_DELIMITER
|
||||
from talon.signature.constants import (SIGNATURE_MAX_LINES,
|
||||
TOO_LONG_SIGNATURE_LINE)
|
||||
from talon.signature.learning.featurespace import features, build_pattern
|
||||
from talon.utils import get_delimiter
|
||||
from talon.signature.bruteforce import get_signature_candidate
|
||||
from talon.signature.learning.helpers import has_signature
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
EXTRACTOR = None
|
||||
|
||||
# regex signature pattern for reversed lines
|
||||
# assumes that all long lines have been excluded
|
||||
RE_REVERSE_SIGNATURE = re.compile(r'''
|
||||
# signature should consists of blocks like this
|
||||
(?:
|
||||
# it could end with empty line
|
||||
e*
|
||||
# there could be text lines but no more than 2 in a row
|
||||
(te*){,2}
|
||||
# every block should end with signature line
|
||||
s
|
||||
)+
|
||||
''', re.I | re.X | re.M | re.S)
|
||||
|
||||
|
||||
def is_signature_line(line, sender, classifier):
|
||||
'''Checks if the line belongs to signature. Returns True or False.'''
|
||||
data = SparseDataSet([build_pattern(line, features(sender))])
|
||||
return classifier.decisionFunc(data, 0) > 0
|
||||
|
||||
|
||||
def extract(body, sender):
|
||||
"""Strips signature from the body of the message.
|
||||
|
||||
Returns stripped body and signature as a tuple.
|
||||
If no signature is found the corresponding returned value is None.
|
||||
"""
|
||||
try:
|
||||
delimiter = get_delimiter(body)
|
||||
|
||||
body = body.strip()
|
||||
|
||||
if has_signature(body, sender):
|
||||
lines = body.splitlines()
|
||||
|
||||
markers = _mark_lines(lines, sender)
|
||||
text, signature = _process_marked_lines(lines, markers)
|
||||
|
||||
if signature:
|
||||
text = delimiter.join(text)
|
||||
if text.strip():
|
||||
return (text, delimiter.join(signature))
|
||||
except Exception, e:
|
||||
log.exception('ERROR when extracting signature with classifiers')
|
||||
|
||||
return (body, None)
|
||||
|
||||
|
||||
def _mark_lines(lines, sender):
|
||||
"""Mark message lines with markers to distinguish signature lines.
|
||||
|
||||
Markers:
|
||||
|
||||
* e - empty line
|
||||
* s - line identified as signature
|
||||
* t - other i.e. ordinary text line
|
||||
|
||||
>>> mark_message_lines(['Some text', '', 'Bob'], 'Bob')
|
||||
'tes'
|
||||
"""
|
||||
global EXTRACTOR
|
||||
|
||||
candidate = get_signature_candidate(lines)
|
||||
|
||||
# at first consider everything to be text no signature
|
||||
markers = bytearray('t'*len(lines))
|
||||
|
||||
# mark lines starting from bottom up
|
||||
# mark only lines that belong to candidate
|
||||
# no need to mark all lines of the message
|
||||
for i, line in reversed(list(enumerate(candidate))):
|
||||
# markers correspond to lines not candidate
|
||||
# so we need to recalculate our index to be
|
||||
# relative to lines not candidate
|
||||
j = len(lines) - len(candidate) + i
|
||||
if not line.strip():
|
||||
markers[j] = 'e'
|
||||
elif is_signature_line(line, sender, EXTRACTOR):
|
||||
markers[j] = 's'
|
||||
|
||||
return markers
|
||||
|
||||
|
||||
def _process_marked_lines(lines, markers):
|
||||
"""Run regexes against message's marked lines to strip signature.
|
||||
|
||||
>>> _process_marked_lines(['Some text', '', 'Bob'], 'tes')
|
||||
(['Some text', ''], ['Bob'])
|
||||
"""
|
||||
# reverse lines and match signature pattern for reversed lines
|
||||
signature = RE_REVERSE_SIGNATURE.match(markers[::-1])
|
||||
if signature:
|
||||
return (lines[:-signature.end()], lines[-signature.end():])
|
||||
|
||||
return (lines, None)
|
||||
0
talon/signature/learning/__init__.py
Normal file
0
talon/signature/learning/__init__.py
Normal file
36
talon/signature/learning/classifier.py
Normal file
36
talon/signature/learning/classifier.py
Normal file
@@ -0,0 +1,36 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""The module's functions could init, train, save and load a classifier.
|
||||
The classifier could be used to detect if a certain line of the message
|
||||
body belongs to the signature.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
from PyML import SparseDataSet, SVM
|
||||
|
||||
|
||||
def init():
|
||||
'''Inits classifier with optimal options.'''
|
||||
return SVM(C=10, optimization='liblinear')
|
||||
|
||||
|
||||
def train(classifier, train_data_filename, save_classifier_filename=None):
|
||||
'''Trains and saves classifier so that it could be easily loaded later.'''
|
||||
data = SparseDataSet(train_data_filename, labelsColumn=-1)
|
||||
classifier.train(data)
|
||||
if save_classifier_filename:
|
||||
classifier.save(save_classifier_filename)
|
||||
return classifier
|
||||
|
||||
|
||||
def load(saved_classifier_filename, train_data_filename):
|
||||
"""Loads saved classifier.
|
||||
|
||||
Classifier should be loaded with the same data it was trained against
|
||||
"""
|
||||
train_data = SparseDataSet(train_data_filename, labelsColumn=-1)
|
||||
classifier = init()
|
||||
classifier.load(saved_classifier_filename, train_data)
|
||||
return classifier
|
||||
161
talon/signature/learning/dataset.py
Normal file
161
talon/signature/learning/dataset.py
Normal file
@@ -0,0 +1,161 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""The module's functions build datasets to train/assess classifiers.
|
||||
|
||||
For signature detection the input should be a folder with two directories
|
||||
that contain emails with and without signatures.
|
||||
|
||||
For signature extraction the input should be a folder with annotated emails.
|
||||
To indicate that a line is a signature line use #sig# at the start of the line.
|
||||
|
||||
A sender of an email could be specified in the same file as
|
||||
the message body e.g. when .eml format is used or in a separate file.
|
||||
|
||||
In the letter case it is assumed that a body filename ends with the `_body`
|
||||
suffix and the corresponding sender file has the same name except for the
|
||||
suffix which should be `_sender`.
|
||||
"""
|
||||
|
||||
import os
|
||||
import regex as re
|
||||
|
||||
from talon.signature.constants import SIGNATURE_MAX_LINES
|
||||
from talon.signature.learning.featurespace import build_pattern, features
|
||||
|
||||
|
||||
SENDER_SUFFIX = '_sender'
|
||||
BODY_SUFFIX = '_body'
|
||||
|
||||
SIGNATURE_ANNOTATION = '#sig#'
|
||||
REPLY_ANNOTATION = '#reply#'
|
||||
|
||||
ANNOTATIONS = [SIGNATURE_ANNOTATION, REPLY_ANNOTATION]
|
||||
|
||||
|
||||
def is_sender_filename(filename):
|
||||
"""Checks if the file could contain message sender's name."""
|
||||
return filename.endswith(SENDER_SUFFIX)
|
||||
|
||||
|
||||
def build_sender_filename(msg_filename):
|
||||
"""By the message filename gives expected sender's filename."""
|
||||
return msg_filename[:-len(BODY_SUFFIX)] + SENDER_SUFFIX
|
||||
|
||||
|
||||
def parse_msg_sender(filename, sender_known=True):
|
||||
"""Given a filename returns the sender and the message.
|
||||
|
||||
Here the message is assumed to be a whole MIME message or just
|
||||
message body.
|
||||
|
||||
>>> sender, msg = parse_msg_sender('msg.eml')
|
||||
>>> sender, msg = parse_msg_sender('msg_body')
|
||||
|
||||
If you don't want to consider the sender's name in your classification
|
||||
algorithm:
|
||||
>>> parse_msg_sender(filename, False)
|
||||
"""
|
||||
sender, msg = None, None
|
||||
if os.path.isfile(filename) and not is_sender_filename(filename):
|
||||
with open(filename) as f:
|
||||
msg = f.read()
|
||||
sender = u''
|
||||
if sender_known:
|
||||
sender_filename = build_sender_filename(filename)
|
||||
if os.path.exists(sender_filename):
|
||||
with open(sender_filename) as sender_file:
|
||||
sender = sender_file.read().strip()
|
||||
else:
|
||||
# if sender isn't found then the next line fails
|
||||
# and it is ok
|
||||
lines = msg.splitlines()
|
||||
for line in lines:
|
||||
match = re.match('From:(.*)', line)
|
||||
if match:
|
||||
sender = match.group(1)
|
||||
break
|
||||
return (sender, msg)
|
||||
|
||||
|
||||
def build_detection_class(folder, dataset_filename,
|
||||
label, sender_known=True):
|
||||
"""Builds signature detection class.
|
||||
|
||||
Signature detection dataset includes patterns for two classes:
|
||||
* class for positive patterns (goes with label 1)
|
||||
* class for negative patterns (goes with label -1)
|
||||
|
||||
The patterns are build of emails from `folder` and appended to
|
||||
dataset file.
|
||||
|
||||
>>> build_signature_detection_class('emails/P', 'train.data', 1)
|
||||
"""
|
||||
with open(dataset_filename, 'a') as dataset:
|
||||
for filename in os.listdir(folder):
|
||||
filename = os.path.join(folder, filename)
|
||||
sender, msg = parse_msg_sender(filename, sender_known)
|
||||
if sender is None or msg is None:
|
||||
continue
|
||||
msg = re.sub('|'.join(ANNOTATIONS), '', msg)
|
||||
X = build_pattern(msg, features(sender))
|
||||
X.append(label)
|
||||
labeled_pattern = ','.join([str(e) for e in X])
|
||||
dataset.write(labeled_pattern + '\n')
|
||||
|
||||
|
||||
def build_detection_dataset(folder, dataset_filename,
|
||||
sender_known=True):
|
||||
"""Builds signature detection dataset using emails from folder.
|
||||
|
||||
folder should have the following structure:
|
||||
x-- folder
|
||||
| x-- P
|
||||
| | | -- positive sample email 1
|
||||
| | | -- positive sample email 2
|
||||
| | | -- ...
|
||||
| x-- N
|
||||
| | | -- negative sample email 1
|
||||
| | | -- negative sample email 2
|
||||
| | | -- ...
|
||||
|
||||
If the dataset file already exist it is rewritten.
|
||||
"""
|
||||
if os.path.exists(dataset_filename):
|
||||
os.remove(dataset_filename)
|
||||
build_detection_class(os.path.join(folder, u'P'),
|
||||
dataset_filename, 1)
|
||||
build_detection_class(os.path.join(folder, u'N'),
|
||||
dataset_filename, -1)
|
||||
|
||||
|
||||
def build_extraction_dataset(folder, dataset_filename,
|
||||
sender_known=True):
|
||||
"""Builds signature extraction dataset using emails in the `folder`.
|
||||
|
||||
The emails in the `folder` should be annotated i.e. signature lines
|
||||
should be marked with `#sig#`.
|
||||
"""
|
||||
if os.path.exists(dataset_filename):
|
||||
os.remove(dataset_filename)
|
||||
with open(dataset_filename, 'a') as dataset:
|
||||
for filename in os.listdir(folder):
|
||||
filename = os.path.join(folder, filename)
|
||||
sender, msg = parse_msg_sender(filename, sender_known)
|
||||
if not sender or not msg:
|
||||
continue
|
||||
lines = msg.splitlines()
|
||||
for i in xrange(1, min(SIGNATURE_MAX_LINES,
|
||||
len(lines)) + 1):
|
||||
line = lines[-i]
|
||||
label = -1
|
||||
if line[:len(SIGNATURE_ANNOTATION)] == \
|
||||
SIGNATURE_ANNOTATION:
|
||||
label = 1
|
||||
line = line[len(SIGNATURE_ANNOTATION):]
|
||||
elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION:
|
||||
line = line[len(REPLY_ANNOTATION):]
|
||||
|
||||
X = build_pattern(line, features(sender))
|
||||
X.append(label)
|
||||
labeled_pattern = ','.join([str(e) for e in X])
|
||||
dataset.write(labeled_pattern + '\n')
|
||||
73
talon/signature/learning/featurespace.py
Normal file
73
talon/signature/learning/featurespace.py
Normal file
@@ -0,0 +1,73 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
""" The module provides functions for convertion of a message body/body lines
|
||||
into classifiers features space.
|
||||
|
||||
The body and the message sender string are converted into unicode before
|
||||
applying features to them.
|
||||
"""
|
||||
|
||||
from talon.signature.constants import SIGNATURE_MAX_LINES
|
||||
from talon.signature.learning.helpers import *
|
||||
|
||||
|
||||
def features(sender=''):
|
||||
'''Returns a list of signature features.'''
|
||||
return [
|
||||
# This one isn't from paper.
|
||||
# Meant to match companies names, sender's names, address.
|
||||
many_capitalized_words,
|
||||
# This one is not from paper.
|
||||
# Line is too long.
|
||||
# This one is less aggressive than `Line is too short`
|
||||
lambda line: 1 if len(line) > 60 else 0,
|
||||
# Line contains email pattern.
|
||||
binary_regex_search(RE_EMAIL),
|
||||
# Line contains url.
|
||||
binary_regex_search(RE_URL),
|
||||
# Line contains phone number pattern.
|
||||
binary_regex_search(RE_RELAX_PHONE),
|
||||
# Line matches the regular expression "^[\s]*---*[\s]*$".
|
||||
binary_regex_match(RE_SEPARATOR),
|
||||
# Line has a sequence of 10 or more special characters.
|
||||
binary_regex_search(RE_SPECIAL_CHARS),
|
||||
# Line contains any typical signature words.
|
||||
binary_regex_search(RE_SIGNATURE_WORDS),
|
||||
# Line contains a pattern like Vitor R. Carvalho or William W. Cohen.
|
||||
binary_regex_search(RE_NAME),
|
||||
# Percentage of punctuation symbols in the line is larger than 50%
|
||||
lambda line: 1 if punctuation_percent(line) > 50 else 0,
|
||||
# Percentage of punctuation symbols in the line is larger than 90%
|
||||
lambda line: 1 if punctuation_percent(line) > 90 else 0,
|
||||
contains_sender_names(sender)
|
||||
]
|
||||
|
||||
|
||||
def apply_features(body, features):
|
||||
'''Applies features to message body lines.
|
||||
|
||||
Returns list of lists. Each of the lists corresponds to the body line
|
||||
and is constituted by the numbers of features occurances (0 or 1).
|
||||
E.g. if element j of list i equals 1 this means that
|
||||
feature j occured in line i (counting from the last line of the body).
|
||||
'''
|
||||
# collect all non empty lines
|
||||
lines = [line for line in body.splitlines() if line.strip()]
|
||||
|
||||
# take the last SIGNATURE_MAX_LINES
|
||||
last_lines = lines[-SIGNATURE_MAX_LINES:]
|
||||
|
||||
# apply features, fallback to zeros
|
||||
return ([[f(line) for f in features] for line in last_lines] or
|
||||
[[0 for f in features]])
|
||||
|
||||
|
||||
def build_pattern(body, features):
|
||||
'''Converts body into a pattern i.e. a point in the features space.
|
||||
|
||||
Applies features to the body lines and sums up the results.
|
||||
Elements of the pattern indicate how many times a certain feature occured
|
||||
in the last lines of the body.
|
||||
'''
|
||||
line_patterns = apply_features(body, features)
|
||||
return reduce(lambda x, y: [i + j for i, j in zip(x, y)], line_patterns)
|
||||
233
talon/signature/learning/helpers.py
Normal file
233
talon/signature/learning/helpers.py
Normal file
@@ -0,0 +1,233 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
""" The module provides:
|
||||
* functions used when evaluating signature's features
|
||||
* regexp's constants used when evaluating signature's features
|
||||
|
||||
"""
|
||||
|
||||
import unicodedata
|
||||
import regex as re
|
||||
|
||||
from talon.utils import to_unicode
|
||||
|
||||
from talon.signature.constants import SIGNATURE_MAX_LINES
|
||||
|
||||
|
||||
rc = re.compile
|
||||
|
||||
RE_EMAIL = rc('@')
|
||||
RE_RELAX_PHONE = rc('.*(\(? ?[\d]{2,3} ?\)?.{,3}){2,}')
|
||||
RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')
|
||||
|
||||
# Taken from:
|
||||
# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
|
||||
# Line matches the regular expression "^[\s]*---*[\s]*$".
|
||||
RE_SEPARATOR = rc('^[\s]*---*[\s]*$')
|
||||
|
||||
# Taken from:
|
||||
# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
|
||||
# Line has a sequence of 10 or more special characters.
|
||||
RE_SPECIAL_CHARS = rc(('^[\s]*([\*]|#|[\+]|[\^]|-|[\~]|[\&]|[\$]|_|[\!]|'
|
||||
'[\/]|[\%]|[\:]|[\=]){10,}[\s]*$'))
|
||||
|
||||
RE_SIGNATURE_WORDS = rc(('(T|t)hank.*,|(B|b)est|(R|r)egards|'
|
||||
'^sent[ ]{1}from[ ]{1}my[\s,!\w]*$|BR|(S|s)incerely|'
|
||||
'(C|c)orporation|Group'))
|
||||
|
||||
# Taken from:
|
||||
# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
|
||||
# Line contains a pattern like Vitor R. Carvalho or William W. Cohen.
|
||||
RE_NAME = rc('[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+')
|
||||
|
||||
# Pattern to match if e.g. 'Sender:' header field has sender names.
|
||||
SENDER_WITH_NAME_PATTERN = '([\s]*[\S]+,?)+[\s]*<.*>.*'
|
||||
RE_SENDER_WITH_NAME = rc(SENDER_WITH_NAME_PATTERN)
|
||||
|
||||
# Reply line clue line endings, as in regular expression:
|
||||
# " wrote:$" or " writes:$"
|
||||
RE_CLUE_LINE_END = rc('.*(W|w)rotes?:$')
|
||||
|
||||
INVALID_WORD_START = rc('\(|\+|[\d]')
|
||||
|
||||
BAD_SENDER_NAMES = [
|
||||
# known mail domains
|
||||
'hotmail', 'gmail', 'yandex', 'mail', 'yahoo', 'mailgun', 'mailgunhq',
|
||||
'example',
|
||||
# first level domains
|
||||
'com', 'org', 'net', 'ru',
|
||||
# bad words
|
||||
'mailto'
|
||||
]
|
||||
|
||||
|
||||
def binary_regex_search(prog):
|
||||
'''Returns a function that returns 1 or 0 depending on regex search result.
|
||||
|
||||
If regular expression compiled into prog is present in a string
|
||||
the result of calling the returned function with the string will be 1
|
||||
and 0 otherwise.
|
||||
|
||||
>>> import regex as re
|
||||
>>> binary_regex_search(re.compile("12"))("12")
|
||||
1
|
||||
>>> binary_regex_search(re.compile("12"))("34")
|
||||
0
|
||||
'''
|
||||
return lambda s: 1 if prog.search(s) else 0
|
||||
|
||||
|
||||
def binary_regex_match(prog):
|
||||
'''Returns a function that returns 1 or 0 depending on regex match result.
|
||||
|
||||
If a string matches regular expression compiled into prog
|
||||
the result of calling the returned function with the string will be 1
|
||||
and 0 otherwise.
|
||||
|
||||
>>> import regex as re
|
||||
>>> binary_regex_match(re.compile("12"))("12 3")
|
||||
1
|
||||
>>> binary_regex_match(re.compile("12"))("3 12")
|
||||
0
|
||||
'''
|
||||
return lambda s: 1 if prog.match(s) else 0
|
||||
|
||||
|
||||
def flatten_list(list_to_flatten):
|
||||
"""Simple list comprehesion to flatten list.
|
||||
|
||||
>>> flatten_list([[1, 2], [3, 4, 5]])
|
||||
[1, 2, 3, 4, 5]
|
||||
>>> flatten_list([[1], [[2]]])
|
||||
[1, [2]]
|
||||
>>> flatten_list([1, [2]])
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
TypeError: 'int' object is not iterable
|
||||
"""
|
||||
return [e for sublist in list_to_flatten for e in sublist]
|
||||
|
||||
|
||||
def contains_sender_names(sender):
|
||||
'''Returns a functions to search sender\'s name or it\'s part.
|
||||
|
||||
>>> feature = contains_sender_names("Sergey N. Obukhov <xxx@example.com>")
|
||||
>>> feature("Sergey Obukhov")
|
||||
1
|
||||
>>> feature("BR, Sergey N.")
|
||||
1
|
||||
>>> feature("Sergey")
|
||||
1
|
||||
>>> contains_sender_names("<serobnic@mail.ru>")("Serobnic")
|
||||
1
|
||||
>>> contains_sender_names("<serobnic@mail.ru>")("serobnic")
|
||||
1
|
||||
'''
|
||||
names = '( |$)|'.join(flatten_list([[e, e.capitalize()]
|
||||
for e in extract_names(sender)]))
|
||||
names = names or sender
|
||||
if names != '':
|
||||
return binary_regex_search(re.compile(names))
|
||||
return lambda s: False
|
||||
|
||||
|
||||
def extract_names(sender):
|
||||
"""Tries to extract sender's names from `From:` header.
|
||||
|
||||
It could extract not only the actual names but e.g.
|
||||
the name of the company, parts of email, etc.
|
||||
|
||||
>>> extract_names('Sergey N. Obukhov <serobnic@mail.ru>')
|
||||
['Sergey', 'Obukhov', 'serobnic']
|
||||
>>> extract_names('')
|
||||
[]
|
||||
"""
|
||||
sender = to_unicode(sender)
|
||||
# Remove non-alphabetical characters
|
||||
sender = "".join([char if char.isalpha() else ' ' for char in sender])
|
||||
# Remove too short words and words from "black" list i.e.
|
||||
# words like `ru`, `gmail`, `com`, `org`, etc.
|
||||
sender = [word for word in sender.split() if len(word) > 1 and
|
||||
not word in BAD_SENDER_NAMES]
|
||||
# Remove duplicates
|
||||
names = list(set(sender))
|
||||
return names
|
||||
|
||||
|
||||
def categories_percent(s, categories):
|
||||
'''Returns category characters persent.
|
||||
|
||||
>>> categories_percent("qqq ggg hhh", ["Po"])
|
||||
0.0
|
||||
>>> categories_percent("q,w.", ["Po"])
|
||||
50.0
|
||||
>>> categories_percent("qqq ggg hhh", ["Nd"])
|
||||
0.0
|
||||
>>> categories_percent("q5", ["Nd"])
|
||||
50.0
|
||||
>>> categories_percent("s.s,5s", ["Po", "Nd"])
|
||||
50.0
|
||||
'''
|
||||
count = 0
|
||||
s = to_unicode(s)
|
||||
for c in s:
|
||||
if unicodedata.category(c) in categories:
|
||||
count += 1
|
||||
return 100 * float(count) / len(s) if len(s) else 0
|
||||
|
||||
|
||||
def punctuation_percent(s):
|
||||
'''Returns punctuation persent.
|
||||
|
||||
>>> punctuation_percent("qqq ggg hhh")
|
||||
0.0
|
||||
>>> punctuation_percent("q,w.")
|
||||
50.0
|
||||
'''
|
||||
return categories_percent(s, ['Po'])
|
||||
|
||||
|
||||
def capitalized_words_percent(s):
|
||||
'''Returns capitalized words percent.'''
|
||||
s = to_unicode(s)
|
||||
words = re.split('\s', s)
|
||||
words = [w for w in words if w.strip()]
|
||||
capitalized_words_counter = 0
|
||||
valid_words_counter = 0
|
||||
for word in words:
|
||||
if not INVALID_WORD_START.match(word):
|
||||
valid_words_counter += 1
|
||||
if word[0].isupper():
|
||||
capitalized_words_counter += 1
|
||||
if valid_words_counter > 0 and len(words) > 1:
|
||||
return 100 * float(capitalized_words_counter) / valid_words_counter
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def many_capitalized_words(s):
|
||||
"""Returns a function to check percentage of capitalized words.
|
||||
|
||||
The function returns 1 if percentage greater then 65% and 0 otherwise.
|
||||
"""
|
||||
return 1 if capitalized_words_percent(s) > 66 else 0
|
||||
|
||||
|
||||
def has_signature(body, sender):
|
||||
'''Checks if the body has signature. Returns True or False.'''
|
||||
non_empty = [line for line in body.splitlines() if line.strip()]
|
||||
candidate = non_empty[-SIGNATURE_MAX_LINES:]
|
||||
upvotes = 0
|
||||
for line in candidate:
|
||||
# we check lines for sender's name, phone, email and url,
|
||||
# those signature lines don't take more then 27 lines
|
||||
if len(line.strip()) > 27:
|
||||
continue
|
||||
elif contains_sender_names(sender)(line):
|
||||
return True
|
||||
elif (binary_regex_search(RE_RELAX_PHONE)(line) +
|
||||
binary_regex_search(RE_EMAIL)(line) +
|
||||
binary_regex_search(RE_URL)(line) == 1):
|
||||
upvotes += 1
|
||||
if upvotes > 1:
|
||||
return True
|
||||
76
talon/utils.py
Normal file
76
talon/utils.py
Normal file
@@ -0,0 +1,76 @@
|
||||
# coding:utf-8
|
||||
|
||||
import logging
|
||||
from random import shuffle
|
||||
|
||||
from talon.constants import RE_DELIMITER
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def safe_format(format_string, *args, **kwargs):
|
||||
"""
|
||||
Helper: formats string with any combination of bytestrings/unicode
|
||||
strings without raising exceptions
|
||||
"""
|
||||
try:
|
||||
if not args and not kwargs:
|
||||
return format_string
|
||||
else:
|
||||
return format_string.format(*args, **kwargs)
|
||||
|
||||
# catch encoding errors and transform everything into utf-8 string
|
||||
# before logging:
|
||||
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||
format_string = to_utf8(format_string)
|
||||
args = [to_utf8(p) for p in args]
|
||||
kwargs = {k: to_utf8(v) for k, v in kwargs.iteritems()}
|
||||
return format_string.format(*args, **kwargs)
|
||||
|
||||
# ignore other errors
|
||||
except:
|
||||
return u''
|
||||
|
||||
|
||||
def to_unicode(str_or_unicode, precise=False):
|
||||
"""
|
||||
Safely returns a unicode version of a given string
|
||||
>>> utils.to_unicode('привет')
|
||||
u'привет'
|
||||
>>> utils.to_unicode(u'привет')
|
||||
u'привет'
|
||||
If `precise` flag is True, tries to guess the correct encoding first.
|
||||
"""
|
||||
encoding = detect_encoding(str_or_unicode) if precise else 'utf-8'
|
||||
if isinstance(str_or_unicode, str):
|
||||
return unicode(str_or_unicode, encoding, 'replace')
|
||||
return str_or_unicode
|
||||
|
||||
|
||||
def to_utf8(str_or_unicode):
|
||||
"""
|
||||
Safely returns a UTF-8 version of a given string
|
||||
>>> utils.to_utf8(u'hi')
|
||||
'hi'
|
||||
"""
|
||||
if isinstance(str_or_unicode, unicode):
|
||||
return str_or_unicode.encode("utf-8", "ignore")
|
||||
return str(str_or_unicode)
|
||||
|
||||
|
||||
def random_token(length=7):
|
||||
vals = ("a b c d e f g h i j k l m n o p q r s t u v w x y z "
|
||||
"0 1 2 3 4 5 6 7 8 9").split(' ')
|
||||
shuffle(vals)
|
||||
return ''.join(vals[:length])
|
||||
|
||||
|
||||
def get_delimiter(msg_body):
|
||||
delimiter = RE_DELIMITER.search(msg_body)
|
||||
if delimiter:
|
||||
delimiter = delimiter.group()
|
||||
else:
|
||||
delimiter = '\n'
|
||||
|
||||
return delimiter
|
||||
Reference in New Issue
Block a user