170 lines
5.9 KiB
Python
170 lines
5.9 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
"""The module's functions build datasets to train/assess classifiers.
|
|
|
|
For signature detection the input should be a folder with two directories
|
|
that contain emails with and without signatures.
|
|
|
|
For signature extraction the input should be a folder with annotated emails.
|
|
To indicate that a line is a signature line use #sig# at the start of the line.
|
|
|
|
A sender of an email could be specified in the same file as
|
|
the message body e.g. when .eml format is used or in a separate file.
|
|
|
|
In the letter case it is assumed that a body filename ends with the `_body`
|
|
suffix and the corresponding sender file has the same name except for the
|
|
suffix which should be `_sender`.
|
|
"""
|
|
|
|
from __future__ import absolute_import
|
|
|
|
import os
|
|
|
|
import regex as re
|
|
from six.moves import range
|
|
|
|
from talon.signature.constants import SIGNATURE_MAX_LINES
|
|
from talon.signature.learning.featurespace import build_pattern, features
|
|
|
|
SENDER_SUFFIX = '_sender'
|
|
BODY_SUFFIX = '_body'
|
|
|
|
SIGNATURE_ANNOTATION = '#sig#'
|
|
REPLY_ANNOTATION = '#reply#'
|
|
|
|
ANNOTATIONS = [SIGNATURE_ANNOTATION, REPLY_ANNOTATION]
|
|
|
|
|
|
def is_sender_filename(filename):
|
|
"""Checks if the file could contain message sender's name."""
|
|
return filename.endswith(SENDER_SUFFIX)
|
|
|
|
|
|
def build_sender_filename(msg_filename):
|
|
"""By the message filename gives expected sender's filename."""
|
|
return msg_filename[:-len(BODY_SUFFIX)] + SENDER_SUFFIX
|
|
|
|
|
|
def parse_msg_sender(filename, sender_known=True):
|
|
"""Given a filename returns the sender and the message.
|
|
|
|
Here the message is assumed to be a whole MIME message or just
|
|
message body.
|
|
|
|
>>> sender, msg = parse_msg_sender('msg.eml')
|
|
>>> sender, msg = parse_msg_sender('msg_body')
|
|
|
|
If you don't want to consider the sender's name in your classification
|
|
algorithm:
|
|
>>> parse_msg_sender(filename, False)
|
|
"""
|
|
import sys
|
|
kwargs = {}
|
|
if sys.version_info > (3, 0):
|
|
kwargs["encoding"] = "utf8"
|
|
|
|
sender, msg = None, None
|
|
if os.path.isfile(filename) and not is_sender_filename(filename):
|
|
with open(filename, **kwargs) as f:
|
|
msg = f.read()
|
|
sender = u''
|
|
if sender_known:
|
|
sender_filename = build_sender_filename(filename)
|
|
if os.path.exists(sender_filename):
|
|
with open(sender_filename) as sender_file:
|
|
sender = sender_file.read().strip()
|
|
else:
|
|
# if sender isn't found then the next line fails
|
|
# and it is ok
|
|
lines = msg.splitlines()
|
|
for line in lines:
|
|
match = re.match('From:(.*)', line)
|
|
if match:
|
|
sender = match.group(1)
|
|
break
|
|
return (sender, msg)
|
|
|
|
|
|
def build_detection_class(folder, dataset_filename,
|
|
label, sender_known=True):
|
|
"""Builds signature detection class.
|
|
|
|
Signature detection dataset includes patterns for two classes:
|
|
* class for positive patterns (goes with label 1)
|
|
* class for negative patterns (goes with label -1)
|
|
|
|
The patterns are build of emails from `folder` and appended to
|
|
dataset file.
|
|
|
|
>>> build_signature_detection_class('emails/P', 'train.data', 1)
|
|
"""
|
|
with open(dataset_filename, 'a') as dataset:
|
|
for filename in os.listdir(folder):
|
|
filename = os.path.join(folder, filename)
|
|
sender, msg = parse_msg_sender(filename, sender_known)
|
|
if sender is None or msg is None:
|
|
continue
|
|
msg = re.sub('|'.join(ANNOTATIONS), '', msg)
|
|
X = build_pattern(msg, features(sender))
|
|
X.append(label)
|
|
labeled_pattern = ','.join([str(e) for e in X])
|
|
dataset.write(labeled_pattern + '\n')
|
|
|
|
|
|
def build_detection_dataset(folder, dataset_filename,
|
|
sender_known=True):
|
|
"""Builds signature detection dataset using emails from folder.
|
|
|
|
folder should have the following structure:
|
|
x-- folder
|
|
| x-- P
|
|
| | | -- positive sample email 1
|
|
| | | -- positive sample email 2
|
|
| | | -- ...
|
|
| x-- N
|
|
| | | -- negative sample email 1
|
|
| | | -- negative sample email 2
|
|
| | | -- ...
|
|
|
|
If the dataset file already exist it is rewritten.
|
|
"""
|
|
if os.path.exists(dataset_filename):
|
|
os.remove(dataset_filename)
|
|
build_detection_class(os.path.join(folder, u'P'),
|
|
dataset_filename, 1)
|
|
build_detection_class(os.path.join(folder, u'N'),
|
|
dataset_filename, -1)
|
|
|
|
|
|
def build_extraction_dataset(folder, dataset_filename,
|
|
sender_known=True):
|
|
"""Builds signature extraction dataset using emails in the `folder`.
|
|
|
|
The emails in the `folder` should be annotated i.e. signature lines
|
|
should be marked with `#sig#`.
|
|
"""
|
|
if os.path.exists(dataset_filename):
|
|
os.remove(dataset_filename)
|
|
with open(dataset_filename, 'a') as dataset:
|
|
for filename in os.listdir(folder):
|
|
filename = os.path.join(folder, filename)
|
|
sender, msg = parse_msg_sender(filename, sender_known)
|
|
if not sender or not msg:
|
|
continue
|
|
lines = msg.splitlines()
|
|
for i in range(1, min(SIGNATURE_MAX_LINES,
|
|
len(lines)) + 1):
|
|
line = lines[-i]
|
|
label = -1
|
|
if line[:len(SIGNATURE_ANNOTATION)] == \
|
|
SIGNATURE_ANNOTATION:
|
|
label = 1
|
|
line = line[len(SIGNATURE_ANNOTATION):]
|
|
elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION:
|
|
line = line[len(REPLY_ANNOTATION):]
|
|
|
|
X = build_pattern(line, features(sender))
|
|
X.append(label)
|
|
labeled_pattern = ','.join([str(e) for e in X])
|
|
dataset.write(labeled_pattern + '\n')
|