149 lines
4.6 KiB
Python
149 lines
4.6 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
||
|
|
|
||
|
|
from .. import *
|
||
|
|
|
||
|
|
import os
|
||
|
|
|
||
|
|
from PyML import SparseDataSet
|
||
|
|
|
||
|
|
from talon.signature.learning import dataset
|
||
|
|
from talon import signature
|
||
|
|
from talon.signature import extraction as e
|
||
|
|
from talon.signature import bruteforce
|
||
|
|
|
||
|
|
|
||
|
|
def test_message_shorter_SIGNATURE_MAX_LINES():
|
||
|
|
sender = "bob@foo.bar"
|
||
|
|
body = """Call me ASAP, please.This is about the last changes you deployed.
|
||
|
|
|
||
|
|
Thanks in advance,
|
||
|
|
Bob"""
|
||
|
|
text, extracted_signature = signature.extract(body, sender)
|
||
|
|
eq_('\n'.join(body.splitlines()[:2]), text)
|
||
|
|
eq_('\n'.join(body.splitlines()[-2:]), extracted_signature)
|
||
|
|
|
||
|
|
|
||
|
|
def test_messages_longer_SIGNATURE_MAX_LINES():
|
||
|
|
for filename in os.listdir(STRIPPED):
|
||
|
|
filename = os.path.join(STRIPPED, filename)
|
||
|
|
if not filename.endswith('_body'):
|
||
|
|
continue
|
||
|
|
sender, body = dataset.parse_msg_sender(filename)
|
||
|
|
text, extracted_signature = signature.extract(body, sender)
|
||
|
|
extracted_signature = extracted_signature or ''
|
||
|
|
with open(filename[:-len('body')] + 'signature') as ms:
|
||
|
|
msg_signature = ms.read()
|
||
|
|
eq_(msg_signature.strip(), extracted_signature.strip())
|
||
|
|
stripped_msg = body.strip()[:len(body.strip())-len(msg_signature)]
|
||
|
|
eq_(stripped_msg.strip(), text.strip())
|
||
|
|
|
||
|
|
|
||
|
|
def test_text_line_in_signature():
|
||
|
|
# test signature should consist of one solid part
|
||
|
|
sender = "bob@foo.bar"
|
||
|
|
body = """Call me ASAP, please.This is about the last changes you deployed.
|
||
|
|
|
||
|
|
Thanks in advance,
|
||
|
|
some text which doesn't seem to be a signature at all
|
||
|
|
Bob"""
|
||
|
|
|
||
|
|
text, extracted_signature = signature.extract(body, sender)
|
||
|
|
eq_('\n'.join(body.splitlines()[:2]), text)
|
||
|
|
eq_('\n'.join(body.splitlines()[-3:]), extracted_signature)
|
||
|
|
|
||
|
|
|
||
|
|
def test_long_line_in_signature():
|
||
|
|
sender = "bob@foo.bar"
|
||
|
|
body = """Call me ASAP, please.This is about the last changes you deployed.
|
||
|
|
|
||
|
|
Thanks in advance,
|
||
|
|
some long text here which doesn't seem to be a signature at all
|
||
|
|
Bob"""
|
||
|
|
|
||
|
|
text, extracted_signature = signature.extract(body, sender)
|
||
|
|
eq_('\n'.join(body.splitlines()[:-1]), text)
|
||
|
|
eq_('Bob', extracted_signature)
|
||
|
|
|
||
|
|
body = """Thanks David,
|
||
|
|
|
||
|
|
some *long* text here which doesn't seem to be a signature at all
|
||
|
|
"""
|
||
|
|
((body, None), signature.extract(body, "david@example.com"))
|
||
|
|
|
||
|
|
|
||
|
|
def test_basic():
|
||
|
|
msg_body = 'Blah\r\n--\r\n\r\nSergey Obukhov'
|
||
|
|
eq_(('Blah', '--\r\n\r\nSergey Obukhov'),
|
||
|
|
signature.extract(msg_body, 'Sergey'))
|
||
|
|
|
||
|
|
|
||
|
|
def test_over_2_text_lines_after_signature():
|
||
|
|
body = """Blah
|
||
|
|
|
||
|
|
Bob,
|
||
|
|
If there are more than
|
||
|
|
2 non signature lines in the end
|
||
|
|
It's not signature
|
||
|
|
"""
|
||
|
|
text, extracted_signature = signature.extract(body, "Bob")
|
||
|
|
eq_(extracted_signature, None)
|
||
|
|
|
||
|
|
|
||
|
|
def test_no_signature():
|
||
|
|
sender, body = "bob@foo.bar", "Hello"
|
||
|
|
eq_((body, None), signature.extract(body, sender))
|
||
|
|
|
||
|
|
|
||
|
|
def test_handles_unicode():
|
||
|
|
sender, body = dataset.parse_msg_sender(UNICODE_MSG)
|
||
|
|
text, extracted_signature = signature.extract(body, sender)
|
||
|
|
|
||
|
|
|
||
|
|
@patch.object(signature.extraction, 'has_signature')
|
||
|
|
def test_signature_extract_crash(has_signature):
|
||
|
|
has_signature.side_effect = Exception('Bam!')
|
||
|
|
msg_body = u'Blah\r\n--\r\n\r\nСергей'
|
||
|
|
eq_((msg_body, None), signature.extract(msg_body, 'Сергей'))
|
||
|
|
|
||
|
|
|
||
|
|
def test_mark_lines():
|
||
|
|
with patch.object(bruteforce, 'SIGNATURE_MAX_LINES', 2):
|
||
|
|
# we analyse the 2nd line as well though it's the 6th line
|
||
|
|
# (starting from the bottom) because we don't count empty line
|
||
|
|
eq_('ttset',
|
||
|
|
e._mark_lines(['Bob Smith',
|
||
|
|
'Bob Smith',
|
||
|
|
'Bob Smith',
|
||
|
|
'',
|
||
|
|
'some text'], 'Bob Smith'))
|
||
|
|
|
||
|
|
with patch.object(bruteforce, 'SIGNATURE_MAX_LINES', 3):
|
||
|
|
# we don't analyse the 1st line because
|
||
|
|
# signature cant start from the 1st line
|
||
|
|
eq_('tset',
|
||
|
|
e._mark_lines(['Bob Smith',
|
||
|
|
'Bob Smith',
|
||
|
|
'',
|
||
|
|
'some text'], 'Bob Smith'))
|
||
|
|
|
||
|
|
|
||
|
|
def test_process_marked_lines():
|
||
|
|
# no signature found
|
||
|
|
eq_((range(5), None), e._process_marked_lines(range(5), 'telt'))
|
||
|
|
|
||
|
|
# signature in the middle of the text
|
||
|
|
eq_((range(9), None), e._process_marked_lines(range(9), 'tesestelt'))
|
||
|
|
|
||
|
|
# long line splits signature
|
||
|
|
eq_((range(7), [7, 8]),
|
||
|
|
e._process_marked_lines(range(9), 'tsslsless'))
|
||
|
|
|
||
|
|
eq_((range(20), [20]),
|
||
|
|
e._process_marked_lines(range(21), 'ttttttstttesllelelets'))
|
||
|
|
|
||
|
|
# some signature lines could be identified as text
|
||
|
|
eq_(([0], range(1, 9)), e._process_marked_lines(range(9), 'tsetetest'))
|
||
|
|
|
||
|
|
eq_(([], range(5)),
|
||
|
|
e._process_marked_lines(range(5), "ststt"))
|