Files
talon/tests/signature/extraction_test.py
Yacine Filali 15e61768f2 Encoding fixes
2017-05-23 16:17:39 -07:00

179 lines
5.1 KiB
Python

# -*- coding: utf-8 -*-
from __future__ import absolute_import
import os
from six.moves import range
from talon.signature import bruteforce, extraction, extract
from talon.signature import extraction as e
from talon.signature.learning import dataset
from .. import *
def test_message_shorter_SIGNATURE_MAX_LINES():
sender = "bob@foo.bar"
body = """Call me ASAP, please.This is about the last changes you deployed.
Thanks in advance,
Bob"""
text, extracted_signature = extract(body, sender)
eq_('\n'.join(body.splitlines()[:2]), text)
eq_('\n'.join(body.splitlines()[-2:]), extracted_signature)
def test_messages_longer_SIGNATURE_MAX_LINES():
import sys
kwargs = {}
if sys.version_info > (3, 0):
kwargs["encoding"] = "utf8"
for filename in os.listdir(STRIPPED):
filename = os.path.join(STRIPPED, filename)
if not filename.endswith('_body'):
continue
sender, body = dataset.parse_msg_sender(filename)
text, extracted_signature = extract(body, sender)
extracted_signature = extracted_signature or ''
with open(filename[:-len('body')] + 'signature', **kwargs) as ms:
msg_signature = ms.read()
eq_(msg_signature.strip(), extracted_signature.strip())
stripped_msg = body.strip()[:len(body.strip()) - len(msg_signature)]
eq_(stripped_msg.strip(), text.strip())
def test_text_line_in_signature():
# test signature should consist of one solid part
sender = "bob@foo.bar"
body = """Call me ASAP, please.This is about the last changes you deployed.
Thanks in advance,
some text which doesn't seem to be a signature at all
Bob"""
text, extracted_signature = extract(body, sender)
eq_('\n'.join(body.splitlines()[:2]), text)
eq_('\n'.join(body.splitlines()[-3:]), extracted_signature)
def test_long_line_in_signature():
sender = "bob@foo.bar"
body = """Call me ASAP, please.This is about the last changes you deployed.
Thanks in advance,
some long text here which doesn't seem to be a signature at all
Bob"""
text, extracted_signature = extract(body, sender)
eq_('\n'.join(body.splitlines()[:-1]), text)
eq_('Bob', extracted_signature)
body = """Thanks David,
some *long* text here which doesn't seem to be a signature at all
"""
((body, None), extract(body, "david@example.com"))
def test_basic():
msg_body = 'Blah\r\n--\r\n\r\nSergey Obukhov'
eq_(('Blah', '--\r\n\r\nSergey Obukhov'),
extract(msg_body, 'Sergey'))
def test_capitalized():
msg_body = """Hi Mary,
Do you still need a DJ for your wedding? I've included a video demo of one of our DJs available for your wedding date.
DJ Doe
http://example.com
Password: SUPERPASSWORD
Would you like to check out more?
At your service,
John Smith
Doe Inc
555-531-7967"""
sig = """John Smith
Doe Inc
555-531-7967"""
eq_(sig, extract(msg_body, 'Doe')[1])
def test_over_2_text_lines_after_signature():
body = """Blah
Bob,
If there are more than
2 non signature lines in the end
It's not signature
"""
text, extracted_signature = extract(body, "Bob")
eq_(extracted_signature, None)
def test_no_signature():
sender, body = "bob@foo.bar", "Hello"
eq_((body, None), extract(body, sender))
def test_handles_unicode():
sender, body = dataset.parse_msg_sender(UNICODE_MSG)
text, extracted_signature = extract(body, sender)
@patch.object(extraction, 'has_signature')
def test_signature_extract_crash(has_signature):
has_signature.side_effect = Exception('Bam!')
msg_body = u'Blah\r\n--\r\n\r\nСергей'
eq_((msg_body, None), extract(msg_body, 'Сергей'))
def test_mark_lines():
with patch.object(bruteforce, 'SIGNATURE_MAX_LINES', 2):
# we analyse the 2nd line as well though it's the 6th line
# (starting from the bottom) because we don't count empty line
eq_('ttset',
e._mark_lines(['Bob Smith',
'Bob Smith',
'Bob Smith',
'',
'some text'], 'Bob Smith'))
with patch.object(bruteforce, 'SIGNATURE_MAX_LINES', 3):
# we don't analyse the 1st line because
# signature cant start from the 1st line
eq_('tset',
e._mark_lines(['Bob Smith',
'Bob Smith',
'',
'some text'], 'Bob Smith'))
def test_process_marked_lines():
# no signature found
eq_((list(range(5)), None), e._process_marked_lines(list(range(5)), 'telt'))
# signature in the middle of the text
eq_((list(range(9)), None), e._process_marked_lines(list(range(9)), 'tesestelt'))
# long line splits signature
eq_((list(range(7)), [7, 8]),
e._process_marked_lines(list(range(9)), 'tsslsless'))
eq_((list(range(20)), [20]),
e._process_marked_lines(list(range(21)), 'ttttttstttesllelelets'))
# some signature lines could be identified as text
eq_(([0], list(range(1, 9))), e._process_marked_lines(list(range(9)), 'tsetetest'))
eq_(([], list(range(5))),
e._process_marked_lines(list(range(5)), "ststt"))