Merge branch 'master' into patch-1

This commit is contained in:
Sergey Obukhov
2017-06-18 22:51:46 -07:00
committed by GitHub
10 changed files with 148 additions and 90 deletions

2
.gitignore vendored
View File

@@ -39,6 +39,8 @@ nosetests.xml
/.emacs.desktop /.emacs.desktop
/.emacs.desktop.lock /.emacs.desktop.lock
.elc .elc
.idea
.cache
auto-save-list auto-save-list
tramp tramp
.\#* .\#*

View File

@@ -29,7 +29,7 @@ class InstallCommand(install):
setup(name='talon', setup(name='talon',
version='1.3.7', version='1.4.0',
description=("Mailgun library " description=("Mailgun library "
"to extract message quotations and signatures."), "to extract message quotations and signatures."),
long_description=open("README.rst").read(), long_description=open("README.rst").read(),

View File

@@ -1,15 +1,15 @@
from __future__ import absolute_import from __future__ import absolute_import
import logging import logging
import regex as re import regex as re
from talon.utils import get_delimiter
from talon.signature.constants import (SIGNATURE_MAX_LINES, from talon.signature.constants import (SIGNATURE_MAX_LINES,
TOO_LONG_SIGNATURE_LINE) TOO_LONG_SIGNATURE_LINE)
from talon.utils import get_delimiter
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
# regex to fetch signature based on common signature words # regex to fetch signature based on common signature words
RE_SIGNATURE = re.compile(r''' RE_SIGNATURE = re.compile(r'''
( (
@@ -28,7 +28,6 @@ RE_SIGNATURE = re.compile(r'''
) )
''', re.I | re.X | re.M | re.S) ''', re.I | re.X | re.M | re.S)
# signatures appended by phone email clients # signatures appended by phone email clients
RE_PHONE_SIGNATURE = re.compile(r''' RE_PHONE_SIGNATURE = re.compile(r'''
( (
@@ -45,7 +44,6 @@ RE_PHONE_SIGNATURE = re.compile(r'''
) )
''', re.I | re.X | re.M | re.S) ''', re.I | re.X | re.M | re.S)
# see _mark_candidate_indexes() for details # see _mark_candidate_indexes() for details
# c - could be signature line # c - could be signature line
# d - line starts with dashes (could be signature or list item) # d - line starts with dashes (could be signature or list item)
@@ -112,7 +110,7 @@ def extract_signature(msg_body):
return (stripped_body.strip(), return (stripped_body.strip(),
signature.strip()) signature.strip())
except Exception as e: except Exception:
log.exception('ERROR extracting signature') log.exception('ERROR extracting signature')
return (msg_body, None) return (msg_body, None)
@@ -163,7 +161,7 @@ def _mark_candidate_indexes(lines, candidate):
'cdc' 'cdc'
""" """
# at first consider everything to be potential signature lines # at first consider everything to be potential signature lines
markers = bytearray('c'*len(candidate)) markers = list('c' * len(candidate))
# mark lines starting from bottom up # mark lines starting from bottom up
for i, line_idx in reversed(list(enumerate(candidate))): for i, line_idx in reversed(list(enumerate(candidate))):
@@ -174,7 +172,7 @@ def _mark_candidate_indexes(lines, candidate):
if line.startswith('-') and line.strip("-"): if line.startswith('-') and line.strip("-"):
markers[i] = 'd' markers[i] = 'd'
return markers return "".join(markers)
def _process_marked_candidate_indexes(candidate, markers): def _process_marked_candidate_indexes(candidate, markers):

View File

@@ -1,16 +1,15 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import absolute_import from __future__ import absolute_import
import logging import logging
import regex as re
import numpy import numpy
import regex as re
from talon.signature.learning.featurespace import features, build_pattern
from talon.utils import get_delimiter
from talon.signature.bruteforce import get_signature_candidate from talon.signature.bruteforce import get_signature_candidate
from talon.signature.learning.featurespace import features, build_pattern
from talon.signature.learning.helpers import has_signature from talon.signature.learning.helpers import has_signature
from talon.utils import get_delimiter
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@@ -58,7 +57,7 @@ def extract(body, sender):
text = delimiter.join(text) text = delimiter.join(text)
if text.strip(): if text.strip():
return (text, delimiter.join(signature)) return (text, delimiter.join(signature))
except Exception: except Exception as e:
log.exception('ERROR when extracting signature with classifiers') log.exception('ERROR when extracting signature with classifiers')
return (body, None) return (body, None)
@@ -81,7 +80,7 @@ def _mark_lines(lines, sender):
candidate = get_signature_candidate(lines) candidate = get_signature_candidate(lines)
# at first consider everything to be text no signature # at first consider everything to be text no signature
markers = bytearray('t'*len(lines)) markers = list('t' * len(lines))
# mark lines starting from bottom up # mark lines starting from bottom up
# mark only lines that belong to candidate # mark only lines that belong to candidate
@@ -96,7 +95,7 @@ def _mark_lines(lines, sender):
elif is_signature_line(line, sender, EXTRACTOR): elif is_signature_line(line, sender, EXTRACTOR):
markers[j] = 's' markers[j] = 's'
return markers return "".join(markers)
def _process_marked_lines(lines, markers): def _process_marked_lines(lines, markers):
@@ -111,3 +110,4 @@ def _process_marked_lines(lines, markers):
return (lines[:-signature.end()], lines[-signature.end():]) return (lines[:-signature.end()], lines[-signature.end():])
return (lines, None) return (lines, None)

View File

@@ -6,9 +6,10 @@ body belongs to the signature.
""" """
from __future__ import absolute_import from __future__ import absolute_import
from numpy import genfromtxt from numpy import genfromtxt
from sklearn.svm import LinearSVC
from sklearn.externals import joblib from sklearn.externals import joblib
from sklearn.svm import LinearSVC
def init(): def init():
@@ -29,4 +30,40 @@ def train(classifier, train_data_filename, save_classifier_filename=None):
def load(saved_classifier_filename, train_data_filename): def load(saved_classifier_filename, train_data_filename):
"""Loads saved classifier. """ """Loads saved classifier. """
try:
return joblib.load(saved_classifier_filename) return joblib.load(saved_classifier_filename)
except Exception:
import sys
if sys.version_info > (3, 0):
return load_compat(saved_classifier_filename)
raise
def load_compat(saved_classifier_filename):
import os
import pickle
import tempfile
# we need to switch to the data path to properly load the related _xx.npy files
cwd = os.getcwd()
os.chdir(os.path.dirname(saved_classifier_filename))
# convert encoding using pick.load and write to temp file which we'll tell joblib to use
pickle_file = open(saved_classifier_filename, 'rb')
classifier = pickle.load(pickle_file, encoding='latin1')
try:
# save our conversion if permissions allow
joblib.dump(classifier, saved_classifier_filename)
except Exception:
# can't write to classifier, use a temp file
tmp = tempfile.SpooledTemporaryFile()
joblib.dump(classifier, tmp)
saved_classifier_filename = tmp
# important, use joblib.load before switching back to original cwd
jb_classifier = joblib.load(saved_classifier_filename)
os.chdir(cwd)
return jb_classifier

View File

@@ -17,13 +17,14 @@ suffix which should be `_sender`.
""" """
from __future__ import absolute_import from __future__ import absolute_import
import os import os
import regex as re import regex as re
from six.moves import range
from talon.signature.constants import SIGNATURE_MAX_LINES from talon.signature.constants import SIGNATURE_MAX_LINES
from talon.signature.learning.featurespace import build_pattern, features from talon.signature.learning.featurespace import build_pattern, features
from six.moves import range
SENDER_SUFFIX = '_sender' SENDER_SUFFIX = '_sender'
BODY_SUFFIX = '_body' BODY_SUFFIX = '_body'
@@ -57,9 +58,14 @@ def parse_msg_sender(filename, sender_known=True):
algorithm: algorithm:
>>> parse_msg_sender(filename, False) >>> parse_msg_sender(filename, False)
""" """
import sys
kwargs = {}
if sys.version_info > (3, 0):
kwargs["encoding"] = "utf8"
sender, msg = None, None sender, msg = None, None
if os.path.isfile(filename) and not is_sender_filename(filename): if os.path.isfile(filename) and not is_sender_filename(filename):
with open(filename) as f: with open(filename, **kwargs) as f:
msg = f.read() msg = f.read()
sender = u'' sender = u''
if sender_known: if sender_known:

View File

@@ -1,19 +1,18 @@
# coding:utf-8 # coding:utf-8
from __future__ import absolute_import from __future__ import absolute_import
import logging
from random import shuffle from random import shuffle
import chardet
import cchardet import cchardet
import regex as re import chardet
from lxml.html import html5parser
from lxml.cssselect import CSSSelector
import html5lib import html5lib
import regex as re
import six
from lxml.cssselect import CSSSelector
from lxml.html import html5parser
from talon.constants import RE_DELIMITER from talon.constants import RE_DELIMITER
import six
def safe_format(format_string, *args, **kwargs): def safe_format(format_string, *args, **kwargs):
@@ -177,6 +176,8 @@ def html_to_text(string):
def html_fromstring(s): def html_fromstring(s):
"""Parse html tree from string. Return None if the string can't be parsed. """Parse html tree from string. Return None if the string can't be parsed.
""" """
if isinstance(s, six.text_type):
s = s.encode('utf8')
try: try:
if html_too_big(s): if html_too_big(s):
return None return None
@@ -189,6 +190,8 @@ def html_fromstring(s):
def html_document_fromstring(s): def html_document_fromstring(s):
"""Parse html tree from string. Return None if the string can't be parsed. """Parse html tree from string. Return None if the string can't be parsed.
""" """
if isinstance(s, six.text_type):
s = s.encode('utf8')
try: try:
if html_too_big(s): if html_too_big(s):
return None return None
@@ -203,7 +206,9 @@ def cssselect(expr, tree):
def html_too_big(s): def html_too_big(s):
return s.count('<') > _MAX_TAGS_COUNT if isinstance(s, six.text_type):
s = s.encode('utf8')
return s.count(b'<') > _MAX_TAGS_COUNT
def _contains_charset_spec(s): def _contains_charset_spec(s):
@@ -248,7 +253,6 @@ def _html5lib_parser():
_UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;' _UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;'
b'charset=utf-8">') b'charset=utf-8">')
_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
_HARDBREAKS = ['br', 'hr', 'tr'] _HARDBREAKS = ['br', 'hr', 'tr']

View File

@@ -1,13 +1,13 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import absolute_import from __future__ import absolute_import
from . import *
from . fixtures import *
import regex as re # noinspection PyUnresolvedReferences
import re
from talon import quotations, utils as u from talon import quotations, utils as u
from . import *
from .fixtures import *
RE_WHITESPACE = re.compile("\s") RE_WHITESPACE = re.compile("\s")
RE_DOUBLE_WHITESPACE = re.compile("\s") RE_DOUBLE_WHITESPACE = re.compile("\s")
@@ -303,7 +303,12 @@ Reply
def extract_reply_and_check(filename): def extract_reply_and_check(filename):
f = open(filename) import sys
kwargs = {}
if sys.version_info > (3, 0):
kwargs["encoding"] = "utf8"
f = open(filename, **kwargs)
msg_body = f.read() msg_body = f.read()
reply = quotations.extract_from_html(msg_body) reply = quotations.extract_from_html(msg_body)

View File

@@ -1,16 +1,16 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from __future__ import absolute_import from __future__ import absolute_import
from .. import *
import os import os
from talon.signature.learning import dataset
from talon import signature
from talon.signature import extraction as e
from talon.signature import bruteforce
from six.moves import range from six.moves import range
from talon.signature import bruteforce, extraction, extract
from talon.signature import extraction as e
from talon.signature.learning import dataset
from .. import *
def test_message_shorter_SIGNATURE_MAX_LINES(): def test_message_shorter_SIGNATURE_MAX_LINES():
sender = "bob@foo.bar" sender = "bob@foo.bar"
@@ -18,20 +18,25 @@ def test_message_shorter_SIGNATURE_MAX_LINES():
Thanks in advance, Thanks in advance,
Bob""" Bob"""
text, extracted_signature = signature.extract(body, sender) text, extracted_signature = extract(body, sender)
eq_('\n'.join(body.splitlines()[:2]), text) eq_('\n'.join(body.splitlines()[:2]), text)
eq_('\n'.join(body.splitlines()[-2:]), extracted_signature) eq_('\n'.join(body.splitlines()[-2:]), extracted_signature)
def test_messages_longer_SIGNATURE_MAX_LINES(): def test_messages_longer_SIGNATURE_MAX_LINES():
import sys
kwargs = {}
if sys.version_info > (3, 0):
kwargs["encoding"] = "utf8"
for filename in os.listdir(STRIPPED): for filename in os.listdir(STRIPPED):
filename = os.path.join(STRIPPED, filename) filename = os.path.join(STRIPPED, filename)
if not filename.endswith('_body'): if not filename.endswith('_body'):
continue continue
sender, body = dataset.parse_msg_sender(filename) sender, body = dataset.parse_msg_sender(filename)
text, extracted_signature = signature.extract(body, sender) text, extracted_signature = extract(body, sender)
extracted_signature = extracted_signature or '' extracted_signature = extracted_signature or ''
with open(filename[:-len('body')] + 'signature') as ms: with open(filename[:-len('body')] + 'signature', **kwargs) as ms:
msg_signature = ms.read() msg_signature = ms.read()
eq_(msg_signature.strip(), extracted_signature.strip()) eq_(msg_signature.strip(), extracted_signature.strip())
stripped_msg = body.strip()[:len(body.strip()) - len(msg_signature)] stripped_msg = body.strip()[:len(body.strip()) - len(msg_signature)]
@@ -47,7 +52,7 @@ Thanks in advance,
some text which doesn't seem to be a signature at all some text which doesn't seem to be a signature at all
Bob""" Bob"""
text, extracted_signature = signature.extract(body, sender) text, extracted_signature = extract(body, sender)
eq_('\n'.join(body.splitlines()[:2]), text) eq_('\n'.join(body.splitlines()[:2]), text)
eq_('\n'.join(body.splitlines()[-3:]), extracted_signature) eq_('\n'.join(body.splitlines()[-3:]), extracted_signature)
@@ -60,7 +65,7 @@ Thanks in advance,
some long text here which doesn't seem to be a signature at all some long text here which doesn't seem to be a signature at all
Bob""" Bob"""
text, extracted_signature = signature.extract(body, sender) text, extracted_signature = extract(body, sender)
eq_('\n'.join(body.splitlines()[:-1]), text) eq_('\n'.join(body.splitlines()[:-1]), text)
eq_('Bob', extracted_signature) eq_('Bob', extracted_signature)
@@ -68,13 +73,13 @@ Bob"""
some *long* text here which doesn't seem to be a signature at all some *long* text here which doesn't seem to be a signature at all
""" """
((body, None), signature.extract(body, "david@example.com")) ((body, None), extract(body, "david@example.com"))
def test_basic(): def test_basic():
msg_body = 'Blah\r\n--\r\n\r\nSergey Obukhov' msg_body = 'Blah\r\n--\r\n\r\nSergey Obukhov'
eq_(('Blah', '--\r\n\r\nSergey Obukhov'), eq_(('Blah', '--\r\n\r\nSergey Obukhov'),
signature.extract(msg_body, 'Sergey')) extract(msg_body, 'Sergey'))
def test_capitalized(): def test_capitalized():
@@ -99,7 +104,7 @@ Doe Inc
Doe Inc Doe Inc
555-531-7967""" 555-531-7967"""
eq_(sig, signature.extract(msg_body, 'Doe')[1]) eq_(sig, extract(msg_body, 'Doe')[1])
def test_over_2_text_lines_after_signature(): def test_over_2_text_lines_after_signature():
@@ -110,25 +115,25 @@ def test_over_2_text_lines_after_signature():
2 non signature lines in the end 2 non signature lines in the end
It's not signature It's not signature
""" """
text, extracted_signature = signature.extract(body, "Bob") text, extracted_signature = extract(body, "Bob")
eq_(extracted_signature, None) eq_(extracted_signature, None)
def test_no_signature(): def test_no_signature():
sender, body = "bob@foo.bar", "Hello" sender, body = "bob@foo.bar", "Hello"
eq_((body, None), signature.extract(body, sender)) eq_((body, None), extract(body, sender))
def test_handles_unicode(): def test_handles_unicode():
sender, body = dataset.parse_msg_sender(UNICODE_MSG) sender, body = dataset.parse_msg_sender(UNICODE_MSG)
text, extracted_signature = signature.extract(body, sender) text, extracted_signature = extract(body, sender)
@patch.object(signature.extraction, 'has_signature') @patch.object(extraction, 'has_signature')
def test_signature_extract_crash(has_signature): def test_signature_extract_crash(has_signature):
has_signature.side_effect = Exception('Bam!') has_signature.side_effect = Exception('Bam!')
msg_body = u'Blah\r\n--\r\n\r\nСергей' msg_body = u'Blah\r\n--\r\n\r\nСергей'
eq_((msg_body, None), signature.extract(msg_body, 'Сергей')) eq_((msg_body, None), extract(msg_body, 'Сергей'))
def test_mark_lines(): def test_mark_lines():

View File

@@ -1,12 +1,12 @@
# coding:utf-8 # coding:utf-8
from __future__ import absolute_import from __future__ import absolute_import
from . import *
from talon import utils as u
import cchardet import cchardet
import six import six
from lxml import html
from talon import utils as u
from . import *
def test_get_delimiter(): def test_get_delimiter():
@@ -115,15 +115,16 @@ font: 13px 'Lucida Grande', Arial, sans-serif;
def test_comment_no_parent(): def test_comment_no_parent():
s = "<!-- COMMENT 1 --> no comment" s = b'<!-- COMMENT 1 --> no comment'
d = u.html_document_fromstring(s) d = u.html_document_fromstring(s)
eq_("no comment", u.html_tree_to_text(d)) eq_(b"no comment", u.html_tree_to_text(d))
@patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception())) @patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception()))
def test_html_fromstring_exception(): def test_html_fromstring_exception():
eq_(None, u.html_fromstring("<html></html>")) eq_(None, u.html_fromstring("<html></html>"))
@patch.object(u, 'html_too_big', Mock()) @patch.object(u, 'html_too_big', Mock())
@patch.object(u.html5parser, 'fromstring') @patch.object(u.html5parser, 'fromstring')
def test_html_fromstring_too_big(fromstring): def test_html_fromstring_too_big(fromstring):
@@ -158,5 +159,5 @@ def test_html_too_big():
@patch.object(u, '_MAX_TAGS_COUNT', 3) @patch.object(u, '_MAX_TAGS_COUNT', 3)
def test_html_to_text(): def test_html_to_text():
eq_("Hello", u.html_to_text("<div>Hello</div>")) eq_(b"Hello", u.html_to_text("<div>Hello</div>"))
eq_(None, u.html_to_text("<div><span>Hi</span></div>")) eq_(None, u.html_to_text("<div><span>Hi</span></div>"))