Run modernizer on the code.
This commit is contained in:
1
setup.py
1
setup.py
@@ -1,3 +1,4 @@
|
|||||||
|
from __future__ import absolute_import
|
||||||
from setuptools import setup, find_packages
|
from setuptools import setup, find_packages
|
||||||
from setuptools.command.install import install
|
from setuptools.command.install import install
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
from __future__ import absolute_import
|
||||||
from talon.quotations import register_xpath_extensions
|
from talon.quotations import register_xpath_extensions
|
||||||
try:
|
try:
|
||||||
from talon import signature
|
from talon import signature
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
from __future__ import absolute_import
|
||||||
import regex as re
|
import regex as re
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ The module's functions operate on message bodies trying to extract original
|
|||||||
messages (without quoted messages) from html
|
messages (without quoted messages) from html
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
import regex as re
|
import regex as re
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ The module's functions operate on message bodies trying to extract
|
|||||||
original messages (without quoted messages)
|
original messages (without quoted messages)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
import regex as re
|
import regex as re
|
||||||
import logging
|
import logging
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
@@ -13,6 +14,7 @@ from lxml import html, etree
|
|||||||
|
|
||||||
from talon.utils import get_delimiter, html_to_text
|
from talon.utils import get_delimiter, html_to_text
|
||||||
from talon import html_quotations
|
from talon import html_quotations
|
||||||
|
from six.moves import range
|
||||||
|
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
@@ -207,7 +209,7 @@ def mark_message_lines(lines):
|
|||||||
if splitter:
|
if splitter:
|
||||||
# append as many splitter markers as lines in splitter
|
# append as many splitter markers as lines in splitter
|
||||||
splitter_lines = splitter.group().splitlines()
|
splitter_lines = splitter.group().splitlines()
|
||||||
for j in xrange(len(splitter_lines)):
|
for j in range(len(splitter_lines)):
|
||||||
markers[i + j] = 's'
|
markers[i + j] = 's'
|
||||||
|
|
||||||
# skip splitter lines
|
# skip splitter lines
|
||||||
@@ -388,7 +390,7 @@ def extract_from_html(msg_body):
|
|||||||
lines_were_deleted, first_deleted, last_deleted = return_flags
|
lines_were_deleted, first_deleted, last_deleted = return_flags
|
||||||
if lines_were_deleted:
|
if lines_were_deleted:
|
||||||
#collect checkpoints from deleted lines
|
#collect checkpoints from deleted lines
|
||||||
for i in xrange(first_deleted, last_deleted):
|
for i in range(first_deleted, last_deleted):
|
||||||
for checkpoint in line_checkpoints[i]:
|
for checkpoint in line_checkpoints[i]:
|
||||||
quotation_checkpoints[checkpoint] = True
|
quotation_checkpoints[checkpoint] = True
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ trained against, don't forget to regenerate:
|
|||||||
* signature/data/classifier
|
* signature/data/classifier
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from . import extraction
|
from . import extraction
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
from __future__ import absolute_import
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
import regex as re
|
import regex as re
|
||||||
@@ -111,7 +112,7 @@ def extract_signature(msg_body):
|
|||||||
|
|
||||||
return (stripped_body.strip(),
|
return (stripped_body.strip(),
|
||||||
signature.strip())
|
signature.strip())
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
log.exception('ERROR extracting signature')
|
log.exception('ERROR extracting signature')
|
||||||
return (msg_body, None)
|
return (msg_body, None)
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
import regex as re
|
import regex as re
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ The classifier could be used to detect if a certain line of the message
|
|||||||
body belongs to the signature.
|
body belongs to the signature.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
from numpy import genfromtxt
|
from numpy import genfromtxt
|
||||||
from sklearn.svm import LinearSVC
|
from sklearn.svm import LinearSVC
|
||||||
from sklearn.externals import joblib
|
from sklearn.externals import joblib
|
||||||
|
|||||||
@@ -16,11 +16,13 @@ suffix and the corresponding sender file has the same name except for the
|
|||||||
suffix which should be `_sender`.
|
suffix which should be `_sender`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
import os
|
import os
|
||||||
import regex as re
|
import regex as re
|
||||||
|
|
||||||
from talon.signature.constants import SIGNATURE_MAX_LINES
|
from talon.signature.constants import SIGNATURE_MAX_LINES
|
||||||
from talon.signature.learning.featurespace import build_pattern, features
|
from talon.signature.learning.featurespace import build_pattern, features
|
||||||
|
from six.moves import range
|
||||||
|
|
||||||
|
|
||||||
SENDER_SUFFIX = '_sender'
|
SENDER_SUFFIX = '_sender'
|
||||||
@@ -144,7 +146,7 @@ def build_extraction_dataset(folder, dataset_filename,
|
|||||||
if not sender or not msg:
|
if not sender or not msg:
|
||||||
continue
|
continue
|
||||||
lines = msg.splitlines()
|
lines = msg.splitlines()
|
||||||
for i in xrange(1, min(SIGNATURE_MAX_LINES,
|
for i in range(1, min(SIGNATURE_MAX_LINES,
|
||||||
len(lines)) + 1):
|
len(lines)) + 1):
|
||||||
line = lines[-i]
|
line = lines[-i]
|
||||||
label = -1
|
label = -1
|
||||||
|
|||||||
@@ -7,9 +7,12 @@ The body and the message sender string are converted into unicode before
|
|||||||
applying features to them.
|
applying features to them.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
from talon.signature.constants import (SIGNATURE_MAX_LINES,
|
from talon.signature.constants import (SIGNATURE_MAX_LINES,
|
||||||
TOO_LONG_SIGNATURE_LINE)
|
TOO_LONG_SIGNATURE_LINE)
|
||||||
from talon.signature.learning.helpers import *
|
from talon.signature.learning.helpers import *
|
||||||
|
from six.moves import zip
|
||||||
|
from functools import reduce
|
||||||
|
|
||||||
|
|
||||||
def features(sender=''):
|
def features(sender=''):
|
||||||
|
|||||||
@@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
import unicodedata
|
import unicodedata
|
||||||
import regex as re
|
import regex as re
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
# coding:utf-8
|
# coding:utf-8
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
import logging
|
import logging
|
||||||
from random import shuffle
|
from random import shuffle
|
||||||
import chardet
|
import chardet
|
||||||
@@ -10,6 +11,7 @@ from lxml import html
|
|||||||
from lxml.cssselect import CSSSelector
|
from lxml.cssselect import CSSSelector
|
||||||
|
|
||||||
from talon.constants import RE_DELIMITER
|
from talon.constants import RE_DELIMITER
|
||||||
|
import six
|
||||||
|
|
||||||
|
|
||||||
def safe_format(format_string, *args, **kwargs):
|
def safe_format(format_string, *args, **kwargs):
|
||||||
@@ -28,7 +30,7 @@ def safe_format(format_string, *args, **kwargs):
|
|||||||
except (UnicodeEncodeError, UnicodeDecodeError):
|
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||||
format_string = to_utf8(format_string)
|
format_string = to_utf8(format_string)
|
||||||
args = [to_utf8(p) for p in args]
|
args = [to_utf8(p) for p in args]
|
||||||
kwargs = {k: to_utf8(v) for k, v in kwargs.iteritems()}
|
kwargs = {k: to_utf8(v) for k, v in six.iteritems(kwargs)}
|
||||||
return format_string.format(*args, **kwargs)
|
return format_string.format(*args, **kwargs)
|
||||||
|
|
||||||
# ignore other errors
|
# ignore other errors
|
||||||
@@ -47,7 +49,7 @@ def to_unicode(str_or_unicode, precise=False):
|
|||||||
"""
|
"""
|
||||||
encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8'
|
encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8'
|
||||||
if isinstance(str_or_unicode, str):
|
if isinstance(str_or_unicode, str):
|
||||||
return unicode(str_or_unicode, encoding, 'replace')
|
return six.text_type(str_or_unicode, encoding, 'replace')
|
||||||
return str_or_unicode
|
return str_or_unicode
|
||||||
|
|
||||||
|
|
||||||
@@ -61,7 +63,7 @@ def detect_encoding(string):
|
|||||||
detected = chardet.detect(string)
|
detected = chardet.detect(string)
|
||||||
if detected:
|
if detected:
|
||||||
return detected.get('encoding') or 'utf-8'
|
return detected.get('encoding') or 'utf-8'
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
pass
|
pass
|
||||||
return 'utf-8'
|
return 'utf-8'
|
||||||
|
|
||||||
@@ -76,7 +78,7 @@ def quick_detect_encoding(string):
|
|||||||
detected = cchardet.detect(string)
|
detected = cchardet.detect(string)
|
||||||
if detected:
|
if detected:
|
||||||
return detected.get('encoding') or detect_encoding(string)
|
return detected.get('encoding') or detect_encoding(string)
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
pass
|
pass
|
||||||
return detect_encoding(string)
|
return detect_encoding(string)
|
||||||
|
|
||||||
@@ -87,7 +89,7 @@ def to_utf8(str_or_unicode):
|
|||||||
>>> utils.to_utf8(u'hi')
|
>>> utils.to_utf8(u'hi')
|
||||||
'hi'
|
'hi'
|
||||||
"""
|
"""
|
||||||
if isinstance(str_or_unicode, unicode):
|
if isinstance(str_or_unicode, six.text_type):
|
||||||
return str_or_unicode.encode("utf-8", "ignore")
|
return str_or_unicode.encode("utf-8", "ignore")
|
||||||
return str(str_or_unicode)
|
return str(str_or_unicode)
|
||||||
|
|
||||||
@@ -173,7 +175,7 @@ def _rm_excessive_newlines(s):
|
|||||||
def _encode_utf8(s):
|
def _encode_utf8(s):
|
||||||
"""Encode in 'utf-8' if unicode
|
"""Encode in 'utf-8' if unicode
|
||||||
"""
|
"""
|
||||||
return s.encode('utf-8') if isinstance(s, unicode) else s
|
return s.encode('utf-8') if isinstance(s, six.text_type) else s
|
||||||
|
|
||||||
|
|
||||||
_UTF8_DECLARATION = ('<meta http-equiv="Content-Type" content="text/html;'
|
_UTF8_DECLARATION = ('<meta http-equiv="Content-Type" content="text/html;'
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
from __future__ import absolute_import
|
||||||
from nose.tools import *
|
from nose.tools import *
|
||||||
from mock import *
|
from mock import *
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
from . import *
|
from . import *
|
||||||
from . fixtures import *
|
from . fixtures import *
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
from . import *
|
from . import *
|
||||||
from . fixtures import *
|
from . fixtures import *
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
from .. import *
|
from .. import *
|
||||||
|
|
||||||
from talon.signature import bruteforce
|
from talon.signature import bruteforce
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
from .. import *
|
from .. import *
|
||||||
|
|
||||||
import os
|
import os
|
||||||
@@ -8,6 +9,7 @@ from talon.signature.learning import dataset
|
|||||||
from talon import signature
|
from talon import signature
|
||||||
from talon.signature import extraction as e
|
from talon.signature import extraction as e
|
||||||
from talon.signature import bruteforce
|
from talon.signature import bruteforce
|
||||||
|
from six.moves import range
|
||||||
|
|
||||||
|
|
||||||
def test_message_shorter_SIGNATURE_MAX_LINES():
|
def test_message_shorter_SIGNATURE_MAX_LINES():
|
||||||
@@ -127,20 +129,20 @@ def test_mark_lines():
|
|||||||
|
|
||||||
def test_process_marked_lines():
|
def test_process_marked_lines():
|
||||||
# no signature found
|
# no signature found
|
||||||
eq_((range(5), None), e._process_marked_lines(range(5), 'telt'))
|
eq_((list(range(5)), None), e._process_marked_lines(list(range(5)), 'telt'))
|
||||||
|
|
||||||
# signature in the middle of the text
|
# signature in the middle of the text
|
||||||
eq_((range(9), None), e._process_marked_lines(range(9), 'tesestelt'))
|
eq_((list(range(9)), None), e._process_marked_lines(list(range(9)), 'tesestelt'))
|
||||||
|
|
||||||
# long line splits signature
|
# long line splits signature
|
||||||
eq_((range(7), [7, 8]),
|
eq_((list(range(7)), [7, 8]),
|
||||||
e._process_marked_lines(range(9), 'tsslsless'))
|
e._process_marked_lines(list(range(9)), 'tsslsless'))
|
||||||
|
|
||||||
eq_((range(20), [20]),
|
eq_((list(range(20)), [20]),
|
||||||
e._process_marked_lines(range(21), 'ttttttstttesllelelets'))
|
e._process_marked_lines(list(range(21)), 'ttttttstttesllelelets'))
|
||||||
|
|
||||||
# some signature lines could be identified as text
|
# some signature lines could be identified as text
|
||||||
eq_(([0], range(1, 9)), e._process_marked_lines(range(9), 'tsetetest'))
|
eq_(([0], list(range(1, 9))), e._process_marked_lines(list(range(9)), 'tsetetest'))
|
||||||
|
|
||||||
eq_(([], range(5)),
|
eq_(([], list(range(5))),
|
||||||
e._process_marked_lines(range(5), "ststt"))
|
e._process_marked_lines(list(range(5)), "ststt"))
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
from ... import *
|
from ... import *
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
from ... import *
|
from ... import *
|
||||||
|
|
||||||
from talon.signature.learning import featurespace as fs
|
from talon.signature.learning import featurespace as fs
|
||||||
|
|||||||
@@ -1,11 +1,13 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
from ... import *
|
from ... import *
|
||||||
|
|
||||||
import regex as re
|
import regex as re
|
||||||
|
|
||||||
from talon.signature.learning import helpers as h
|
from talon.signature.learning import helpers as h
|
||||||
from talon.signature.learning.helpers import *
|
from talon.signature.learning.helpers import *
|
||||||
|
from six.moves import range
|
||||||
|
|
||||||
# First testing regex constants.
|
# First testing regex constants.
|
||||||
VALID = '''
|
VALID = '''
|
||||||
@@ -154,7 +156,7 @@ def test_extract_names():
|
|||||||
# check that extracted names could be compiled
|
# check that extracted names could be compiled
|
||||||
try:
|
try:
|
||||||
re.compile("|".join(extracted_names))
|
re.compile("|".join(extracted_names))
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
ok_(False, ("Failed to compile extracted names {}"
|
ok_(False, ("Failed to compile extracted names {}"
|
||||||
"\n\nReason: {}").format(extracted_names, e))
|
"\n\nReason: {}").format(extracted_names, e))
|
||||||
if expected_names:
|
if expected_names:
|
||||||
@@ -204,7 +206,7 @@ def test_has_signature():
|
|||||||
'sender@example.com'))
|
'sender@example.com'))
|
||||||
assert_false(h.has_signature('http://www.example.com/555-555-5555',
|
assert_false(h.has_signature('http://www.example.com/555-555-5555',
|
||||||
'sender@example.com'))
|
'sender@example.com'))
|
||||||
long_line = ''.join(['q' for e in xrange(28)])
|
long_line = ''.join(['q' for e in range(28)])
|
||||||
assert_false(h.has_signature(long_line + ' sender', 'sender@example.com'))
|
assert_false(h.has_signature(long_line + ' sender', 'sender@example.com'))
|
||||||
# wont crash on an empty string
|
# wont crash on an empty string
|
||||||
assert_false(h.has_signature('', ''))
|
assert_false(h.has_signature('', ''))
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
from . import *
|
from . import *
|
||||||
from . fixtures import *
|
from . fixtures import *
|
||||||
|
|
||||||
@@ -7,6 +8,8 @@ import os
|
|||||||
|
|
||||||
import email.iterators
|
import email.iterators
|
||||||
from talon import quotations
|
from talon import quotations
|
||||||
|
import six
|
||||||
|
from six.moves import range
|
||||||
|
|
||||||
|
|
||||||
@patch.object(quotations, 'MAX_LINES_COUNT', 1)
|
@patch.object(quotations, 'MAX_LINES_COUNT', 1)
|
||||||
@@ -138,7 +141,7 @@ def _check_pattern_original_message(original_message_indicator):
|
|||||||
-----{}-----
|
-----{}-----
|
||||||
|
|
||||||
Test"""
|
Test"""
|
||||||
eq_('Test reply', quotations.extract_from_plain(msg_body.format(unicode(original_message_indicator))))
|
eq_('Test reply', quotations.extract_from_plain(msg_body.format(six.text_type(original_message_indicator))))
|
||||||
|
|
||||||
def test_english_original_message():
|
def test_english_original_message():
|
||||||
_check_pattern_original_message('Original Message')
|
_check_pattern_original_message('Original Message')
|
||||||
@@ -669,7 +672,7 @@ def test_standard_replies():
|
|||||||
continue
|
continue
|
||||||
with open(filename) as f:
|
with open(filename) as f:
|
||||||
message = email.message_from_file(f)
|
message = email.message_from_file(f)
|
||||||
body = email.iterators.typed_subpart_iterator(message, subtype='plain').next()
|
body = next(email.iterators.typed_subpart_iterator(message, subtype='plain'))
|
||||||
text = ''.join(email.iterators.body_line_iterator(body, True))
|
text = ''.join(email.iterators.body_line_iterator(body, True))
|
||||||
|
|
||||||
stripped_text = quotations.extract_from_plain(text)
|
stripped_text = quotations.extract_from_plain(text)
|
||||||
|
|||||||
@@ -1,9 +1,11 @@
|
|||||||
# coding:utf-8
|
# coding:utf-8
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
from . import *
|
from . import *
|
||||||
|
|
||||||
from talon import utils as u
|
from talon import utils as u
|
||||||
import cchardet
|
import cchardet
|
||||||
|
import six
|
||||||
|
|
||||||
|
|
||||||
def test_get_delimiter():
|
def test_get_delimiter():
|
||||||
@@ -14,10 +16,10 @@ def test_get_delimiter():
|
|||||||
|
|
||||||
def test_unicode():
|
def test_unicode():
|
||||||
eq_ (u'hi', u.to_unicode('hi'))
|
eq_ (u'hi', u.to_unicode('hi'))
|
||||||
eq_ (type(u.to_unicode('hi')), unicode )
|
eq_ (type(u.to_unicode('hi')), six.text_type )
|
||||||
eq_ (type(u.to_unicode(u'hi')), unicode )
|
eq_ (type(u.to_unicode(u'hi')), six.text_type )
|
||||||
eq_ (type(u.to_unicode('привет')), unicode )
|
eq_ (type(u.to_unicode('привет')), six.text_type )
|
||||||
eq_ (type(u.to_unicode(u'привет')), unicode )
|
eq_ (type(u.to_unicode(u'привет')), six.text_type )
|
||||||
eq_ (u"привет", u.to_unicode('привет'))
|
eq_ (u"привет", u.to_unicode('привет'))
|
||||||
eq_ (u"привет", u.to_unicode(u'привет'))
|
eq_ (u"привет", u.to_unicode(u'привет'))
|
||||||
# some latin1 stuff
|
# some latin1 stuff
|
||||||
|
|||||||
1
train.py
1
train.py
@@ -1,3 +1,4 @@
|
|||||||
|
from __future__ import absolute_import
|
||||||
from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA
|
from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA
|
||||||
from talon.signature.learning.classifier import train, init
|
from talon.signature.learning.classifier import train, init
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user