Run modernizer on the code.

This commit is contained in:
Umair Khan
2016-07-12 17:25:46 +05:00
parent 07f68815df
commit da998ddb60
24 changed files with 61 additions and 27 deletions

View File

@@ -1,3 +1,4 @@
from __future__ import absolute_import
from setuptools import setup, find_packages
from setuptools.command.install import install

View File

@@ -1,3 +1,4 @@
from __future__ import absolute_import
from talon.quotations import register_xpath_extensions
try:
from talon import signature

View File

@@ -1,3 +1,4 @@
from __future__ import absolute_import
import regex as re

View File

@@ -3,6 +3,7 @@ The module's functions operate on message bodies trying to extract original
messages (without quoted messages) from html
"""
from __future__ import absolute_import
import regex as re

View File

@@ -5,6 +5,7 @@ The module's functions operate on message bodies trying to extract
original messages (without quoted messages)
"""
from __future__ import absolute_import
import regex as re
import logging
from copy import deepcopy
@@ -13,6 +14,7 @@ from lxml import html, etree
from talon.utils import get_delimiter, html_to_text
from talon import html_quotations
from six.moves import range
log = logging.getLogger(__name__)
@@ -207,7 +209,7 @@ def mark_message_lines(lines):
if splitter:
# append as many splitter markers as lines in splitter
splitter_lines = splitter.group().splitlines()
for j in xrange(len(splitter_lines)):
for j in range(len(splitter_lines)):
markers[i + j] = 's'
# skip splitter lines
@@ -388,7 +390,7 @@ def extract_from_html(msg_body):
lines_were_deleted, first_deleted, last_deleted = return_flags
if lines_were_deleted:
#collect checkpoints from deleted lines
for i in xrange(first_deleted, last_deleted):
for i in range(first_deleted, last_deleted):
for checkpoint in line_checkpoints[i]:
quotation_checkpoints[checkpoint] = True
else:

View File

@@ -20,6 +20,7 @@ trained against, don't forget to regenerate:
* signature/data/classifier
"""
from __future__ import absolute_import
import os
from . import extraction

View File

@@ -1,3 +1,4 @@
from __future__ import absolute_import
import logging
import regex as re
@@ -111,7 +112,7 @@ def extract_signature(msg_body):
return (stripped_body.strip(),
signature.strip())
except Exception, e:
except Exception as e:
log.exception('ERROR extracting signature')
return (msg_body, None)

View File

@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import logging
import regex as re

View File

@@ -5,6 +5,7 @@ The classifier could be used to detect if a certain line of the message
body belongs to the signature.
"""
from __future__ import absolute_import
from numpy import genfromtxt
from sklearn.svm import LinearSVC
from sklearn.externals import joblib

View File

@@ -16,11 +16,13 @@ suffix and the corresponding sender file has the same name except for the
suffix which should be `_sender`.
"""
from __future__ import absolute_import
import os
import regex as re
from talon.signature.constants import SIGNATURE_MAX_LINES
from talon.signature.learning.featurespace import build_pattern, features
from six.moves import range
SENDER_SUFFIX = '_sender'
@@ -144,7 +146,7 @@ def build_extraction_dataset(folder, dataset_filename,
if not sender or not msg:
continue
lines = msg.splitlines()
for i in xrange(1, min(SIGNATURE_MAX_LINES,
for i in range(1, min(SIGNATURE_MAX_LINES,
len(lines)) + 1):
line = lines[-i]
label = -1

View File

@@ -7,9 +7,12 @@ The body and the message sender string are converted into unicode before
applying features to them.
"""
from __future__ import absolute_import
from talon.signature.constants import (SIGNATURE_MAX_LINES,
TOO_LONG_SIGNATURE_LINE)
from talon.signature.learning.helpers import *
from six.moves import zip
from functools import reduce
def features(sender=''):

View File

@@ -6,6 +6,7 @@
"""
from __future__ import absolute_import
import unicodedata
import regex as re

View File

@@ -1,5 +1,6 @@
# coding:utf-8
from __future__ import absolute_import
import logging
from random import shuffle
import chardet
@@ -10,6 +11,7 @@ from lxml import html
from lxml.cssselect import CSSSelector
from talon.constants import RE_DELIMITER
import six
def safe_format(format_string, *args, **kwargs):
@@ -28,7 +30,7 @@ def safe_format(format_string, *args, **kwargs):
except (UnicodeEncodeError, UnicodeDecodeError):
format_string = to_utf8(format_string)
args = [to_utf8(p) for p in args]
kwargs = {k: to_utf8(v) for k, v in kwargs.iteritems()}
kwargs = {k: to_utf8(v) for k, v in six.iteritems(kwargs)}
return format_string.format(*args, **kwargs)
# ignore other errors
@@ -47,7 +49,7 @@ def to_unicode(str_or_unicode, precise=False):
"""
encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8'
if isinstance(str_or_unicode, str):
return unicode(str_or_unicode, encoding, 'replace')
return six.text_type(str_or_unicode, encoding, 'replace')
return str_or_unicode
@@ -61,7 +63,7 @@ def detect_encoding(string):
detected = chardet.detect(string)
if detected:
return detected.get('encoding') or 'utf-8'
except Exception, e:
except Exception as e:
pass
return 'utf-8'
@@ -76,7 +78,7 @@ def quick_detect_encoding(string):
detected = cchardet.detect(string)
if detected:
return detected.get('encoding') or detect_encoding(string)
except Exception, e:
except Exception as e:
pass
return detect_encoding(string)
@@ -87,7 +89,7 @@ def to_utf8(str_or_unicode):
>>> utils.to_utf8(u'hi')
'hi'
"""
if isinstance(str_or_unicode, unicode):
if isinstance(str_or_unicode, six.text_type):
return str_or_unicode.encode("utf-8", "ignore")
return str(str_or_unicode)
@@ -173,7 +175,7 @@ def _rm_excessive_newlines(s):
def _encode_utf8(s):
"""Encode in 'utf-8' if unicode
"""
return s.encode('utf-8') if isinstance(s, unicode) else s
return s.encode('utf-8') if isinstance(s, six.text_type) else s
_UTF8_DECLARATION = ('<meta http-equiv="Content-Type" content="text/html;'

View File

@@ -1,3 +1,4 @@
from __future__ import absolute_import
from nose.tools import *
from mock import *

View File

@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from . import *
from . fixtures import *

View File

@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from . import *
from . fixtures import *

View File

@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from .. import *
from talon.signature import bruteforce

View File

@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from .. import *
import os
@@ -8,6 +9,7 @@ from talon.signature.learning import dataset
from talon import signature
from talon.signature import extraction as e
from talon.signature import bruteforce
from six.moves import range
def test_message_shorter_SIGNATURE_MAX_LINES():
@@ -127,20 +129,20 @@ def test_mark_lines():
def test_process_marked_lines():
# no signature found
eq_((range(5), None), e._process_marked_lines(range(5), 'telt'))
eq_((list(range(5)), None), e._process_marked_lines(list(range(5)), 'telt'))
# signature in the middle of the text
eq_((range(9), None), e._process_marked_lines(range(9), 'tesestelt'))
eq_((list(range(9)), None), e._process_marked_lines(list(range(9)), 'tesestelt'))
# long line splits signature
eq_((range(7), [7, 8]),
e._process_marked_lines(range(9), 'tsslsless'))
eq_((list(range(7)), [7, 8]),
e._process_marked_lines(list(range(9)), 'tsslsless'))
eq_((range(20), [20]),
e._process_marked_lines(range(21), 'ttttttstttesllelelets'))
eq_((list(range(20)), [20]),
e._process_marked_lines(list(range(21)), 'ttttttstttesllelelets'))
# some signature lines could be identified as text
eq_(([0], range(1, 9)), e._process_marked_lines(range(9), 'tsetetest'))
eq_(([0], list(range(1, 9))), e._process_marked_lines(list(range(9)), 'tsetetest'))
eq_(([], range(5)),
e._process_marked_lines(range(5), "ststt"))
eq_(([], list(range(5))),
e._process_marked_lines(list(range(5)), "ststt"))

View File

@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from ... import *
import os

View File

@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from ... import *
from talon.signature.learning import featurespace as fs

View File

@@ -1,11 +1,13 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from ... import *
import regex as re
from talon.signature.learning import helpers as h
from talon.signature.learning.helpers import *
from six.moves import range
# First testing regex constants.
VALID = '''
@@ -154,7 +156,7 @@ def test_extract_names():
# check that extracted names could be compiled
try:
re.compile("|".join(extracted_names))
except Exception, e:
except Exception as e:
ok_(False, ("Failed to compile extracted names {}"
"\n\nReason: {}").format(extracted_names, e))
if expected_names:
@@ -204,7 +206,7 @@ def test_has_signature():
'sender@example.com'))
assert_false(h.has_signature('http://www.example.com/555-555-5555',
'sender@example.com'))
long_line = ''.join(['q' for e in xrange(28)])
long_line = ''.join(['q' for e in range(28)])
assert_false(h.has_signature(long_line + ' sender', 'sender@example.com'))
# wont crash on an empty string
assert_false(h.has_signature('', ''))

View File

@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from . import *
from . fixtures import *
@@ -7,6 +8,8 @@ import os
import email.iterators
from talon import quotations
import six
from six.moves import range
@patch.object(quotations, 'MAX_LINES_COUNT', 1)
@@ -138,7 +141,7 @@ def _check_pattern_original_message(original_message_indicator):
-----{}-----
Test"""
eq_('Test reply', quotations.extract_from_plain(msg_body.format(unicode(original_message_indicator))))
eq_('Test reply', quotations.extract_from_plain(msg_body.format(six.text_type(original_message_indicator))))
def test_english_original_message():
_check_pattern_original_message('Original Message')
@@ -669,7 +672,7 @@ def test_standard_replies():
continue
with open(filename) as f:
message = email.message_from_file(f)
body = email.iterators.typed_subpart_iterator(message, subtype='plain').next()
body = next(email.iterators.typed_subpart_iterator(message, subtype='plain'))
text = ''.join(email.iterators.body_line_iterator(body, True))
stripped_text = quotations.extract_from_plain(text)

View File

@@ -1,9 +1,11 @@
# coding:utf-8
from __future__ import absolute_import
from . import *
from talon import utils as u
import cchardet
import six
def test_get_delimiter():
@@ -14,10 +16,10 @@ def test_get_delimiter():
def test_unicode():
eq_ (u'hi', u.to_unicode('hi'))
eq_ (type(u.to_unicode('hi')), unicode )
eq_ (type(u.to_unicode(u'hi')), unicode )
eq_ (type(u.to_unicode('привет')), unicode )
eq_ (type(u.to_unicode(u'привет')), unicode )
eq_ (type(u.to_unicode('hi')), six.text_type )
eq_ (type(u.to_unicode(u'hi')), six.text_type )
eq_ (type(u.to_unicode('привет')), six.text_type )
eq_ (type(u.to_unicode(u'привет')), six.text_type )
eq_ (u"привет", u.to_unicode('привет'))
eq_ (u"привет", u.to_unicode(u'привет'))
# some latin1 stuff

View File

@@ -1,3 +1,4 @@
from __future__ import absolute_import
from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA
from talon.signature.learning.classifier import train, init