From da998ddb6039f4ee660787b6f50b821eee332831 Mon Sep 17 00:00:00 2001 From: Umair Khan Date: Tue, 12 Jul 2016 17:25:46 +0500 Subject: [PATCH] Run modernizer on the code. --- setup.py | 1 + talon/__init__.py | 1 + talon/constants.py | 1 + talon/html_quotations.py | 1 + talon/quotations.py | 6 ++++-- talon/signature/__init__.py | 1 + talon/signature/bruteforce.py | 3 ++- talon/signature/extraction.py | 1 + talon/signature/learning/classifier.py | 1 + talon/signature/learning/dataset.py | 4 +++- talon/signature/learning/featurespace.py | 3 +++ talon/signature/learning/helpers.py | 1 + talon/utils.py | 14 +++++++------ tests/__init__.py | 1 + tests/html_quotations_test.py | 1 + tests/quotations_test.py | 1 + tests/signature/bruteforce_test.py | 1 + tests/signature/extraction_test.py | 20 ++++++++++--------- tests/signature/learning/dataset_test.py | 1 + tests/signature/learning/featurespace_test.py | 1 + tests/signature/learning/helpers_test.py | 6 ++++-- tests/text_quotations_test.py | 7 +++++-- tests/utils_test.py | 10 ++++++---- train.py | 1 + 24 files changed, 61 insertions(+), 27 deletions(-) diff --git a/setup.py b/setup.py index 8253212..79e3f20 100755 --- a/setup.py +++ b/setup.py @@ -1,3 +1,4 @@ +from __future__ import absolute_import from setuptools import setup, find_packages from setuptools.command.install import install diff --git a/talon/__init__.py b/talon/__init__.py index de27ae6..7060f5b 100644 --- a/talon/__init__.py +++ b/talon/__init__.py @@ -1,3 +1,4 @@ +from __future__ import absolute_import from talon.quotations import register_xpath_extensions try: from talon import signature diff --git a/talon/constants.py b/talon/constants.py index 68fa04c..0e7276d 100644 --- a/talon/constants.py +++ b/talon/constants.py @@ -1,3 +1,4 @@ +from __future__ import absolute_import import regex as re diff --git a/talon/html_quotations.py b/talon/html_quotations.py index 1af78ac..4aa7e74 100644 --- a/talon/html_quotations.py +++ b/talon/html_quotations.py @@ -3,6 +3,7 @@ The module's functions operate on message bodies trying to extract original messages (without quoted messages) from html """ +from __future__ import absolute_import import regex as re diff --git a/talon/quotations.py b/talon/quotations.py index ff23daa..b294de5 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -5,6 +5,7 @@ The module's functions operate on message bodies trying to extract original messages (without quoted messages) """ +from __future__ import absolute_import import regex as re import logging from copy import deepcopy @@ -13,6 +14,7 @@ from lxml import html, etree from talon.utils import get_delimiter, html_to_text from talon import html_quotations +from six.moves import range log = logging.getLogger(__name__) @@ -207,7 +209,7 @@ def mark_message_lines(lines): if splitter: # append as many splitter markers as lines in splitter splitter_lines = splitter.group().splitlines() - for j in xrange(len(splitter_lines)): + for j in range(len(splitter_lines)): markers[i + j] = 's' # skip splitter lines @@ -388,7 +390,7 @@ def extract_from_html(msg_body): lines_were_deleted, first_deleted, last_deleted = return_flags if lines_were_deleted: #collect checkpoints from deleted lines - for i in xrange(first_deleted, last_deleted): + for i in range(first_deleted, last_deleted): for checkpoint in line_checkpoints[i]: quotation_checkpoints[checkpoint] = True else: diff --git a/talon/signature/__init__.py b/talon/signature/__init__.py index a871447..fc60e1d 100644 --- a/talon/signature/__init__.py +++ b/talon/signature/__init__.py @@ -20,6 +20,7 @@ trained against, don't forget to regenerate: * signature/data/classifier """ +from __future__ import absolute_import import os from . import extraction diff --git a/talon/signature/bruteforce.py b/talon/signature/bruteforce.py index d3493bb..7f666bd 100644 --- a/talon/signature/bruteforce.py +++ b/talon/signature/bruteforce.py @@ -1,3 +1,4 @@ +from __future__ import absolute_import import logging import regex as re @@ -111,7 +112,7 @@ def extract_signature(msg_body): return (stripped_body.strip(), signature.strip()) - except Exception, e: + except Exception as e: log.exception('ERROR extracting signature') return (msg_body, None) diff --git a/talon/signature/extraction.py b/talon/signature/extraction.py index 995ad27..3259171 100644 --- a/talon/signature/extraction.py +++ b/talon/signature/extraction.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +from __future__ import absolute_import import logging import regex as re diff --git a/talon/signature/learning/classifier.py b/talon/signature/learning/classifier.py index 9ce5e75..8ec3228 100644 --- a/talon/signature/learning/classifier.py +++ b/talon/signature/learning/classifier.py @@ -5,6 +5,7 @@ The classifier could be used to detect if a certain line of the message body belongs to the signature. """ +from __future__ import absolute_import from numpy import genfromtxt from sklearn.svm import LinearSVC from sklearn.externals import joblib diff --git a/talon/signature/learning/dataset.py b/talon/signature/learning/dataset.py index b0a9f45..308995b 100644 --- a/talon/signature/learning/dataset.py +++ b/talon/signature/learning/dataset.py @@ -16,11 +16,13 @@ suffix and the corresponding sender file has the same name except for the suffix which should be `_sender`. """ +from __future__ import absolute_import import os import regex as re from talon.signature.constants import SIGNATURE_MAX_LINES from talon.signature.learning.featurespace import build_pattern, features +from six.moves import range SENDER_SUFFIX = '_sender' @@ -144,7 +146,7 @@ def build_extraction_dataset(folder, dataset_filename, if not sender or not msg: continue lines = msg.splitlines() - for i in xrange(1, min(SIGNATURE_MAX_LINES, + for i in range(1, min(SIGNATURE_MAX_LINES, len(lines)) + 1): line = lines[-i] label = -1 diff --git a/talon/signature/learning/featurespace.py b/talon/signature/learning/featurespace.py index 60676f9..649e859 100644 --- a/talon/signature/learning/featurespace.py +++ b/talon/signature/learning/featurespace.py @@ -7,9 +7,12 @@ The body and the message sender string are converted into unicode before applying features to them. """ +from __future__ import absolute_import from talon.signature.constants import (SIGNATURE_MAX_LINES, TOO_LONG_SIGNATURE_LINE) from talon.signature.learning.helpers import * +from six.moves import zip +from functools import reduce def features(sender=''): diff --git a/talon/signature/learning/helpers.py b/talon/signature/learning/helpers.py index 7085a74..f94c688 100644 --- a/talon/signature/learning/helpers.py +++ b/talon/signature/learning/helpers.py @@ -6,6 +6,7 @@ """ +from __future__ import absolute_import import unicodedata import regex as re diff --git a/talon/utils.py b/talon/utils.py index dc47622..e4bd19b 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -1,5 +1,6 @@ # coding:utf-8 +from __future__ import absolute_import import logging from random import shuffle import chardet @@ -10,6 +11,7 @@ from lxml import html from lxml.cssselect import CSSSelector from talon.constants import RE_DELIMITER +import six def safe_format(format_string, *args, **kwargs): @@ -28,7 +30,7 @@ def safe_format(format_string, *args, **kwargs): except (UnicodeEncodeError, UnicodeDecodeError): format_string = to_utf8(format_string) args = [to_utf8(p) for p in args] - kwargs = {k: to_utf8(v) for k, v in kwargs.iteritems()} + kwargs = {k: to_utf8(v) for k, v in six.iteritems(kwargs)} return format_string.format(*args, **kwargs) # ignore other errors @@ -47,7 +49,7 @@ def to_unicode(str_or_unicode, precise=False): """ encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8' if isinstance(str_or_unicode, str): - return unicode(str_or_unicode, encoding, 'replace') + return six.text_type(str_or_unicode, encoding, 'replace') return str_or_unicode @@ -61,7 +63,7 @@ def detect_encoding(string): detected = chardet.detect(string) if detected: return detected.get('encoding') or 'utf-8' - except Exception, e: + except Exception as e: pass return 'utf-8' @@ -76,7 +78,7 @@ def quick_detect_encoding(string): detected = cchardet.detect(string) if detected: return detected.get('encoding') or detect_encoding(string) - except Exception, e: + except Exception as e: pass return detect_encoding(string) @@ -87,7 +89,7 @@ def to_utf8(str_or_unicode): >>> utils.to_utf8(u'hi') 'hi' """ - if isinstance(str_or_unicode, unicode): + if isinstance(str_or_unicode, six.text_type): return str_or_unicode.encode("utf-8", "ignore") return str(str_or_unicode) @@ -173,7 +175,7 @@ def _rm_excessive_newlines(s): def _encode_utf8(s): """Encode in 'utf-8' if unicode """ - return s.encode('utf-8') if isinstance(s, unicode) else s + return s.encode('utf-8') if isinstance(s, six.text_type) else s _UTF8_DECLARATION = ('