Run modernizer on the code.
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
from __future__ import absolute_import
|
||||
from talon.quotations import register_xpath_extensions
|
||||
try:
|
||||
from talon import signature
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from __future__ import absolute_import
|
||||
import regex as re
|
||||
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ The module's functions operate on message bodies trying to extract original
|
||||
messages (without quoted messages) from html
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
import regex as re
|
||||
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ The module's functions operate on message bodies trying to extract
|
||||
original messages (without quoted messages)
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
import regex as re
|
||||
import logging
|
||||
from copy import deepcopy
|
||||
@@ -13,6 +14,7 @@ from lxml import html, etree
|
||||
|
||||
from talon.utils import get_delimiter, html_to_text
|
||||
from talon import html_quotations
|
||||
from six.moves import range
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
@@ -207,7 +209,7 @@ def mark_message_lines(lines):
|
||||
if splitter:
|
||||
# append as many splitter markers as lines in splitter
|
||||
splitter_lines = splitter.group().splitlines()
|
||||
for j in xrange(len(splitter_lines)):
|
||||
for j in range(len(splitter_lines)):
|
||||
markers[i + j] = 's'
|
||||
|
||||
# skip splitter lines
|
||||
@@ -388,7 +390,7 @@ def extract_from_html(msg_body):
|
||||
lines_were_deleted, first_deleted, last_deleted = return_flags
|
||||
if lines_were_deleted:
|
||||
#collect checkpoints from deleted lines
|
||||
for i in xrange(first_deleted, last_deleted):
|
||||
for i in range(first_deleted, last_deleted):
|
||||
for checkpoint in line_checkpoints[i]:
|
||||
quotation_checkpoints[checkpoint] = True
|
||||
else:
|
||||
|
||||
@@ -20,6 +20,7 @@ trained against, don't forget to regenerate:
|
||||
* signature/data/classifier
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
import os
|
||||
|
||||
from . import extraction
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from __future__ import absolute_import
|
||||
import logging
|
||||
|
||||
import regex as re
|
||||
@@ -111,7 +112,7 @@ def extract_signature(msg_body):
|
||||
|
||||
return (stripped_body.strip(),
|
||||
signature.strip())
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
log.exception('ERROR extracting signature')
|
||||
return (msg_body, None)
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
import logging
|
||||
|
||||
import regex as re
|
||||
|
||||
@@ -5,6 +5,7 @@ The classifier could be used to detect if a certain line of the message
|
||||
body belongs to the signature.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from numpy import genfromtxt
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.externals import joblib
|
||||
|
||||
@@ -16,11 +16,13 @@ suffix and the corresponding sender file has the same name except for the
|
||||
suffix which should be `_sender`.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
import os
|
||||
import regex as re
|
||||
|
||||
from talon.signature.constants import SIGNATURE_MAX_LINES
|
||||
from talon.signature.learning.featurespace import build_pattern, features
|
||||
from six.moves import range
|
||||
|
||||
|
||||
SENDER_SUFFIX = '_sender'
|
||||
@@ -144,7 +146,7 @@ def build_extraction_dataset(folder, dataset_filename,
|
||||
if not sender or not msg:
|
||||
continue
|
||||
lines = msg.splitlines()
|
||||
for i in xrange(1, min(SIGNATURE_MAX_LINES,
|
||||
for i in range(1, min(SIGNATURE_MAX_LINES,
|
||||
len(lines)) + 1):
|
||||
line = lines[-i]
|
||||
label = -1
|
||||
|
||||
@@ -7,9 +7,12 @@ The body and the message sender string are converted into unicode before
|
||||
applying features to them.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from talon.signature.constants import (SIGNATURE_MAX_LINES,
|
||||
TOO_LONG_SIGNATURE_LINE)
|
||||
from talon.signature.learning.helpers import *
|
||||
from six.moves import zip
|
||||
from functools import reduce
|
||||
|
||||
|
||||
def features(sender=''):
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
import unicodedata
|
||||
import regex as re
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# coding:utf-8
|
||||
|
||||
from __future__ import absolute_import
|
||||
import logging
|
||||
from random import shuffle
|
||||
import chardet
|
||||
@@ -10,6 +11,7 @@ from lxml import html
|
||||
from lxml.cssselect import CSSSelector
|
||||
|
||||
from talon.constants import RE_DELIMITER
|
||||
import six
|
||||
|
||||
|
||||
def safe_format(format_string, *args, **kwargs):
|
||||
@@ -28,7 +30,7 @@ def safe_format(format_string, *args, **kwargs):
|
||||
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||
format_string = to_utf8(format_string)
|
||||
args = [to_utf8(p) for p in args]
|
||||
kwargs = {k: to_utf8(v) for k, v in kwargs.iteritems()}
|
||||
kwargs = {k: to_utf8(v) for k, v in six.iteritems(kwargs)}
|
||||
return format_string.format(*args, **kwargs)
|
||||
|
||||
# ignore other errors
|
||||
@@ -47,7 +49,7 @@ def to_unicode(str_or_unicode, precise=False):
|
||||
"""
|
||||
encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8'
|
||||
if isinstance(str_or_unicode, str):
|
||||
return unicode(str_or_unicode, encoding, 'replace')
|
||||
return six.text_type(str_or_unicode, encoding, 'replace')
|
||||
return str_or_unicode
|
||||
|
||||
|
||||
@@ -61,7 +63,7 @@ def detect_encoding(string):
|
||||
detected = chardet.detect(string)
|
||||
if detected:
|
||||
return detected.get('encoding') or 'utf-8'
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
pass
|
||||
return 'utf-8'
|
||||
|
||||
@@ -76,7 +78,7 @@ def quick_detect_encoding(string):
|
||||
detected = cchardet.detect(string)
|
||||
if detected:
|
||||
return detected.get('encoding') or detect_encoding(string)
|
||||
except Exception, e:
|
||||
except Exception as e:
|
||||
pass
|
||||
return detect_encoding(string)
|
||||
|
||||
@@ -87,7 +89,7 @@ def to_utf8(str_or_unicode):
|
||||
>>> utils.to_utf8(u'hi')
|
||||
'hi'
|
||||
"""
|
||||
if isinstance(str_or_unicode, unicode):
|
||||
if isinstance(str_or_unicode, six.text_type):
|
||||
return str_or_unicode.encode("utf-8", "ignore")
|
||||
return str(str_or_unicode)
|
||||
|
||||
@@ -173,7 +175,7 @@ def _rm_excessive_newlines(s):
|
||||
def _encode_utf8(s):
|
||||
"""Encode in 'utf-8' if unicode
|
||||
"""
|
||||
return s.encode('utf-8') if isinstance(s, unicode) else s
|
||||
return s.encode('utf-8') if isinstance(s, six.text_type) else s
|
||||
|
||||
|
||||
_UTF8_DECLARATION = ('<meta http-equiv="Content-Type" content="text/html;'
|
||||
|
||||
Reference in New Issue
Block a user