Drops Python 2 support
REP-1030 In addition to some python 2 => 3 fixes, this change bumps the scikit-learn version to latest. The previously pinned version of scikit-learn failed trying to compile all necessary C modules under python 3.7+ due to included header files that weren't compatible with C the API implemented in python 3.7+. Simultaneously, with the restrictive compatibility supported by scikit-learn, it seemed prudent to drop python 2 support altogether. Otherwise, we'd be stuck with python 3.4 as the newest possible version we could support. With this change, tests are currently passing under 3.9.2. Lastly, imports the original training data. At some point, a new version of the training data was committed to the repo but no classifier was trained from it. Using a classifier trained from this new data resulted in most of the tests failing.
This commit is contained in:
20
.build/Dockerfile
Normal file
20
.build/Dockerfile
Normal file
@@ -0,0 +1,20 @@
|
||||
FROM python:3.9-slim-buster AS deps
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git curl python3-dev libatlas3-base libatlas-base-dev liblapack-dev libxml2 libxml2-dev libffi6 libffi-dev musl-dev libxslt-dev
|
||||
|
||||
FROM deps AS testable
|
||||
ARG REPORT_PATH
|
||||
|
||||
VOLUME ["/var/mailgun", "/etc/mailgun/ssl", ${REPORT_PATH}]
|
||||
|
||||
ADD . /app
|
||||
WORKDIR /app
|
||||
COPY wheel/* /wheel/
|
||||
|
||||
RUN mkdir -p ${REPORT_PATH}
|
||||
|
||||
RUN python ./setup.py build bdist_wheel -d /wheel && \
|
||||
pip install --no-deps /wheel/*
|
||||
|
||||
ENTRYPOINT ["/bin/sh", "/app/run_tests.sh"]
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -54,3 +54,6 @@ _trial_temp
|
||||
|
||||
# OSX
|
||||
.DS_Store
|
||||
|
||||
# vim-backup
|
||||
*.bak
|
||||
|
||||
11
requirements.txt
Normal file
11
requirements.txt
Normal file
@@ -0,0 +1,11 @@
|
||||
chardet>=1.0.1
|
||||
cchardet>=0.3.5
|
||||
cssselect
|
||||
html5lib
|
||||
joblib
|
||||
lxml>=2.3.3
|
||||
numpy
|
||||
regex>=1
|
||||
scikit-learn==0.24.1 # pickled versions of classifier, else rebuild
|
||||
scipy
|
||||
six>=1.10.0
|
||||
4
run_tests.sh
Executable file
4
run_tests.sh
Executable file
@@ -0,0 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
set -ex
|
||||
REPORT_PATH="${REPORT_PATH:-./}"
|
||||
nosetests --with-xunit --with-coverage --cover-xml --cover-xml-file $REPORT_PATH/coverage.xml --xunit-file=$REPORT_PATH/nosetests.xml --cover-package=talon .
|
||||
23
setup.py
23
setup.py
@@ -19,12 +19,12 @@ class InstallCommand(install):
|
||||
if self.no_ml:
|
||||
dist = self.distribution
|
||||
dist.packages=find_packages(exclude=[
|
||||
'tests',
|
||||
'tests.*',
|
||||
'talon.signature',
|
||||
'talon.signature.*',
|
||||
"tests",
|
||||
"tests.*",
|
||||
"talon.signature",
|
||||
"talon.signature.*",
|
||||
])
|
||||
for not_required in ['numpy', 'scipy', 'scikit-learn==0.16.1']:
|
||||
for not_required in ["numpy", "scipy", "scikit-learn==0.24.1"]:
|
||||
dist.install_requires.remove(not_required)
|
||||
|
||||
|
||||
@@ -48,12 +48,13 @@ setup(name='talon',
|
||||
"regex>=1",
|
||||
"numpy",
|
||||
"scipy",
|
||||
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
|
||||
'chardet>=1.0.1',
|
||||
'cchardet>=0.3.5',
|
||||
'cssselect',
|
||||
'six>=1.10.0',
|
||||
'html5lib'
|
||||
"scikit-learn==0.24.1", # pickled versions of classifier, else rebuild
|
||||
"chardet>=1.0.1",
|
||||
"cchardet>=0.3.5",
|
||||
"cssselect",
|
||||
"six>=1.10.0",
|
||||
"html5lib",
|
||||
"joblib",
|
||||
],
|
||||
tests_require=[
|
||||
"mock",
|
||||
|
||||
@@ -457,7 +457,7 @@ def _extract_from_html(msg_body):
|
||||
|
||||
msg_body = msg_body.replace(b'\r\n', b'\n')
|
||||
|
||||
msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
|
||||
msg_body = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
|
||||
|
||||
html_tree = html_document_fromstring(msg_body)
|
||||
|
||||
|
||||
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -8,7 +8,7 @@ body belongs to the signature.
|
||||
from __future__ import absolute_import
|
||||
|
||||
from numpy import genfromtxt
|
||||
from sklearn.externals import joblib
|
||||
import joblib
|
||||
from sklearn.svm import LinearSVC
|
||||
|
||||
|
||||
|
||||
3
test-requirements.txt
Normal file
3
test-requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
coverage
|
||||
mock
|
||||
nose>=1.2.1
|
||||
@@ -826,10 +826,10 @@ The user experience was unparallelled. Please continue production. I'm sending p
|
||||
that this line is intact."""
|
||||
|
||||
parsed = quotations.extract_from_plain(msg_body)
|
||||
eq_(msg_body, parsed.decode('utf8'))
|
||||
eq_(msg_body, parsed)
|
||||
|
||||
|
||||
def test_appointment():
|
||||
def test_appointment_2():
|
||||
msg_body = """Invitation for an interview:
|
||||
|
||||
Date: Wednesday 3, October 2011
|
||||
@@ -838,4 +838,4 @@ Address: 130 Fox St
|
||||
|
||||
Please bring in your ID."""
|
||||
parsed = quotations.extract_from_plain(msg_body)
|
||||
eq_(msg_body, parsed.decode('utf8'))
|
||||
eq_(msg_body, parsed)
|
||||
|
||||
Reference in New Issue
Block a user