From d37c4fd551d49da3cd26fc029497d5172a49a55d Mon Sep 17 00:00:00 2001 From: Matt Dietz Date: Tue, 27 Apr 2021 09:27:24 -0500 Subject: [PATCH] Drops Python 2 support REP-1030 In addition to some python 2 => 3 fixes, this change bumps the scikit-learn version to latest. The previously pinned version of scikit-learn failed trying to compile all necessary C modules under python 3.7+ due to included header files that weren't compatible with C the API implemented in python 3.7+. Simultaneously, with the restrictive compatibility supported by scikit-learn, it seemed prudent to drop python 2 support altogether. Otherwise, we'd be stuck with python 3.4 as the newest possible version we could support. With this change, tests are currently passing under 3.9.2. Lastly, imports the original training data. At some point, a new version of the training data was committed to the repo but no classifier was trained from it. Using a classifier trained from this new data resulted in most of the tests failing. --- .build/Dockerfile | 20 + .gitignore | 3 + requirements.txt | 11 + run_tests.sh | 4 + setup.py | 23 +- talon/quotations.py | 2 +- talon/signature/data/classifier | Bin 608 -> 799 bytes talon/signature/data/train.data | 4995 ++++++++++++------------ talon/signature/learning/classifier.py | 2 +- test-requirements.txt | 3 + tests/text_quotations_test.py | 6 +- 11 files changed, 2652 insertions(+), 2417 deletions(-) create mode 100644 .build/Dockerfile create mode 100644 requirements.txt create mode 100755 run_tests.sh create mode 100644 test-requirements.txt diff --git a/.build/Dockerfile b/.build/Dockerfile new file mode 100644 index 0000000..a32dd51 --- /dev/null +++ b/.build/Dockerfile @@ -0,0 +1,20 @@ +FROM python:3.9-slim-buster AS deps + +RUN apt-get update && \ + apt-get install -y build-essential git curl python3-dev libatlas3-base libatlas-base-dev liblapack-dev libxml2 libxml2-dev libffi6 libffi-dev musl-dev libxslt-dev + +FROM deps AS testable +ARG REPORT_PATH + +VOLUME ["/var/mailgun", "/etc/mailgun/ssl", ${REPORT_PATH}] + +ADD . /app +WORKDIR /app +COPY wheel/* /wheel/ + +RUN mkdir -p ${REPORT_PATH} + +RUN python ./setup.py build bdist_wheel -d /wheel && \ + pip install --no-deps /wheel/* + +ENTRYPOINT ["/bin/sh", "/app/run_tests.sh"] diff --git a/.gitignore b/.gitignore index d1a3778..d2a4660 100644 --- a/.gitignore +++ b/.gitignore @@ -54,3 +54,6 @@ _trial_temp # OSX .DS_Store + +# vim-backup +*.bak diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b383232 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +chardet>=1.0.1 +cchardet>=0.3.5 +cssselect +html5lib +joblib +lxml>=2.3.3 +numpy +regex>=1 +scikit-learn==0.24.1 # pickled versions of classifier, else rebuild +scipy +six>=1.10.0 diff --git a/run_tests.sh b/run_tests.sh new file mode 100755 index 0000000..19f1d59 --- /dev/null +++ b/run_tests.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash +set -ex +REPORT_PATH="${REPORT_PATH:-./}" +nosetests --with-xunit --with-coverage --cover-xml --cover-xml-file $REPORT_PATH/coverage.xml --xunit-file=$REPORT_PATH/nosetests.xml --cover-package=talon . diff --git a/setup.py b/setup.py index f55fc3b..2824f65 100755 --- a/setup.py +++ b/setup.py @@ -19,12 +19,12 @@ class InstallCommand(install): if self.no_ml: dist = self.distribution dist.packages=find_packages(exclude=[ - 'tests', - 'tests.*', - 'talon.signature', - 'talon.signature.*', + "tests", + "tests.*", + "talon.signature", + "talon.signature.*", ]) - for not_required in ['numpy', 'scipy', 'scikit-learn==0.16.1']: + for not_required in ["numpy", "scipy", "scikit-learn==0.24.1"]: dist.install_requires.remove(not_required) @@ -48,12 +48,13 @@ setup(name='talon', "regex>=1", "numpy", "scipy", - "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild - 'chardet>=1.0.1', - 'cchardet>=0.3.5', - 'cssselect', - 'six>=1.10.0', - 'html5lib' + "scikit-learn==0.24.1", # pickled versions of classifier, else rebuild + "chardet>=1.0.1", + "cchardet>=0.3.5", + "cssselect", + "six>=1.10.0", + "html5lib", + "joblib", ], tests_require=[ "mock", diff --git a/talon/quotations.py b/talon/quotations.py index 99eec95..aa215fe 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -457,7 +457,7 @@ def _extract_from_html(msg_body): msg_body = msg_body.replace(b'\r\n', b'\n') - msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body) + msg_body = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body) html_tree = html_document_fromstring(msg_body) diff --git a/talon/signature/data/classifier b/talon/signature/data/classifier index 1c3a4b0865f3e951b1b3b17fb31bacc48d8d005b..88a424b7434f1b967569108cf3d889bb605820cd 100644 GIT binary patch literal 799 zcmZWnOGs2v7{2pze2kSeL8KH}q>^1qOoEhK%tGkqLrrNqwLB2u)`VTU=ip&EW#}+`-$}4aq`CO`L;7;Fvh9>>2p*wIg=~VmTZ> zb+8WCVufa4)Nl|I&V+~1S~LP;*a7o&<^}?wZR0?EotlO{hH4BlQE@WKj4>Wy9?}!c z(;T0nDnZ7hcEPcTAJBk_w?>Oj5kny2$fpFPU1n1=C?UknK7^#{_~Hw#{`HW0tU$OZ zz!0|Cq{wIxddwGW2&x%qtpArF?7Ze^rlHApXt^cg8oH1Lp)FrMp7uOiy5v#U6$Lls zX!JuZ%11~=SqN#nK&!DL2vR<$t{@+u5qrlgh*X}2R3RvdTi(GtMNGVS0_W%M;|1(i z6A&B3d_I5XuNcv!N&#xnOik07B$h>8oC~#@&a0LBs+4x?MZTY>c_%-_dj!IZq05hZUw>B?n-@;5yp)y)+J7v{@Bb)o*`ukUcOR9jhaU{S zdb+Bt#j@G$r8TAdN$1FN_M0;Pe4I7i`>jl$v*-1@6(xTD#+8S|OO^F?W$b6m@z1}M za_>5$=1rfs(3<(5)U7vNe5(CRrF?TeIc-M<3u|^o>dLy~F*$0ln#RyjZ5~>6gdc#o q(Z4Ar@7gM4F1eJDB%Bpu!wypKdLnwvHyj(G;i%l#FCW8Dlm7u48#c)R literal 608 zcmZut%Z}496iu5B4QUIMH^cBApu;1A0kJ{?RHTszlf|G5OUp^!iNSGlZKq5Ym0$z& zReTI9Bt8OMchm*qE-d-#yzcqZ&h#h2sFIPM6;URr)=YFs|?W@6*fGcj9bu|)KuW9Ll{iCAu9|H4=1.2.1 diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index 89a7974..0cf7d4b 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -826,10 +826,10 @@ The user experience was unparallelled. Please continue production. I'm sending p that this line is intact.""" parsed = quotations.extract_from_plain(msg_body) - eq_(msg_body, parsed.decode('utf8')) + eq_(msg_body, parsed) -def test_appointment(): +def test_appointment_2(): msg_body = """Invitation for an interview: Date: Wednesday 3, October 2011 @@ -838,4 +838,4 @@ Address: 130 Fox St Please bring in your ID.""" parsed = quotations.extract_from_plain(msg_body) - eq_(msg_body, parsed.decode('utf8')) + eq_(msg_body, parsed)