1 Commits

Author SHA1 Message Date
Matt Dietz
d37c4fd551 Drops Python 2 support
REP-1030

In addition to some python 2 => 3 fixes, this change bumps the scikit-learn
version to latest. The previously pinned version of scikit-learn failed trying
to compile all necessary C modules under python 3.7+ due to included header files
that weren't compatible with C the API implemented in python 3.7+.

Simultaneously, with the restrictive compatibility supported by scikit-learn,
it seemed prudent to drop python 2 support altogether. Otherwise, we'd be stuck
with python 3.4 as the newest possible version we could support.

With this change, tests are currently passing under 3.9.2.

Lastly, imports the original training data. At some point, a new version
of the training data was committed to the repo but no classifier was
trained from it. Using a classifier trained from this new data resulted
in most of the tests failing.
2021-06-10 14:03:25 -05:00
21 changed files with 2701 additions and 2486 deletions

20
.build/Dockerfile Normal file
View File

@@ -0,0 +1,20 @@
FROM python:3.9-slim-buster AS deps
RUN apt-get update && \
apt-get install -y build-essential git curl python3-dev libatlas3-base libatlas-base-dev liblapack-dev libxml2 libxml2-dev libffi6 libffi-dev musl-dev libxslt-dev
FROM deps AS testable
ARG REPORT_PATH
VOLUME ["/var/mailgun", "/etc/mailgun/ssl", ${REPORT_PATH}]
ADD . /app
WORKDIR /app
COPY wheel/* /wheel/
RUN mkdir -p ${REPORT_PATH}
RUN python ./setup.py build bdist_wheel -d /wheel && \
pip install --no-deps /wheel/*
ENTRYPOINT ["/bin/sh", "/app/run_tests.sh"]

3
.gitignore vendored
View File

@@ -54,3 +54,6 @@ _trial_temp
# OSX
.DS_Store
# vim-backup
*.bak

11
requirements.txt Normal file
View File

@@ -0,0 +1,11 @@
chardet>=1.0.1
cchardet>=0.3.5
cssselect
html5lib
joblib
lxml>=2.3.3
numpy
regex>=1
scikit-learn==0.24.1 # pickled versions of classifier, else rebuild
scipy
six>=1.10.0

4
run_tests.sh Executable file
View File

@@ -0,0 +1,4 @@
#!/usr/bin/env bash
set -ex
REPORT_PATH="${REPORT_PATH:-./}"
nosetests --with-xunit --with-coverage --cover-xml --cover-xml-file $REPORT_PATH/coverage.xml --xunit-file=$REPORT_PATH/nosetests.xml --cover-package=talon .

View File

@@ -19,12 +19,12 @@ class InstallCommand(install):
if self.no_ml:
dist = self.distribution
dist.packages=find_packages(exclude=[
'tests',
'tests.*',
'talon.signature',
'talon.signature.*',
"tests",
"tests.*",
"talon.signature",
"talon.signature.*",
])
for not_required in ['numpy', 'scipy', 'scikit-learn==0.16.1']:
for not_required in ["numpy", "scipy", "scikit-learn==0.24.1"]:
dist.install_requires.remove(not_required)
@@ -48,12 +48,13 @@ setup(name='talon',
"regex>=1",
"numpy",
"scipy",
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
'chardet>=1.0.1',
'cchardet>=0.3.5',
'cssselect',
'six>=1.10.0',
'html5lib'
"scikit-learn==0.24.1", # pickled versions of classifier, else rebuild
"chardet>=1.0.1",
"cchardet>=0.3.5",
"cssselect",
"six>=1.10.0",
"html5lib",
"joblib",
],
tests_require=[
"mock",

View File

@@ -457,15 +457,13 @@ def _extract_from_html(msg_body):
msg_body = msg_body.replace(b'\r\n', b'\n')
msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
msg_body = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
html_tree = html_document_fromstring(msg_body)
if html_tree is None:
return msg_body
cut_quotations = False
try:
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
html_quotations.cut_zimbra_quote(html_tree) or
html_quotations.cut_blockquote(html_tree) or
@@ -473,10 +471,6 @@ def _extract_from_html(msg_body):
html_quotations.cut_by_id(html_tree) or
html_quotations.cut_from_block(html_tree)
)
except Exception as e:
log.exception('during html quotations cut')
pass
html_tree_copy = deepcopy(html_tree)
number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)

View File

@@ -62,7 +62,7 @@ RE_SIGNATURE_CANDIDATE = re.compile(r'''
def extract_signature(msg_body):
"""
'''
Analyzes message for a presence of signature block (by common patterns)
and returns tuple with two elements: message text without signature block
and the signature itself.
@@ -72,7 +72,7 @@ def extract_signature(msg_body):
>>> extract_signature('Hey man!')
('Hey man!', None)
"""
'''
try:
# identify line delimiter first
delimiter = get_delimiter(msg_body)

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@@ -8,7 +8,7 @@ body belongs to the signature.
from __future__ import absolute_import
from numpy import genfromtxt
from sklearn.externals import joblib
import joblib
from sklearn.svm import LinearSVC

3
test-requirements.txt Normal file
View File

@@ -0,0 +1,3 @@
coverage
mock
nose>=1.2.1

View File

@@ -1,4 +1,6 @@
from __future__ import absolute_import
from nose.tools import *
from mock import *
import talon

View File

@@ -2,12 +2,14 @@
from __future__ import absolute_import
from tests.fixtures import REPLY_QUOTATIONS_SHARE_BLOCK, OLK_SRC_BODY_SECTION, REPLY_SEPARATED_BY_HR
from nose.tools import eq_, ok_, assert_false, assert_true
from talon import quotations, utils as u
from mock import Mock, patch
# noinspection PyUnresolvedReferences
import re
from talon import quotations, utils as u
from . import *
from .fixtures import *
from lxml import html
RE_WHITESPACE = re.compile("\s")
RE_DOUBLE_WHITESPACE = re.compile("\s")

View File

@@ -1,10 +1,10 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from . import *
from . fixtures import *
from mock import Mock, patch
from talon import quotations
from nose.tools import eq_
@patch.object(quotations, 'extract_from_html')

View File

@@ -1,10 +1,9 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from nose.tools import eq_
from .. import *
from talon.signature import bruteforce
from mock import patch, Mock
def test_empty_body():

View File

@@ -2,14 +2,14 @@
from __future__ import absolute_import
import os
from six.moves import range
from talon.signature import bruteforce, extraction, extract
from talon.signature import extraction as e
from talon.signature.learning import dataset
from nose.tools import eq_
from .. import STRIPPED, UNICODE_MSG
from six.moves import range
from mock import patch
import os
from .. import *
def test_message_shorter_SIGNATURE_MAX_LINES():

View File

@@ -1,14 +1,15 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from ... import EML_MSG_FILENAME, MSG_FILENAME_WITH_BODY_SUFFIX, TMP_DIR, EMAILS_DIR
from talon.signature.learning.featurespace import features
from talon.signature.learning import dataset as d
from nose.tools import eq_, assert_false, ok_
from numpy import genfromtxt
from ... import *
import os
from numpy import genfromtxt
from talon.signature.learning import dataset as d
from talon.signature.learning.featurespace import features
def test_is_sender_filename():
assert_false(d.is_sender_filename("foo/bar"))

View File

@@ -1,10 +1,9 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from ... import *
from talon.signature.learning import featurespace as fs
from nose.tools import eq_, assert_false, ok_
from mock import patch
def test_apply_features():

View File

@@ -1,13 +1,13 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from ... import *
import regex as re
from talon.signature.learning import helpers as h
from talon.signature.learning.helpers import RE_RELAX_PHONE, RE_NAME
from nose.tools import eq_, ok_, assert_false, assert_in
from mock import patch, Mock
from talon.signature.learning.helpers import *
from six.moves import range
import re
# First testing regex constants.
VALID = '''

View File

@@ -1,16 +1,17 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from . import *
from . fixtures import *
from tests.fixtures import STANDARD_REPLIES
from talon import quotations
from six.moves import range
from nose.tools import eq_
from mock import patch
import email.iterators
import six
import os
import email.iterators
from talon import quotations
import six
from six.moves import range
from six import StringIO
@patch.object(quotations, 'MAX_LINES_COUNT', 1)
def test_too_many_lines():
@@ -34,7 +35,6 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote:
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_pattern_on_date_polymail():
msg_body = """Test reply
@@ -190,17 +190,14 @@ Test"""
eq_('Test reply', quotations.extract_from_plain(
msg_body.format(six.text_type(original_message_indicator))))
def test_english_original_message():
_check_pattern_original_message('Original Message')
_check_pattern_original_message('Reply Message')
def test_german_original_message():
_check_pattern_original_message(u'Ursprüngliche Nachricht')
_check_pattern_original_message('Antwort Nachricht')
def test_danish_original_message():
_check_pattern_original_message('Oprindelig meddelelse')
@@ -299,7 +296,6 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote:
> Hello"""
eq_("Hi", quotations.extract_from_plain(msg_body))
def test_with_indent():
msg_body = """YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.
@@ -307,8 +303,7 @@ def test_with_indent():
Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur.
"""
eq_("YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.",
quotations.extract_from_plain(msg_body))
eq_("YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.", quotations.extract_from_plain(msg_body))
def test_short_quotation_with_newline():
@@ -348,7 +343,6 @@ Subject: The manager has commented on your Loop
Blah-blah-blah
"""))
def test_german_from_block():
eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
"""Allo! Follow up MIME!
@@ -361,7 +355,6 @@ Betreff: The manager has commented on your Loop
Blah-blah-blah
"""))
def test_french_multiline_from_block():
eq_('Lorem ipsum', quotations.extract_from_plain(
u"""Lorem ipsum
@@ -374,7 +367,6 @@ Objet : Follow Up
Blah-blah-blah
"""))
def test_french_from_block():
eq_('Lorem ipsum', quotations.extract_from_plain(
u"""Lorem ipsum
@@ -383,7 +375,6 @@ Le 23 janv. 2015 à 22:03, Brendan xxx <brendan.xxx@xxx.com<mailto:brendan.xxx@x
Bonjour!"""))
def test_polish_from_block():
eq_('Lorem ipsum', quotations.extract_from_plain(
u"""Lorem ipsum
@@ -394,7 +385,6 @@ napisał:
Blah!
"""))
def test_danish_from_block():
eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
"""Allo! Follow up MIME!
@@ -407,7 +397,6 @@ Emne: The manager has commented on your Loop
Blah-blah-blah
"""))
def test_swedish_from_block():
eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
u"""Allo! Follow up MIME!
@@ -419,7 +408,6 @@ Till: Isacson Leiff
Blah-blah-blah
"""))
def test_swedish_from_line():
eq_('Lorem', quotations.extract_from_plain(
"""Lorem
@@ -428,7 +416,6 @@ Den 14 september, 2015 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
"""))
def test_norwegian_from_line():
eq_('Lorem', quotations.extract_from_plain(
u"""Lorem
@@ -437,7 +424,6 @@ På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
"""))
def test_dutch_from_block():
eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
"""Gluten-free culpa lo-fi et nesciunt nostrud.
@@ -447,7 +433,6 @@ Op 17-feb.-2015, om 13:18 heeft Julius Caesar <pantheon@rome.com> het volgende g
Small batch beard laboris tempor, non listicle hella Tumblr heirloom.
"""))
def test_vietnamese_from_block():
eq_('Hello', quotations.extract_from_plain(
u"""Hello
@@ -457,7 +442,6 @@ Vào 14:24 8 tháng 6, 2017, Hùng Nguyễn <hungnguyen@xxx.com> đã viết:
> Xin chào
"""))
def test_quotation_marker_false_positive():
msg_body = """Visit us now for assistance...
>>> >>> http://www.domain.com <<<
@@ -842,10 +826,10 @@ The user experience was unparallelled. Please continue production. I'm sending p
that this line is intact."""
parsed = quotations.extract_from_plain(msg_body)
eq_(msg_body, parsed.decode('utf8'))
eq_(msg_body, parsed)
def test_appointment():
def test_appointment_2():
msg_body = """Invitation for an interview:
Date: Wednesday 3, October 2011
@@ -854,4 +838,4 @@ Address: 130 Fox St
Please bring in your ID."""
parsed = quotations.extract_from_plain(msg_body)
eq_(msg_body, parsed.decode('utf8'))
eq_(msg_body, parsed)

View File

@@ -2,13 +2,12 @@
from __future__ import absolute_import
from nose.tools import eq_, ok_, assert_false
from talon import utils as u
from mock import patch, Mock
import cchardet
import six
from talon import utils as u
from . import *
def test_get_delimiter():
eq_('\r\n', u.get_delimiter('abc\r\n123'))