Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
24d0f2d00a | ||
|
|
94007b0b92 | ||
|
|
1a5548f171 | ||
|
|
53c49b9121 | ||
|
|
bd50872043 | ||
|
|
d37c4fd551 | ||
|
|
d9ed7cc6d1 | ||
|
|
0a0808c0a8 | ||
|
|
2916351517 | ||
|
|
46d4b02c81 | ||
|
|
58eac88a10 | ||
|
|
2ef3d8dfbe | ||
|
|
7cf4c29340 |
20
.build/Dockerfile
Normal file
20
.build/Dockerfile
Normal file
@@ -0,0 +1,20 @@
|
||||
FROM python:3.9-slim-buster AS deps
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git curl python3-dev libatlas3-base libatlas-base-dev liblapack-dev libxml2 libxml2-dev libffi6 libffi-dev musl-dev libxslt-dev
|
||||
|
||||
FROM deps AS testable
|
||||
ARG REPORT_PATH
|
||||
|
||||
VOLUME ["/var/mailgun", "/etc/mailgun/ssl", ${REPORT_PATH}]
|
||||
|
||||
ADD . /app
|
||||
WORKDIR /app
|
||||
COPY wheel/* /wheel/
|
||||
|
||||
RUN mkdir -p ${REPORT_PATH}
|
||||
|
||||
RUN python ./setup.py build bdist_wheel -d /wheel && \
|
||||
pip install --no-deps /wheel/*
|
||||
|
||||
ENTRYPOINT ["/bin/sh", "/app/run_tests.sh"]
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -54,3 +54,6 @@ _trial_temp
|
||||
|
||||
# OSX
|
||||
.DS_Store
|
||||
|
||||
# vim-backup
|
||||
*.bak
|
||||
|
||||
@@ -5,3 +5,10 @@ include classifier
|
||||
include LICENSE
|
||||
include MANIFEST.in
|
||||
include README.rst
|
||||
include talon/signature/data/train.data
|
||||
include talon/signature/data/classifier
|
||||
include talon/signature/data/classifier_01.npy
|
||||
include talon/signature/data/classifier_02.npy
|
||||
include talon/signature/data/classifier_03.npy
|
||||
include talon/signature/data/classifier_04.npy
|
||||
include talon/signature/data/classifier_05.npy
|
||||
|
||||
11
requirements.txt
Normal file
11
requirements.txt
Normal file
@@ -0,0 +1,11 @@
|
||||
chardet>=1.0.1
|
||||
cchardet>=0.3.5
|
||||
cssselect
|
||||
html5lib
|
||||
joblib
|
||||
lxml>=2.3.3
|
||||
numpy
|
||||
regex>=1
|
||||
scikit-learn>=1.0.0
|
||||
scipy
|
||||
six>=1.10.0
|
||||
4
run_tests.sh
Executable file
4
run_tests.sh
Executable file
@@ -0,0 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
set -ex
|
||||
REPORT_PATH="${REPORT_PATH:-./}"
|
||||
nosetests --with-xunit --with-coverage --cover-xml --cover-xml-file $REPORT_PATH/coverage.xml --xunit-file=$REPORT_PATH/nosetests.xml --cover-package=talon .
|
||||
31
setup.py
31
setup.py
@@ -19,17 +19,17 @@ class InstallCommand(install):
|
||||
if self.no_ml:
|
||||
dist = self.distribution
|
||||
dist.packages=find_packages(exclude=[
|
||||
'tests',
|
||||
'tests.*',
|
||||
'talon.signature',
|
||||
'talon.signature.*',
|
||||
"tests",
|
||||
"tests.*",
|
||||
"talon.signature",
|
||||
"talon.signature.*",
|
||||
])
|
||||
for not_required in ['numpy', 'scipy', 'scikit-learn==0.16.1']:
|
||||
for not_required in ["numpy", "scipy", "scikit-learn==0.24.1"]:
|
||||
dist.install_requires.remove(not_required)
|
||||
|
||||
|
||||
setup(name='talon',
|
||||
version='1.4.8',
|
||||
version='1.4.10',
|
||||
description=("Mailgun library "
|
||||
"to extract message quotations and signatures."),
|
||||
long_description=open("README.rst").read(),
|
||||
@@ -44,20 +44,21 @@ setup(name='talon',
|
||||
include_package_data=True,
|
||||
zip_safe=True,
|
||||
install_requires=[
|
||||
"lxml>=2.3.3",
|
||||
"regex>=1",
|
||||
"lxml",
|
||||
"regex",
|
||||
"numpy",
|
||||
"scipy",
|
||||
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
|
||||
'chardet>=1.0.1',
|
||||
'cchardet>=0.3.5',
|
||||
'cssselect',
|
||||
'six>=1.10.0',
|
||||
'html5lib'
|
||||
"scikit-learn>=1.0.0",
|
||||
"chardet",
|
||||
"cchardet",
|
||||
"cssselect",
|
||||
"six",
|
||||
"html5lib",
|
||||
"joblib",
|
||||
],
|
||||
tests_require=[
|
||||
"mock",
|
||||
"nose>=1.2.1",
|
||||
"nose",
|
||||
"coverage"
|
||||
]
|
||||
)
|
||||
|
||||
@@ -457,7 +457,7 @@ def _extract_from_html(msg_body):
|
||||
|
||||
msg_body = msg_body.replace(b'\r\n', b'\n')
|
||||
|
||||
msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
|
||||
msg_body = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
|
||||
|
||||
html_tree = html_document_fromstring(msg_body)
|
||||
|
||||
|
||||
@@ -23,17 +23,14 @@ trained against, don't forget to regenerate:
|
||||
from __future__ import absolute_import
|
||||
import os
|
||||
|
||||
from . import extraction
|
||||
from . extraction import extract #noqa
|
||||
from . learning import classifier
|
||||
|
||||
|
||||
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
|
||||
|
||||
EXTRACTOR_FILENAME = os.path.join(DATA_DIR, 'classifier')
|
||||
EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data')
|
||||
from talon.signature import extraction
|
||||
from talon.signature.extraction import extract
|
||||
from talon.signature.learning import classifier
|
||||
|
||||
|
||||
def initialize():
|
||||
extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME,
|
||||
EXTRACTOR_DATA)
|
||||
data_dir = os.path.join(os.path.dirname(__file__), 'data')
|
||||
extractor_filename = os.path.join(data_dir, 'classifier')
|
||||
extractor_data_filename = os.path.join(data_dir, 'train.data')
|
||||
extraction.EXTRACTOR = classifier.load(extractor_filename,
|
||||
extractor_data_filename)
|
||||
|
||||
1
talon/signature/data/__init__.py
Normal file
1
talon/signature/data/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -8,7 +8,7 @@ body belongs to the signature.
|
||||
from __future__ import absolute_import
|
||||
|
||||
from numpy import genfromtxt
|
||||
from sklearn.externals import joblib
|
||||
import joblib
|
||||
from sklearn.svm import LinearSVC
|
||||
|
||||
|
||||
|
||||
@@ -102,7 +102,7 @@ def flatten_list(list_to_flatten):
|
||||
|
||||
|
||||
def contains_sender_names(sender):
|
||||
'''Returns a functions to search sender\'s name or it\'s part.
|
||||
"""Returns a functions to search sender\'s name or it\'s part.
|
||||
|
||||
>>> feature = contains_sender_names("Sergey N. Obukhov <xxx@example.com>")
|
||||
>>> feature("Sergey Obukhov")
|
||||
@@ -115,7 +115,7 @@ def contains_sender_names(sender):
|
||||
1
|
||||
>>> contains_sender_names("<serobnic@mail.ru>")("serobnic")
|
||||
1
|
||||
'''
|
||||
"""
|
||||
names = '( |$)|'.join(flatten_list([[e, e.capitalize()]
|
||||
for e in extract_names(sender)]))
|
||||
names = names or sender
|
||||
@@ -140,10 +140,16 @@ def extract_names(sender):
|
||||
sender = "".join([char if char.isalpha() else ' ' for char in sender])
|
||||
# Remove too short words and words from "black" list i.e.
|
||||
# words like `ru`, `gmail`, `com`, `org`, etc.
|
||||
sender = [word for word in sender.split() if len(word) > 1 and
|
||||
not word in BAD_SENDER_NAMES]
|
||||
# Remove duplicates
|
||||
names = list(set(sender))
|
||||
names = list()
|
||||
for word in sender.split():
|
||||
if len(word) < 2:
|
||||
continue
|
||||
if word in BAD_SENDER_NAMES:
|
||||
continue
|
||||
if word in names:
|
||||
continue
|
||||
names.append(word)
|
||||
|
||||
return names
|
||||
|
||||
|
||||
@@ -208,20 +214,26 @@ def many_capitalized_words(s):
|
||||
|
||||
|
||||
def has_signature(body, sender):
|
||||
'''Checks if the body has signature. Returns True or False.'''
|
||||
"""Checks if the body has signature. Returns True or False."""
|
||||
non_empty = [line for line in body.splitlines() if line.strip()]
|
||||
candidate = non_empty[-SIGNATURE_MAX_LINES:]
|
||||
upvotes = 0
|
||||
sender_check = contains_sender_names(sender)
|
||||
for line in candidate:
|
||||
# we check lines for sender's name, phone, email and url,
|
||||
# those signature lines don't take more then 27 lines
|
||||
if len(line.strip()) > 27:
|
||||
continue
|
||||
elif contains_sender_names(sender)(line):
|
||||
|
||||
if sender_check(line):
|
||||
return True
|
||||
elif (binary_regex_search(RE_RELAX_PHONE)(line) +
|
||||
|
||||
if (binary_regex_search(RE_RELAX_PHONE)(line) +
|
||||
binary_regex_search(RE_EMAIL)(line) +
|
||||
binary_regex_search(RE_URL)(line) == 1):
|
||||
upvotes += 1
|
||||
|
||||
if upvotes > 1:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
3
test-requirements.txt
Normal file
3
test-requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
coverage
|
||||
mock
|
||||
nose>=1.2.1
|
||||
@@ -826,10 +826,10 @@ The user experience was unparallelled. Please continue production. I'm sending p
|
||||
that this line is intact."""
|
||||
|
||||
parsed = quotations.extract_from_plain(msg_body)
|
||||
eq_(msg_body, parsed.decode('utf8'))
|
||||
eq_(msg_body, parsed)
|
||||
|
||||
|
||||
def test_appointment():
|
||||
def test_appointment_2():
|
||||
msg_body = """Invitation for an interview:
|
||||
|
||||
Date: Wednesday 3, October 2011
|
||||
@@ -838,4 +838,4 @@ Address: 130 Fox St
|
||||
|
||||
Please bring in your ID."""
|
||||
parsed = quotations.extract_from_plain(msg_body)
|
||||
eq_(msg_body, parsed.decode('utf8'))
|
||||
eq_(msg_body, parsed)
|
||||
|
||||
Reference in New Issue
Block a user