Compare commits
15 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
24d0f2d00a | ||
|
|
94007b0b92 | ||
|
|
1a5548f171 | ||
|
|
53c49b9121 | ||
|
|
bd50872043 | ||
|
|
d37c4fd551 | ||
|
|
d9ed7cc6d1 | ||
|
|
0a0808c0a8 | ||
|
|
16354e3528 | ||
|
|
1018e88ec1 | ||
|
|
2916351517 | ||
|
|
46d4b02c81 | ||
|
|
58eac88a10 | ||
|
|
2ef3d8dfbe | ||
|
|
7cf4c29340 |
20
.build/Dockerfile
Normal file
20
.build/Dockerfile
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
FROM python:3.9-slim-buster AS deps
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential git curl python3-dev libatlas3-base libatlas-base-dev liblapack-dev libxml2 libxml2-dev libffi6 libffi-dev musl-dev libxslt-dev
|
||||||
|
|
||||||
|
FROM deps AS testable
|
||||||
|
ARG REPORT_PATH
|
||||||
|
|
||||||
|
VOLUME ["/var/mailgun", "/etc/mailgun/ssl", ${REPORT_PATH}]
|
||||||
|
|
||||||
|
ADD . /app
|
||||||
|
WORKDIR /app
|
||||||
|
COPY wheel/* /wheel/
|
||||||
|
|
||||||
|
RUN mkdir -p ${REPORT_PATH}
|
||||||
|
|
||||||
|
RUN python ./setup.py build bdist_wheel -d /wheel && \
|
||||||
|
pip install --no-deps /wheel/*
|
||||||
|
|
||||||
|
ENTRYPOINT ["/bin/sh", "/app/run_tests.sh"]
|
||||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -54,3 +54,6 @@ _trial_temp
|
|||||||
|
|
||||||
# OSX
|
# OSX
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
|
||||||
|
# vim-backup
|
||||||
|
*.bak
|
||||||
|
|||||||
@@ -5,3 +5,10 @@ include classifier
|
|||||||
include LICENSE
|
include LICENSE
|
||||||
include MANIFEST.in
|
include MANIFEST.in
|
||||||
include README.rst
|
include README.rst
|
||||||
|
include talon/signature/data/train.data
|
||||||
|
include talon/signature/data/classifier
|
||||||
|
include talon/signature/data/classifier_01.npy
|
||||||
|
include talon/signature/data/classifier_02.npy
|
||||||
|
include talon/signature/data/classifier_03.npy
|
||||||
|
include talon/signature/data/classifier_04.npy
|
||||||
|
include talon/signature/data/classifier_05.npy
|
||||||
|
|||||||
11
requirements.txt
Normal file
11
requirements.txt
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
chardet>=1.0.1
|
||||||
|
cchardet>=0.3.5
|
||||||
|
cssselect
|
||||||
|
html5lib
|
||||||
|
joblib
|
||||||
|
lxml>=2.3.3
|
||||||
|
numpy
|
||||||
|
regex>=1
|
||||||
|
scikit-learn>=1.0.0
|
||||||
|
scipy
|
||||||
|
six>=1.10.0
|
||||||
4
run_tests.sh
Executable file
4
run_tests.sh
Executable file
@@ -0,0 +1,4 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -ex
|
||||||
|
REPORT_PATH="${REPORT_PATH:-./}"
|
||||||
|
nosetests --with-xunit --with-coverage --cover-xml --cover-xml-file $REPORT_PATH/coverage.xml --xunit-file=$REPORT_PATH/nosetests.xml --cover-package=talon .
|
||||||
31
setup.py
31
setup.py
@@ -19,17 +19,17 @@ class InstallCommand(install):
|
|||||||
if self.no_ml:
|
if self.no_ml:
|
||||||
dist = self.distribution
|
dist = self.distribution
|
||||||
dist.packages=find_packages(exclude=[
|
dist.packages=find_packages(exclude=[
|
||||||
'tests',
|
"tests",
|
||||||
'tests.*',
|
"tests.*",
|
||||||
'talon.signature',
|
"talon.signature",
|
||||||
'talon.signature.*',
|
"talon.signature.*",
|
||||||
])
|
])
|
||||||
for not_required in ['numpy', 'scipy', 'scikit-learn==0.16.1']:
|
for not_required in ["numpy", "scipy", "scikit-learn==0.24.1"]:
|
||||||
dist.install_requires.remove(not_required)
|
dist.install_requires.remove(not_required)
|
||||||
|
|
||||||
|
|
||||||
setup(name='talon',
|
setup(name='talon',
|
||||||
version='1.4.7',
|
version='1.4.10',
|
||||||
description=("Mailgun library "
|
description=("Mailgun library "
|
||||||
"to extract message quotations and signatures."),
|
"to extract message quotations and signatures."),
|
||||||
long_description=open("README.rst").read(),
|
long_description=open("README.rst").read(),
|
||||||
@@ -44,20 +44,21 @@ setup(name='talon',
|
|||||||
include_package_data=True,
|
include_package_data=True,
|
||||||
zip_safe=True,
|
zip_safe=True,
|
||||||
install_requires=[
|
install_requires=[
|
||||||
"lxml>=2.3.3",
|
"lxml",
|
||||||
"regex>=1",
|
"regex",
|
||||||
"numpy",
|
"numpy",
|
||||||
"scipy",
|
"scipy",
|
||||||
"scikit-learn>=0.16.1", # pickled versions of classifier, else rebuild
|
"scikit-learn>=1.0.0",
|
||||||
'chardet>=1.0.1',
|
"chardet",
|
||||||
'cchardet>=0.3.5',
|
"cchardet",
|
||||||
'cssselect',
|
"cssselect",
|
||||||
'six>=1.10.0',
|
"six",
|
||||||
'html5lib'
|
"html5lib",
|
||||||
|
"joblib",
|
||||||
],
|
],
|
||||||
tests_require=[
|
tests_require=[
|
||||||
"mock",
|
"mock",
|
||||||
"nose>=1.2.1",
|
"nose",
|
||||||
"coverage"
|
"coverage"
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -457,7 +457,7 @@ def _extract_from_html(msg_body):
|
|||||||
|
|
||||||
msg_body = msg_body.replace(b'\r\n', b'\n')
|
msg_body = msg_body.replace(b'\r\n', b'\n')
|
||||||
|
|
||||||
msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
|
msg_body = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
|
||||||
|
|
||||||
html_tree = html_document_fromstring(msg_body)
|
html_tree = html_document_fromstring(msg_body)
|
||||||
|
|
||||||
@@ -516,9 +516,69 @@ def _extract_from_html(msg_body):
|
|||||||
if _readable_text_empty(html_tree_copy):
|
if _readable_text_empty(html_tree_copy):
|
||||||
return msg_body
|
return msg_body
|
||||||
|
|
||||||
|
# NOTE: We remove_namespaces() because we are using an HTML5 Parser, HTML
|
||||||
|
# parsers do not recognize namespaces in HTML tags. As such the rendered
|
||||||
|
# HTML tags are no longer recognizable HTML tags. Example: <o:p> becomes
|
||||||
|
# <oU0003Ap>. When we port this to golang we should look into using an
|
||||||
|
# XML Parser NOT and HTML5 Parser since we do not know what input a
|
||||||
|
# customer will send us. Switching to a common XML parser in python
|
||||||
|
# opens us up to a host of vulnerabilities.
|
||||||
|
# See https://docs.python.org/3/library/xml.html#xml-vulnerabilities
|
||||||
|
#
|
||||||
|
# The down sides to removing the namespaces is that customers might
|
||||||
|
# judge the XML namespaces important. If that is the case then support
|
||||||
|
# should encourage customers to preform XML parsing of the un-stripped
|
||||||
|
# body to get the full unmodified XML payload.
|
||||||
|
#
|
||||||
|
# Alternatives to this approach are
|
||||||
|
# 1. Ignore the U0003A in tag names and let the customer deal with it.
|
||||||
|
# This is not ideal, as most customers use stripped-html for viewing
|
||||||
|
# emails sent from a recipient, as such they cannot control the HTML
|
||||||
|
# provided by a recipient.
|
||||||
|
# 2. Preform a string replace of 'U0003A' to ':' on the rendered HTML
|
||||||
|
# string. While this would solve the issue simply, it runs the risk
|
||||||
|
# of replacing data outside the <tag> which might be essential to
|
||||||
|
# the customer.
|
||||||
|
remove_namespaces(html_tree_copy)
|
||||||
return html.tostring(html_tree_copy)
|
return html.tostring(html_tree_copy)
|
||||||
|
|
||||||
|
|
||||||
|
def remove_namespaces(root):
|
||||||
|
"""
|
||||||
|
Given the root of an HTML document iterate through all the elements
|
||||||
|
and remove any namespaces that might have been provided and remove
|
||||||
|
any attributes that contain a namespace
|
||||||
|
|
||||||
|
<html xmlns:o="urn:schemas-microsoft-com:office:office">
|
||||||
|
becomes
|
||||||
|
<html>
|
||||||
|
|
||||||
|
<o:p>Hi</o:p>
|
||||||
|
becomes
|
||||||
|
<p>Hi</p>
|
||||||
|
|
||||||
|
Start tags do NOT have a namespace; COLON characters have no special meaning.
|
||||||
|
if we don't remove the namespace the parser translates the tag name into a
|
||||||
|
unicode representation. For example <o:p> becomes <oU0003Ap>
|
||||||
|
|
||||||
|
See https://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#start-tags
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
for child in root.iter():
|
||||||
|
for key, value in child.attrib.items():
|
||||||
|
# If the attribute includes a colon
|
||||||
|
if key.rfind("U0003A") != -1:
|
||||||
|
child.attrib.pop(key)
|
||||||
|
|
||||||
|
# If the tag includes a colon
|
||||||
|
idx = child.tag.rfind("U0003A")
|
||||||
|
if idx != -1:
|
||||||
|
child.tag = child.tag[idx+6:]
|
||||||
|
|
||||||
|
return root
|
||||||
|
|
||||||
|
|
||||||
def split_emails(msg):
|
def split_emails(msg):
|
||||||
"""
|
"""
|
||||||
Given a message (which may consist of an email conversation thread with
|
Given a message (which may consist of an email conversation thread with
|
||||||
|
|||||||
@@ -23,17 +23,14 @@ trained against, don't forget to regenerate:
|
|||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from . import extraction
|
from talon.signature import extraction
|
||||||
from . extraction import extract #noqa
|
from talon.signature.extraction import extract
|
||||||
from . learning import classifier
|
from talon.signature.learning import classifier
|
||||||
|
|
||||||
|
|
||||||
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
|
|
||||||
|
|
||||||
EXTRACTOR_FILENAME = os.path.join(DATA_DIR, 'classifier')
|
|
||||||
EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data')
|
|
||||||
|
|
||||||
|
|
||||||
def initialize():
|
def initialize():
|
||||||
extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME,
|
data_dir = os.path.join(os.path.dirname(__file__), 'data')
|
||||||
EXTRACTOR_DATA)
|
extractor_filename = os.path.join(data_dir, 'classifier')
|
||||||
|
extractor_data_filename = os.path.join(data_dir, 'train.data')
|
||||||
|
extraction.EXTRACTOR = classifier.load(extractor_filename,
|
||||||
|
extractor_data_filename)
|
||||||
|
|||||||
1
talon/signature/data/__init__.py
Normal file
1
talon/signature/data/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
|
||||||
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -8,7 +8,7 @@ body belongs to the signature.
|
|||||||
from __future__ import absolute_import
|
from __future__ import absolute_import
|
||||||
|
|
||||||
from numpy import genfromtxt
|
from numpy import genfromtxt
|
||||||
from sklearn.externals import joblib
|
import joblib
|
||||||
from sklearn.svm import LinearSVC
|
from sklearn.svm import LinearSVC
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -102,7 +102,7 @@ def flatten_list(list_to_flatten):
|
|||||||
|
|
||||||
|
|
||||||
def contains_sender_names(sender):
|
def contains_sender_names(sender):
|
||||||
'''Returns a functions to search sender\'s name or it\'s part.
|
"""Returns a functions to search sender\'s name or it\'s part.
|
||||||
|
|
||||||
>>> feature = contains_sender_names("Sergey N. Obukhov <xxx@example.com>")
|
>>> feature = contains_sender_names("Sergey N. Obukhov <xxx@example.com>")
|
||||||
>>> feature("Sergey Obukhov")
|
>>> feature("Sergey Obukhov")
|
||||||
@@ -115,7 +115,7 @@ def contains_sender_names(sender):
|
|||||||
1
|
1
|
||||||
>>> contains_sender_names("<serobnic@mail.ru>")("serobnic")
|
>>> contains_sender_names("<serobnic@mail.ru>")("serobnic")
|
||||||
1
|
1
|
||||||
'''
|
"""
|
||||||
names = '( |$)|'.join(flatten_list([[e, e.capitalize()]
|
names = '( |$)|'.join(flatten_list([[e, e.capitalize()]
|
||||||
for e in extract_names(sender)]))
|
for e in extract_names(sender)]))
|
||||||
names = names or sender
|
names = names or sender
|
||||||
@@ -140,10 +140,16 @@ def extract_names(sender):
|
|||||||
sender = "".join([char if char.isalpha() else ' ' for char in sender])
|
sender = "".join([char if char.isalpha() else ' ' for char in sender])
|
||||||
# Remove too short words and words from "black" list i.e.
|
# Remove too short words and words from "black" list i.e.
|
||||||
# words like `ru`, `gmail`, `com`, `org`, etc.
|
# words like `ru`, `gmail`, `com`, `org`, etc.
|
||||||
sender = [word for word in sender.split() if len(word) > 1 and
|
names = list()
|
||||||
not word in BAD_SENDER_NAMES]
|
for word in sender.split():
|
||||||
# Remove duplicates
|
if len(word) < 2:
|
||||||
names = list(set(sender))
|
continue
|
||||||
|
if word in BAD_SENDER_NAMES:
|
||||||
|
continue
|
||||||
|
if word in names:
|
||||||
|
continue
|
||||||
|
names.append(word)
|
||||||
|
|
||||||
return names
|
return names
|
||||||
|
|
||||||
|
|
||||||
@@ -208,20 +214,26 @@ def many_capitalized_words(s):
|
|||||||
|
|
||||||
|
|
||||||
def has_signature(body, sender):
|
def has_signature(body, sender):
|
||||||
'''Checks if the body has signature. Returns True or False.'''
|
"""Checks if the body has signature. Returns True or False."""
|
||||||
non_empty = [line for line in body.splitlines() if line.strip()]
|
non_empty = [line for line in body.splitlines() if line.strip()]
|
||||||
candidate = non_empty[-SIGNATURE_MAX_LINES:]
|
candidate = non_empty[-SIGNATURE_MAX_LINES:]
|
||||||
upvotes = 0
|
upvotes = 0
|
||||||
|
sender_check = contains_sender_names(sender)
|
||||||
for line in candidate:
|
for line in candidate:
|
||||||
# we check lines for sender's name, phone, email and url,
|
# we check lines for sender's name, phone, email and url,
|
||||||
# those signature lines don't take more then 27 lines
|
# those signature lines don't take more then 27 lines
|
||||||
if len(line.strip()) > 27:
|
if len(line.strip()) > 27:
|
||||||
continue
|
continue
|
||||||
elif contains_sender_names(sender)(line):
|
|
||||||
|
if sender_check(line):
|
||||||
return True
|
return True
|
||||||
elif (binary_regex_search(RE_RELAX_PHONE)(line) +
|
|
||||||
|
if (binary_regex_search(RE_RELAX_PHONE)(line) +
|
||||||
binary_regex_search(RE_EMAIL)(line) +
|
binary_regex_search(RE_EMAIL)(line) +
|
||||||
binary_regex_search(RE_URL)(line) == 1):
|
binary_regex_search(RE_URL)(line) == 1):
|
||||||
upvotes += 1
|
upvotes += 1
|
||||||
|
|
||||||
if upvotes > 1:
|
if upvotes > 1:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|||||||
3
test-requirements.txt
Normal file
3
test-requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
coverage
|
||||||
|
mock
|
||||||
|
nose>=1.2.1
|
||||||
@@ -8,6 +8,7 @@ import re
|
|||||||
from talon import quotations, utils as u
|
from talon import quotations, utils as u
|
||||||
from . import *
|
from . import *
|
||||||
from .fixtures import *
|
from .fixtures import *
|
||||||
|
from lxml import html
|
||||||
|
|
||||||
RE_WHITESPACE = re.compile("\s")
|
RE_WHITESPACE = re.compile("\s")
|
||||||
RE_DOUBLE_WHITESPACE = re.compile("\s")
|
RE_DOUBLE_WHITESPACE = re.compile("\s")
|
||||||
@@ -424,3 +425,23 @@ def test_readable_html_empty():
|
|||||||
def test_bad_html():
|
def test_bad_html():
|
||||||
bad_html = "<html></html>"
|
bad_html = "<html></html>"
|
||||||
eq_(bad_html, quotations.extract_from_html(bad_html))
|
eq_(bad_html, quotations.extract_from_html(bad_html))
|
||||||
|
|
||||||
|
|
||||||
|
def test_remove_namespaces():
|
||||||
|
msg_body = """
|
||||||
|
<html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns="http://www.w3.org/TR/REC-html40">
|
||||||
|
<body>
|
||||||
|
<o:p>Dear Sir,</o:p>
|
||||||
|
<o:p>Thank you for the email.</o:p>
|
||||||
|
<blockquote>thing</blockquote>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
rendered = quotations.extract_from_html(msg_body)
|
||||||
|
|
||||||
|
assert_true("<p>" in rendered)
|
||||||
|
assert_true("xmlns" in rendered)
|
||||||
|
|
||||||
|
assert_true("<o:p>" not in rendered)
|
||||||
|
assert_true("<xmlns:o>" not in rendered)
|
||||||
|
|||||||
@@ -826,10 +826,10 @@ The user experience was unparallelled. Please continue production. I'm sending p
|
|||||||
that this line is intact."""
|
that this line is intact."""
|
||||||
|
|
||||||
parsed = quotations.extract_from_plain(msg_body)
|
parsed = quotations.extract_from_plain(msg_body)
|
||||||
eq_(msg_body, parsed.decode('utf8'))
|
eq_(msg_body, parsed)
|
||||||
|
|
||||||
|
|
||||||
def test_appointment():
|
def test_appointment_2():
|
||||||
msg_body = """Invitation for an interview:
|
msg_body = """Invitation for an interview:
|
||||||
|
|
||||||
Date: Wednesday 3, October 2011
|
Date: Wednesday 3, October 2011
|
||||||
@@ -838,4 +838,4 @@ Address: 130 Fox St
|
|||||||
|
|
||||||
Please bring in your ID."""
|
Please bring in your ID."""
|
||||||
parsed = quotations.extract_from_plain(msg_body)
|
parsed = quotations.extract_from_plain(msg_body)
|
||||||
eq_(msg_body, parsed.decode('utf8'))
|
eq_(msg_body, parsed)
|
||||||
|
|||||||
Reference in New Issue
Block a user