Compare commits
	
		
			13 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 1a5548f171 | ||
|  | 53c49b9121 | ||
|  | bd50872043 | ||
|  | d37c4fd551 | ||
|  | d9ed7cc6d1 | ||
|  | 0a0808c0a8 | ||
|  | 16354e3528 | ||
|  | 1018e88ec1 | ||
|  | 2916351517 | ||
|  | 46d4b02c81 | ||
|  | 58eac88a10 | ||
|  | 2ef3d8dfbe | ||
|  | 7cf4c29340 | 
							
								
								
									
										20
									
								
								.build/Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								.build/Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,20 @@ | |||||||
|  | FROM python:3.9-slim-buster AS deps | ||||||
|  |  | ||||||
|  | RUN apt-get update && \ | ||||||
|  |     apt-get install -y build-essential git curl python3-dev libatlas3-base libatlas-base-dev liblapack-dev libxml2 libxml2-dev libffi6 libffi-dev musl-dev libxslt-dev | ||||||
|  |  | ||||||
|  | FROM deps AS testable | ||||||
|  | ARG REPORT_PATH | ||||||
|  |  | ||||||
|  | VOLUME ["/var/mailgun", "/etc/mailgun/ssl", ${REPORT_PATH}] | ||||||
|  |  | ||||||
|  | ADD . /app | ||||||
|  | WORKDIR /app | ||||||
|  | COPY wheel/* /wheel/ | ||||||
|  |  | ||||||
|  | RUN mkdir -p ${REPORT_PATH} | ||||||
|  |  | ||||||
|  | RUN python ./setup.py build bdist_wheel -d /wheel && \ | ||||||
|  |     pip install --no-deps /wheel/* | ||||||
|  |  | ||||||
|  | ENTRYPOINT ["/bin/sh", "/app/run_tests.sh"] | ||||||
							
								
								
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -54,3 +54,6 @@ _trial_temp | |||||||
|  |  | ||||||
| # OSX | # OSX | ||||||
| .DS_Store | .DS_Store | ||||||
|  |  | ||||||
|  | # vim-backup | ||||||
|  | *.bak | ||||||
|   | |||||||
| @@ -5,3 +5,10 @@ include classifier | |||||||
| include LICENSE | include LICENSE | ||||||
| include MANIFEST.in | include MANIFEST.in | ||||||
| include README.rst | include README.rst | ||||||
|  | include talon/signature/data/train.data | ||||||
|  | include talon/signature/data/classifier | ||||||
|  | include talon/signature/data/classifier_01.npy | ||||||
|  | include talon/signature/data/classifier_02.npy | ||||||
|  | include talon/signature/data/classifier_03.npy | ||||||
|  | include talon/signature/data/classifier_04.npy | ||||||
|  | include talon/signature/data/classifier_05.npy | ||||||
|   | |||||||
							
								
								
									
										11
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,11 @@ | |||||||
|  | chardet>=1.0.1 | ||||||
|  | cchardet>=0.3.5 | ||||||
|  | cssselect | ||||||
|  | html5lib | ||||||
|  | joblib | ||||||
|  | lxml>=2.3.3 | ||||||
|  | numpy | ||||||
|  | regex>=1 | ||||||
|  | scikit-learn>=1.0.0 | ||||||
|  | scipy | ||||||
|  | six>=1.10.0 | ||||||
							
								
								
									
										4
									
								
								run_tests.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										4
									
								
								run_tests.sh
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,4 @@ | |||||||
|  | #!/usr/bin/env bash | ||||||
|  | set -ex | ||||||
|  | REPORT_PATH="${REPORT_PATH:-./}" | ||||||
|  | nosetests --with-xunit --with-coverage --cover-xml --cover-xml-file $REPORT_PATH/coverage.xml --xunit-file=$REPORT_PATH/nosetests.xml --cover-package=talon . | ||||||
							
								
								
									
										31
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										31
									
								
								setup.py
									
									
									
									
									
								
							| @@ -19,17 +19,17 @@ class InstallCommand(install): | |||||||
|         if self.no_ml: |         if self.no_ml: | ||||||
|             dist = self.distribution |             dist = self.distribution | ||||||
|             dist.packages=find_packages(exclude=[ |             dist.packages=find_packages(exclude=[ | ||||||
|                 'tests', |                 "tests", | ||||||
|                 'tests.*', |                 "tests.*", | ||||||
|                 'talon.signature', |                 "talon.signature", | ||||||
|                 'talon.signature.*', |                 "talon.signature.*", | ||||||
|             ]) |             ]) | ||||||
|             for not_required in ['numpy', 'scipy', 'scikit-learn==0.16.1']: |             for not_required in ["numpy", "scipy", "scikit-learn==0.24.1"]: | ||||||
|                 dist.install_requires.remove(not_required) |                 dist.install_requires.remove(not_required) | ||||||
|  |  | ||||||
|  |  | ||||||
| setup(name='talon', | setup(name='talon', | ||||||
|       version='1.4.7', |       version='1.4.9', | ||||||
|       description=("Mailgun library " |       description=("Mailgun library " | ||||||
|                    "to extract message quotations and signatures."), |                    "to extract message quotations and signatures."), | ||||||
|       long_description=open("README.rst").read(), |       long_description=open("README.rst").read(), | ||||||
| @@ -44,20 +44,21 @@ setup(name='talon', | |||||||
|       include_package_data=True, |       include_package_data=True, | ||||||
|       zip_safe=True, |       zip_safe=True, | ||||||
|       install_requires=[ |       install_requires=[ | ||||||
|           "lxml>=2.3.3", |           "lxml", | ||||||
|           "regex>=1", |           "regex", | ||||||
|           "numpy", |           "numpy", | ||||||
|           "scipy", |           "scipy", | ||||||
|           "scikit-learn>=0.16.1", # pickled versions of classifier, else rebuild |           "scikit-learn>=1.0.0", | ||||||
|           'chardet>=1.0.1', |           "chardet", | ||||||
|           'cchardet>=0.3.5', |           "cchardet", | ||||||
|           'cssselect', |           "cssselect", | ||||||
|           'six>=1.10.0', |           "six", | ||||||
|           'html5lib' |           "html5lib", | ||||||
|  |           "joblib", | ||||||
|           ], |           ], | ||||||
|       tests_require=[ |       tests_require=[ | ||||||
|           "mock", |           "mock", | ||||||
|           "nose>=1.2.1", |           "nose", | ||||||
|           "coverage" |           "coverage" | ||||||
|           ] |           ] | ||||||
|       ) |       ) | ||||||
|   | |||||||
| @@ -457,7 +457,7 @@ def _extract_from_html(msg_body): | |||||||
|  |  | ||||||
|     msg_body = msg_body.replace(b'\r\n', b'\n') |     msg_body = msg_body.replace(b'\r\n', b'\n') | ||||||
|  |  | ||||||
|     msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body) |     msg_body = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body) | ||||||
|  |  | ||||||
|     html_tree = html_document_fromstring(msg_body) |     html_tree = html_document_fromstring(msg_body) | ||||||
|  |  | ||||||
| @@ -516,9 +516,69 @@ def _extract_from_html(msg_body): | |||||||
|     if _readable_text_empty(html_tree_copy): |     if _readable_text_empty(html_tree_copy): | ||||||
|         return msg_body |         return msg_body | ||||||
|  |  | ||||||
|  |     # NOTE: We remove_namespaces() because we are using an HTML5 Parser, HTML | ||||||
|  |     # parsers do not recognize namespaces in HTML tags. As such the rendered | ||||||
|  |     # HTML tags are no longer recognizable HTML tags. Example: <o:p> becomes | ||||||
|  |     # <oU0003Ap>. When we port this to golang we should look into using an | ||||||
|  |     # XML Parser NOT and HTML5 Parser since we do not know what input a | ||||||
|  |     # customer will send us. Switching to a common XML parser in python | ||||||
|  |     # opens us up to a host of vulnerabilities. | ||||||
|  |     # See https://docs.python.org/3/library/xml.html#xml-vulnerabilities | ||||||
|  |     # | ||||||
|  |     # The down sides to removing the namespaces is that customers might | ||||||
|  |     # judge the XML namespaces important. If that is the case then support | ||||||
|  |     # should encourage customers to preform XML parsing of the un-stripped | ||||||
|  |     # body to get the full unmodified XML payload. | ||||||
|  |     # | ||||||
|  |     # Alternatives to this approach are | ||||||
|  |     # 1. Ignore the U0003A in tag names and let the customer deal with it. | ||||||
|  |     #    This is not ideal, as most customers use stripped-html for viewing | ||||||
|  |     #    emails sent from a recipient, as such they cannot control the HTML | ||||||
|  |     #    provided by a recipient. | ||||||
|  |     # 2. Preform a string replace of 'U0003A' to ':' on the rendered HTML | ||||||
|  |     #    string. While this would solve the issue simply, it runs the risk | ||||||
|  |     #    of replacing data outside the <tag> which might be essential to | ||||||
|  |     #    the customer. | ||||||
|  |     remove_namespaces(html_tree_copy) | ||||||
|     return html.tostring(html_tree_copy) |     return html.tostring(html_tree_copy) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def remove_namespaces(root): | ||||||
|  |     """ | ||||||
|  |     Given the root of an HTML document iterate through all the elements | ||||||
|  |     and remove any namespaces that might have been provided and remove | ||||||
|  |     any attributes that contain a namespace | ||||||
|  |  | ||||||
|  |     <html xmlns:o="urn:schemas-microsoft-com:office:office"> | ||||||
|  |     becomes | ||||||
|  |     <html> | ||||||
|  |  | ||||||
|  |     <o:p>Hi</o:p> | ||||||
|  |     becomes | ||||||
|  |     <p>Hi</p> | ||||||
|  |  | ||||||
|  |     Start tags do NOT have a namespace; COLON characters have no special meaning. | ||||||
|  |     if we don't remove the namespace the parser translates the tag name into a | ||||||
|  |     unicode representation. For example <o:p> becomes <oU0003Ap> | ||||||
|  |  | ||||||
|  |     See https://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#start-tags | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     """ | ||||||
|  |     for child in root.iter(): | ||||||
|  |         for key, value in child.attrib.items(): | ||||||
|  |             # If the attribute includes a colon | ||||||
|  |             if key.rfind("U0003A") != -1: | ||||||
|  |                 child.attrib.pop(key) | ||||||
|  |  | ||||||
|  |         # If the tag includes a colon | ||||||
|  |         idx = child.tag.rfind("U0003A") | ||||||
|  |         if idx != -1: | ||||||
|  |             child.tag = child.tag[idx+6:] | ||||||
|  |  | ||||||
|  |     return root | ||||||
|  |  | ||||||
|  |  | ||||||
| def split_emails(msg): | def split_emails(msg): | ||||||
|     """ |     """ | ||||||
|     Given a message (which may consist of an email conversation thread with |     Given a message (which may consist of an email conversation thread with | ||||||
|   | |||||||
							
								
								
									
										1
									
								
								talon/signature/data/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								talon/signature/data/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | |||||||
|  |  | ||||||
										
											Binary file not shown.
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -8,7 +8,7 @@ body belongs to the signature. | |||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
|  |  | ||||||
| from numpy import genfromtxt | from numpy import genfromtxt | ||||||
| from sklearn.externals import joblib | import joblib | ||||||
| from sklearn.svm import LinearSVC | from sklearn.svm import LinearSVC | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										3
									
								
								test-requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								test-requirements.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | |||||||
|  | coverage | ||||||
|  | mock | ||||||
|  | nose>=1.2.1 | ||||||
| @@ -8,6 +8,7 @@ import re | |||||||
| from talon import quotations, utils as u | from talon import quotations, utils as u | ||||||
| from . import * | from . import * | ||||||
| from .fixtures import * | from .fixtures import * | ||||||
|  | from lxml import html | ||||||
|  |  | ||||||
| RE_WHITESPACE = re.compile("\s") | RE_WHITESPACE = re.compile("\s") | ||||||
| RE_DOUBLE_WHITESPACE = re.compile("\s") | RE_DOUBLE_WHITESPACE = re.compile("\s") | ||||||
| @@ -424,3 +425,23 @@ def test_readable_html_empty(): | |||||||
| def test_bad_html(): | def test_bad_html(): | ||||||
|     bad_html = "<html></html>" |     bad_html = "<html></html>" | ||||||
|     eq_(bad_html, quotations.extract_from_html(bad_html)) |     eq_(bad_html, quotations.extract_from_html(bad_html)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_remove_namespaces(): | ||||||
|  |     msg_body = """ | ||||||
|  |     <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns="http://www.w3.org/TR/REC-html40"> | ||||||
|  |         <body> | ||||||
|  |             <o:p>Dear Sir,</o:p> | ||||||
|  |             <o:p>Thank you for the email.</o:p> | ||||||
|  |             <blockquote>thing</blockquote> | ||||||
|  |         </body> | ||||||
|  |     </html> | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     rendered = quotations.extract_from_html(msg_body) | ||||||
|  |  | ||||||
|  |     assert_true("<p>" in rendered) | ||||||
|  |     assert_true("xmlns" in rendered) | ||||||
|  |  | ||||||
|  |     assert_true("<o:p>" not in rendered) | ||||||
|  |     assert_true("<xmlns:o>" not in rendered) | ||||||
|   | |||||||
| @@ -826,10 +826,10 @@ The user experience was unparallelled. Please continue production. I'm sending p | |||||||
| that this line is intact.""" | that this line is intact.""" | ||||||
|  |  | ||||||
|     parsed = quotations.extract_from_plain(msg_body) |     parsed = quotations.extract_from_plain(msg_body) | ||||||
|     eq_(msg_body, parsed.decode('utf8')) |     eq_(msg_body, parsed) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_appointment(): | def test_appointment_2(): | ||||||
|     msg_body = """Invitation for an interview: |     msg_body = """Invitation for an interview: | ||||||
|  |  | ||||||
| Date: Wednesday 3, October 2011  | Date: Wednesday 3, October 2011  | ||||||
| @@ -838,4 +838,4 @@ Address: 130 Fox St | |||||||
|  |  | ||||||
| Please bring in your ID.""" | Please bring in your ID.""" | ||||||
|     parsed = quotations.extract_from_plain(msg_body) |     parsed = quotations.extract_from_plain(msg_body) | ||||||
|     eq_(msg_body, parsed.decode('utf8')) |     eq_(msg_body, parsed) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user