Compare commits
	
		
			247 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 6713699ebe | |||
|  | 71d9b6eb78 | ||
|  | 14f106ee76 | ||
|  | a8c7e6a972 | ||
|  | b30c375c5b | ||
|  | cec5acf58f | ||
|  | 24d0f2d00a | ||
|  | 94007b0b92 | ||
|  | 1a5548f171 | ||
|  | 53c49b9121 | ||
|  | bd50872043 | ||
|  | d37c4fd551 | ||
|  | d9ed7cc6d1 | ||
|  | 0a0808c0a8 | ||
|  | 16354e3528 | ||
|  | 1018e88ec1 | ||
|  | 2916351517 | ||
|  | 46d4b02c81 | ||
|  | 58eac88a10 | ||
|  | 2ef3d8dfbe | ||
|  | 7cf4c29340 | ||
|  | cdd84563dd | ||
|  | 8138ea9a60 | ||
|  | c171f9a875 | ||
|  | 3f97a8b8ff | ||
|  | 1147767ff3 | ||
|  | 6a304215c3 | ||
|  | 31714506bd | ||
|  | 403d80cf3b | ||
|  | 7cf20f2877 | ||
|  | afff08b017 | ||
|  | 685abb1905 | ||
|  | 41990727a3 | ||
|  | b113d8ab33 | ||
|  | 7bd0e9cc2f | ||
|  | 1e030a51d4 | ||
|  | 238a5de5cc | ||
|  | 53b24ffb3d | ||
|  | a7404afbcb | ||
|  | 0e6d5f993c | ||
|  | 60637ff13a | ||
|  | df8259e3fe | ||
|  | aab3b1cc75 | ||
|  | 9492b39f2d | ||
|  | b9ac866ea7 | ||
|  | 678517dd89 | ||
|  | 221774c6f8 | ||
|  | a2aa345712 | ||
|  | d998beaff3 | ||
|  | a379bc4e7c | ||
|  | b8e1894f3b | ||
|  | 0b5a44090f | ||
|  | b40835eca2 | ||
|  | b38562c7cc | ||
|  | 70e9fb415e | ||
|  | 64612099cd | ||
|  | 45c20f979d | ||
|  | 743c76f159 | ||
|  | bc5dad75d3 | ||
|  | 4acf05cf28 | ||
|  | f5f7264077 | ||
|  | 4364bebf38 | ||
|  | 15e61768f2 | ||
|  | dd0a0f5c4d | ||
|  | 086f5ba43b | ||
|  | e16dcf629e | ||
|  | f16ae5110b | ||
|  | ab5cbe5ec3 | ||
|  | be5da92f16 | ||
|  | 95954a65a0 | ||
|  | 0b55e8fa77 | ||
|  | 6f159e8959 | ||
|  | 5c413b4b00 | ||
|  | cca64d3ed1 | ||
|  | e11eaf6ff8 | ||
|  | 85a4c1d855 | ||
|  | 0f5e72623b | ||
|  | 061e549ad7 | ||
|  | 49d1a5d248 | ||
|  | 03d6b00db8 | ||
|  | a2eb0f7201 | ||
|  | 5c71a0ca07 | ||
|  | 489d16fad9 | ||
|  | a458707777 | ||
|  | a1d0a86305 | ||
|  | 29f1d21be7 | ||
|  | 34c5b526c3 | ||
|  | 3edb6578ba | ||
|  | 984c036b6e | ||
|  | a403ecb5c9 | ||
|  | a44713409c | ||
|  | 567467b8ed | ||
|  | 139edd6104 | ||
|  | e756d55abf | ||
|  | 015c8d2a78 | ||
|  | 5af846c13d | ||
|  | e69a9c7a54 | ||
|  | 23cb2a9a53 | ||
|  | b5e3397b88 | ||
|  | 5685a4055a | ||
|  | 97b72ef767 | ||
|  | 31489848be | ||
|  | e5988d447b | ||
|  | adfed748ce | ||
|  | 2444ba87c0 | ||
|  | 534457e713 | ||
|  | ea82a9730e | ||
|  | f04b872e14 | ||
|  | e61894e425 | ||
|  | 35fbdaadac | ||
|  | 8441bc7328 | ||
|  | 37c95ff97b | ||
|  | 5b1ca33c57 | ||
|  | ec8e09b34e | ||
|  | bcf97eccfa | ||
|  | f53b5cc7a6 | ||
|  | 27adde7aa7 | ||
|  | a9719833e0 | ||
|  | 7bf37090ca | ||
|  | 44fcef7123 | ||
|  | 69a44b10a1 | ||
|  | b085e3d049 | ||
|  | 4b953bcddc | ||
|  | 315eaa7080 | ||
|  | 5a9bc967f1 | ||
|  | a0d7236d0b | ||
|  | 21e9a31ffe | ||
|  | 4ee46c0a97 | ||
|  | 10d9a930f9 | ||
|  | a21ccdb21b | ||
|  | 7cdd7a8f35 | ||
|  | 01e03a47e0 | ||
|  | 1b9a71551a | ||
|  | 911efd1db4 | ||
|  | e61f0a68c4 | ||
|  | cefbcffd59 | ||
|  | 622a98d6d5 | ||
|  | 7901f5d1dc | ||
|  | 555c34d7a8 | ||
|  | dcc0d1de20 | ||
|  | 7bdf4d622b | ||
|  | 4a7207b0d0 | ||
|  | ad9c2ca0e8 | ||
|  | da998ddb60 | ||
|  | 07f68815df | ||
|  | 35645f9ade | ||
|  | 7c3d91301c | ||
|  | 5bcf7403ad | ||
|  | 2d6c092b65 | ||
|  | 6d0689cad6 | ||
|  | 3f80e93ee0 | ||
|  | 1b18abab1d | ||
|  | 03dd5af5ab | ||
|  | dfba82b07c | ||
|  | 08ca02c87f | ||
|  | b61f4ec095 | ||
|  | 9dbe6a494b | ||
|  | 44e70939d6 | ||
|  | ab6066eafa | ||
|  | 42258cdd36 | ||
|  | d3de9e6893 | ||
|  | 333beb94af | ||
|  | f3c0942c49 | ||
|  | 02adf53ab9 | ||
|  | 3497b5cab4 | ||
|  | 9c17dca17c | ||
|  | de342d3177 | ||
|  | 743b452daf | ||
|  | c762f3c337 | ||
|  | 31803d41bc | ||
|  | 2ecd9779fc | ||
|  | 5a7047233e | ||
|  | 999e9c3725 | ||
|  | f6940fe878 | ||
|  | ce65ff8fc8 | ||
|  | eed6784f25 | ||
|  | 3d9ae356ea | ||
|  | f688d074b5 | ||
|  | 41457d8fbd | ||
|  | 2c416ecc0e | ||
|  | 3ab33c557b | ||
|  | 8db05f4950 | ||
|  | 3d5bc82a03 | ||
|  | 14e3a0d80b | ||
|  | fcd9e2716a | ||
|  | d62d633215 | ||
|  | 3b0c9273c1 | ||
|  | e4c1c11845 | ||
|  | ae508fe0e5 | ||
|  | 2cb9b5399c | ||
|  | 134c47f515 | ||
|  | d328c9d128 | ||
|  | 77b62b0fef | ||
|  | ad09b18f3f | ||
|  | b5af9c03a5 | ||
|  | 176c7e7532 | ||
|  | 15976888a0 | ||
|  | 9bee502903 | ||
|  | e3cb8dc3e6 | ||
|  | 385285e5de | ||
|  | 127771dac9 | ||
|  | cc98befba5 | ||
|  | 567549cba4 | ||
|  | 76c4f49be8 | ||
|  | d9d89dc250 | ||
|  | 9358db6cee | ||
|  | 08c9d7db03 | ||
|  | 390b0a6dc9 | ||
|  | ed6b861a47 | ||
|  | 85c7ee980c | ||
|  | 7ea773e6a9 | ||
|  | e3c4ff38fe | ||
|  | 8b1f87b1c0 | ||
|  | c5e4cd9ab4 | ||
|  | 215e36e9ed | ||
|  | e3ef622031 | ||
|  | f16760c466 | ||
|  | b36287e573 | ||
|  | 4df7aa284b | ||
|  | 3a37d8b649 | ||
|  | f9f428f4c3 | ||
|  | 84a83e865e | ||
|  | b4c180b9ff | ||
|  | 072a440837 | ||
|  | 105d16644d | ||
|  | df3338192a | ||
|  | f0ed5d6c07 | ||
|  | 790463821f | ||
|  | 763d3b308e | ||
|  | 3c9ef4653f | ||
|  | b16060261a | ||
|  | 13dc43e960 | ||
|  | 3768d7ba31 | ||
|  | 613d1fc815 | ||
|  | 52505bba8a | ||
|  | 79cd4fcc52 | ||
|  | a4f156b174 | ||
|  | 1789ccf3c8 | ||
|  | 7a42ab3b28 | ||
|  | 12b0e88a01 | ||
|  | 8b78da5977 | ||
|  | b299feab1e | ||
|  | 95182dcfc4 | ||
|  | f9fe412fa4 | ||
|  | 00a8db2e3e | ||
|  | 71ae26ccd1 | ||
|  | b0851d5363 | 
							
								
								
									
										20
									
								
								.build/Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								.build/Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,20 @@ | |||||||
|  | FROM python:3.9-slim-buster AS deps | ||||||
|  |  | ||||||
|  | RUN apt-get update && \ | ||||||
|  |     apt-get install -y build-essential git curl python3-dev libatlas3-base libatlas-base-dev liblapack-dev libxml2 libxml2-dev libffi6 libffi-dev musl-dev libxslt-dev | ||||||
|  |  | ||||||
|  | FROM deps AS testable | ||||||
|  | ARG REPORT_PATH | ||||||
|  |  | ||||||
|  | VOLUME ["/var/mailgun", "/etc/mailgun/ssl", ${REPORT_PATH}] | ||||||
|  |  | ||||||
|  | ADD . /app | ||||||
|  | WORKDIR /app | ||||||
|  | COPY wheel/* /wheel/ | ||||||
|  |  | ||||||
|  | RUN mkdir -p ${REPORT_PATH} | ||||||
|  |  | ||||||
|  | RUN python ./setup.py build bdist_wheel -d /wheel && \ | ||||||
|  |     pip install --no-deps /wheel/* | ||||||
|  |  | ||||||
|  | ENTRYPOINT ["/bin/sh", "/app/run_tests.sh"] | ||||||
							
								
								
									
										8
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										8
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -39,6 +39,8 @@ nosetests.xml | |||||||
| /.emacs.desktop | /.emacs.desktop | ||||||
| /.emacs.desktop.lock | /.emacs.desktop.lock | ||||||
| .elc | .elc | ||||||
|  | .idea | ||||||
|  | .cache | ||||||
| auto-save-list | auto-save-list | ||||||
| tramp | tramp | ||||||
| .\#* | .\#* | ||||||
| @@ -49,3 +51,9 @@ tramp | |||||||
|  |  | ||||||
| # Trial temp | # Trial temp | ||||||
| _trial_temp | _trial_temp | ||||||
|  |  | ||||||
|  | # OSX | ||||||
|  | .DS_Store | ||||||
|  |  | ||||||
|  | # vim-backup | ||||||
|  | *.bak | ||||||
|   | |||||||
							
								
								
									
										14
									
								
								MANIFEST.in
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								MANIFEST.in
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,14 @@ | |||||||
|  | recursive-exclude tests *.pyc *~ | ||||||
|  | recursive-exclude talon *.pyc *~ | ||||||
|  | include train.data | ||||||
|  | include classifier | ||||||
|  | include LICENSE | ||||||
|  | include MANIFEST.in | ||||||
|  | include README.rst | ||||||
|  | include talon/signature/data/train.data | ||||||
|  | include talon/signature/data/classifier | ||||||
|  | include talon/signature/data/classifier_01.npy | ||||||
|  | include talon/signature/data/classifier_02.npy | ||||||
|  | include talon/signature/data/classifier_03.npy | ||||||
|  | include talon/signature/data/classifier_04.npy | ||||||
|  | include talon/signature/data/classifier_05.npy | ||||||
							
								
								
									
										54
									
								
								README.rst
									
									
									
									
									
								
							
							
						
						
									
										54
									
								
								README.rst
									
									
									
									
									
								
							| @@ -3,7 +3,7 @@ talon | |||||||
|  |  | ||||||
| Mailgun library to extract message quotations and signatures. | Mailgun library to extract message quotations and signatures. | ||||||
|  |  | ||||||
| If you ever tried to parse message quotations or signatures you know that absense of any formatting standards in this area could make this task a nightmare. Hopefully this library will make your life much eathier. The name of the project is inspired by TALON - multipurpose robot designed to perform missions ranging from reconnaissance to combat and operate in a number of hostile environments. That’s what a good quotations and signature parser should be like :smile: | If you ever tried to parse message quotations or signatures you know that absence of any formatting standards in this area could make this task a nightmare. Hopefully this library will make your life much easier. The name of the project is inspired by TALON - multipurpose robot designed to perform missions ranging from reconnaissance to combat and operate in a number of hostile environments. That’s what a good quotations and signature parser should be like :smile: | ||||||
|  |  | ||||||
| Usage | Usage | ||||||
| ----- | ----- | ||||||
| @@ -71,6 +71,11 @@ the power of machine learning algorithms: | |||||||
|  |  | ||||||
| .. code:: python | .. code:: python | ||||||
|  |  | ||||||
|  |     import talon | ||||||
|  |     # don't forget to init the library first | ||||||
|  |     # it loads machine learning classifiers | ||||||
|  |     talon.init() | ||||||
|  |  | ||||||
|     from talon import signature |     from talon import signature | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -84,21 +89,62 @@ the power of machine learning algorithms: | |||||||
|     # text == "Thanks Sasha, I can't go any higher and is why I limited it to the\nhomepage." |     # text == "Thanks Sasha, I can't go any higher and is why I limited it to the\nhomepage." | ||||||
|     # signature == "John Doe\nvia mobile" |     # signature == "John Doe\nvia mobile" | ||||||
|  |  | ||||||
| For machine learning talon currently uses `PyML`_ library to build SVM | For machine learning talon currently uses the `scikit-learn`_ library to build SVM | ||||||
| classifiers. The core of machine learning algorithm lays in | classifiers. The core of machine learning algorithm lays in | ||||||
| ``talon.signature.learning package``. It defines a set of features to | ``talon.signature.learning package``. It defines a set of features to | ||||||
| apply to a message (``featurespace.py``), how data sets are built | apply to a message (``featurespace.py``), how data sets are built | ||||||
| (``dataset.py``), classifier’s interface (``classifier.py``). | (``dataset.py``), classifier’s interface (``classifier.py``). | ||||||
|  |  | ||||||
| The data used for training is taken from our personal email | Currently the data used for training is taken from our personal email | ||||||
| conversations and from `ENRON`_ dataset. As a result of applying our set | conversations and from `ENRON`_ dataset. As a result of applying our set | ||||||
| of features to the dataset we provide files ``classifier`` and | of features to the dataset we provide files ``classifier`` and | ||||||
| ``train.data`` that don’t have any personal information but could be | ``train.data`` that don’t have any personal information but could be | ||||||
| used to load trained classifier. Those files should be regenerated every | used to load trained classifier. Those files should be regenerated every | ||||||
| time the feature/data set is changed. | time the feature/data set is changed. | ||||||
|  |  | ||||||
| .. _PyML: http://pyml.sourceforge.net/ | To regenerate the model files, you can run | ||||||
|  |  | ||||||
|  | .. code:: sh | ||||||
|  |  | ||||||
|  |     python train.py | ||||||
|  |  | ||||||
|  | or | ||||||
|  |  | ||||||
|  | .. code:: python | ||||||
|  |      | ||||||
|  |     from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA | ||||||
|  |     from talon.signature.learning.classifier import train, init | ||||||
|  |     train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME) | ||||||
|  |  | ||||||
|  | Open-source Dataset | ||||||
|  | ------------------- | ||||||
|  |  | ||||||
|  | Recently we started a `forge`_ project to create an open-source, annotated dataset of raw emails. In the project we | ||||||
|  | used a subset of `ENRON`_ data, cleansed of private, health and financial information by `EDRM`_. At the moment over 190 | ||||||
|  | emails are annotated. Any contribution and collaboration on the project are welcome. Once the dataset is ready we plan to | ||||||
|  | start using it for talon. | ||||||
|  |  | ||||||
|  | .. _scikit-learn: http://scikit-learn.org | ||||||
| .. _ENRON: https://www.cs.cmu.edu/~enron/ | .. _ENRON: https://www.cs.cmu.edu/~enron/ | ||||||
|  | .. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set | ||||||
|  | .. _forge: https://github.com/mailgun/forge | ||||||
|  |  | ||||||
|  | Training on your dataset | ||||||
|  | ------------------------ | ||||||
|  |  | ||||||
|  | talon comes with a pre-processed dataset and a pre-trained classifier. To retrain the classifier on your own dataset of raw emails, structure and annotate them in the same way the `forge`_ project does. Then do: | ||||||
|  |  | ||||||
|  | .. code:: python | ||||||
|  |  | ||||||
|  |     from talon.signature.learning.dataset import build_extraction_dataset | ||||||
|  |     from talon.signature.learning import classifier as c  | ||||||
|  |      | ||||||
|  |     build_extraction_dataset("/path/to/your/P/folder", "/path/to/talon/signature/data/train.data") | ||||||
|  |     c.train(c.init(), "/path/to/talon/signature/data/train.data", "/path/to/talon/signature/data/classifier") | ||||||
|  |  | ||||||
|  | Note that for signature extraction you need just the folder with the positive samples with annotated signature lines (P folder). | ||||||
|  |  | ||||||
|  | .. _forge: https://github.com/mailgun/forge | ||||||
|  |  | ||||||
| Research | Research | ||||||
| -------- | -------- | ||||||
|   | |||||||
							
								
								
									
										11
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,11 @@ | |||||||
|  | chardet>=1.0.1 | ||||||
|  | # cchardet>=0.3.5 | ||||||
|  | cssselect | ||||||
|  | html5lib | ||||||
|  | joblib | ||||||
|  | lxml>=2.3.3 | ||||||
|  | numpy | ||||||
|  | regex>=1 | ||||||
|  | scikit-learn>=1.0.0 | ||||||
|  | scipy | ||||||
|  | six>=1.10.0 | ||||||
							
								
								
									
										4
									
								
								run_tests.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										4
									
								
								run_tests.sh
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,4 @@ | |||||||
|  | #!/usr/bin/env bash | ||||||
|  | set -ex | ||||||
|  | REPORT_PATH="${REPORT_PATH:-./}" | ||||||
|  | nosetests --with-xunit --with-coverage --cover-xml --cover-xml-file $REPORT_PATH/coverage.xml --xunit-file=$REPORT_PATH/nosetests.xml --cover-package=talon . | ||||||
							
								
								
									
										153
									
								
								setup.py
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							
							
						
						
									
										153
									
								
								setup.py
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							| @@ -1,105 +1,64 @@ | |||||||
| import os | from __future__ import absolute_import | ||||||
| import sys |  | ||||||
| import contextlib |  | ||||||
|  |  | ||||||
| from distutils.spawn import find_executable |  | ||||||
| from setuptools import setup, find_packages | from setuptools import setup, find_packages | ||||||
|  | from setuptools.command.install import install | ||||||
|  |  | ||||||
|  |  | ||||||
| setup(name='talon', | class InstallCommand(install): | ||||||
|       version='1.0', |     user_options = install.user_options + [ | ||||||
|       description=("Mailgun library " |         ("no-ml", None, "Don't install without Machine Learning modules."), | ||||||
|                    "to extract message quotations and signatures."), |     ] | ||||||
|  |  | ||||||
|  |     boolean_options = install.boolean_options + ["no-ml"] | ||||||
|  |  | ||||||
|  |     def initialize_options(self): | ||||||
|  |         install.initialize_options(self) | ||||||
|  |         self.no_ml = None | ||||||
|  |  | ||||||
|  |     def finalize_options(self): | ||||||
|  |         install.finalize_options(self) | ||||||
|  |         if self.no_ml: | ||||||
|  |             dist = self.distribution | ||||||
|  |             dist.packages = find_packages( | ||||||
|  |                 exclude=[ | ||||||
|  |                     "tests", | ||||||
|  |                     "tests.*", | ||||||
|  |                     "talon.signature", | ||||||
|  |                     "talon.signature.*", | ||||||
|  |                 ] | ||||||
|  |             ) | ||||||
|  |             for not_required in ["numpy", "scipy", "scikit-learn==0.24.1"]: | ||||||
|  |                 dist.install_requires.remove(not_required) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | setup( | ||||||
|  |     name="talon-o2w", | ||||||
|  |     version="1.6.1", | ||||||
|  |     description=( | ||||||
|  |         "Mailgun library " "to extract message quotations and signatures." | ||||||
|  |     ), | ||||||
|     long_description=open("README.rst").read(), |     long_description=open("README.rst").read(), | ||||||
|       author='Mailgun Inc.', |     author="Mailgun Inc.", | ||||||
|       author_email='admin@mailgunhq.com', |     author_email="admin@mailgunhq.com", | ||||||
|       url='https://github.com/mailgun/talon', |     url="https://github.com/mailgun/talon", | ||||||
|       license='APACHE2', |     license="APACHE2", | ||||||
|       packages=find_packages(exclude=['tests']), |     cmdclass={ | ||||||
|  |         "install": InstallCommand, | ||||||
|  |     }, | ||||||
|  |     packages=find_packages(exclude=["tests", "tests.*"]), | ||||||
|     include_package_data=True, |     include_package_data=True, | ||||||
|     zip_safe=True, |     zip_safe=True, | ||||||
|     install_requires=[ |     install_requires=[ | ||||||
|           "lxml==2.3.3", |         "lxml", | ||||||
|           "regex==0.1.20110315", |         "regex", | ||||||
|           "chardet==1.0.1", |         "numpy", | ||||||
|           "dnspython==1.11.1", |         "scipy", | ||||||
|           "html2text", |         "scikit-learn>=1.0.0", | ||||||
|           "nose==1.2.1", |         "chardet", | ||||||
|           "mock", |         #          "cchardet", | ||||||
|           "coverage" |         "cssselect", | ||||||
|           ] |         "six", | ||||||
|  |         "html5lib", | ||||||
|  |         "joblib", | ||||||
|  |     ], | ||||||
|  |     tests_require=["mock", "nose", "coverage"], | ||||||
| ) | ) | ||||||
|  |  | ||||||
|  |  | ||||||
| def install_pyml(): |  | ||||||
|     ''' |  | ||||||
|     Downloads and installs PyML |  | ||||||
|     ''' |  | ||||||
|     try: |  | ||||||
|         import PyML |  | ||||||
|     except: |  | ||||||
|         pass |  | ||||||
|     else: |  | ||||||
|         return |  | ||||||
|  |  | ||||||
|     # install numpy first |  | ||||||
|     pip('install numpy==1.6.1 --upgrade') |  | ||||||
|  |  | ||||||
|     pyml_tarball = ( |  | ||||||
|         'http://09cce49df173f6f6e61f-fd6930021b51685920a6fa76529ee321' |  | ||||||
|         '.r45.cf2.rackcdn.com/PyML-0.7.9.tar.gz') |  | ||||||
|     pyml_srcidr = 'PyML-0.7.9' |  | ||||||
|  |  | ||||||
|     # see if PyML tarball needs to be fetched: |  | ||||||
|     if not dir_exists(pyml_srcidr): |  | ||||||
|         run("curl %s | tar -xz" % pyml_tarball) |  | ||||||
|  |  | ||||||
|     # compile&install: |  | ||||||
|     with cd(pyml_srcidr): |  | ||||||
|         python('setup.py build') |  | ||||||
|         python('setup.py install') |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def run(command): |  | ||||||
|     if os.system(command) != 0: |  | ||||||
|         raise Exception("Failed '{}'".format(command)) |  | ||||||
|     else: |  | ||||||
|         return 0 |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def python(command): |  | ||||||
|     command = '{} {}'.format(sys.executable, command) |  | ||||||
|     run(command) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def enforce_executable(name, install_info): |  | ||||||
|     if os.system("which {}".format(name)) != 0: |  | ||||||
|         raise Exception( |  | ||||||
|             '{} utility is missing.\nTo install, run:\n\n{}\n'.format( |  | ||||||
|                 name, install_info)) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def pip(command): |  | ||||||
|     command = '{} {}'.format(find_executable('pip'), command) |  | ||||||
|     run(command) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def dir_exists(path): |  | ||||||
|     return os.path.isdir(path) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @contextlib.contextmanager |  | ||||||
| def cd(directory): |  | ||||||
|     curdir = os.getcwd() |  | ||||||
|     try: |  | ||||||
|         os.chdir(directory) |  | ||||||
|         yield {} |  | ||||||
|     finally: |  | ||||||
|         os.chdir(curdir) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| if __name__ == '__main__': |  | ||||||
|     if len(sys.argv) > 1 and sys.argv[1] in ['develop', 'install']: |  | ||||||
|         enforce_executable('curl', 'sudo aptitude install curl') |  | ||||||
|  |  | ||||||
|         install_pyml() |  | ||||||
|   | |||||||
| @@ -1,7 +1,13 @@ | |||||||
|  | from __future__ import absolute_import | ||||||
| from talon.quotations import register_xpath_extensions | from talon.quotations import register_xpath_extensions | ||||||
|  | try: | ||||||
|     from talon import signature |     from talon import signature | ||||||
|  |     ML_ENABLED = True | ||||||
|  | except ImportError: | ||||||
|  |     ML_ENABLED = False | ||||||
|  |  | ||||||
|  |  | ||||||
| def init(): | def init(): | ||||||
|     register_xpath_extensions() |     register_xpath_extensions() | ||||||
|  |     if ML_ENABLED: | ||||||
|         signature.initialize() |         signature.initialize() | ||||||
|   | |||||||
| @@ -1,3 +1,4 @@ | |||||||
|  | from __future__ import absolute_import | ||||||
| import regex as re | import regex as re | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -3,8 +3,10 @@ The module's functions operate on message bodies trying to extract original | |||||||
| messages (without quoted messages) from html | messages (without quoted messages) from html | ||||||
| """ | """ | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| import regex as re | import regex as re | ||||||
|  |  | ||||||
|  | from talon.utils import cssselect  | ||||||
|  |  | ||||||
| CHECKPOINT_PREFIX = '#!%!' | CHECKPOINT_PREFIX = '#!%!' | ||||||
| CHECKPOINT_SUFFIX = '!%!#' | CHECKPOINT_SUFFIX = '!%!#' | ||||||
| @@ -12,6 +14,7 @@ CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX) | |||||||
|  |  | ||||||
| # HTML quote indicators (tag ids) | # HTML quote indicators (tag ids) | ||||||
| QUOTE_IDS = ['OLK_SRC_BODY_SECTION'] | QUOTE_IDS = ['OLK_SRC_BODY_SECTION'] | ||||||
|  | RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) | ||||||
|  |  | ||||||
|  |  | ||||||
| def add_checkpoint(html_note, counter): | def add_checkpoint(html_note, counter): | ||||||
| @@ -76,22 +79,32 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints): | |||||||
|  |  | ||||||
| def cut_gmail_quote(html_message): | def cut_gmail_quote(html_message): | ||||||
|     ''' Cuts the outermost block element with class gmail_quote. ''' |     ''' Cuts the outermost block element with class gmail_quote. ''' | ||||||
|     gmail_quote = html_message.cssselect('.gmail_quote') |     gmail_quote = cssselect('div.gmail_quote', html_message) | ||||||
|     if gmail_quote: |     if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)): | ||||||
|         gmail_quote[0].getparent().remove(gmail_quote[0]) |         gmail_quote[0].getparent().remove(gmail_quote[0]) | ||||||
|         return True |         return True | ||||||
|  |  | ||||||
|  |  | ||||||
| def cut_microsoft_quote(html_message): | def cut_microsoft_quote(html_message): | ||||||
|     ''' Cuts splitter block and all following blocks. ''' |     ''' Cuts splitter block and all following blocks. ''' | ||||||
|  |     #use EXSLT extensions to have a regex match() function with lxml | ||||||
|  |     ns = {"re": "http://exslt.org/regular-expressions"} | ||||||
|  |  | ||||||
|  |     #general pattern: @style='border:none;border-top:solid <color> 1.0pt;padding:3.0pt 0<unit> 0<unit> 0<unit>' | ||||||
|  |     #outlook 2007, 2010 (international) <color=#B5C4DF> <unit=cm> | ||||||
|  |     #outlook 2007, 2010 (american)      <color=#B5C4DF> <unit=pt> | ||||||
|  |     #outlook 2013       (international) <color=#E1E1E1> <unit=cm> | ||||||
|  |     #outlook 2013       (american)      <color=#E1E1E1> <unit=pt> | ||||||
|  |     #also handles a variant with a space after the semicolon | ||||||
|     splitter = html_message.xpath( |     splitter = html_message.xpath( | ||||||
|         #outlook 2007, 2010 |         #outlook 2007, 2010, 2013 (international, american) | ||||||
|         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" |         "//div[@style[re:match(., 'border:none; ?border-top:solid #(E1E1E1|B5C4DF) 1.0pt; ?" | ||||||
|         "padding:3.0pt 0cm 0cm 0cm']|" |         "padding:3.0pt 0(in|cm) 0(in|cm) 0(in|cm)')]]|" | ||||||
|         #windows mail |         #windows mail | ||||||
|         "//div[@style='padding-top: 5px; " |         "//div[@style='padding-top: 5px; " | ||||||
|         "border-top-color: rgb(229, 229, 229); " |         "border-top-color: rgb(229, 229, 229); " | ||||||
|         "border-top-width: 1px; border-top-style: solid;']" |         "border-top-width: 1px; border-top-style: solid;']" | ||||||
|  |         , namespaces=ns | ||||||
|     ) |     ) | ||||||
|  |  | ||||||
|     if splitter: |     if splitter: | ||||||
| @@ -130,7 +143,7 @@ def cut_microsoft_quote(html_message): | |||||||
| def cut_by_id(html_message): | def cut_by_id(html_message): | ||||||
|     found = False |     found = False | ||||||
|     for quote_id in QUOTE_IDS: |     for quote_id in QUOTE_IDS: | ||||||
|         quote = html_message.cssselect('#{}'.format(quote_id)) |         quote = cssselect('#{}'.format(quote_id), html_message) | ||||||
|         if quote: |         if quote: | ||||||
|             found = True |             found = True | ||||||
|             quote[0].getparent().remove(quote[0]) |             quote[0].getparent().remove(quote[0]) | ||||||
| @@ -138,9 +151,14 @@ def cut_by_id(html_message): | |||||||
|  |  | ||||||
|  |  | ||||||
| def cut_blockquote(html_message): | def cut_blockquote(html_message): | ||||||
|     ''' Cuts blockquote with wrapping elements. ''' |     ''' Cuts the last non-nested blockquote with wrapping elements.''' | ||||||
|     quote = html_message.find('.//blockquote') |     quote = html_message.xpath( | ||||||
|     if quote is not None: |         '(.//blockquote)' | ||||||
|  |         '[not(@class="gmail_quote") and not(ancestor::blockquote)]' | ||||||
|  |         '[last()]') | ||||||
|  |  | ||||||
|  |     if quote: | ||||||
|  |         quote = quote[0] | ||||||
|         quote.getparent().remove(quote) |         quote.getparent().remove(quote) | ||||||
|         return True |         return True | ||||||
|  |  | ||||||
| @@ -154,13 +172,40 @@ def cut_from_block(html_message): | |||||||
|  |  | ||||||
|     if block: |     if block: | ||||||
|         block = block[-1] |         block = block[-1] | ||||||
|  |         parent_div = None | ||||||
|         while block.getparent() is not None: |         while block.getparent() is not None: | ||||||
|             if block.tag == 'div': |             if block.tag == 'div': | ||||||
|                 block.getparent().remove(block) |                 parent_div = block | ||||||
|  |                 break | ||||||
|  |             block = block.getparent() | ||||||
|  |         if parent_div is not None: | ||||||
|  |             maybe_body = parent_div.getparent() | ||||||
|  |             # In cases where removing this enclosing div will remove all | ||||||
|  |             # content, we should assume the quote is not enclosed in a tag. | ||||||
|  |             parent_div_is_all_content = ( | ||||||
|  |                 maybe_body is not None and maybe_body.tag == 'body' and | ||||||
|  |                 len(maybe_body.getchildren()) == 1) | ||||||
|  |  | ||||||
|  |             if not parent_div_is_all_content: | ||||||
|  |                 parent = block.getparent() | ||||||
|  |                 next_sibling = block.getnext() | ||||||
|  |  | ||||||
|  |                 # remove all tags after found From block | ||||||
|  |                 # (From block and quoted message are in separate divs) | ||||||
|  |                 while next_sibling is not None: | ||||||
|  |                     parent.remove(block) | ||||||
|  |                     block = next_sibling | ||||||
|  |                     next_sibling = block.getnext() | ||||||
|  |  | ||||||
|  |                 # remove the last sibling (or the | ||||||
|  |                 # From block if no siblings) | ||||||
|  |                 if block is not None: | ||||||
|  |                     parent.remove(block) | ||||||
|  |  | ||||||
|                 return True |                 return True | ||||||
|         else: |         else: | ||||||
|                 block = block.getparent() |             return False | ||||||
|     else: |  | ||||||
|     # handle the case when From: block goes right after e.g. <hr> |     # handle the case when From: block goes right after e.g. <hr> | ||||||
|     # and not enclosed in some tag |     # and not enclosed in some tag | ||||||
|     block = html_message.xpath( |     block = html_message.xpath( | ||||||
| @@ -168,7 +213,17 @@ def cut_from_block(html_message): | |||||||
|          "//*[starts-with(mg:tail(), 'Date:')]")) |          "//*[starts-with(mg:tail(), 'Date:')]")) | ||||||
|     if block: |     if block: | ||||||
|         block = block[0] |         block = block[0] | ||||||
|  |  | ||||||
|  |         if RE_FWD.match(block.getparent().text or ''): | ||||||
|  |             return False | ||||||
|  |          | ||||||
|         while(block.getnext() is not None): |         while(block.getnext() is not None): | ||||||
|             block.getparent().remove(block.getnext()) |             block.getparent().remove(block.getnext()) | ||||||
|         block.getparent().remove(block) |         block.getparent().remove(block) | ||||||
|         return True |         return True | ||||||
|  |  | ||||||
|  | def cut_zimbra_quote(html_message): | ||||||
|  |     zDivider = html_message.xpath('//hr[@data-marker="__DIVIDER__"]') | ||||||
|  |     if zDivider: | ||||||
|  |         zDivider[0].getparent().remove(zDivider[0]) | ||||||
|  |         return True | ||||||
|   | |||||||
| @@ -5,35 +5,95 @@ The module's functions operate on message bodies trying to extract | |||||||
| original messages (without quoted messages) | original messages (without quoted messages) | ||||||
| """ | """ | ||||||
|  |  | ||||||
| import regex as re | from __future__ import absolute_import | ||||||
|  |  | ||||||
| import logging | import logging | ||||||
| from copy import deepcopy | from copy import deepcopy | ||||||
|  |  | ||||||
| from lxml import html, etree | import regex as re | ||||||
| import html2text | from lxml import etree, html | ||||||
|  | from six.moves import range | ||||||
|  |  | ||||||
| from talon.constants import RE_DELIMITER |  | ||||||
| from talon.utils import random_token, get_delimiter |  | ||||||
| from talon import html_quotations | from talon import html_quotations | ||||||
|  | from talon.utils import (get_delimiter, html_document_fromstring, | ||||||
|  |                          html_tree_to_text) | ||||||
|  |  | ||||||
| log = logging.getLogger(__name__) | log = logging.getLogger(__name__) | ||||||
|  |  | ||||||
|  |  | ||||||
| RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) | RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+\s*$", re.I | re.M) | ||||||
|  |  | ||||||
| RE_ON_DATE_SMB_WROTE = re.compile( | RE_ON_DATE_SMB_WROTE = re.compile( | ||||||
|     r''' |     u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format( | ||||||
|     ( |         # Beginning of the line | ||||||
|         -*  # could include dashes |         u'|'.join(( | ||||||
|         [ ]?On[ ].*,  # date part ends with comma |             # English | ||||||
|         (.*\n){0,2}  # splitter takes 4 lines at most |             'On', | ||||||
|         .*(wrote|sent): |             # French | ||||||
|  |             'Le', | ||||||
|  |             # Polish | ||||||
|  |             'W dniu', | ||||||
|  |             # Dutch | ||||||
|  |             'Op', | ||||||
|  |             # German | ||||||
|  |             'Am', | ||||||
|  |             # Portuguese | ||||||
|  |             'Em', | ||||||
|  |             # Norwegian | ||||||
|  |             u'På', | ||||||
|  |             # Swedish, Danish | ||||||
|  |             'Den', | ||||||
|  |             # Vietnamese | ||||||
|  |             u'Vào', | ||||||
|  |         )), | ||||||
|  |         # Date and sender separator | ||||||
|  |         u'|'.join(( | ||||||
|  |             # most languages separate date and sender address by comma | ||||||
|  |             ',', | ||||||
|  |             # polish date and sender address separator | ||||||
|  |             u'użytkownik' | ||||||
|  |         )), | ||||||
|  |         # Ending of the line | ||||||
|  |         u'|'.join(( | ||||||
|  |             # English | ||||||
|  |             'wrote', 'sent', | ||||||
|  |             # French | ||||||
|  |             u'a écrit', | ||||||
|  |             # Polish | ||||||
|  |             u'napisał', | ||||||
|  |             # Dutch | ||||||
|  |             'schreef','verzond','geschreven', | ||||||
|  |             # German | ||||||
|  |             'schrieb', | ||||||
|  |             # Portuguese | ||||||
|  |             'escreveu', | ||||||
|  |             # Norwegian, Swedish | ||||||
|  |             'skrev', | ||||||
|  |             # Vietnamese | ||||||
|  |             u'đã viết', | ||||||
|  |         )) | ||||||
|  |     )) | ||||||
|  | # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:' | ||||||
|  | RE_ON_DATE_WROTE_SMB = re.compile( | ||||||
|  |     u'(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format( | ||||||
|  |         # Beginning of the line | ||||||
|  |         u'|'.join(( | ||||||
|  |         	'Op', | ||||||
|  |         	#German | ||||||
|  |         	'Am' | ||||||
|  |         )), | ||||||
|  |         # Ending of the line | ||||||
|  |         u'|'.join(( | ||||||
|  |             # Dutch | ||||||
|  |             'schreef','verzond','geschreven', | ||||||
|  |             # German | ||||||
|  |             'schrieb' | ||||||
|  |         )) | ||||||
|  |     ) | ||||||
|     ) |     ) | ||||||
|     ''', re.VERBOSE) |  | ||||||
|  |  | ||||||
| RE_QUOTATION = re.compile( | RE_QUOTATION = re.compile( | ||||||
|     r''' |     r""" | ||||||
|     ( |     ( | ||||||
|         # quotation border: splitter line or a number of quotation marker lines |         # quotation border: splitter line or a number of quotation marker lines | ||||||
|         (?: |         (?: | ||||||
| @@ -51,44 +111,94 @@ RE_QUOTATION = re.compile( | |||||||
|  |  | ||||||
|     # after quotations should be text only or nothing at all |     # after quotations should be text only or nothing at all | ||||||
|     [te]*$ |     [te]*$ | ||||||
|     ''', re.VERBOSE) |     """, re.VERBOSE) | ||||||
|  |  | ||||||
| RE_EMPTY_QUOTATION = re.compile( | RE_EMPTY_QUOTATION = re.compile( | ||||||
|     r''' |     r""" | ||||||
|     ( |     ( | ||||||
|         # quotation border: splitter line or a number of quotation marker lines |         # quotation border: splitter line or a number of quotation marker lines | ||||||
|         (?: |         (?: | ||||||
|             s |             (?:se*)+ | ||||||
|             | |             | | ||||||
|             (?:me*){2,} |             (?:me*){2,} | ||||||
|         ) |         ) | ||||||
|     ) |     ) | ||||||
|     e* |     e* | ||||||
|     ''', re.VERBOSE) |     """, re.VERBOSE) | ||||||
|  |  | ||||||
|  | # ------Original Message------ or ---- Reply Message ---- | ||||||
|  | # With variations in other languages. | ||||||
|  | RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format( | ||||||
|  |     u'|'.join(( | ||||||
|  |         # English | ||||||
|  |         'Original Message', 'Reply Message', | ||||||
|  |         # German | ||||||
|  |         u'Ursprüngliche Nachricht', 'Antwort Nachricht', | ||||||
|  |         # Danish | ||||||
|  |         'Oprindelig meddelelse', | ||||||
|  |     ))), re.I) | ||||||
|  |  | ||||||
|  | RE_FROM_COLON_OR_DATE_COLON = re.compile(u'((_+\r?\n)?[\s]*:?[*]?({})[\s]?:([^\n$]+\n){{1,2}}){{2,}}'.format( | ||||||
|  |     u'|'.join(( | ||||||
|  |         # "From" in different languages. | ||||||
|  |         'From', 'Van', 'De', 'Von', 'Fra', u'Från', | ||||||
|  |         # "Date" in different languages. | ||||||
|  |         'Date', '[S]ent', 'Datum', u'Envoyé', 'Skickat', 'Sendt', 'Gesendet', | ||||||
|  |         # "Subject" in different languages. | ||||||
|  |         'Subject', 'Betreff', 'Objet', 'Emne', u'Ämne', | ||||||
|  |         # "To" in different languages. | ||||||
|  |         'To', 'An', 'Til', u'À', 'Till' | ||||||
|  |     ))), re.I | re.M) | ||||||
|  |  | ||||||
|  | # ---- John Smith wrote ---- | ||||||
|  | RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format( | ||||||
|  |     u'|'.join(( | ||||||
|  |         # English | ||||||
|  |         'wrote', | ||||||
|  |     ))), re.I) | ||||||
|  |  | ||||||
|  | # Support polymail.io reply format | ||||||
|  | # On Tue, Apr 11, 2017 at 10:07 PM John Smith | ||||||
|  | # | ||||||
|  | # < | ||||||
|  | # mailto:John Smith <johnsmith@gmail.com> | ||||||
|  | # > wrote: | ||||||
|  | RE_POLYMAIL = re.compile('On.*\s{2}<\smailto:.*\s> wrote:', re.I) | ||||||
|  |  | ||||||
| SPLITTER_PATTERNS = [ | SPLITTER_PATTERNS = [ | ||||||
|     # ------Original Message------ or ---- Reply Message ---- |     RE_ORIGINAL_MESSAGE, | ||||||
|     re.compile("[\s]*[-]+[ ]*(Original|Reply) Message[ ]*[-]+", re.I), |  | ||||||
|     # <date> <person> |  | ||||||
|     re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE), |  | ||||||
|     RE_ON_DATE_SMB_WROTE, |     RE_ON_DATE_SMB_WROTE, | ||||||
|     re.compile('(_+\r?\n)?[\s]*(:?[*]?From|Date):[*]? .*'), |     RE_ON_DATE_WROTE_SMB, | ||||||
|  |     RE_FROM_COLON_OR_DATE_COLON, | ||||||
|  |     # 02.04.2012 14:20 пользователь "bob@example.com" < | ||||||
|  |     # bob@xxx.mailgun.org> написал: | ||||||
|  |     re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*\s\S+@\S+", re.S), | ||||||
|  |     # 2014-10-17 11:28 GMT+03:00 Bob < | ||||||
|  |     # bob@example.com>: | ||||||
|  |     re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*\s\S+@\S+", re.S), | ||||||
|  |     # Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>: | ||||||
|     re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' |     re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' | ||||||
|                '( \S+){3,6}@\S+:') |                '( \S+){3,6}@\S+:'), | ||||||
|  |     # Sent from Samsung MobileName <address@example.com> wrote: | ||||||
|  |     re.compile('Sent from Samsung.* \S+@\S+> wrote'), | ||||||
|  |     RE_ANDROID_WROTE, | ||||||
|  |     RE_POLYMAIL | ||||||
|     ] |     ] | ||||||
|  |  | ||||||
|  |  | ||||||
| RE_LINK = re.compile('<(http://[^>]*)>') | RE_LINK = re.compile('<(http://[^>]*)>') | ||||||
| RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@') | RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@') | ||||||
|  |  | ||||||
| RE_PARANTHESIS_LINK = re.compile("\(https?://") | RE_PARENTHESIS_LINK = re.compile("\(https?://") | ||||||
|  |  | ||||||
| SPLITTER_MAX_LINES = 4 | SPLITTER_MAX_LINES = 6 | ||||||
| MAX_LINES_COUNT = 1000 | MAX_LINES_COUNT = 1000 | ||||||
|  |  | ||||||
| QUOT_PATTERN = re.compile('^>+ ?') | QUOT_PATTERN = re.compile('^>+ ?') | ||||||
| NO_QUOT_LINE = re.compile('^[^>].*[\S].*') | NO_QUOT_LINE = re.compile('^[^>].*[\S].*') | ||||||
|  |  | ||||||
|  | # Regular expression to identify if a line is a header. | ||||||
|  | RE_HEADER = re.compile(": ") | ||||||
|  |  | ||||||
|  |  | ||||||
| def extract_from(msg_body, content_type='text/plain'): | def extract_from(msg_body, content_type='text/plain'): | ||||||
|     try: |     try: | ||||||
| @@ -96,12 +206,25 @@ def extract_from(msg_body, content_type='text/plain'): | |||||||
|             return extract_from_plain(msg_body) |             return extract_from_plain(msg_body) | ||||||
|         elif content_type == 'text/html': |         elif content_type == 'text/html': | ||||||
|             return extract_from_html(msg_body) |             return extract_from_html(msg_body) | ||||||
|     except Exception, e: |     except Exception: | ||||||
|         log.exception('ERROR extracting message') |         log.exception('ERROR extracting message') | ||||||
|  |  | ||||||
|     return msg_body |     return msg_body | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def remove_initial_spaces_and_mark_message_lines(lines): | ||||||
|  |     """ | ||||||
|  |     Removes the initial spaces in each line before marking message lines. | ||||||
|  |  | ||||||
|  |     This ensures headers can be identified if they are indented with spaces. | ||||||
|  |     """ | ||||||
|  |     i = 0 | ||||||
|  |     while i < len(lines): | ||||||
|  |         lines[i] = lines[i].lstrip(' ') | ||||||
|  |         i += 1 | ||||||
|  |     return mark_message_lines(lines) | ||||||
|  |  | ||||||
|  |  | ||||||
| def mark_message_lines(lines): | def mark_message_lines(lines): | ||||||
|     """Mark message lines with markers to distinguish quotation lines. |     """Mark message lines with markers to distinguish quotation lines. | ||||||
|  |  | ||||||
| @@ -115,7 +238,7 @@ def mark_message_lines(lines): | |||||||
|     >>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question']) |     >>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question']) | ||||||
|     'tsem' |     'tsem' | ||||||
|     """ |     """ | ||||||
|     markers = bytearray(len(lines)) |     markers = ['e' for _ in lines] | ||||||
|     i = 0 |     i = 0 | ||||||
|     while i < len(lines): |     while i < len(lines): | ||||||
|         if not lines[i].strip(): |         if not lines[i].strip(): | ||||||
| @@ -127,10 +250,11 @@ def mark_message_lines(lines): | |||||||
|         else: |         else: | ||||||
|             # in case splitter is spread across several lines |             # in case splitter is spread across several lines | ||||||
|             splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES])) |             splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES])) | ||||||
|  |  | ||||||
|             if splitter: |             if splitter: | ||||||
|                 # append as many splitter markers as lines in splitter |                 # append as many splitter markers as lines in splitter | ||||||
|                 splitter_lines = splitter.group().splitlines() |                 splitter_lines = splitter.group().splitlines() | ||||||
|                 for j in xrange(len(splitter_lines)): |                 for j in range(len(splitter_lines)): | ||||||
|                     markers[i + j] = 's' |                     markers[i + j] = 's' | ||||||
|  |  | ||||||
|                 # skip splitter lines |                 # skip splitter lines | ||||||
| @@ -140,7 +264,7 @@ def mark_message_lines(lines): | |||||||
|                 markers[i] = 't' |                 markers[i] = 't' | ||||||
|         i += 1 |         i += 1 | ||||||
|  |  | ||||||
|     return markers |     return ''.join(markers) | ||||||
|  |  | ||||||
|  |  | ||||||
| def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): | def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): | ||||||
| @@ -154,6 +278,7 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): | |||||||
|     return_flags = [were_lines_deleted, first_deleted_line, |     return_flags = [were_lines_deleted, first_deleted_line, | ||||||
|                     last_deleted_line] |                     last_deleted_line] | ||||||
|     """ |     """ | ||||||
|  |     markers = ''.join(markers) | ||||||
|     # if there are no splitter there should be no markers |     # if there are no splitter there should be no markers | ||||||
|     if 's' not in markers and not re.search('(me*){3}', markers): |     if 's' not in markers and not re.search('(me*){3}', markers): | ||||||
|         markers = markers.replace('m', 't') |         markers = markers.replace('m', 't') | ||||||
| @@ -165,12 +290,12 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): | |||||||
|     # inlined reply |     # inlined reply | ||||||
|     # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm' |     # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm' | ||||||
|     # both 't' entries should be found |     # both 't' entries should be found | ||||||
|     for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers): |     for inline_reply in re.finditer('(?<=m)e*(t[te]*)m', markers): | ||||||
|         # long links could break sequence of quotation lines but they shouldn't |         # long links could break sequence of quotation lines but they shouldn't | ||||||
|         # be considered an inline reply |         # be considered an inline reply | ||||||
|         links = ( |         links = ( | ||||||
|             RE_PARANTHESIS_LINK.search(lines[inline_reply.start() - 1]) or |             RE_PARENTHESIS_LINK.search(lines[inline_reply.start() - 1]) or | ||||||
|             RE_PARANTHESIS_LINK.match(lines[inline_reply.start()].strip())) |             RE_PARENTHESIS_LINK.match(lines[inline_reply.start()].strip())) | ||||||
|         if not links: |         if not links: | ||||||
|             return_flags[:] = [False, -1, -1] |             return_flags[:] = [False, -1, -1] | ||||||
|             return lines |             return lines | ||||||
| @@ -197,12 +322,26 @@ def preprocess(msg_body, delimiter, content_type='text/plain'): | |||||||
|     """Prepares msg_body for being stripped. |     """Prepares msg_body for being stripped. | ||||||
|  |  | ||||||
|     Replaces link brackets so that they couldn't be taken for quotation marker. |     Replaces link brackets so that they couldn't be taken for quotation marker. | ||||||
|     Splits line in two if splitter pattern preceeded by some text on the same |     Splits line in two if splitter pattern preceded by some text on the same | ||||||
|     line (done only for 'On <date> <person> wrote:' pattern). |     line (done only for 'On <date> <person> wrote:' pattern). | ||||||
|  |  | ||||||
|  |     Converts msg_body into a unicode. | ||||||
|  |     """ | ||||||
|  |     msg_body = _replace_link_brackets(msg_body) | ||||||
|  |  | ||||||
|  |     msg_body = _wrap_splitter_with_newline(msg_body, delimiter, content_type) | ||||||
|  |  | ||||||
|  |     return msg_body | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _replace_link_brackets(msg_body): | ||||||
|  |     """ | ||||||
|  |     Normalize links i.e. replace '<', '>' wrapping the link with some symbols | ||||||
|  |     so that '>' closing the link couldn't be mistakenly taken for quotation | ||||||
|  |     marker. | ||||||
|  |  | ||||||
|  |     Converts msg_body into a unicode | ||||||
|     """ |     """ | ||||||
|     # normalize links i.e. replace '<', '>' wrapping the link with some symbols |  | ||||||
|     # so that '>' closing the link couldn't be mistakenly taken for quotation |  | ||||||
|     # marker. |  | ||||||
|     def link_wrapper(link): |     def link_wrapper(link): | ||||||
|         newline_index = msg_body[:link.start()].rfind("\n") |         newline_index = msg_body[:link.start()].rfind("\n") | ||||||
|         if msg_body[newline_index + 1] == ">": |         if msg_body[newline_index + 1] == ">": | ||||||
| @@ -211,9 +350,16 @@ def preprocess(msg_body, delimiter, content_type='text/plain'): | |||||||
|             return "@@%s@@" % link.group(1) |             return "@@%s@@" % link.group(1) | ||||||
|  |  | ||||||
|     msg_body = re.sub(RE_LINK, link_wrapper, msg_body) |     msg_body = re.sub(RE_LINK, link_wrapper, msg_body) | ||||||
|  |     return msg_body | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'): | ||||||
|  |     """ | ||||||
|  |     Splits line in two if splitter pattern preceded by some text on the same | ||||||
|  |     line (done only for 'On <date> <person> wrote:' pattern. | ||||||
|  |     """ | ||||||
|     def splitter_wrapper(splitter): |     def splitter_wrapper(splitter): | ||||||
|         """Wrapps splitter with new line""" |         """Wraps splitter with new line""" | ||||||
|         if splitter.start() and msg_body[splitter.start() - 1] != '\n': |         if splitter.start() and msg_body[splitter.start() - 1] != '\n': | ||||||
|             return '%s%s' % (delimiter, splitter.group()) |             return '%s%s' % (delimiter, splitter.group()) | ||||||
|         else: |         else: | ||||||
| @@ -235,16 +381,10 @@ def postprocess(msg_body): | |||||||
|  |  | ||||||
| def extract_from_plain(msg_body): | def extract_from_plain(msg_body): | ||||||
|     """Extracts a non quoted message from provided plain text.""" |     """Extracts a non quoted message from provided plain text.""" | ||||||
|     stripped_text = msg_body |  | ||||||
|  |  | ||||||
|     delimiter = get_delimiter(msg_body) |     delimiter = get_delimiter(msg_body) | ||||||
|     msg_body = preprocess(msg_body, delimiter) |     msg_body = preprocess(msg_body, delimiter) | ||||||
|     lines = msg_body.splitlines() |  | ||||||
|  |  | ||||||
|     # don't process too long messages |     # don't process too long messages | ||||||
|     if len(lines) > MAX_LINES_COUNT: |     lines = msg_body.splitlines()[:MAX_LINES_COUNT] | ||||||
|         return stripped_text |  | ||||||
|  |  | ||||||
|     markers = mark_message_lines(lines) |     markers = mark_message_lines(lines) | ||||||
|     lines = process_marked_lines(lines, markers) |     lines = process_marked_lines(lines, markers) | ||||||
|  |  | ||||||
| @@ -268,50 +408,61 @@ def extract_from_html(msg_body): | |||||||
|     then converting html to text, |     then converting html to text, | ||||||
|     then extracting quotations from text, |     then extracting quotations from text, | ||||||
|     then checking deleted checkpoints, |     then checking deleted checkpoints, | ||||||
|     then deleting neccessary tags. |     then deleting necessary tags. | ||||||
|     """ |  | ||||||
|  |  | ||||||
|     if msg_body.strip() == '': |     Returns a unicode string. | ||||||
|  |     """ | ||||||
|  |     if msg_body.strip() == "": | ||||||
|         return msg_body |         return msg_body | ||||||
|  |  | ||||||
|     html_tree = html.document_fromstring( |     msg_body = msg_body.replace("\r\n", "\n") | ||||||
|         msg_body, |     # Cut out xml and doctype tags to avoid conflict with unicode decoding. | ||||||
|         parser=html.HTMLParser(encoding="utf-8") |     msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body) | ||||||
|     ) |     html_tree = html_document_fromstring(msg_body) | ||||||
|  |     if html_tree is None: | ||||||
|  |         return msg_body | ||||||
|  |  | ||||||
|  |     result = extract_from_html_tree(html_tree) | ||||||
|  |     if not result: | ||||||
|  |         return msg_body | ||||||
|  |  | ||||||
|  |     return result | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def extract_from_html_tree(html_tree): | ||||||
|  |     """ | ||||||
|  |     Extract not quoted message from provided parsed html tree using tags and | ||||||
|  |     plain text algorithm. | ||||||
|  |  | ||||||
|  |     Cut out the 'blockquote', 'gmail_quote' tags. | ||||||
|  |     Cut Microsoft quotations. | ||||||
|  |  | ||||||
|  |     Then use plain text algorithm to cut out splitter or | ||||||
|  |     leftover quotation. | ||||||
|  |     This works by adding checkpoint text to all html tags, | ||||||
|  |     then converting html to text, | ||||||
|  |     then extracting quotations from text, | ||||||
|  |     then checking deleted checkpoints, | ||||||
|  |     then deleting necessary tags. | ||||||
|  |     """ | ||||||
|     cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or |     cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or | ||||||
|  |                       html_quotations.cut_zimbra_quote(html_tree) or | ||||||
|                       html_quotations.cut_blockquote(html_tree) or |                       html_quotations.cut_blockquote(html_tree) or | ||||||
|                       html_quotations.cut_microsoft_quote(html_tree) or |                       html_quotations.cut_microsoft_quote(html_tree) or | ||||||
|                       html_quotations.cut_by_id(html_tree) or |                       html_quotations.cut_by_id(html_tree) or | ||||||
|                       html_quotations.cut_from_block(html_tree) |                       html_quotations.cut_from_block(html_tree) | ||||||
|                       ) |                       ) | ||||||
|  |  | ||||||
|     html_tree_copy = deepcopy(html_tree) |     html_tree_copy = deepcopy(html_tree) | ||||||
|  |  | ||||||
|     number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) |     number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) | ||||||
|     quotation_checkpoints = [False for i in xrange(number_of_checkpoints)] |     quotation_checkpoints = [False] * number_of_checkpoints | ||||||
|     msg_with_checkpoints = html.tostring(html_tree) |     plain_text = html_tree_to_text(html_tree) | ||||||
|  |     plain_text = preprocess(plain_text, '\n', content_type='text/html') | ||||||
|     h = html2text.HTML2Text() |  | ||||||
|     h.body_width = 0  # generate plain text without wrap |  | ||||||
|  |  | ||||||
|     # html2text adds unnecessary star symbols. Remove them. |  | ||||||
|     # Mask star symbols |  | ||||||
|     msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432') |  | ||||||
|     plain_text = h.handle(msg_with_checkpoints) |  | ||||||
|     # Remove created star symbols |  | ||||||
|     plain_text = plain_text.replace('*', '') |  | ||||||
|     # Unmask saved star symbols |  | ||||||
|     plain_text = plain_text.replace('3423oorkg432', '*') |  | ||||||
|  |  | ||||||
|     delimiter = get_delimiter(plain_text) |  | ||||||
|  |  | ||||||
|     plain_text = preprocess(plain_text, delimiter, content_type='text/html') |  | ||||||
|     lines = plain_text.splitlines() |     lines = plain_text.splitlines() | ||||||
|  |  | ||||||
|     # Don't process too long messages |     # Don't process too long messages | ||||||
|     if len(lines) > MAX_LINES_COUNT: |     if len(lines) > MAX_LINES_COUNT: | ||||||
|         return msg_body |         return None | ||||||
|  |  | ||||||
|     # Collect checkpoints on each line |     # Collect checkpoints on each line | ||||||
|     line_checkpoints = [ |     line_checkpoints = [ | ||||||
| @@ -329,30 +480,174 @@ def extract_from_html(msg_body): | |||||||
|     process_marked_lines(lines, markers, return_flags) |     process_marked_lines(lines, markers, return_flags) | ||||||
|     lines_were_deleted, first_deleted, last_deleted = return_flags |     lines_were_deleted, first_deleted, last_deleted = return_flags | ||||||
|  |  | ||||||
|  |     if not lines_were_deleted and not cut_quotations: | ||||||
|  |         return None | ||||||
|  |  | ||||||
|     if lines_were_deleted: |     if lines_were_deleted: | ||||||
|         #collect checkpoints from deleted lines |         #collect checkpoints from deleted lines | ||||||
|         for i in xrange(first_deleted, last_deleted): |         for i in range(first_deleted, last_deleted): | ||||||
|             for checkpoint in line_checkpoints[i]: |             for checkpoint in line_checkpoints[i]: | ||||||
|                 quotation_checkpoints[checkpoint] = True |                 quotation_checkpoints[checkpoint] = True | ||||||
|     else: |  | ||||||
|         if cut_quotations: |  | ||||||
|             return html.tostring(html_tree_copy) |  | ||||||
|         else: |  | ||||||
|             return msg_body |  | ||||||
|  |  | ||||||
|         # Remove tags with quotation checkpoints |         # Remove tags with quotation checkpoints | ||||||
|         html_quotations.delete_quotation_tags( |         html_quotations.delete_quotation_tags( | ||||||
|             html_tree_copy, 0, quotation_checkpoints |             html_tree_copy, 0, quotation_checkpoints | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|     return html.tostring(html_tree_copy) |     if _readable_text_empty(html_tree_copy): | ||||||
|  |         return None | ||||||
|  |  | ||||||
|  |     # NOTE: We remove_namespaces() because we are using an HTML5 Parser, HTML | ||||||
|  |     # parsers do not recognize namespaces in HTML tags. As such the rendered | ||||||
|  |     # HTML tags are no longer recognizable HTML tags. Example: <o:p> becomes | ||||||
|  |     # <oU0003Ap>. When we port this to golang we should look into using an | ||||||
|  |     # XML Parser NOT and HTML5 Parser since we do not know what input a | ||||||
|  |     # customer will send us. Switching to a common XML parser in python | ||||||
|  |     # opens us up to a host of vulnerabilities. | ||||||
|  |     # See https://docs.python.org/3/library/xml.html#xml-vulnerabilities | ||||||
|  |     # | ||||||
|  |     # The down sides to removing the namespaces is that customers might | ||||||
|  |     # judge the XML namespaces important. If that is the case then support | ||||||
|  |     # should encourage customers to preform XML parsing of the un-stripped | ||||||
|  |     # body to get the full unmodified XML payload. | ||||||
|  |     # | ||||||
|  |     # Alternatives to this approach are | ||||||
|  |     # 1. Ignore the U0003A in tag names and let the customer deal with it. | ||||||
|  |     #    This is not ideal, as most customers use stripped-html for viewing | ||||||
|  |     #    emails sent from a recipient, as such they cannot control the HTML | ||||||
|  |     #    provided by a recipient. | ||||||
|  |     # 2. Preform a string replace of 'U0003A' to ':' on the rendered HTML | ||||||
|  |     #    string. While this would solve the issue simply, it runs the risk | ||||||
|  |     #    of replacing data outside the <tag> which might be essential to | ||||||
|  |     #    the customer. | ||||||
|  |     remove_namespaces(html_tree_copy) | ||||||
|  |     s = html.tostring(html_tree_copy, encoding="ascii") | ||||||
|  |     if not s: | ||||||
|  |         return None | ||||||
|  |  | ||||||
|  |     return s.decode("ascii") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def remove_namespaces(root): | ||||||
|  |     """ | ||||||
|  |     Given the root of an HTML document iterate through all the elements | ||||||
|  |     and remove any namespaces that might have been provided and remove | ||||||
|  |     any attributes that contain a namespace | ||||||
|  |  | ||||||
|  |     <html xmlns:o="urn:schemas-microsoft-com:office:office"> | ||||||
|  |     becomes | ||||||
|  |     <html> | ||||||
|  |  | ||||||
|  |     <o:p>Hi</o:p> | ||||||
|  |     becomes | ||||||
|  |     <p>Hi</p> | ||||||
|  |  | ||||||
|  |     Start tags do NOT have a namespace; COLON characters have no special meaning. | ||||||
|  |     if we don't remove the namespace the parser translates the tag name into a | ||||||
|  |     unicode representation. For example <o:p> becomes <oU0003Ap> | ||||||
|  |  | ||||||
|  |     See https://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#start-tags | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     """ | ||||||
|  |     for child in root.iter(): | ||||||
|  |         for key, value in child.attrib.items(): | ||||||
|  |             # If the attribute includes a colon | ||||||
|  |             if key.rfind("U0003A") != -1: | ||||||
|  |                 child.attrib.pop(key) | ||||||
|  |  | ||||||
|  |         # If the tag includes a colon | ||||||
|  |         idx = child.tag.rfind("U0003A") | ||||||
|  |         if idx != -1: | ||||||
|  |             child.tag = child.tag[idx+6:] | ||||||
|  |  | ||||||
|  |     return root | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def split_emails(msg): | ||||||
|  |     """ | ||||||
|  |     Given a message (which may consist of an email conversation thread with | ||||||
|  |     multiple emails), mark the lines to identify split lines, content lines and | ||||||
|  |     empty lines. | ||||||
|  |  | ||||||
|  |     Correct the split line markers inside header blocks. Header blocks are | ||||||
|  |     identified by the regular expression RE_HEADER. | ||||||
|  |  | ||||||
|  |     Return the corrected markers | ||||||
|  |     """ | ||||||
|  |     msg_body = _replace_link_brackets(msg) | ||||||
|  |  | ||||||
|  |     # don't process too long messages | ||||||
|  |     lines = msg_body.splitlines()[:MAX_LINES_COUNT] | ||||||
|  |     markers = remove_initial_spaces_and_mark_message_lines(lines) | ||||||
|  |  | ||||||
|  |     markers = _mark_quoted_email_splitlines(markers, lines) | ||||||
|  |  | ||||||
|  |     # we don't want splitlines in header blocks | ||||||
|  |     markers = _correct_splitlines_in_headers(markers, lines) | ||||||
|  |  | ||||||
|  |     return markers | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _mark_quoted_email_splitlines(markers, lines): | ||||||
|  |     """ | ||||||
|  |     When there are headers indented with '>' characters, this method will | ||||||
|  |     attempt to identify if the header is a splitline header. If it is, then we | ||||||
|  |     mark it with 's' instead of leaving it as 'm' and return the new markers. | ||||||
|  |     """ | ||||||
|  |     # Create a list of markers to easily alter specific characters | ||||||
|  |     markerlist = list(markers) | ||||||
|  |     for i, line in enumerate(lines): | ||||||
|  |         if markerlist[i] != 'm': | ||||||
|  |             continue | ||||||
|  |         for pattern in SPLITTER_PATTERNS: | ||||||
|  |             matcher = re.search(pattern, line) | ||||||
|  |             if matcher: | ||||||
|  |                 markerlist[i] = 's' | ||||||
|  |                 break | ||||||
|  |  | ||||||
|  |     return "".join(markerlist) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _correct_splitlines_in_headers(markers, lines): | ||||||
|  |     """ | ||||||
|  |     Corrects markers by removing splitlines deemed to be inside header blocks. | ||||||
|  |     """ | ||||||
|  |     updated_markers = "" | ||||||
|  |     i = 0 | ||||||
|  |     in_header_block = False | ||||||
|  |     for m in markers: | ||||||
|  |         # Only set in_header_block flag when we hit an 's' and line is a header | ||||||
|  |         if m == 's': | ||||||
|  |             if not in_header_block: | ||||||
|  |                 if bool(re.search(RE_HEADER, lines[i])): | ||||||
|  |                     in_header_block = True | ||||||
|  |             else: | ||||||
|  |                 if QUOT_PATTERN.match(lines[i]): | ||||||
|  |                     m = 'm' | ||||||
|  |                 else: | ||||||
|  |                     m = 't' | ||||||
|  |  | ||||||
|  |         # If the line is not a header line, set in_header_block false. | ||||||
|  |         if not bool(re.search(RE_HEADER, lines[i])): | ||||||
|  |             in_header_block = False | ||||||
|  |  | ||||||
|  |         # Add the marker to the new updated markers string. | ||||||
|  |         updated_markers += m | ||||||
|  |         i += 1 | ||||||
|  |  | ||||||
|  |     return updated_markers | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _readable_text_empty(html_tree): | ||||||
|  |     return not bool(html_tree_to_text(html_tree).strip()) | ||||||
|  |  | ||||||
|  |  | ||||||
| def is_splitter(line): | def is_splitter(line): | ||||||
|     ''' |     """ | ||||||
|     Returns Matcher object if provided string is a splitter and |     Returns Matcher object if provided string is a splitter and | ||||||
|     None otherwise. |     None otherwise. | ||||||
|     ''' |     """ | ||||||
|     for pattern in SPLITTER_PATTERNS: |     for pattern in SPLITTER_PATTERNS: | ||||||
|         matcher = re.match(pattern, line) |         matcher = re.match(pattern, line) | ||||||
|         if matcher: |         if matcher: | ||||||
| @@ -360,12 +655,12 @@ def is_splitter(line): | |||||||
|  |  | ||||||
|  |  | ||||||
| def text_content(context): | def text_content(context): | ||||||
|     '''XPath Extension function to return a node text content.''' |     """XPath Extension function to return a node text content.""" | ||||||
|     return context.context_node.text_content().strip() |     return context.context_node.xpath("string()").strip() | ||||||
|  |  | ||||||
|  |  | ||||||
| def tail(context): | def tail(context): | ||||||
|     '''XPath Extension function to return a node tail text.''' |     """XPath Extension function to return a node tail text.""" | ||||||
|     return context.context_node.tail or '' |     return context.context_node.tail or '' | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -20,29 +20,17 @@ trained against, don't forget to regenerate: | |||||||
| * signature/data/classifier | * signature/data/classifier | ||||||
| """ | """ | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| import os | import os | ||||||
| import sys |  | ||||||
| from cStringIO import StringIO |  | ||||||
|  |  | ||||||
| from . import extraction | from talon.signature import extraction | ||||||
| from . extraction import extract | from talon.signature.extraction import extract | ||||||
| from . learning import classifier | from talon.signature.learning import classifier | ||||||
|  |  | ||||||
|  |  | ||||||
| DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') |  | ||||||
|  |  | ||||||
| EXTRACTOR_FILENAME = os.path.join(DATA_DIR, 'classifier') |  | ||||||
| EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data') |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def initialize(): | def initialize(): | ||||||
|     try: |     data_dir = os.path.join(os.path.dirname(__file__), 'data') | ||||||
|         # redirect output |     extractor_filename = os.path.join(data_dir, 'classifier') | ||||||
|         so, sys.stdout = sys.stdout, StringIO() |     extractor_data_filename = os.path.join(data_dir, 'train.data') | ||||||
|  |     extraction.EXTRACTOR = classifier.load(extractor_filename, | ||||||
|         extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME, |                                            extractor_data_filename) | ||||||
|                                                EXTRACTOR_DATA) |  | ||||||
|         sys.stdout = so |  | ||||||
|     except Exception, e: |  | ||||||
|         raise Exception( |  | ||||||
|             "Failed initializing signature parsing with classifiers", e) |  | ||||||
|   | |||||||
| @@ -1,14 +1,15 @@ | |||||||
|  | from __future__ import absolute_import | ||||||
|  |  | ||||||
| import logging | import logging | ||||||
|  |  | ||||||
| import regex as re | import regex as re | ||||||
|  |  | ||||||
| from talon.utils import get_delimiter |  | ||||||
| from talon.signature.constants import (SIGNATURE_MAX_LINES, | from talon.signature.constants import (SIGNATURE_MAX_LINES, | ||||||
|                                        TOO_LONG_SIGNATURE_LINE) |                                        TOO_LONG_SIGNATURE_LINE) | ||||||
|  | from talon.utils import get_delimiter | ||||||
|  |  | ||||||
| log = logging.getLogger(__name__) | log = logging.getLogger(__name__) | ||||||
|  |  | ||||||
|  |  | ||||||
| # regex to fetch signature based on common signature words | # regex to fetch signature based on common signature words | ||||||
| RE_SIGNATURE = re.compile(r''' | RE_SIGNATURE = re.compile(r''' | ||||||
|                ( |                ( | ||||||
| @@ -27,7 +28,6 @@ RE_SIGNATURE = re.compile(r''' | |||||||
|                ) |                ) | ||||||
|                ''', re.I | re.X | re.M | re.S) |                ''', re.I | re.X | re.M | re.S) | ||||||
|  |  | ||||||
|  |  | ||||||
| # signatures appended by phone email clients | # signatures appended by phone email clients | ||||||
| RE_PHONE_SIGNATURE = re.compile(r''' | RE_PHONE_SIGNATURE = re.compile(r''' | ||||||
|                ( |                ( | ||||||
| @@ -44,12 +44,11 @@ RE_PHONE_SIGNATURE = re.compile(r''' | |||||||
|                ) |                ) | ||||||
|                ''', re.I | re.X | re.M | re.S) |                ''', re.I | re.X | re.M | re.S) | ||||||
|  |  | ||||||
|  |  | ||||||
| # see _mark_candidate_indexes() for details | # see _mark_candidate_indexes() for details | ||||||
| # c - could be signature line | # c - could be signature line | ||||||
| # d - line starts with dashes (could be signature or list item) | # d - line starts with dashes (could be signature or list item) | ||||||
| # l - long line | # l - long line | ||||||
| RE_SIGNATURE_CANDIDAATE = re.compile(r''' | RE_SIGNATURE_CANDIDATE = re.compile(r''' | ||||||
|     (?P<candidate>c+d)[^d] |     (?P<candidate>c+d)[^d] | ||||||
|     | |     | | ||||||
|     (?P<candidate>c+d)$ |     (?P<candidate>c+d)$ | ||||||
| @@ -111,7 +110,7 @@ def extract_signature(msg_body): | |||||||
|  |  | ||||||
|             return (stripped_body.strip(), |             return (stripped_body.strip(), | ||||||
|                     signature.strip()) |                     signature.strip()) | ||||||
|     except Exception, e: |     except Exception: | ||||||
|         log.exception('ERROR extracting signature') |         log.exception('ERROR extracting signature') | ||||||
|         return (msg_body, None) |         return (msg_body, None) | ||||||
|  |  | ||||||
| @@ -162,7 +161,7 @@ def _mark_candidate_indexes(lines, candidate): | |||||||
|     'cdc' |     'cdc' | ||||||
|     """ |     """ | ||||||
|     # at first consider everything to be potential signature lines |     # at first consider everything to be potential signature lines | ||||||
|     markers = bytearray('c'*len(candidate)) |     markers = list('c' * len(candidate)) | ||||||
|  |  | ||||||
|     # mark lines starting from bottom up |     # mark lines starting from bottom up | ||||||
|     for i, line_idx in reversed(list(enumerate(candidate))): |     for i, line_idx in reversed(list(enumerate(candidate))): | ||||||
| @@ -173,7 +172,7 @@ def _mark_candidate_indexes(lines, candidate): | |||||||
|             if line.startswith('-') and line.strip("-"): |             if line.startswith('-') and line.strip("-"): | ||||||
|                 markers[i] = 'd' |                 markers[i] = 'd' | ||||||
|  |  | ||||||
|     return markers |     return "".join(markers) | ||||||
|  |  | ||||||
|  |  | ||||||
| def _process_marked_candidate_indexes(candidate, markers): | def _process_marked_candidate_indexes(candidate, markers): | ||||||
| @@ -184,5 +183,5 @@ def _process_marked_candidate_indexes(candidate, markers): | |||||||
|     >>> _process_marked_candidate_indexes([9, 12, 14, 15, 17], 'clddc') |     >>> _process_marked_candidate_indexes([9, 12, 14, 15, 17], 'clddc') | ||||||
|     [15, 17] |     [15, 17] | ||||||
|     """ |     """ | ||||||
|     match = RE_SIGNATURE_CANDIDAATE.match(markers[::-1]) |     match = RE_SIGNATURE_CANDIDATE.match(markers[::-1]) | ||||||
|     return candidate[-match.end('candidate'):] if match else [] |     return candidate[-match.end('candidate'):] if match else [] | ||||||
|   | |||||||
							
								
								
									
										1
									
								
								talon/signature/data/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								talon/signature/data/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | |||||||
|  |  | ||||||
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								talon/signature/data/classifier_01.npy
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								talon/signature/data/classifier_01.npy
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								talon/signature/data/classifier_02.npy
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								talon/signature/data/classifier_02.npy
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								talon/signature/data/classifier_03.npy
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								talon/signature/data/classifier_03.npy
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								talon/signature/data/classifier_04.npy
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								talon/signature/data/classifier_04.npy
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								talon/signature/data/classifier_05.npy
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								talon/signature/data/classifier_05.npy
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| @@ -1,19 +1,15 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
| import os | from __future__ import absolute_import | ||||||
|  |  | ||||||
| import logging | import logging | ||||||
|  |  | ||||||
|  | import numpy | ||||||
| import regex as re | import regex as re | ||||||
| from PyML import SparseDataSet |  | ||||||
|  |  | ||||||
| from talon.constants import RE_DELIMITER |  | ||||||
| from talon.signature.constants import (SIGNATURE_MAX_LINES, |  | ||||||
|                                        TOO_LONG_SIGNATURE_LINE) |  | ||||||
| from talon.signature.learning.featurespace import features, build_pattern |  | ||||||
| from talon.utils import get_delimiter |  | ||||||
| from talon.signature.bruteforce import get_signature_candidate | from talon.signature.bruteforce import get_signature_candidate | ||||||
|  | from talon.signature.learning.featurespace import features, build_pattern | ||||||
| from talon.signature.learning.helpers import has_signature | from talon.signature.learning.helpers import has_signature | ||||||
|  | from talon.utils import get_delimiter | ||||||
|  |  | ||||||
| log = logging.getLogger(__name__) | log = logging.getLogger(__name__) | ||||||
|  |  | ||||||
| @@ -36,8 +32,8 @@ RE_REVERSE_SIGNATURE = re.compile(r''' | |||||||
|  |  | ||||||
| def is_signature_line(line, sender, classifier): | def is_signature_line(line, sender, classifier): | ||||||
|     '''Checks if the line belongs to signature. Returns True or False.''' |     '''Checks if the line belongs to signature. Returns True or False.''' | ||||||
|     data = SparseDataSet([build_pattern(line, features(sender))]) |     data = numpy.array(build_pattern(line, features(sender))).reshape(1, -1) | ||||||
|     return classifier.decisionFunc(data, 0) > 0 |     return classifier.predict(data) > 0 | ||||||
|  |  | ||||||
|  |  | ||||||
| def extract(body, sender): | def extract(body, sender): | ||||||
| @@ -61,7 +57,7 @@ def extract(body, sender): | |||||||
|                 text = delimiter.join(text) |                 text = delimiter.join(text) | ||||||
|                 if text.strip(): |                 if text.strip(): | ||||||
|                     return (text, delimiter.join(signature)) |                     return (text, delimiter.join(signature)) | ||||||
|     except Exception, e: |     except Exception as e: | ||||||
|         log.exception('ERROR when extracting signature with classifiers') |         log.exception('ERROR when extracting signature with classifiers') | ||||||
|  |  | ||||||
|     return (body, None) |     return (body, None) | ||||||
| @@ -84,7 +80,7 @@ def _mark_lines(lines, sender): | |||||||
|     candidate = get_signature_candidate(lines) |     candidate = get_signature_candidate(lines) | ||||||
|  |  | ||||||
|     # at first consider everything to be text no signature |     # at first consider everything to be text no signature | ||||||
|     markers = bytearray('t'*len(lines)) |     markers = list('t' * len(lines)) | ||||||
|  |  | ||||||
|     # mark lines starting from bottom up |     # mark lines starting from bottom up | ||||||
|     # mark only lines that belong to candidate |     # mark only lines that belong to candidate | ||||||
| @@ -99,7 +95,7 @@ def _mark_lines(lines, sender): | |||||||
|         elif is_signature_line(line, sender, EXTRACTOR): |         elif is_signature_line(line, sender, EXTRACTOR): | ||||||
|             markers[j] = 's' |             markers[j] = 's' | ||||||
|  |  | ||||||
|     return markers |     return "".join(markers) | ||||||
|  |  | ||||||
|  |  | ||||||
| def _process_marked_lines(lines, markers): | def _process_marked_lines(lines, markers): | ||||||
| @@ -114,3 +110,4 @@ def _process_marked_lines(lines, markers): | |||||||
|         return (lines[:-signature.end()], lines[-signature.end():]) |         return (lines[:-signature.end()], lines[-signature.end():]) | ||||||
|  |  | ||||||
|     return (lines, None) |     return (lines, None) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -5,32 +5,65 @@ The classifier could be used to detect if a certain line of the message | |||||||
| body belongs to the signature. | body belongs to the signature. | ||||||
| """ | """ | ||||||
|  |  | ||||||
| import os | from __future__ import absolute_import | ||||||
| import sys |  | ||||||
|  |  | ||||||
| from PyML import SparseDataSet, SVM | from numpy import genfromtxt | ||||||
|  | import joblib | ||||||
|  | from sklearn.svm import LinearSVC | ||||||
|  |  | ||||||
|  |  | ||||||
| def init(): | def init(): | ||||||
|     '''Inits classifier with optimal options.''' |     """Inits classifier with optimal options.""" | ||||||
|     return SVM(C=10, optimization='liblinear') |     return LinearSVC(C=10.0) | ||||||
|  |  | ||||||
|  |  | ||||||
| def train(classifier, train_data_filename, save_classifier_filename=None): | def train(classifier, train_data_filename, save_classifier_filename=None): | ||||||
|     '''Trains and saves classifier so that it could be easily loaded later.''' |     """Trains and saves classifier so that it could be easily loaded later.""" | ||||||
|     data = SparseDataSet(train_data_filename, labelsColumn=-1) |     file_data = genfromtxt(train_data_filename, delimiter=",") | ||||||
|     classifier.train(data) |     train_data, labels = file_data[:, :-1], file_data[:, -1] | ||||||
|  |     classifier.fit(train_data, labels) | ||||||
|  |  | ||||||
|     if save_classifier_filename: |     if save_classifier_filename: | ||||||
|         classifier.save(save_classifier_filename) |         joblib.dump(classifier, save_classifier_filename) | ||||||
|     return classifier |     return classifier | ||||||
|  |  | ||||||
|  |  | ||||||
| def load(saved_classifier_filename, train_data_filename): | def load(saved_classifier_filename, train_data_filename): | ||||||
|     """Loads saved classifier. |     """Loads saved classifier. """ | ||||||
|  |     try: | ||||||
|  |         return joblib.load(saved_classifier_filename) | ||||||
|  |     except Exception: | ||||||
|  |         import sys | ||||||
|  |         if sys.version_info > (3, 0): | ||||||
|  |             return load_compat(saved_classifier_filename) | ||||||
|  |  | ||||||
|     Classifier should be loaded with the same data it was trained against |         raise | ||||||
|     """ |  | ||||||
|     train_data = SparseDataSet(train_data_filename, labelsColumn=-1) |  | ||||||
|     classifier = init() | def load_compat(saved_classifier_filename): | ||||||
|     classifier.load(saved_classifier_filename, train_data) |     import os | ||||||
|     return classifier |     import pickle | ||||||
|  |     import tempfile | ||||||
|  |  | ||||||
|  |     # we need to switch to the data path to properly load the related _xx.npy files | ||||||
|  |     cwd = os.getcwd() | ||||||
|  |     os.chdir(os.path.dirname(saved_classifier_filename)) | ||||||
|  |  | ||||||
|  |     # convert encoding using pick.load and write to temp file which we'll tell joblib to use | ||||||
|  |     pickle_file = open(saved_classifier_filename, 'rb') | ||||||
|  |     classifier = pickle.load(pickle_file, encoding='latin1') | ||||||
|  |  | ||||||
|  |     try: | ||||||
|  |         # save our conversion if permissions allow | ||||||
|  |         joblib.dump(classifier, saved_classifier_filename) | ||||||
|  |     except Exception: | ||||||
|  |         # can't write to classifier, use a temp file | ||||||
|  |         tmp = tempfile.SpooledTemporaryFile() | ||||||
|  |         joblib.dump(classifier, tmp) | ||||||
|  |         saved_classifier_filename = tmp | ||||||
|  |  | ||||||
|  |     # important, use joblib.load before switching back to original cwd | ||||||
|  |     jb_classifier = joblib.load(saved_classifier_filename) | ||||||
|  |     os.chdir(cwd) | ||||||
|  |  | ||||||
|  |     return jb_classifier | ||||||
|   | |||||||
| @@ -16,13 +16,16 @@ suffix and the corresponding sender file has the same name except for the | |||||||
| suffix which should be `_sender`. | suffix which should be `_sender`. | ||||||
| """ | """ | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
|  |  | ||||||
| import os | import os | ||||||
|  |  | ||||||
| import regex as re | import regex as re | ||||||
|  | from six.moves import range | ||||||
|  |  | ||||||
| from talon.signature.constants import SIGNATURE_MAX_LINES | from talon.signature.constants import SIGNATURE_MAX_LINES | ||||||
| from talon.signature.learning.featurespace import build_pattern, features | from talon.signature.learning.featurespace import build_pattern, features | ||||||
|  |  | ||||||
|  |  | ||||||
| SENDER_SUFFIX = '_sender' | SENDER_SUFFIX = '_sender' | ||||||
| BODY_SUFFIX = '_body' | BODY_SUFFIX = '_body' | ||||||
|  |  | ||||||
| @@ -55,9 +58,14 @@ def parse_msg_sender(filename, sender_known=True): | |||||||
|     algorithm: |     algorithm: | ||||||
|     >>> parse_msg_sender(filename, False) |     >>> parse_msg_sender(filename, False) | ||||||
|     """ |     """ | ||||||
|  |     import sys | ||||||
|  |     kwargs = {} | ||||||
|  |     if sys.version_info > (3, 0): | ||||||
|  |         kwargs["encoding"] = "utf8" | ||||||
|  |  | ||||||
|     sender, msg = None, None |     sender, msg = None, None | ||||||
|     if os.path.isfile(filename) and not is_sender_filename(filename): |     if os.path.isfile(filename) and not is_sender_filename(filename): | ||||||
|         with open(filename) as f: |         with open(filename, **kwargs) as f: | ||||||
|             msg = f.read() |             msg = f.read() | ||||||
|             sender = u'' |             sender = u'' | ||||||
|             if sender_known: |             if sender_known: | ||||||
| @@ -144,7 +152,7 @@ def build_extraction_dataset(folder, dataset_filename, | |||||||
|             if not sender or not msg: |             if not sender or not msg: | ||||||
|                 continue |                 continue | ||||||
|             lines = msg.splitlines() |             lines = msg.splitlines() | ||||||
|             for i in xrange(1, min(SIGNATURE_MAX_LINES, |             for i in range(1, min(SIGNATURE_MAX_LINES, | ||||||
|                                   len(lines)) + 1): |                                   len(lines)) + 1): | ||||||
|                 line = lines[-i] |                 line = lines[-i] | ||||||
|                 label = -1 |                 label = -1 | ||||||
|   | |||||||
| @@ -1,14 +1,18 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
| """ The module provides functions for convertion of a message body/body lines | """ The module provides functions for conversion of a message body/body lines | ||||||
| into classifiers features space. | into classifiers features space. | ||||||
|  |  | ||||||
| The body and the message sender string are converted into unicode before | The body and the message sender string are converted into unicode before | ||||||
| applying features to them. | applying features to them. | ||||||
| """ | """ | ||||||
|  |  | ||||||
| from talon.signature.constants import SIGNATURE_MAX_LINES | from __future__ import absolute_import | ||||||
|  | from talon.signature.constants import (SIGNATURE_MAX_LINES, | ||||||
|  |                                        TOO_LONG_SIGNATURE_LINE) | ||||||
| from talon.signature.learning.helpers import * | from talon.signature.learning.helpers import * | ||||||
|  | from six.moves import zip | ||||||
|  | from functools import reduce | ||||||
|  |  | ||||||
|  |  | ||||||
| def features(sender=''): | def features(sender=''): | ||||||
| @@ -20,7 +24,7 @@ def features(sender=''): | |||||||
|         # This one is not from paper. |         # This one is not from paper. | ||||||
|         # Line is too long. |         # Line is too long. | ||||||
|         # This one is less aggressive than `Line is too short` |         # This one is less aggressive than `Line is too short` | ||||||
|         lambda line: 1 if len(line) > 60 else 0, |         lambda line: 1 if len(line) > TOO_LONG_SIGNATURE_LINE else 0, | ||||||
|         # Line contains email pattern. |         # Line contains email pattern. | ||||||
|         binary_regex_search(RE_EMAIL), |         binary_regex_search(RE_EMAIL), | ||||||
|         # Line contains url. |         # Line contains url. | ||||||
| @@ -47,9 +51,9 @@ def apply_features(body, features): | |||||||
|     '''Applies features to message body lines. |     '''Applies features to message body lines. | ||||||
|  |  | ||||||
|     Returns list of lists. Each of the lists corresponds to the body line |     Returns list of lists. Each of the lists corresponds to the body line | ||||||
|     and is constituted by the numbers of features occurances (0 or 1). |     and is constituted by the numbers of features occurrences (0 or 1). | ||||||
|     E.g. if element j of list i equals 1 this means that |     E.g. if element j of list i equals 1 this means that | ||||||
|     feature j occured in line i (counting from the last line of the body). |     feature j occurred in line i (counting from the last line of the body). | ||||||
|     ''' |     ''' | ||||||
|     # collect all non empty lines |     # collect all non empty lines | ||||||
|     lines = [line for line in body.splitlines() if line.strip()] |     lines = [line for line in body.splitlines() if line.strip()] | ||||||
| @@ -66,7 +70,7 @@ def build_pattern(body, features): | |||||||
|     '''Converts body into a pattern i.e. a point in the features space. |     '''Converts body into a pattern i.e. a point in the features space. | ||||||
|  |  | ||||||
|     Applies features to the body lines and sums up the results. |     Applies features to the body lines and sums up the results. | ||||||
|     Elements of the pattern indicate how many times a certain feature occured |     Elements of the pattern indicate how many times a certain feature occurred | ||||||
|     in the last lines of the body. |     in the last lines of the body. | ||||||
|     ''' |     ''' | ||||||
|     line_patterns = apply_features(body, features) |     line_patterns = apply_features(body, features) | ||||||
|   | |||||||
| @@ -5,20 +5,17 @@ | |||||||
| * regexp's constants used when evaluating signature's features | * regexp's constants used when evaluating signature's features | ||||||
|  |  | ||||||
| """ | """ | ||||||
|  |  | ||||||
| import unicodedata | import unicodedata | ||||||
| import regex as re |  | ||||||
|  |  | ||||||
| from talon.utils import to_unicode | import regex as re | ||||||
|  |  | ||||||
| from talon.signature.constants import SIGNATURE_MAX_LINES | from talon.signature.constants import SIGNATURE_MAX_LINES | ||||||
|  |  | ||||||
|  |  | ||||||
| rc = re.compile | rc = re.compile | ||||||
|  |  | ||||||
| RE_EMAIL = rc('@') | RE_EMAIL = rc('\S@\S') | ||||||
| RE_RELAX_PHONE = rc('.*(\(? ?[\d]{2,3} ?\)?.{,3}){2,}') | RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}') | ||||||
| RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''') | RE_URL = rc(r"""https?://|www\.[\S]+\.[\S]""") | ||||||
|  |  | ||||||
| # Taken from: | # Taken from: | ||||||
| # http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf | # http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf | ||||||
| @@ -40,14 +37,6 @@ RE_SIGNATURE_WORDS = rc(('(T|t)hank.*,|(B|b)est|(R|r)egards|' | |||||||
| # Line contains a pattern like Vitor R. Carvalho or William W. Cohen. | # Line contains a pattern like Vitor R. Carvalho or William W. Cohen. | ||||||
| RE_NAME = rc('[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+') | RE_NAME = rc('[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+') | ||||||
|  |  | ||||||
| # Pattern to match if e.g. 'Sender:' header field has sender names. |  | ||||||
| SENDER_WITH_NAME_PATTERN = '([\s]*[\S]+,?)+[\s]*<.*>.*' |  | ||||||
| RE_SENDER_WITH_NAME = rc(SENDER_WITH_NAME_PATTERN) |  | ||||||
|  |  | ||||||
| # Reply line clue line endings, as in regular expression: |  | ||||||
| # " wrote:$" or " writes:$" |  | ||||||
| RE_CLUE_LINE_END = rc('.*(W|w)rotes?:$') |  | ||||||
|  |  | ||||||
| INVALID_WORD_START = rc('\(|\+|[\d]') | INVALID_WORD_START = rc('\(|\+|[\d]') | ||||||
|  |  | ||||||
| BAD_SENDER_NAMES = [ | BAD_SENDER_NAMES = [ | ||||||
| @@ -62,7 +51,7 @@ BAD_SENDER_NAMES = [ | |||||||
|  |  | ||||||
|  |  | ||||||
| def binary_regex_search(prog): | def binary_regex_search(prog): | ||||||
|     '''Returns a function that returns 1 or 0 depending on regex search result. |     """Returns a function that returns 1 or 0 depending on regex search result. | ||||||
|  |  | ||||||
|     If regular expression compiled into prog is present in a string |     If regular expression compiled into prog is present in a string | ||||||
|     the result of calling the returned function with the string will be 1 |     the result of calling the returned function with the string will be 1 | ||||||
| @@ -73,12 +62,12 @@ def binary_regex_search(prog): | |||||||
|     1 |     1 | ||||||
|     >>> binary_regex_search(re.compile("12"))("34") |     >>> binary_regex_search(re.compile("12"))("34") | ||||||
|     0 |     0 | ||||||
|     ''' |     """ | ||||||
|     return lambda s: 1 if prog.search(s) else 0 |     return lambda s: 1 if prog.search(s) else 0 | ||||||
|  |  | ||||||
|  |  | ||||||
| def binary_regex_match(prog): | def binary_regex_match(prog): | ||||||
|     '''Returns a function that returns 1 or 0 depending on regex match result. |     """Returns a function that returns 1 or 0 depending on regex match result. | ||||||
|  |  | ||||||
|     If a string matches regular expression compiled into prog |     If a string matches regular expression compiled into prog | ||||||
|     the result of calling the returned function with the string will be 1 |     the result of calling the returned function with the string will be 1 | ||||||
| @@ -89,12 +78,12 @@ def binary_regex_match(prog): | |||||||
|     1 |     1 | ||||||
|     >>> binary_regex_match(re.compile("12"))("3 12") |     >>> binary_regex_match(re.compile("12"))("3 12") | ||||||
|     0 |     0 | ||||||
|     ''' |     """ | ||||||
|     return lambda s: 1 if prog.match(s) else 0 |     return lambda s: 1 if prog.match(s) else 0 | ||||||
|  |  | ||||||
|  |  | ||||||
| def flatten_list(list_to_flatten): | def flatten_list(list_to_flatten): | ||||||
|     """Simple list comprehesion to flatten list. |     """Simple list comprehension to flatten list. | ||||||
|  |  | ||||||
|     >>> flatten_list([[1, 2], [3, 4, 5]]) |     >>> flatten_list([[1, 2], [3, 4, 5]]) | ||||||
|     [1, 2, 3, 4, 5] |     [1, 2, 3, 4, 5] | ||||||
| @@ -109,7 +98,7 @@ def flatten_list(list_to_flatten): | |||||||
|  |  | ||||||
|  |  | ||||||
| def contains_sender_names(sender): | def contains_sender_names(sender): | ||||||
|     '''Returns a functions to search sender\'s name or it\'s part. |     """Returns a functions to search sender\'s name or it\'s part. | ||||||
|  |  | ||||||
|     >>> feature = contains_sender_names("Sergey N.  Obukhov <xxx@example.com>") |     >>> feature = contains_sender_names("Sergey N.  Obukhov <xxx@example.com>") | ||||||
|     >>> feature("Sergey Obukhov") |     >>> feature("Sergey Obukhov") | ||||||
| @@ -122,13 +111,13 @@ def contains_sender_names(sender): | |||||||
|     1 |     1 | ||||||
|     >>> contains_sender_names("<serobnic@mail.ru>")("serobnic") |     >>> contains_sender_names("<serobnic@mail.ru>")("serobnic") | ||||||
|     1 |     1 | ||||||
|     ''' |     """ | ||||||
|     names = '( |$)|'.join(flatten_list([[e, e.capitalize()] |     names = '( |$)|'.join(flatten_list([[e, e.capitalize()] | ||||||
|                                         for e in extract_names(sender)])) |                                         for e in extract_names(sender)])) | ||||||
|     names = names or sender |     names = names or sender | ||||||
|     if names != '': |     if names != '': | ||||||
|         return binary_regex_search(re.compile(names)) |         return binary_regex_search(re.compile(names)) | ||||||
|     return lambda s: False |     return lambda s: 0 | ||||||
|  |  | ||||||
|  |  | ||||||
| def extract_names(sender): | def extract_names(sender): | ||||||
| @@ -142,20 +131,25 @@ def extract_names(sender): | |||||||
|     >>> extract_names('') |     >>> extract_names('') | ||||||
|     [] |     [] | ||||||
|     """ |     """ | ||||||
|     sender = to_unicode(sender) |  | ||||||
|     # Remove non-alphabetical characters |     # Remove non-alphabetical characters | ||||||
|     sender = "".join([char if char.isalpha() else ' ' for char in sender]) |     sender = "".join([char if char.isalpha() else ' ' for char in sender]) | ||||||
|     # Remove too short words and words from "black" list i.e. |     # Remove too short words and words from "black" list i.e. | ||||||
|     # words like `ru`, `gmail`, `com`, `org`, etc. |     # words like `ru`, `gmail`, `com`, `org`, etc. | ||||||
|     sender = [word for word in sender.split() if len(word) > 1 and |     names = list() | ||||||
|               not word in BAD_SENDER_NAMES] |     for word in sender.split(): | ||||||
|     # Remove duplicates |         if len(word) < 2: | ||||||
|     names = list(set(sender)) |             continue | ||||||
|  |         if word in BAD_SENDER_NAMES: | ||||||
|  |             continue | ||||||
|  |         if word in names: | ||||||
|  |             continue | ||||||
|  |         names.append(word) | ||||||
|  |  | ||||||
|     return names |     return names | ||||||
|  |  | ||||||
|  |  | ||||||
| def categories_percent(s, categories): | def categories_percent(s, categories): | ||||||
|     '''Returns category characters persent. |     """Returns category characters percent. | ||||||
|  |  | ||||||
|     >>> categories_percent("qqq ggg hhh", ["Po"]) |     >>> categories_percent("qqq ggg hhh", ["Po"]) | ||||||
|     0.0 |     0.0 | ||||||
| @@ -167,9 +161,8 @@ def categories_percent(s, categories): | |||||||
|     50.0 |     50.0 | ||||||
|     >>> categories_percent("s.s,5s", ["Po", "Nd"]) |     >>> categories_percent("s.s,5s", ["Po", "Nd"]) | ||||||
|     50.0 |     50.0 | ||||||
|     ''' |     """ | ||||||
|     count = 0 |     count = 0 | ||||||
|     s = to_unicode(s) |  | ||||||
|     for c in s: |     for c in s: | ||||||
|         if unicodedata.category(c) in categories: |         if unicodedata.category(c) in categories: | ||||||
|             count += 1 |             count += 1 | ||||||
| @@ -177,27 +170,27 @@ def categories_percent(s, categories): | |||||||
|  |  | ||||||
|  |  | ||||||
| def punctuation_percent(s): | def punctuation_percent(s): | ||||||
|     '''Returns punctuation persent. |     """Returns punctuation percent. | ||||||
|  |  | ||||||
|     >>> punctuation_percent("qqq ggg hhh") |     >>> punctuation_percent("qqq ggg hhh") | ||||||
|     0.0 |     0.0 | ||||||
|     >>> punctuation_percent("q,w.") |     >>> punctuation_percent("q,w.") | ||||||
|     50.0 |     50.0 | ||||||
|     ''' |     """ | ||||||
|     return categories_percent(s, ['Po']) |     return categories_percent(s, ['Po']) | ||||||
|  |  | ||||||
|  |  | ||||||
| def capitalized_words_percent(s): | def capitalized_words_percent(s): | ||||||
|     '''Returns capitalized words percent.''' |     """Returns capitalized words percent.""" | ||||||
|     s = to_unicode(s) |  | ||||||
|     words = re.split('\s', s) |     words = re.split('\s', s) | ||||||
|     words = [w for w in words if w.strip()] |     words = [w for w in words if w.strip()] | ||||||
|  |     words = [w for w in words if len(w) > 2]     | ||||||
|     capitalized_words_counter = 0 |     capitalized_words_counter = 0 | ||||||
|     valid_words_counter = 0 |     valid_words_counter = 0 | ||||||
|     for word in words: |     for word in words: | ||||||
|         if not INVALID_WORD_START.match(word): |         if not INVALID_WORD_START.match(word): | ||||||
|             valid_words_counter += 1 |             valid_words_counter += 1 | ||||||
|             if word[0].isupper(): |             if word[0].isupper() and not word[1].isupper(): | ||||||
|                 capitalized_words_counter += 1 |                 capitalized_words_counter += 1 | ||||||
|     if valid_words_counter > 0 and len(words) > 1: |     if valid_words_counter > 0 and len(words) > 1: | ||||||
|         return 100 * float(capitalized_words_counter) / valid_words_counter |         return 100 * float(capitalized_words_counter) / valid_words_counter | ||||||
| @@ -214,20 +207,26 @@ def many_capitalized_words(s): | |||||||
|  |  | ||||||
|  |  | ||||||
| def has_signature(body, sender): | def has_signature(body, sender): | ||||||
|     '''Checks if the body has signature. Returns True or False.''' |     """Checks if the body has signature. Returns True or False.""" | ||||||
|     non_empty = [line for line in body.splitlines() if line.strip()] |     non_empty = [line for line in body.splitlines() if line.strip()] | ||||||
|     candidate = non_empty[-SIGNATURE_MAX_LINES:] |     candidate = non_empty[-SIGNATURE_MAX_LINES:] | ||||||
|     upvotes = 0 |     upvotes = 0 | ||||||
|  |     sender_check = contains_sender_names(sender) | ||||||
|     for line in candidate: |     for line in candidate: | ||||||
|         # we check lines for sender's name, phone, email and url, |         # we check lines for sender's name, phone, email and url, | ||||||
|         # those signature lines don't take more then 27 lines |         # those signature lines don't take more then 27 lines | ||||||
|         if len(line.strip()) > 27: |         if len(line.strip()) > 27: | ||||||
|             continue |             continue | ||||||
|         elif contains_sender_names(sender)(line): |  | ||||||
|  |         if sender_check(line): | ||||||
|             return True |             return True | ||||||
|         elif (binary_regex_search(RE_RELAX_PHONE)(line) + |  | ||||||
|  |         if (binary_regex_search(RE_RELAX_PHONE)(line) + | ||||||
|                 binary_regex_search(RE_EMAIL)(line) + |                 binary_regex_search(RE_EMAIL)(line) + | ||||||
|                 binary_regex_search(RE_URL)(line) == 1): |                 binary_regex_search(RE_URL)(line) == 1): | ||||||
|             upvotes += 1 |             upvotes += 1 | ||||||
|  |  | ||||||
|     if upvotes > 1: |     if upvotes > 1: | ||||||
|         return True |         return True | ||||||
|  |  | ||||||
|  |     return False | ||||||
|   | |||||||
							
								
								
									
										185
									
								
								talon/utils.py
									
									
									
									
									
								
							
							
						
						
									
										185
									
								
								talon/utils.py
									
									
									
									
									
								
							| @@ -1,72 +1,17 @@ | |||||||
| # coding:utf-8 | # coding:utf-8 | ||||||
|  | from __future__ import annotations | ||||||
|  |  | ||||||
| import logging | import html5lib | ||||||
| from random import shuffle | import regex as re | ||||||
|  | from html5lib import HTMLParser | ||||||
|  | from lxml.cssselect import CSSSelector | ||||||
|  | from lxml.etree import _Element | ||||||
|  | from lxml.html import html5parser | ||||||
|  |  | ||||||
| from talon.constants import RE_DELIMITER | from talon.constants import RE_DELIMITER | ||||||
|  |  | ||||||
|  |  | ||||||
| log = logging.getLogger(__name__) | def get_delimiter(msg_body: str) -> str: | ||||||
|  |  | ||||||
|  |  | ||||||
| def safe_format(format_string, *args, **kwargs): |  | ||||||
|     """ |  | ||||||
|     Helper: formats string with any combination of bytestrings/unicode |  | ||||||
|     strings without raising exceptions |  | ||||||
|     """ |  | ||||||
|     try: |  | ||||||
|         if not args and not kwargs: |  | ||||||
|             return format_string |  | ||||||
|         else: |  | ||||||
|             return format_string.format(*args, **kwargs) |  | ||||||
|  |  | ||||||
|     # catch encoding errors and transform everything into utf-8 string |  | ||||||
|     # before logging: |  | ||||||
|     except (UnicodeEncodeError, UnicodeDecodeError): |  | ||||||
|         format_string = to_utf8(format_string) |  | ||||||
|         args = [to_utf8(p) for p in args] |  | ||||||
|         kwargs = {k: to_utf8(v) for k, v in kwargs.iteritems()} |  | ||||||
|         return format_string.format(*args, **kwargs) |  | ||||||
|  |  | ||||||
|     # ignore other errors |  | ||||||
|     except: |  | ||||||
|         return u'' |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def to_unicode(str_or_unicode, precise=False): |  | ||||||
|     """ |  | ||||||
|     Safely returns a unicode version of a given string |  | ||||||
|     >>> utils.to_unicode('привет') |  | ||||||
|         u'привет' |  | ||||||
|     >>> utils.to_unicode(u'привет') |  | ||||||
|         u'привет' |  | ||||||
|     If `precise` flag is True, tries to guess the correct encoding first. |  | ||||||
|     """ |  | ||||||
|     encoding = detect_encoding(str_or_unicode) if precise else 'utf-8' |  | ||||||
|     if isinstance(str_or_unicode, str): |  | ||||||
|         return unicode(str_or_unicode, encoding, 'replace') |  | ||||||
|     return str_or_unicode |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def to_utf8(str_or_unicode): |  | ||||||
|     """ |  | ||||||
|     Safely returns a UTF-8 version of a given string |  | ||||||
|     >>> utils.to_utf8(u'hi') |  | ||||||
|         'hi' |  | ||||||
|     """ |  | ||||||
|     if isinstance(str_or_unicode, unicode): |  | ||||||
|         return str_or_unicode.encode("utf-8", "ignore") |  | ||||||
|     return str(str_or_unicode) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def random_token(length=7): |  | ||||||
|     vals = ("a b c d e f g h i j k l m n o p q r s t u v w x y z " |  | ||||||
|             "0 1 2 3 4 5 6 7 8 9").split(' ') |  | ||||||
|     shuffle(vals) |  | ||||||
|     return ''.join(vals[:length]) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def get_delimiter(msg_body): |  | ||||||
|     delimiter = RE_DELIMITER.search(msg_body) |     delimiter = RE_DELIMITER.search(msg_body) | ||||||
|     if delimiter: |     if delimiter: | ||||||
|         delimiter = delimiter.group() |         delimiter = delimiter.group() | ||||||
| @@ -74,3 +19,117 @@ def get_delimiter(msg_body): | |||||||
|         delimiter = '\n' |         delimiter = '\n' | ||||||
|  |  | ||||||
|     return delimiter |     return delimiter | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def html_tree_to_text(tree: _Element) -> str: | ||||||
|  |     for style in CSSSelector('style')(tree): | ||||||
|  |         style.getparent().remove(style) | ||||||
|  |  | ||||||
|  |     for c in tree.xpath('//comment()'): | ||||||
|  |         parent = c.getparent() | ||||||
|  |  | ||||||
|  |         # comment with no parent does not impact produced text | ||||||
|  |         if parent is None: | ||||||
|  |             continue | ||||||
|  |  | ||||||
|  |         parent.remove(c) | ||||||
|  |  | ||||||
|  |     text = "" | ||||||
|  |     for el in tree.iter(): | ||||||
|  |         el_text = (el.text or '') + (el.tail or '') | ||||||
|  |         if len(el_text) > 1: | ||||||
|  |             if el.tag in _BLOCKTAGS + _HARDBREAKS: | ||||||
|  |                 text += "\n" | ||||||
|  |             if el.tag == 'li': | ||||||
|  |                 text += "  * " | ||||||
|  |             text += el_text.strip() + " " | ||||||
|  |  | ||||||
|  |             # add href to the output | ||||||
|  |             href = el.attrib.get('href') | ||||||
|  |             if href: | ||||||
|  |                 text += "(%s) " % href | ||||||
|  |  | ||||||
|  |         if (el.tag in _HARDBREAKS and text and | ||||||
|  |             not text.endswith("\n") and not el_text): | ||||||
|  |             text += "\n" | ||||||
|  |  | ||||||
|  |     text = _rm_excessive_newlines(text) | ||||||
|  |     return text | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def html_to_text(s: str) -> str | None: | ||||||
|  |     """ | ||||||
|  |     Dead-simple HTML-to-text converter: | ||||||
|  |         >>> html_to_text("one<br>two<br>three") | ||||||
|  |         <<< "one\ntwo\nthree" | ||||||
|  |  | ||||||
|  |     NOTES: | ||||||
|  |         1. the string is expected to contain UTF-8 encoded HTML! | ||||||
|  |         3. if html can't be parsed returns None | ||||||
|  |     """ | ||||||
|  |     s = _prepend_utf8_declaration(s) | ||||||
|  |     s = s.replace("\n", "") | ||||||
|  |     tree = html_fromstring(s) | ||||||
|  |  | ||||||
|  |     if tree is None: | ||||||
|  |         return None | ||||||
|  |  | ||||||
|  |     return html_tree_to_text(tree) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def html_fromstring(s: str) -> _Element: | ||||||
|  |     """Parse html tree from string. Return None if the string can't be parsed. | ||||||
|  |     """ | ||||||
|  |     return html5parser.fromstring(s, parser=_html5lib_parser()) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def html_document_fromstring(s: str) -> _Element: | ||||||
|  |     """Parse html tree from string. Return None if the string can't be parsed. | ||||||
|  |     """ | ||||||
|  |     return html5parser.document_fromstring(s, parser=_html5lib_parser()) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def cssselect(expr: str, tree: str) -> list[_Element]: | ||||||
|  |     return CSSSelector(expr)(tree) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _contains_charset_spec(s: str) -> str: | ||||||
|  |     """Return True if the first 4KB contain charset spec | ||||||
|  |     """ | ||||||
|  |     return s.lower().find('html; charset=', 0, 4096) != -1 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _prepend_utf8_declaration(s: str) -> str: | ||||||
|  |     """Prepend 'utf-8' encoding declaration if the first 4KB don't have any | ||||||
|  |     """ | ||||||
|  |     return s if _contains_charset_spec(s) else _UTF8_DECLARATION + s | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _rm_excessive_newlines(s: str) -> str: | ||||||
|  |     """Remove excessive newlines that often happen due to tons of divs | ||||||
|  |     """ | ||||||
|  |     return _RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip() | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _html5lib_parser() -> HTMLParser: | ||||||
|  |     """ | ||||||
|  |     html5lib is a pure-python library that conforms to the WHATWG HTML spec | ||||||
|  |     and is not vulnarable to certain attacks common for XML libraries | ||||||
|  |     """ | ||||||
|  |     return HTMLParser( | ||||||
|  |         # build lxml tree | ||||||
|  |         html5lib.treebuilders.getTreeBuilder("lxml"), | ||||||
|  |         # remove namespace value from inside lxml.html.html5paser element tag | ||||||
|  |         # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div" | ||||||
|  |         # instead of "div", throwing the algo off | ||||||
|  |         namespaceHTMLElements=False | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | _UTF8_DECLARATION = ('<meta http-equiv="Content-Type" content="text/html;' | ||||||
|  |                      'charset=utf-8">') | ||||||
|  |  | ||||||
|  | _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] | ||||||
|  | _HARDBREAKS = ['br', 'hr', 'tr'] | ||||||
|  |  | ||||||
|  | _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") | ||||||
|   | |||||||
							
								
								
									
										3
									
								
								test-requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								test-requirements.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | |||||||
|  | coverage | ||||||
|  | mock | ||||||
|  | nose>=1.2.1 | ||||||
| @@ -1,3 +1,4 @@ | |||||||
|  | from __future__ import absolute_import | ||||||
| from nose.tools import * | from nose.tools import * | ||||||
| from mock import * | from mock import * | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										1
									
								
								tests/fixtures/html_replies/hotmail.html
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								tests/fixtures/html_replies/hotmail.html
									
									
									
									
										vendored
									
									
								
							| @@ -1,3 +1,4 @@ | |||||||
|  | <?xml version="1.0" encoding="UTF-8"?> | ||||||
| <html> | <html> | ||||||
| <head> | <head> | ||||||
| <style><!-- | <style><!-- | ||||||
|   | |||||||
							
								
								
									
										87
									
								
								tests/fixtures/html_replies/ms_outlook_2010.html
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										87
									
								
								tests/fixtures/html_replies/ms_outlook_2010.html
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,87 @@ | |||||||
|  | <html> | ||||||
|  | <head> | ||||||
|  | <meta http-equiv="Content-Type" content="text/html; charset=iso-2022-jp"> | ||||||
|  | <meta name="Generator" content="Microsoft Word 14 (filtered medium)"> | ||||||
|  | <style><!-- | ||||||
|  | /* Font Definitions */ | ||||||
|  | @font-face | ||||||
|  | 	{font-family:Calibri; | ||||||
|  | 	panose-1:2 15 5 2 2 2 4 3 2 4;} | ||||||
|  | @font-face | ||||||
|  | 	{font-family:Tahoma; | ||||||
|  | 	panose-1:2 11 6 4 3 5 4 4 2 4;} | ||||||
|  | /* Style Definitions */ | ||||||
|  | p.MsoNormal, li.MsoNormal, div.MsoNormal | ||||||
|  | 	{margin:0in; | ||||||
|  | 	margin-bottom:.0001pt; | ||||||
|  | 	font-size:12.0pt; | ||||||
|  | 	font-family:"Times New Roman","serif";} | ||||||
|  | h3 | ||||||
|  | 	{mso-style-priority:9; | ||||||
|  | 	mso-style-link:"Heading 3 Char"; | ||||||
|  | 	mso-margin-top-alt:auto; | ||||||
|  | 	margin-right:0in; | ||||||
|  | 	mso-margin-bottom-alt:auto; | ||||||
|  | 	margin-left:0in; | ||||||
|  | 	font-size:13.5pt; | ||||||
|  | 	font-family:"Times New Roman","serif"; | ||||||
|  | 	font-weight:bold;} | ||||||
|  | a:link, span.MsoHyperlink | ||||||
|  | 	{mso-style-priority:99; | ||||||
|  | 	color:blue; | ||||||
|  | 	text-decoration:underline;} | ||||||
|  | a:visited, span.MsoHyperlinkFollowed | ||||||
|  | 	{mso-style-priority:99; | ||||||
|  | 	color:purple; | ||||||
|  | 	text-decoration:underline;} | ||||||
|  | p | ||||||
|  | 	{mso-style-priority:99; | ||||||
|  | 	mso-margin-top-alt:auto; | ||||||
|  | 	margin-right:0in; | ||||||
|  | 	mso-margin-bottom-alt:auto; | ||||||
|  | 	margin-left:0in; | ||||||
|  | 	font-size:12.0pt; | ||||||
|  | 	font-family:"Times New Roman","serif";} | ||||||
|  | span.Heading3Char | ||||||
|  | 	{mso-style-name:"Heading 3 Char"; | ||||||
|  | 	mso-style-priority:9; | ||||||
|  | 	mso-style-link:"Heading 3"; | ||||||
|  | 	font-family:"Cambria","serif"; | ||||||
|  | 	color:#4F81BD; | ||||||
|  | 	font-weight:bold;} | ||||||
|  | span.EmailStyle19 | ||||||
|  | 	{mso-style-type:personal-reply; | ||||||
|  | 	font-family:"Calibri","sans-serif"; | ||||||
|  | 	color:#1F497D;} | ||||||
|  | .MsoChpDefault | ||||||
|  | 	{mso-style-type:export-only; | ||||||
|  | 	font-family:"Calibri","sans-serif";} | ||||||
|  | @page WordSection1 | ||||||
|  | 	{size:8.5in 11.0in; | ||||||
|  | 	margin:1.0in 1.0in 1.0in 1.0in;} | ||||||
|  | div.WordSection1 | ||||||
|  | 	{page:WordSection1;} | ||||||
|  | --></style><!--[if gte mso 9]><xml> | ||||||
|  | <o:shapedefaults v:ext="edit" spidmax="1026" /> | ||||||
|  | </xml><![endif]--><!--[if gte mso 9]><xml> | ||||||
|  | <o:shapelayout v:ext="edit"> | ||||||
|  | <o:idmap v:ext="edit" data="1" /> | ||||||
|  | </o:shapelayout></xml><![endif]--> | ||||||
|  | </head> | ||||||
|  | <body lang="EN-US" link="blue" vlink="purple"> | ||||||
|  | <div class="WordSection1"> | ||||||
|  | <p class="MsoNormal"><span style="font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D">Hi. I am fine.<o:p></o:p></span></p> | ||||||
|  | <p class="MsoNormal"><span style="font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D">Thanks,<o:p></o:p></span></p> | ||||||
|  | <p class="MsoNormal"><span style="font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D">Alex<o:p></o:p></span></p> | ||||||
|  | <p class="MsoNormal"><b><span style="font-size:10.0pt;font-family:"Tahoma","sans-serif"">From:</span></b><span style="font-size:10.0pt;font-family:"Tahoma","sans-serif""> Foo [mailto:foo@bar.com] | ||||||
|  | <b>On Behalf Of </b>baz@bar.com<br> | ||||||
|  | <b>Sent:</b> Monday, January 01, 2000 12:00 AM<br> | ||||||
|  | <b>To:</b> john@bar.com<br> | ||||||
|  | <b>Cc:</b> jane@bar.io<br> | ||||||
|  | <b>Subject:</b> Conversation<o:p></o:p></span></p> | ||||||
|  | <p class="MsoNormal"><o:p> </o:p></p> | ||||||
|  | <p>Hello! How are you?<o:p></o:p></p> | ||||||
|  | <p class="MsoNormal"><o:p> </o:p></p> | ||||||
|  | </div> | ||||||
|  | </body> | ||||||
|  | </html> | ||||||
							
								
								
									
										19
									
								
								tests/fixtures/standard_replies/apple_mail_2.eml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								tests/fixtures/standard_replies/apple_mail_2.eml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,19 @@ | |||||||
|  | Content-Type: text/plain; | ||||||
|  | 	charset=us-ascii | ||||||
|  | Mime-Version: 1.0 (Mac OS X Mail 8.2 \(2104\)) | ||||||
|  | Subject: Re: Hello there | ||||||
|  | X-Universally-Unique-Identifier: 85B1075D-5841-46A9-8565-FCB287A93AC4 | ||||||
|  | From: Adam Renberg <adam@tictail.com> | ||||||
|  | In-Reply-To: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com> | ||||||
|  | Date: Sat, 22 Aug 2015 19:22:20 +0200 | ||||||
|  | Content-Transfer-Encoding: 7bit | ||||||
|  | X-Smtp-Server: smtp.gmail.com:adam@tictail.com | ||||||
|  | Message-Id: <68001B29-8EA4-444C-A894-0537D2CA5208@tictail.com> | ||||||
|  | References: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com> | ||||||
|  | To: Adam Renberg <tgwizard@gmail.com> | ||||||
|  |  | ||||||
|  | Hello | ||||||
|  | > On 22 Aug 2015, at 19:21, Adam Renberg <tgwizard@gmail.com> wrote: | ||||||
|  | > | ||||||
|  | > Hi there! | ||||||
|  |  | ||||||
							
								
								
									
										2
									
								
								tests/fixtures/standard_replies/iphone.eml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								tests/fixtures/standard_replies/iphone.eml
									
									
									
									
										vendored
									
									
								
							| @@ -9,7 +9,7 @@ To: bob <bob@example.com> | |||||||
| Content-Transfer-Encoding: quoted-printable | Content-Transfer-Encoding: quoted-printable | ||||||
| Mime-Version: 1.0 (1.0) | Mime-Version: 1.0 (1.0) | ||||||
|  |  | ||||||
| hello | Hello | ||||||
|  |  | ||||||
| Sent from my iPhone | Sent from my iPhone | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										3
									
								
								tests/fixtures/standard_replies/iphone_reply_text
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								tests/fixtures/standard_replies/iphone_reply_text
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | |||||||
|  | Hello | ||||||
|  |  | ||||||
|  | Sent from my iPhone | ||||||
| @@ -1,18 +1,20 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
| from . import * | from __future__ import absolute_import | ||||||
| from . fixtures import * |  | ||||||
|  |  | ||||||
| import regex as re | # noinspection PyUnresolvedReferences | ||||||
| from flanker import mime | import re | ||||||
|  | from unittest.mock import Mock, patch | ||||||
|  |  | ||||||
| from talon import quotations | from nose.tools import assert_false, assert_true, eq_, ok_ | ||||||
|  |  | ||||||
| import html2text | from tests.fixtures import (OLK_SRC_BODY_SECTION, | ||||||
|  |                             REPLY_QUOTATIONS_SHARE_BLOCK, | ||||||
|  |                             REPLY_SEPARATED_BY_HR) | ||||||
|  | from talon import quotations, utils as u | ||||||
|  |  | ||||||
|  | RE_WHITESPACE = re.compile(r"\s") | ||||||
| RE_WHITESPACE = re.compile("\s") | RE_DOUBLE_WHITESPACE = re.compile(r"\s") | ||||||
| RE_DOUBLE_WHITESPACE = re.compile("\s") |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_quotation_splitter_inside_blockquote(): | def test_quotation_splitter_inside_blockquote(): | ||||||
| @@ -29,7 +31,7 @@ def test_quotation_splitter_inside_blockquote(): | |||||||
|  |  | ||||||
| </blockquote>""" | </blockquote>""" | ||||||
|  |  | ||||||
|     eq_("<html><body><p>Reply</p></body></html>", |     eq_("<html><head></head><body>Reply</body></html>", | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -46,7 +48,25 @@ def test_quotation_splitter_outside_blockquote(): | |||||||
|   </div> |   </div> | ||||||
| </blockquote> | </blockquote> | ||||||
| """ | """ | ||||||
|     eq_("<html><body><p>Reply</p><div></div></body></html>", |     eq_("<html><head></head><body>Reply</body></html>", | ||||||
|  |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_regular_blockquote(): | ||||||
|  |     msg_body = """Reply | ||||||
|  | <blockquote>Regular</blockquote> | ||||||
|  |  | ||||||
|  | <div> | ||||||
|  |   On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: | ||||||
|  | </div> | ||||||
|  |  | ||||||
|  | <blockquote> | ||||||
|  |   <div> | ||||||
|  |     <blockquote>Nested</blockquote> | ||||||
|  |   </div> | ||||||
|  | </blockquote> | ||||||
|  | """ | ||||||
|  |     eq_("<html><head></head><body>Reply<blockquote>Regular</blockquote></body></html>", | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -69,6 +89,7 @@ Reply | |||||||
|  |  | ||||||
|     reply = """ |     reply = """ | ||||||
| <html> | <html> | ||||||
|  | <head></head> | ||||||
| <body> | <body> | ||||||
| Reply | Reply | ||||||
|  |  | ||||||
| @@ -112,7 +133,30 @@ def test_gmail_quote(): | |||||||
|     </div> |     </div> | ||||||
|   </div> |   </div> | ||||||
| </div>""" | </div>""" | ||||||
|     eq_("<html><body><p>Reply</p></body></html>", |     eq_("<html><head></head><body>Reply</body></html>", | ||||||
|  |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_gmail_quote_compact(): | ||||||
|  |     msg_body = 'Reply' \ | ||||||
|  |                '<div class="gmail_quote">' \ | ||||||
|  |                '<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:' \ | ||||||
|  |                '<div>Test</div>' \ | ||||||
|  |                '</div>' \ | ||||||
|  |                '</div>' | ||||||
|  |     eq_("<html><head></head><body>Reply</body></html>", | ||||||
|  |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_gmail_quote_blockquote(): | ||||||
|  |     msg_body = """Message | ||||||
|  | <blockquote class="gmail_quote"> | ||||||
|  |   <div class="gmail_default"> | ||||||
|  |     My name is William Shakespeare. | ||||||
|  |     <br/> | ||||||
|  |   </div> | ||||||
|  | </blockquote>""" | ||||||
|  |     eq_(RE_WHITESPACE.sub('', msg_body), | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -123,11 +167,11 @@ def test_unicode_in_reply(): | |||||||
|   <br> |   <br> | ||||||
| </div> | </div> | ||||||
|  |  | ||||||
| <blockquote class="gmail_quote"> | <blockquote> | ||||||
|   Quote |   Quote | ||||||
| </blockquote>""".encode("utf-8") | </blockquote>""" | ||||||
|  |  | ||||||
|     eq_("<html><body><p>Reply  Text<br></p><div><br></div>" |     eq_("<html><head></head><body>Reply  Text<br><div><br></div>" | ||||||
|         "</body></html>", |         "</body></html>", | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
| @@ -153,6 +197,7 @@ def test_blockquote_disclaimer(): | |||||||
|  |  | ||||||
|     stripped_html = """ |     stripped_html = """ | ||||||
| <html> | <html> | ||||||
|  |   <head></head> | ||||||
|   <body> |   <body> | ||||||
|   <div> |   <div> | ||||||
|     <div> |     <div> | ||||||
| @@ -184,7 +229,7 @@ def test_date_block(): | |||||||
|   </div> |   </div> | ||||||
| </div> | </div> | ||||||
| """ | """ | ||||||
|     eq_('<html><body><div>message<br></div></body></html>', |     eq_('<html><head></head><body><div>message<br></div></body></html>', | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -201,7 +246,7 @@ Subject: You Have New Mail From Mary!<br><br> | |||||||
| text | text | ||||||
| </div></div> | </div></div> | ||||||
| """ | """ | ||||||
|     eq_('<html><body><div>message<br></div></body></html>', |     eq_('<html><head></head><body><div>message<br></div></body></html>', | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -219,51 +264,62 @@ def test_reply_shares_div_with_from_block(): | |||||||
|  |  | ||||||
|   </div> |   </div> | ||||||
| </body>''' | </body>''' | ||||||
|     eq_('<html><body><div>Blah<br><br></div></body></html>', |     eq_('<html><head></head><body><div>Blah<br><br></div></body></html>', | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_reply_quotations_share_block(): | def test_reply_quotations_share_block(): | ||||||
|     msg = mime.from_string(REPLY_QUOTATIONS_SHARE_BLOCK) |     stripped_html = quotations.extract_from_plain(REPLY_QUOTATIONS_SHARE_BLOCK) | ||||||
|     html_part = list(msg.walk())[1] |  | ||||||
|     assert html_part.content_type == 'text/html' |  | ||||||
|     stripped_html = quotations.extract_from_html(html_part.body) |  | ||||||
|     ok_(stripped_html) |     ok_(stripped_html) | ||||||
|     ok_('From' not in stripped_html) |     ok_('From' not in stripped_html) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_OLK_SRC_BODY_SECTION_stripped(): | def test_OLK_SRC_BODY_SECTION_stripped(): | ||||||
|     eq_('<html><body><div>Reply</div></body></html>', |     eq_('<html><head></head><body><div>Reply</div></body></html>', | ||||||
|         RE_WHITESPACE.sub( |         RE_WHITESPACE.sub( | ||||||
|             '', quotations.extract_from_html(OLK_SRC_BODY_SECTION))) |             '', quotations.extract_from_html(OLK_SRC_BODY_SECTION))) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_reply_separated_by_hr(): | def test_reply_separated_by_hr(): | ||||||
|     eq_('<html><body><div>Hi<div>there</div></div></body></html>', |     eq_('<html><head></head><body><div>Hi<div>there</div></div></body></html>', | ||||||
|         RE_WHITESPACE.sub( |         RE_WHITESPACE.sub( | ||||||
|             '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) |             '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) | ||||||
|  |  | ||||||
|  |  | ||||||
| RE_REPLY = re.compile(r"^Hi\. I am fine\.\s*\n\s*Thanks,\s*\n\s*Alex\s*$") | def test_from_block_and_quotations_in_separate_divs(): | ||||||
|  |     msg_body = ''' | ||||||
|  | Reply | ||||||
|  | <div> | ||||||
|  |   <hr/> | ||||||
|  |   <div> | ||||||
|  |     <font> | ||||||
|  |       <b>From: bob@example.com</b> | ||||||
|  |       <b>Date: Thu, 24 Mar 2016 08:07:12 -0700</b> | ||||||
|  |     </font> | ||||||
|  |   </div> | ||||||
|  |   <div> | ||||||
|  |     Quoted message | ||||||
|  |   </div> | ||||||
|  | </div> | ||||||
|  | ''' | ||||||
|  |     eq_('<html><head></head><body>Reply<div><hr></div></body></html>', | ||||||
|  |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| def extract_reply_and_check(filename): | def extract_reply_and_check(filename): | ||||||
|     f = open(filename) |     import sys | ||||||
|  |     kwargs = {} | ||||||
|  |     if sys.version_info > (3, 0): | ||||||
|  |         kwargs["encoding"] = "utf8" | ||||||
|  |  | ||||||
|     msg_body = f.read().decode("utf-8") |     f = open(filename, **kwargs) | ||||||
|  |  | ||||||
|  |     msg_body = f.read() | ||||||
|     reply = quotations.extract_from_html(msg_body) |     reply = quotations.extract_from_html(msg_body) | ||||||
|  |     plain_reply = u.html_to_text(reply) | ||||||
|  |  | ||||||
|     h = html2text.HTML2Text() |     eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"), | ||||||
|     h.body_width = 0 |         RE_WHITESPACE.sub('', plain_reply)) | ||||||
|     plain_reply = h.handle(reply) |  | ||||||
|  |  | ||||||
|     #remove   spaces |  | ||||||
|     plain_reply = plain_reply.replace(u'\xa0', u' ') |  | ||||||
|  |  | ||||||
|     if RE_REPLY.match(plain_reply): |  | ||||||
|         eq_(1, 1) |  | ||||||
|     else: |  | ||||||
|         eq_("Hi. I am fine.\n\nThanks,\nAlex", plain_reply) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_gmail_reply(): | def test_gmail_reply(): | ||||||
| @@ -286,6 +342,10 @@ def test_ms_outlook_2007_reply(): | |||||||
|     extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2007.html") |     extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2007.html") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_ms_outlook_2010_reply(): | ||||||
|  |     extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2010.html") | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_thunderbird_reply(): | def test_thunderbird_reply(): | ||||||
|     extract_reply_and_check("tests/fixtures/html_replies/thunderbird.html") |     extract_reply_and_check("tests/fixtures/html_replies/thunderbird.html") | ||||||
|  |  | ||||||
| @@ -296,3 +356,82 @@ def test_windows_mail_reply(): | |||||||
|  |  | ||||||
| def test_yandex_ru_reply(): | def test_yandex_ru_reply(): | ||||||
|     extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html") |     extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_CRLF(): | ||||||
|  |     """CR is not converted to '
' | ||||||
|  |     """ | ||||||
|  |     symbol = '
' | ||||||
|  |     extracted = quotations.extract_from_html('<html>\r\n</html>') | ||||||
|  |     assert_false(symbol in extracted) | ||||||
|  |     eq_('<html></html>', RE_WHITESPACE.sub('', extracted)) | ||||||
|  |  | ||||||
|  |     msg_body = """My | ||||||
|  | reply | ||||||
|  | <blockquote> | ||||||
|  |  | ||||||
|  |   <div> | ||||||
|  |     On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: | ||||||
|  |   </div> | ||||||
|  |  | ||||||
|  |   <div> | ||||||
|  |     Test | ||||||
|  |   </div> | ||||||
|  |  | ||||||
|  | </blockquote>""" | ||||||
|  |     msg_body = msg_body.replace('\n', '\r\n') | ||||||
|  |     extracted = quotations.extract_from_html(msg_body) | ||||||
|  |     assert_false(symbol in extracted) | ||||||
|  |     # Keep new lines otherwise "My reply" becomes one word - "Myreply"  | ||||||
|  |     eq_("<html><head></head><body>My\nreply\n</body></html>", extracted) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_gmail_forwarded_msg(): | ||||||
|  |     msg_body = """<div dir="ltr"><br><div class="gmail_quote">---------- Forwarded message ----------<br>From: <b class="gmail_sendername">Bob</b> <span dir="ltr"><<a href="mailto:bob@example.com">bob@example.com</a>></span><br>Date: Fri, Feb 11, 2010 at 5:59 PM<br>Subject: Bob WFH today<br>To: Mary <<a href="mailto:mary@example.com">mary@example.com</a>><br><br><br><div dir="ltr">eom</div> | ||||||
|  | </div><br></div>""" | ||||||
|  |     extracted = quotations.extract_from_html(msg_body) | ||||||
|  |     eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_readable_html_empty(): | ||||||
|  |     msg_body = """ | ||||||
|  | <blockquote> | ||||||
|  |   Reply | ||||||
|  |   <div> | ||||||
|  |     On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: | ||||||
|  |   </div> | ||||||
|  |  | ||||||
|  |   <div> | ||||||
|  |     Test | ||||||
|  |   </div> | ||||||
|  |  | ||||||
|  | </blockquote>""" | ||||||
|  |  | ||||||
|  |     eq_(RE_WHITESPACE.sub('', msg_body), | ||||||
|  |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @patch.object(quotations, 'html_document_fromstring', Mock(return_value=None)) | ||||||
|  | def test_bad_html(): | ||||||
|  |     bad_html = "<html></html>" | ||||||
|  |     eq_(bad_html, quotations.extract_from_html(bad_html)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_remove_namespaces(): | ||||||
|  |     msg_body = """ | ||||||
|  |     <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns="http://www.w3.org/TR/REC-html40"> | ||||||
|  |         <body> | ||||||
|  |             <o:p>Dear Sir,</o:p> | ||||||
|  |             <o:p>Thank you for the email.</o:p> | ||||||
|  |             <blockquote>thing</blockquote> | ||||||
|  |         </body> | ||||||
|  |     </html> | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     rendered = quotations.extract_from_html(msg_body) | ||||||
|  |  | ||||||
|  |     assert_true("<p>" in rendered) | ||||||
|  |     assert_true("xmlns" in rendered) | ||||||
|  |  | ||||||
|  |     assert_true("<o:p>" not in rendered) | ||||||
|  |     assert_true("<xmlns:o>" not in rendered) | ||||||
|   | |||||||
| @@ -1,10 +1,9 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| from . import * | from . import * | ||||||
| from . fixtures import * | from . fixtures import * | ||||||
|  |  | ||||||
| from flanker import mime |  | ||||||
|  |  | ||||||
| from talon import quotations | from talon import quotations | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,11 +1,8 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| from .. import * | from .. import * | ||||||
|  |  | ||||||
| import os |  | ||||||
|  |  | ||||||
| from flanker import mime |  | ||||||
|  |  | ||||||
| from talon.signature import bruteforce | from talon.signature import bruteforce | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,15 +1,15 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
| from .. import * | from __future__ import absolute_import | ||||||
|  |  | ||||||
| import os | import os | ||||||
|  |  | ||||||
| from PyML import SparseDataSet | from six.moves import range | ||||||
|  |  | ||||||
| from talon.signature.learning import dataset | from talon.signature import bruteforce, extraction, extract | ||||||
| from talon import signature |  | ||||||
| from talon.signature import extraction as e | from talon.signature import extraction as e | ||||||
| from talon.signature import bruteforce | from talon.signature.learning import dataset | ||||||
|  | from .. import * | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_message_shorter_SIGNATURE_MAX_LINES(): | def test_message_shorter_SIGNATURE_MAX_LINES(): | ||||||
| @@ -18,20 +18,25 @@ def test_message_shorter_SIGNATURE_MAX_LINES(): | |||||||
|  |  | ||||||
| Thanks in advance, | Thanks in advance, | ||||||
| Bob""" | Bob""" | ||||||
|     text, extracted_signature = signature.extract(body, sender) |     text, extracted_signature = extract(body, sender) | ||||||
|     eq_('\n'.join(body.splitlines()[:2]), text) |     eq_('\n'.join(body.splitlines()[:2]), text) | ||||||
|     eq_('\n'.join(body.splitlines()[-2:]), extracted_signature) |     eq_('\n'.join(body.splitlines()[-2:]), extracted_signature) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_messages_longer_SIGNATURE_MAX_LINES(): | def test_messages_longer_SIGNATURE_MAX_LINES(): | ||||||
|  |     import sys | ||||||
|  |     kwargs = {} | ||||||
|  |     if sys.version_info > (3, 0): | ||||||
|  |         kwargs["encoding"] = "utf8" | ||||||
|  |  | ||||||
|     for filename in os.listdir(STRIPPED): |     for filename in os.listdir(STRIPPED): | ||||||
|         filename = os.path.join(STRIPPED, filename) |         filename = os.path.join(STRIPPED, filename) | ||||||
|         if not filename.endswith('_body'): |         if not filename.endswith('_body'): | ||||||
|             continue |             continue | ||||||
|         sender, body = dataset.parse_msg_sender(filename) |         sender, body = dataset.parse_msg_sender(filename) | ||||||
|         text, extracted_signature = signature.extract(body, sender) |         text, extracted_signature = extract(body, sender) | ||||||
|         extracted_signature = extracted_signature or '' |         extracted_signature = extracted_signature or '' | ||||||
|         with open(filename[:-len('body')] + 'signature') as ms: |         with open(filename[:-len('body')] + 'signature', **kwargs) as ms: | ||||||
|             msg_signature = ms.read() |             msg_signature = ms.read() | ||||||
|             eq_(msg_signature.strip(), extracted_signature.strip()) |             eq_(msg_signature.strip(), extracted_signature.strip()) | ||||||
|             stripped_msg = body.strip()[:len(body.strip()) - len(msg_signature)] |             stripped_msg = body.strip()[:len(body.strip()) - len(msg_signature)] | ||||||
| @@ -47,7 +52,7 @@ Thanks in advance, | |||||||
| some text which doesn't seem to be a signature at all | some text which doesn't seem to be a signature at all | ||||||
| Bob""" | Bob""" | ||||||
|  |  | ||||||
|     text, extracted_signature = signature.extract(body, sender) |     text, extracted_signature = extract(body, sender) | ||||||
|     eq_('\n'.join(body.splitlines()[:2]), text) |     eq_('\n'.join(body.splitlines()[:2]), text) | ||||||
|     eq_('\n'.join(body.splitlines()[-3:]), extracted_signature) |     eq_('\n'.join(body.splitlines()[-3:]), extracted_signature) | ||||||
|  |  | ||||||
| @@ -60,7 +65,7 @@ Thanks in advance, | |||||||
| some long text here which doesn't seem to be a signature at all | some long text here which doesn't seem to be a signature at all | ||||||
| Bob""" | Bob""" | ||||||
|  |  | ||||||
|     text, extracted_signature = signature.extract(body, sender) |     text, extracted_signature = extract(body, sender) | ||||||
|     eq_('\n'.join(body.splitlines()[:-1]), text) |     eq_('\n'.join(body.splitlines()[:-1]), text) | ||||||
|     eq_('Bob', extracted_signature) |     eq_('Bob', extracted_signature) | ||||||
|  |  | ||||||
| @@ -68,13 +73,38 @@ Bob""" | |||||||
|  |  | ||||||
|     some *long* text here which doesn't seem to be a signature at all |     some *long* text here which doesn't seem to be a signature at all | ||||||
|     """ |     """ | ||||||
|     ((body, None), signature.extract(body, "david@example.com")) |     ((body, None), extract(body, "david@example.com")) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_basic(): | def test_basic(): | ||||||
|     msg_body = 'Blah\r\n--\r\n\r\nSergey Obukhov' |     msg_body = 'Blah\r\n--\r\n\r\nSergey Obukhov' | ||||||
|     eq_(('Blah', '--\r\n\r\nSergey Obukhov'), |     eq_(('Blah', '--\r\n\r\nSergey Obukhov'), | ||||||
|         signature.extract(msg_body, 'Sergey')) |         extract(msg_body, 'Sergey')) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_capitalized(): | ||||||
|  |     msg_body = """Hi Mary, | ||||||
|  |  | ||||||
|  | Do you still need a DJ for your wedding? I've included a video demo of one of our DJs available for your wedding date. | ||||||
|  |  | ||||||
|  | DJ Doe  | ||||||
|  | http://example.com | ||||||
|  | Password: SUPERPASSWORD | ||||||
|  |  | ||||||
|  | Would you like to check out more? | ||||||
|  |  | ||||||
|  |  | ||||||
|  | At your service, | ||||||
|  |  | ||||||
|  | John Smith | ||||||
|  | Doe Inc | ||||||
|  | 555-531-7967""" | ||||||
|  |  | ||||||
|  |     sig = """John Smith | ||||||
|  | Doe Inc | ||||||
|  | 555-531-7967""" | ||||||
|  |  | ||||||
|  |     eq_(sig, extract(msg_body, 'Doe')[1]) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_over_2_text_lines_after_signature(): | def test_over_2_text_lines_after_signature(): | ||||||
| @@ -85,25 +115,25 @@ def test_over_2_text_lines_after_signature(): | |||||||
|     2 non signature lines in the end |     2 non signature lines in the end | ||||||
|     It's not signature |     It's not signature | ||||||
|     """ |     """ | ||||||
|     text, extracted_signature = signature.extract(body, "Bob") |     text, extracted_signature = extract(body, "Bob") | ||||||
|     eq_(extracted_signature, None) |     eq_(extracted_signature, None) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_no_signature(): | def test_no_signature(): | ||||||
|     sender, body = "bob@foo.bar", "Hello" |     sender, body = "bob@foo.bar", "Hello" | ||||||
|     eq_((body, None), signature.extract(body, sender)) |     eq_((body, None), extract(body, sender)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_handles_unicode(): | def test_handles_unicode(): | ||||||
|     sender, body = dataset.parse_msg_sender(UNICODE_MSG) |     sender, body = dataset.parse_msg_sender(UNICODE_MSG) | ||||||
|     text, extracted_signature = signature.extract(body, sender) |     text, extracted_signature = extract(body, sender) | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(signature.extraction, 'has_signature') | @patch.object(extraction, 'has_signature') | ||||||
| def test_signature_extract_crash(has_signature): | def test_signature_extract_crash(has_signature): | ||||||
|     has_signature.side_effect = Exception('Bam!') |     has_signature.side_effect = Exception('Bam!') | ||||||
|     msg_body = u'Blah\r\n--\r\n\r\nСергей' |     msg_body = u'Blah\r\n--\r\n\r\nСергей' | ||||||
|     eq_((msg_body, None), signature.extract(msg_body, 'Сергей')) |     eq_((msg_body, None), extract(msg_body, 'Сергей')) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_mark_lines(): | def test_mark_lines(): | ||||||
| @@ -129,20 +159,20 @@ def test_mark_lines(): | |||||||
|  |  | ||||||
| def test_process_marked_lines(): | def test_process_marked_lines(): | ||||||
|     # no signature found |     # no signature found | ||||||
|     eq_((range(5), None), e._process_marked_lines(range(5), 'telt')) |     eq_((list(range(5)), None), e._process_marked_lines(list(range(5)), 'telt')) | ||||||
|  |  | ||||||
|     # signature in the middle of the text |     # signature in the middle of the text | ||||||
|     eq_((range(9), None), e._process_marked_lines(range(9), 'tesestelt')) |     eq_((list(range(9)), None), e._process_marked_lines(list(range(9)), 'tesestelt')) | ||||||
|  |  | ||||||
|     # long line splits signature |     # long line splits signature | ||||||
|     eq_((range(7), [7, 8]), |     eq_((list(range(7)), [7, 8]), | ||||||
|         e._process_marked_lines(range(9), 'tsslsless')) |         e._process_marked_lines(list(range(9)), 'tsslsless')) | ||||||
|  |  | ||||||
|     eq_((range(20), [20]), |     eq_((list(range(20)), [20]), | ||||||
|         e._process_marked_lines(range(21), 'ttttttstttesllelelets')) |         e._process_marked_lines(list(range(21)), 'ttttttstttesllelelets')) | ||||||
|  |  | ||||||
|     # some signature lines could be identified as text |     # some signature lines could be identified as text | ||||||
|     eq_(([0], range(1, 9)), e._process_marked_lines(range(9), 'tsetetest')) |     eq_(([0], list(range(1, 9))), e._process_marked_lines(list(range(9)), 'tsetetest')) | ||||||
|  |  | ||||||
|     eq_(([], range(5)), |     eq_(([], list(range(5))), | ||||||
|         e._process_marked_lines(range(5), "ststt")) |         e._process_marked_lines(list(range(5)), "ststt")) | ||||||
|   | |||||||
| @@ -1,11 +1,11 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| from ... import * | from ... import * | ||||||
| import os | import os | ||||||
|  |  | ||||||
| from PyML import SparseDataSet | from numpy import genfromtxt | ||||||
|  |  | ||||||
| from talon.utils import to_unicode |  | ||||||
| from talon.signature.learning import dataset as d | from talon.signature.learning import dataset as d | ||||||
|  |  | ||||||
| from talon.signature.learning.featurespace import features | from talon.signature.learning.featurespace import features | ||||||
| @@ -42,10 +42,13 @@ def test_build_extraction_dataset(): | |||||||
|     d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'), |     d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'), | ||||||
|                                os.path.join(TMP_DIR, |                                os.path.join(TMP_DIR, | ||||||
|                                             'extraction.data'), 1) |                                             'extraction.data'), 1) | ||||||
|     test_data = SparseDataSet(os.path.join(TMP_DIR, 'extraction.data'), |  | ||||||
|                               labelsColumn=-1) |     filename = os.path.join(TMP_DIR, 'extraction.data') | ||||||
|  |     file_data = genfromtxt(filename, delimiter=",") | ||||||
|  |     test_data = file_data[:, :-1] | ||||||
|  |  | ||||||
|     # the result is a loadable signature extraction dataset |     # the result is a loadable signature extraction dataset | ||||||
|     # 32 comes from 3 emails in emails/P folder, 11 lines checked to be |     # 32 comes from 3 emails in emails/P folder, 11 lines checked to be | ||||||
|     # a signature, one email has only 10 lines |     # a signature, one email has only 10 lines | ||||||
|     eq_(test_data.size(), 32) |     eq_(test_data.shape[0], 32) | ||||||
|     eq_(len(features('')), test_data.numFeatures) |     eq_(len(features('')), test_data.shape[1]) | ||||||
|   | |||||||
| @@ -1,12 +1,15 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| from ... import * | from ... import * | ||||||
|  |  | ||||||
| from talon.signature.learning import featurespace as fs | from talon.signature.learning import featurespace as fs | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_apply_features(): | def test_apply_features(): | ||||||
|     s = '''John Doe |     s = '''This is John Doe | ||||||
|  |  | ||||||
|  | Tuesday @3pm suits. I'll chat to you then. | ||||||
|  |  | ||||||
| VP Research and Development, Xxxx Xxxx Xxxxx | VP Research and Development, Xxxx Xxxx Xxxxx | ||||||
|  |  | ||||||
| @@ -19,11 +22,12 @@ john@example.com''' | |||||||
|     # note that we don't consider the first line because signatures don't |     # note that we don't consider the first line because signatures don't | ||||||
|     # usually take all the text, empty lines are not considered |     # usually take all the text, empty lines are not considered | ||||||
|     eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], |     eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], | ||||||
|  |                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], | ||||||
|                  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], |                  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], | ||||||
|                  [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], |                  [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], | ||||||
|                  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) |                  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) | ||||||
|  |  | ||||||
|     with patch.object(fs, 'SIGNATURE_MAX_LINES', 4): |     with patch.object(fs, 'SIGNATURE_MAX_LINES', 5): | ||||||
|         features = fs.features(sender) |         features = fs.features(sender) | ||||||
|         new_result = fs.apply_features(s, features) |         new_result = fs.apply_features(s, features) | ||||||
|         # result remains the same because we don't consider empty lines |         # result remains the same because we don't consider empty lines | ||||||
|   | |||||||
| @@ -1,11 +1,13 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| from ... import * | from ... import * | ||||||
|  |  | ||||||
| import regex as re | import regex as re | ||||||
|  |  | ||||||
| from talon.signature.learning import helpers as h | from talon.signature.learning import helpers as h | ||||||
| from talon.signature.learning.helpers import * | from talon.signature.learning.helpers import * | ||||||
|  | from six.moves import range | ||||||
|  |  | ||||||
| # First testing regex constants. | # First testing regex constants. | ||||||
| VALID = ''' | VALID = ''' | ||||||
| @@ -43,7 +45,7 @@ VALID_PHONE_NUMBERS = [e.strip() for e in VALID.splitlines() if e.strip()] | |||||||
|  |  | ||||||
| def test_match_phone_numbers(): | def test_match_phone_numbers(): | ||||||
|     for phone in VALID_PHONE_NUMBERS: |     for phone in VALID_PHONE_NUMBERS: | ||||||
|         ok_(RE_RELAX_PHONE.match(phone), "{} should be matched".format(phone)) |         ok_(RE_RELAX_PHONE.search(phone), "{} should be matched".format(phone)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_match_names(): | def test_match_names(): | ||||||
| @@ -52,29 +54,6 @@ def test_match_names(): | |||||||
|         ok_(RE_NAME.match(name), "{} should be matched".format(name)) |         ok_(RE_NAME.match(name), "{} should be matched".format(name)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_sender_with_name(): |  | ||||||
|     ok_lines = ['Sergey Obukhov <serobnic@example.com>', |  | ||||||
|                 '\tSergey  <serobnic@example.com>', |  | ||||||
|                 ('"Doe, John (TX)"' |  | ||||||
|                  '<DowJ@example.com>@EXAMPLE' |  | ||||||
|                  '<IMCEANOTES-+22Doe+2C+20John+20' |  | ||||||
|                  '+28TX+29+22+20+3CDoeJ+40example+2Ecom+3E' |  | ||||||
|                  '+40EXAMPLE@EXAMPLE.com>'), |  | ||||||
|                 ('Company Sleuth <csleuth@email.xxx.com>' |  | ||||||
|                  '@EXAMPLE <XXX-Company+20Sleuth+20+3Ccsleuth' |  | ||||||
|                  '+40email+2Exxx+2Ecom+3E+40EXAMPLE@EXAMPLE.com>'), |  | ||||||
|                 ('Doe III, John ' |  | ||||||
|                  '</O=EXAMPLE/OU=NA/CN=RECIPIENTS/CN=jDOE5>')] |  | ||||||
|     for line in ok_lines: |  | ||||||
|         ok_(RE_SENDER_WITH_NAME.match(line), |  | ||||||
|             '{} should be matched'.format(line)) |  | ||||||
|  |  | ||||||
|     nok_lines = ['', '<serobnic@xxx.ru>', 'Sergey serobnic@xxx.ru'] |  | ||||||
|     for line in nok_lines: |  | ||||||
|         assert_false(RE_SENDER_WITH_NAME.match(line), |  | ||||||
|                      '{} should not be matched'.format(line)) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # Now test helpers functions | # Now test helpers functions | ||||||
| def test_binary_regex_search(): | def test_binary_regex_search(): | ||||||
|     eq_(1, h.binary_regex_search(re.compile("12"))("12")) |     eq_(1, h.binary_regex_search(re.compile("12"))("12")) | ||||||
| @@ -177,7 +156,7 @@ def test_extract_names(): | |||||||
|         # check that extracted names could be compiled |         # check that extracted names could be compiled | ||||||
|         try: |         try: | ||||||
|             re.compile("|".join(extracted_names)) |             re.compile("|".join(extracted_names)) | ||||||
|         except Exception, e: |         except Exception as e: | ||||||
|             ok_(False, ("Failed to compile extracted names {}" |             ok_(False, ("Failed to compile extracted names {}" | ||||||
|                         "\n\nReason: {}").format(extracted_names, e)) |                         "\n\nReason: {}").format(extracted_names, e)) | ||||||
|         if expected_names: |         if expected_names: | ||||||
| @@ -213,10 +192,11 @@ def test_punctuation_percent(categories_percent): | |||||||
| def test_capitalized_words_percent(): | def test_capitalized_words_percent(): | ||||||
|     eq_(0.0, h.capitalized_words_percent('')) |     eq_(0.0, h.capitalized_words_percent('')) | ||||||
|     eq_(100.0, h.capitalized_words_percent('Example Corp')) |     eq_(100.0, h.capitalized_words_percent('Example Corp')) | ||||||
|     eq_(50.0, h.capitalized_words_percent('Qqq qqq QQQ 123 sss')) |     eq_(50.0, h.capitalized_words_percent('Qqq qqq Aqs 123 sss')) | ||||||
|     eq_(100.0, h.capitalized_words_percent('Cell 713-444-7368')) |     eq_(100.0, h.capitalized_words_percent('Cell 713-444-7368')) | ||||||
|     eq_(100.0, h.capitalized_words_percent('8th Floor')) |     eq_(100.0, h.capitalized_words_percent('8th Floor')) | ||||||
|     eq_(0.0, h.capitalized_words_percent('(212) 230-9276')) |     eq_(0.0, h.capitalized_words_percent('(212) 230-9276')) | ||||||
|  |     eq_(50.0, h.capitalized_words_percent('Password: REMARKABLE')) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_has_signature(): | def test_has_signature(): | ||||||
| @@ -227,7 +207,7 @@ def test_has_signature(): | |||||||
|                         'sender@example.com')) |                         'sender@example.com')) | ||||||
|     assert_false(h.has_signature('http://www.example.com/555-555-5555', |     assert_false(h.has_signature('http://www.example.com/555-555-5555', | ||||||
|                                  'sender@example.com')) |                                  'sender@example.com')) | ||||||
|     long_line = ''.join(['q' for e in xrange(28)]) |     long_line = ''.join(['q' for e in range(28)]) | ||||||
|     assert_false(h.has_signature(long_line + ' sender', 'sender@example.com')) |     assert_false(h.has_signature(long_line + ' sender', 'sender@example.com')) | ||||||
|     # wont crash on an empty string |     # wont crash on an empty string | ||||||
|     assert_false(h.has_signature('', '')) |     assert_false(h.has_signature('', '')) | ||||||
|   | |||||||
| @@ -1,23 +1,26 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| from . import * | from . import * | ||||||
| from . fixtures import * | from . fixtures import * | ||||||
|  |  | ||||||
| import os | import os | ||||||
|  |  | ||||||
| from flanker import mime | import email.iterators | ||||||
|  |  | ||||||
| from talon import quotations | from talon import quotations | ||||||
|  | import six | ||||||
|  | from six.moves import range | ||||||
|  | from six import StringIO | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(quotations, 'MAX_LINES_COUNT', 1) | @patch.object(quotations, 'MAX_LINES_COUNT', 1) | ||||||
| def test_too_many_lines(): | def test_too_many_lines(): | ||||||
|     msg_body = """Test reply |     msg_body = """Test reply | ||||||
|  | Hi | ||||||
| -----Original Message----- | -----Original Message----- | ||||||
|  |  | ||||||
| Test""" | Test""" | ||||||
|     eq_(msg_body, quotations.extract_from_plain(msg_body)) |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_pattern_on_date_somebody_wrote(): | def test_pattern_on_date_somebody_wrote(): | ||||||
| @@ -32,6 +35,42 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote: | |||||||
|  |  | ||||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  | def test_pattern_on_date_polymail(): | ||||||
|  |     msg_body = """Test reply | ||||||
|  |  | ||||||
|  | On Tue, Apr 11, 2017 at 10:07 PM John Smith | ||||||
|  |  | ||||||
|  | < | ||||||
|  | mailto:John Smith <johnsmith@gmail.com> | ||||||
|  | > wrote: | ||||||
|  | Test quoted data | ||||||
|  | """ | ||||||
|  |  | ||||||
|  |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_pattern_sent_from_samsung_smb_wrote(): | ||||||
|  |     msg_body = """Test reply | ||||||
|  |  | ||||||
|  | Sent from Samsung MobileName <address@example.com> wrote: | ||||||
|  |  | ||||||
|  | > | ||||||
|  | > Test | ||||||
|  | > | ||||||
|  | > Roman""" | ||||||
|  |  | ||||||
|  |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_pattern_on_date_wrote_somebody(): | ||||||
|  |     eq_('Lorem', quotations.extract_from_plain( | ||||||
|  |     """Lorem | ||||||
|  |  | ||||||
|  | Op 13-02-2014 3:18 schreef Julius Caesar <pantheon@rome.com>: | ||||||
|  |  | ||||||
|  | Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse. | ||||||
|  | """)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_pattern_on_date_somebody_wrote_date_with_slashes(): | def test_pattern_on_date_somebody_wrote_date_with_slashes(): | ||||||
|     msg_body = """Test reply |     msg_body = """Test reply | ||||||
| @@ -45,6 +84,18 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote: | |||||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_date_time_email_splitter(): | ||||||
|  |     msg_body = """Test reply | ||||||
|  |  | ||||||
|  | 2014-10-17 11:28 GMT+03:00 Postmaster < | ||||||
|  | postmaster@sandboxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.mailgun.org>: | ||||||
|  |  | ||||||
|  | > First from site | ||||||
|  | > | ||||||
|  |     """ | ||||||
|  |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_pattern_on_date_somebody_wrote_allows_space_in_front(): | def test_pattern_on_date_somebody_wrote_allows_space_in_front(): | ||||||
|     msg_body = """Thanks Thanmai |     msg_body = """Thanks Thanmai | ||||||
|  On Mar 8, 2012 9:59 AM, "Example.com" < |  On Mar 8, 2012 9:59 AM, "Example.com" < | ||||||
| @@ -68,6 +119,38 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> sent: | |||||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_appointment(): | ||||||
|  |     msg_body = """Response | ||||||
|  |  | ||||||
|  | 10/19/2017 @ 9:30 am for physical therapy | ||||||
|  | Bla | ||||||
|  | 1517 4th Avenue Ste 300 | ||||||
|  | London CA 19129, 555-421-6780 | ||||||
|  |  | ||||||
|  | John Doe, FCLS | ||||||
|  | Mailgun Inc | ||||||
|  | 555-941-0697 | ||||||
|  |  | ||||||
|  | From: from@example.com [mailto:from@example.com] | ||||||
|  | Sent: Wednesday, October 18, 2017 2:05 PM | ||||||
|  | To: John Doer - SIU <jd@example.com> | ||||||
|  | Subject: RE: Claim # 5551188-1 | ||||||
|  |  | ||||||
|  | Text""" | ||||||
|  |  | ||||||
|  |     expected = """Response | ||||||
|  |  | ||||||
|  | 10/19/2017 @ 9:30 am for physical therapy | ||||||
|  | Bla | ||||||
|  | 1517 4th Avenue Ste 300 | ||||||
|  | London CA 19129, 555-421-6780 | ||||||
|  |  | ||||||
|  | John Doe, FCLS | ||||||
|  | Mailgun Inc | ||||||
|  | 555-941-0697""" | ||||||
|  |     eq_(expected, quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_line_starts_with_on(): | def test_line_starts_with_on(): | ||||||
|     msg_body = """Blah-blah-blah |     msg_body = """Blah-blah-blah | ||||||
| On blah-blah-blah""" | On blah-blah-blah""" | ||||||
| @@ -98,22 +181,25 @@ bla-bla - bla""" | |||||||
|     eq_(reply, quotations.extract_from_plain(msg_body)) |     eq_(reply, quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_pattern_original_message(): | def _check_pattern_original_message(original_message_indicator): | ||||||
|     msg_body = """Test reply |     msg_body = u"""Test reply | ||||||
|  |  | ||||||
| -----Original Message----- | -----{}----- | ||||||
|  |  | ||||||
| Test""" | Test""" | ||||||
|  |     eq_('Test reply', quotations.extract_from_plain( | ||||||
|  |         msg_body.format(six.text_type(original_message_indicator)))) | ||||||
|  |  | ||||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) | def test_english_original_message(): | ||||||
|  |     _check_pattern_original_message('Original Message') | ||||||
|  |     _check_pattern_original_message('Reply Message') | ||||||
|  |  | ||||||
|     msg_body = """Test reply | def test_german_original_message(): | ||||||
|  |     _check_pattern_original_message(u'Ursprüngliche Nachricht') | ||||||
|  |     _check_pattern_original_message('Antwort Nachricht') | ||||||
|  |  | ||||||
|  -----Original Message----- | def test_danish_original_message(): | ||||||
|  |     _check_pattern_original_message('Oprindelig meddelelse') | ||||||
| Test""" |  | ||||||
|  |  | ||||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_reply_after_quotations(): | def test_reply_after_quotations(): | ||||||
| @@ -125,6 +211,17 @@ Test reply""" | |||||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_android_wrote(): | ||||||
|  |     msg_body = """Test reply | ||||||
|  |  | ||||||
|  | ---- John Smith wrote ---- | ||||||
|  |  | ||||||
|  | > quoted | ||||||
|  | > text | ||||||
|  | """ | ||||||
|  |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_reply_wraps_quotations(): | def test_reply_wraps_quotations(): | ||||||
|     msg_body = """Test reply |     msg_body = """Test reply | ||||||
|  |  | ||||||
| @@ -199,6 +296,33 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote: | |||||||
| > Hello""" | > Hello""" | ||||||
|     eq_("Hi", quotations.extract_from_plain(msg_body)) |     eq_("Hi", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  | def test_with_indent(): | ||||||
|  |     msg_body = """YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin. | ||||||
|  |  | ||||||
|  | ------On 12/29/1987 17:32 PM, Julius Caesar wrote----- | ||||||
|  |  | ||||||
|  | Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur. | ||||||
|  |     """ | ||||||
|  |     eq_("YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_short_quotation_with_newline(): | ||||||
|  |     msg_body = """Btw blah blah... | ||||||
|  |  | ||||||
|  | On Tue, Jan 27, 2015 at 12:42 PM -0800, "Company" <christine.XXX@XXX.com> wrote: | ||||||
|  |  | ||||||
|  | Hi Mark, | ||||||
|  | Blah blah?  | ||||||
|  | Thanks,Christine  | ||||||
|  |  | ||||||
|  | On Jan 27, 2015, at 11:55 AM, Mark XXX <mark@XXX.com> wrote: | ||||||
|  |  | ||||||
|  | Lorem ipsum? | ||||||
|  | Mark | ||||||
|  |  | ||||||
|  | Sent from Acompli""" | ||||||
|  |     eq_("Btw blah blah...", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_pattern_date_email_with_unicode(): | def test_pattern_date_email_with_unicode(): | ||||||
|     msg_body = """Replying ok |     msg_body = """Replying ok | ||||||
| @@ -208,8 +332,8 @@ def test_pattern_date_email_with_unicode(): | |||||||
|     eq_("Replying ok", quotations.extract_from_plain(msg_body)) |     eq_("Replying ok", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_pattern_from_block(): | def test_english_from_block(): | ||||||
|     msg_body = """Allo! Follow up MIME! |     eq_('Allo! Follow up MIME!', quotations.extract_from_plain("""Allo! Follow up MIME! | ||||||
|  |  | ||||||
| From: somebody@example.com | From: somebody@example.com | ||||||
| Sent: March-19-11 5:42 PM | Sent: March-19-11 5:42 PM | ||||||
| @@ -217,9 +341,106 @@ To: Somebody | |||||||
| Subject: The manager has commented on your Loop | Subject: The manager has commented on your Loop | ||||||
|  |  | ||||||
| Blah-blah-blah | Blah-blah-blah | ||||||
| """ | """)) | ||||||
|     eq_("Allo! Follow up MIME!", quotations.extract_from_plain(msg_body)) |  | ||||||
|  |  | ||||||
|  | def test_german_from_block(): | ||||||
|  |     eq_('Allo! Follow up MIME!', quotations.extract_from_plain( | ||||||
|  |     """Allo! Follow up MIME! | ||||||
|  |  | ||||||
|  | Von: somebody@example.com | ||||||
|  | Gesendet: Dienstag, 25. November 2014 14:59 | ||||||
|  | An: Somebody | ||||||
|  | Betreff: The manager has commented on your Loop | ||||||
|  |  | ||||||
|  | Blah-blah-blah | ||||||
|  | """)) | ||||||
|  |  | ||||||
|  | def test_french_multiline_from_block(): | ||||||
|  |     eq_('Lorem ipsum', quotations.extract_from_plain( | ||||||
|  |     u"""Lorem ipsum | ||||||
|  |  | ||||||
|  | De : Brendan xxx [mailto:brendan.xxx@xxx.com] | ||||||
|  | Envoyé : vendredi 23 janvier 2015 16:39 | ||||||
|  | À : Camille XXX | ||||||
|  | Objet : Follow Up | ||||||
|  |  | ||||||
|  | Blah-blah-blah | ||||||
|  | """)) | ||||||
|  |  | ||||||
|  | def test_french_from_block(): | ||||||
|  |     eq_('Lorem ipsum', quotations.extract_from_plain( | ||||||
|  |     u"""Lorem ipsum | ||||||
|  |  | ||||||
|  | Le 23 janv. 2015 à 22:03, Brendan xxx <brendan.xxx@xxx.com<mailto:brendan.xxx@xxx.com>> a écrit: | ||||||
|  |  | ||||||
|  | Bonjour!""")) | ||||||
|  |  | ||||||
|  | def test_polish_from_block(): | ||||||
|  |     eq_('Lorem ipsum', quotations.extract_from_plain( | ||||||
|  |     u"""Lorem ipsum | ||||||
|  |  | ||||||
|  | W dniu 28 stycznia 2015 01:53 użytkownik Zoe xxx <zoe.xxx@xxx.com> | ||||||
|  | napisał: | ||||||
|  |  | ||||||
|  | Blah! | ||||||
|  | """)) | ||||||
|  |  | ||||||
|  | def test_danish_from_block(): | ||||||
|  |     eq_('Allo! Follow up MIME!', quotations.extract_from_plain( | ||||||
|  |     """Allo! Follow up MIME! | ||||||
|  |  | ||||||
|  | Fra: somebody@example.com | ||||||
|  | Sendt: 19. march 2011 12:10 | ||||||
|  | Til: Somebody | ||||||
|  | Emne: The manager has commented on your Loop | ||||||
|  |  | ||||||
|  | Blah-blah-blah | ||||||
|  | """)) | ||||||
|  |  | ||||||
|  | def test_swedish_from_block(): | ||||||
|  |     eq_('Allo! Follow up MIME!', quotations.extract_from_plain( | ||||||
|  |     u"""Allo! Follow up MIME! | ||||||
|  | Från: Anno Sportel [mailto:anno.spoel@hsbcssad.com] | ||||||
|  | Skickat: den 26 augusti 2015 14:45 | ||||||
|  | Till: Isacson Leiff | ||||||
|  | Ämne: RE: Week 36 | ||||||
|  |  | ||||||
|  | Blah-blah-blah | ||||||
|  | """)) | ||||||
|  |  | ||||||
|  | def test_swedish_from_line(): | ||||||
|  |     eq_('Lorem', quotations.extract_from_plain( | ||||||
|  |     """Lorem | ||||||
|  | Den 14 september, 2015 02:23:18, Valentino Rudy (valentino@rudy.be) skrev: | ||||||
|  |  | ||||||
|  | Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse. | ||||||
|  | """)) | ||||||
|  |  | ||||||
|  | def test_norwegian_from_line(): | ||||||
|  |     eq_('Lorem', quotations.extract_from_plain( | ||||||
|  |     u"""Lorem | ||||||
|  | På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev: | ||||||
|  |  | ||||||
|  | Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse. | ||||||
|  | """)) | ||||||
|  |  | ||||||
|  | def test_dutch_from_block(): | ||||||
|  |     eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain( | ||||||
|  |     """Gluten-free culpa lo-fi et nesciunt nostrud. | ||||||
|  |  | ||||||
|  | Op 17-feb.-2015, om 13:18 heeft Julius Caesar <pantheon@rome.com> het volgende geschreven: | ||||||
|  |  | ||||||
|  | Small batch beard laboris tempor, non listicle hella Tumblr heirloom. | ||||||
|  | """)) | ||||||
|  |  | ||||||
|  | def test_vietnamese_from_block(): | ||||||
|  |     eq_('Hello', quotations.extract_from_plain( | ||||||
|  |     u"""Hello | ||||||
|  |  | ||||||
|  | Vào 14:24 8 tháng 6, 2017, Hùng Nguyễn <hungnguyen@xxx.com> đã viết: | ||||||
|  |  | ||||||
|  | > Xin chào | ||||||
|  | """)) | ||||||
|  |  | ||||||
| def test_quotation_marker_false_positive(): | def test_quotation_marker_false_positive(): | ||||||
|     msg_body = """Visit us now for assistance... |     msg_body = """Visit us now for assistance... | ||||||
| @@ -232,6 +453,7 @@ def test_link_closed_with_quotation_marker_on_new_line(): | |||||||
|     msg_body = '''8.45am-1pm |     msg_body = '''8.45am-1pm | ||||||
|  |  | ||||||
| From: somebody@example.com | From: somebody@example.com | ||||||
|  | Date: Wed, 16 May 2012 00:15:02 -0600 | ||||||
|   |   | ||||||
| <http://email.example.com/c/dHJhY2tpbmdfY29kZT1mMDdjYzBmNzM1ZjYzMGIxNT | <http://email.example.com/c/dHJhY2tpbmdfY29kZT1mMDdjYzBmNzM1ZjYzMGIxNT | ||||||
| >  <bob@example.com <mailto:bob@example.com> > | >  <bob@example.com <mailto:bob@example.com> > | ||||||
| @@ -273,7 +495,9 @@ def test_from_block_starts_with_date(): | |||||||
|     msg_body = """Blah |     msg_body = """Blah | ||||||
|  |  | ||||||
| Date: Wed, 16 May 2012 00:15:02 -0600 | Date: Wed, 16 May 2012 00:15:02 -0600 | ||||||
| To: klizhentas@example.com""" | To: klizhentas@example.com | ||||||
|  |  | ||||||
|  | """ | ||||||
|     eq_('Blah', quotations.extract_from_plain(msg_body)) |     eq_('Blah', quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -343,11 +567,12 @@ def test_mark_message_lines(): | |||||||
|              # next line should be marked as splitter |              # next line should be marked as splitter | ||||||
|              '_____________', |              '_____________', | ||||||
|              'From: foo@bar.com', |              'From: foo@bar.com', | ||||||
|  |              'Date: Wed, 16 May 2012 00:15:02 -0600', | ||||||
|              '', |              '', | ||||||
|              '> Hi', |              '> Hi', | ||||||
|              '', |              '', | ||||||
|              'Signature'] |              'Signature'] | ||||||
|     eq_('tessemet', quotations.mark_message_lines(lines)) |     eq_('tesssemet', quotations.mark_message_lines(lines)) | ||||||
|  |  | ||||||
|     lines = ['Just testing the email reply', |     lines = ['Just testing the email reply', | ||||||
|              '', |              '', | ||||||
| @@ -510,25 +735,107 @@ def test_preprocess_postprocess_2_links(): | |||||||
|     eq_(msg_body, quotations.extract_from_plain(msg_body)) |     eq_(msg_body, quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def body_iterator(msg, decode=False): | ||||||
|  |     for subpart in msg.walk(): | ||||||
|  |         payload = subpart.get_payload(decode=decode) | ||||||
|  |         if isinstance(payload, six.text_type): | ||||||
|  |             yield payload | ||||||
|  |         else: | ||||||
|  |             yield payload.decode('utf8') | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_standard_replies(): | def test_standard_replies(): | ||||||
|     for filename in os.listdir(STANDARD_REPLIES): |     for filename in os.listdir(STANDARD_REPLIES): | ||||||
|         filename = os.path.join(STANDARD_REPLIES, filename) |         filename = os.path.join(STANDARD_REPLIES, filename) | ||||||
|         if os.path.isdir(filename): |         if not filename.endswith('.eml') or os.path.isdir(filename): | ||||||
|             continue |             continue | ||||||
|         with open(filename) as f: |         with open(filename) as f: | ||||||
|             msg = f.read() |             message = email.message_from_file(f) | ||||||
|             m = mime.from_string(msg) |             body = next(email.iterators.typed_subpart_iterator(message, subtype='plain')) | ||||||
|             for part in m.walk(): |             text = ''.join(body_iterator(body, True)) | ||||||
|                 if part.content_type == 'text/plain': |  | ||||||
|                     text = part.body |  | ||||||
|             stripped_text = quotations.extract_from_plain(text) |             stripped_text = quotations.extract_from_plain(text) | ||||||
|             reply_text_fn = filename[:-4] + '_reply_text' |             reply_text_fn = filename[:-4] + '_reply_text' | ||||||
|             if os.path.isfile(reply_text_fn): |             if os.path.isfile(reply_text_fn): | ||||||
|                 with open(reply_text_fn) as f: |                 with open(reply_text_fn) as f: | ||||||
|                             reply_text = f.read() |                     reply_text = f.read().strip() | ||||||
|             else: |             else: | ||||||
|                 reply_text = 'Hello' |                 reply_text = 'Hello' | ||||||
|                     eq_(reply_text, stripped_text, |             yield eq_, reply_text, stripped_text, \ | ||||||
|                         "'%(reply)s' != %(stripped)s for %(fn)s" % |                 "'%(reply)s' != %(stripped)s for %(fn)s" % \ | ||||||
|                 {'reply': reply_text, 'stripped': stripped_text, |                 {'reply': reply_text, 'stripped': stripped_text, | ||||||
|                          'fn': filename}) |                  'fn': filename} | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_split_email(): | ||||||
|  |     msg = """From: Mr. X | ||||||
|  |     Date: 24 February 2016 | ||||||
|  |     To: Mr. Y | ||||||
|  |     Subject: Hi | ||||||
|  |     Attachments: none | ||||||
|  |     Goodbye. | ||||||
|  |     From: Mr. Y | ||||||
|  |     To: Mr. X | ||||||
|  |     Date: 24 February 2016 | ||||||
|  |     Subject: Hi | ||||||
|  |     Attachments: none | ||||||
|  |  | ||||||
|  |     Hello. | ||||||
|  |  | ||||||
|  |         On 24th February 2016 at 09.32am, Conal wrote: | ||||||
|  |  | ||||||
|  |         Hey! | ||||||
|  |  | ||||||
|  |         On Mon, 2016-10-03 at 09:45 -0600, Stangel, Dan wrote: | ||||||
|  |         > Mohan, | ||||||
|  |         > | ||||||
|  |         > We have not yet migrated the systems. | ||||||
|  |         > | ||||||
|  |         > Dan | ||||||
|  |         > | ||||||
|  |         > > -----Original Message----- | ||||||
|  |         > > Date: Mon, 2 Apr 2012 17:44:22 +0400 | ||||||
|  |         > > Subject: Test | ||||||
|  |         > > From: bob@xxx.mailgun.org | ||||||
|  |         > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com | ||||||
|  |         > > | ||||||
|  |         > > Hi | ||||||
|  |         > > | ||||||
|  |         > > > From: bob@xxx.mailgun.org | ||||||
|  |         > > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com | ||||||
|  |         > > > Date: Mon, 2 Apr 2012 17:44:22 +0400 | ||||||
|  |         > > > Subject: Test | ||||||
|  |         > > > Hi | ||||||
|  |         > > > | ||||||
|  |         > > | ||||||
|  |         > | ||||||
|  |         > | ||||||
|  | """ | ||||||
|  |     expected_markers = "stttttsttttetesetesmmmmmmsmmmmmmmmmmmmmmmm" | ||||||
|  |     markers = quotations.split_emails(msg) | ||||||
|  |     eq_(markers, expected_markers) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_feedback_below_left_unparsed(): | ||||||
|  |     msg_body = """Please enter your feedback below. Thank you. | ||||||
|  |  | ||||||
|  | ------------------------------------- Enter Feedback Below ------------------------------------- | ||||||
|  |  | ||||||
|  | The user experience was unparallelled. Please continue production. I'm sending payment to ensure | ||||||
|  | that this line is intact.""" | ||||||
|  |  | ||||||
|  |     parsed = quotations.extract_from_plain(msg_body) | ||||||
|  |     eq_(msg_body, parsed) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_appointment_2(): | ||||||
|  |     msg_body = """Invitation for an interview: | ||||||
|  |  | ||||||
|  | Date: Wednesday 3, October 2011  | ||||||
|  | Time: 7 : 00am  | ||||||
|  | Address: 130 Fox St | ||||||
|  |  | ||||||
|  | Please bring in your ID.""" | ||||||
|  |     parsed = quotations.extract_from_plain(msg_body) | ||||||
|  |     eq_(msg_body, parsed) | ||||||
|   | |||||||
| @@ -1,9 +1,71 @@ | |||||||
| from . import * | # coding:utf-8 | ||||||
|  |  | ||||||
| from talon import utils | from __future__ import absolute_import | ||||||
|  |  | ||||||
|  | from talon import utils as u | ||||||
|  | from . import * | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_get_delimiter(): | def test_get_delimiter(): | ||||||
|     eq_('\r\n', utils.get_delimiter('abc\r\n123')) |     eq_('\r\n', u.get_delimiter('abc\r\n123')) | ||||||
|     eq_('\n', utils.get_delimiter('abc\n123')) |     eq_('\n', u.get_delimiter('abc\n123')) | ||||||
|     eq_('\n', utils.get_delimiter('abc')) |     eq_('\n', u.get_delimiter('abc')) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_html_to_text(): | ||||||
|  |     html = """<body> | ||||||
|  | <p>Hello world!</p> | ||||||
|  | <br> | ||||||
|  | <ul> | ||||||
|  | <li>One!</li> | ||||||
|  | <li>Two</li> | ||||||
|  | </ul> | ||||||
|  | <p> | ||||||
|  | Haha | ||||||
|  | </p> | ||||||
|  | </body>""" | ||||||
|  |     text = u.html_to_text(html) | ||||||
|  |     eq_("Hello world! \n\n  * One! \n  * Two \nHaha", text) | ||||||
|  |     eq_(u"привет!", u.html_to_text("<b>привет!</b>")) | ||||||
|  |  | ||||||
|  |     html = '<body><br/><br/>Hi</body>' | ||||||
|  |     eq_('Hi', u.html_to_text(html)) | ||||||
|  |  | ||||||
|  |     html = """Hi | ||||||
|  | <style type="text/css"> | ||||||
|  |  | ||||||
|  | div, p, li { | ||||||
|  |  | ||||||
|  | font: 13px 'Lucida Grande', Arial, sans-serif; | ||||||
|  |  | ||||||
|  | } | ||||||
|  | </style> | ||||||
|  |  | ||||||
|  | <style type="text/css"> | ||||||
|  |  | ||||||
|  | h1 { | ||||||
|  |  | ||||||
|  | font: 13px 'Lucida Grande', Arial, sans-serif; | ||||||
|  |  | ||||||
|  | } | ||||||
|  | </style>""" | ||||||
|  |     eq_('Hi', u.html_to_text(html)) | ||||||
|  |  | ||||||
|  |     html = """<div> | ||||||
|  | <!-- COMMENT 1 --> | ||||||
|  | <span>TEXT 1</span> | ||||||
|  | <p>TEXT 2 <!-- COMMENT 2 --></p> | ||||||
|  | </div>""" | ||||||
|  |     eq_('TEXT 1 \nTEXT 2', u.html_to_text(html)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_comment_no_parent(): | ||||||
|  |     s = '<!-- COMMENT 1 --> no comment' | ||||||
|  |     d = u.html_document_fromstring(s) | ||||||
|  |     eq_("no comment", u.html_tree_to_text(d)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @patch.object(u, 'html_fromstring', Mock(return_value=None)) | ||||||
|  | def test_bad_html_to_text(): | ||||||
|  |     bad_html = "one<br>two<br>three" | ||||||
|  |     eq_(None, u.html_to_text(bad_html)) | ||||||
|   | |||||||
							
								
								
									
										11
									
								
								train.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								train.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,11 @@ | |||||||
|  | from __future__ import absolute_import | ||||||
|  | from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA | ||||||
|  | from talon.signature.learning.classifier import train, init | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def train_model(): | ||||||
|  |     """ retrain model and persist """ | ||||||
|  |     train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME) | ||||||
|  |  | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     train_model() | ||||||
		Reference in New Issue
	
	Block a user