Compare commits
	
		
			10 Commits
		
	
	
		
			thrawn/dev
			...
			maxim/deve
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 14f106ee76 | ||
|  | a8c7e6a972 | ||
|  | b30c375c5b | ||
|  | cec5acf58f | ||
|  | 24d0f2d00a | ||
|  | 94007b0b92 | ||
|  | 1a5548f171 | ||
|  | 53c49b9121 | ||
|  | bd50872043 | ||
|  | d37c4fd551 | 
							
								
								
									
										20
									
								
								.build/Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								.build/Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,20 @@ | |||||||
|  | FROM python:3.9-slim-buster AS deps | ||||||
|  |  | ||||||
|  | RUN apt-get update && \ | ||||||
|  |     apt-get install -y build-essential git curl python3-dev libatlas3-base libatlas-base-dev liblapack-dev libxml2 libxml2-dev libffi6 libffi-dev musl-dev libxslt-dev | ||||||
|  |  | ||||||
|  | FROM deps AS testable | ||||||
|  | ARG REPORT_PATH | ||||||
|  |  | ||||||
|  | VOLUME ["/var/mailgun", "/etc/mailgun/ssl", ${REPORT_PATH}] | ||||||
|  |  | ||||||
|  | ADD . /app | ||||||
|  | WORKDIR /app | ||||||
|  | COPY wheel/* /wheel/ | ||||||
|  |  | ||||||
|  | RUN mkdir -p ${REPORT_PATH} | ||||||
|  |  | ||||||
|  | RUN python ./setup.py build bdist_wheel -d /wheel && \ | ||||||
|  |     pip install --no-deps /wheel/* | ||||||
|  |  | ||||||
|  | ENTRYPOINT ["/bin/sh", "/app/run_tests.sh"] | ||||||
							
								
								
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -54,3 +54,6 @@ _trial_temp | |||||||
|  |  | ||||||
| # OSX | # OSX | ||||||
| .DS_Store | .DS_Store | ||||||
|  |  | ||||||
|  | # vim-backup | ||||||
|  | *.bak | ||||||
|   | |||||||
							
								
								
									
										11
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,11 @@ | |||||||
|  | chardet>=1.0.1 | ||||||
|  | cchardet>=0.3.5 | ||||||
|  | cssselect | ||||||
|  | html5lib | ||||||
|  | joblib | ||||||
|  | lxml>=2.3.3 | ||||||
|  | numpy | ||||||
|  | regex>=1 | ||||||
|  | scikit-learn>=1.0.0 | ||||||
|  | scipy | ||||||
|  | six>=1.10.0 | ||||||
							
								
								
									
										4
									
								
								run_tests.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										4
									
								
								run_tests.sh
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,4 @@ | |||||||
|  | #!/usr/bin/env bash | ||||||
|  | set -ex | ||||||
|  | REPORT_PATH="${REPORT_PATH:-./}" | ||||||
|  | nosetests --with-xunit --with-coverage --cover-xml --cover-xml-file $REPORT_PATH/coverage.xml --xunit-file=$REPORT_PATH/nosetests.xml --cover-package=talon . | ||||||
							
								
								
									
										31
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										31
									
								
								setup.py
									
									
									
									
									
								
							| @@ -19,17 +19,17 @@ class InstallCommand(install): | |||||||
|         if self.no_ml: |         if self.no_ml: | ||||||
|             dist = self.distribution |             dist = self.distribution | ||||||
|             dist.packages=find_packages(exclude=[ |             dist.packages=find_packages(exclude=[ | ||||||
|                 'tests', |                 "tests", | ||||||
|                 'tests.*', |                 "tests.*", | ||||||
|                 'talon.signature', |                 "talon.signature", | ||||||
|                 'talon.signature.*', |                 "talon.signature.*", | ||||||
|             ]) |             ]) | ||||||
|             for not_required in ['numpy', 'scipy', 'scikit-learn==0.16.1']: |             for not_required in ["numpy", "scipy", "scikit-learn==0.24.1"]: | ||||||
|                 dist.install_requires.remove(not_required) |                 dist.install_requires.remove(not_required) | ||||||
|  |  | ||||||
|  |  | ||||||
| setup(name='talon', | setup(name='talon', | ||||||
|       version='1.4.8', |       version='1.6.0', | ||||||
|       description=("Mailgun library " |       description=("Mailgun library " | ||||||
|                    "to extract message quotations and signatures."), |                    "to extract message quotations and signatures."), | ||||||
|       long_description=open("README.rst").read(), |       long_description=open("README.rst").read(), | ||||||
| @@ -44,20 +44,21 @@ setup(name='talon', | |||||||
|       include_package_data=True, |       include_package_data=True, | ||||||
|       zip_safe=True, |       zip_safe=True, | ||||||
|       install_requires=[ |       install_requires=[ | ||||||
|           "lxml>=2.3.3", |           "lxml", | ||||||
|           "regex>=1", |           "regex", | ||||||
|           "numpy", |           "numpy", | ||||||
|           "scipy", |           "scipy", | ||||||
|           "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild |           "scikit-learn>=1.0.0", | ||||||
|           'chardet>=1.0.1', |           "chardet", | ||||||
|           'cchardet>=0.3.5', |           "cchardet", | ||||||
|           'cssselect', |           "cssselect", | ||||||
|           'six>=1.10.0', |           "six", | ||||||
|           'html5lib' |           "html5lib", | ||||||
|  |           "joblib", | ||||||
|           ], |           ], | ||||||
|       tests_require=[ |       tests_require=[ | ||||||
|           "mock", |           "mock", | ||||||
|           "nose>=1.2.1", |           "nose", | ||||||
|           "coverage" |           "coverage" | ||||||
|           ] |           ] | ||||||
|       ) |       ) | ||||||
|   | |||||||
| @@ -6,18 +6,17 @@ original messages (without quoted messages) | |||||||
| """ | """ | ||||||
|  |  | ||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
| import regex as re |  | ||||||
| import logging | import logging | ||||||
| from copy import deepcopy | from copy import deepcopy | ||||||
|  |  | ||||||
| from lxml import html, etree | import regex as re | ||||||
|  | from lxml import etree, html | ||||||
| from talon.utils import (get_delimiter, html_tree_to_text, |  | ||||||
|                          html_document_fromstring) |  | ||||||
| from talon import html_quotations |  | ||||||
| from six.moves import range | from six.moves import range | ||||||
| import six |  | ||||||
|  |  | ||||||
|  | from talon import html_quotations | ||||||
|  | from talon.utils import (get_delimiter, html_document_fromstring, | ||||||
|  |                          html_tree_to_text) | ||||||
|  |  | ||||||
| log = logging.getLogger(__name__) | log = logging.getLogger(__name__) | ||||||
|  |  | ||||||
| @@ -94,7 +93,7 @@ RE_ON_DATE_WROTE_SMB = re.compile( | |||||||
|     ) |     ) | ||||||
|  |  | ||||||
| RE_QUOTATION = re.compile( | RE_QUOTATION = re.compile( | ||||||
|     r''' |     r""" | ||||||
|     ( |     ( | ||||||
|         # quotation border: splitter line or a number of quotation marker lines |         # quotation border: splitter line or a number of quotation marker lines | ||||||
|         (?: |         (?: | ||||||
| @@ -112,10 +111,10 @@ RE_QUOTATION = re.compile( | |||||||
|  |  | ||||||
|     # after quotations should be text only or nothing at all |     # after quotations should be text only or nothing at all | ||||||
|     [te]*$ |     [te]*$ | ||||||
|     ''', re.VERBOSE) |     """, re.VERBOSE) | ||||||
|  |  | ||||||
| RE_EMPTY_QUOTATION = re.compile( | RE_EMPTY_QUOTATION = re.compile( | ||||||
|     r''' |     r""" | ||||||
|     ( |     ( | ||||||
|         # quotation border: splitter line or a number of quotation marker lines |         # quotation border: splitter line or a number of quotation marker lines | ||||||
|         (?: |         (?: | ||||||
| @@ -125,7 +124,7 @@ RE_EMPTY_QUOTATION = re.compile( | |||||||
|         ) |         ) | ||||||
|     ) |     ) | ||||||
|     e* |     e* | ||||||
|     ''', re.VERBOSE) |     """, re.VERBOSE) | ||||||
|  |  | ||||||
| # ------Original Message------ or ---- Reply Message ---- | # ------Original Message------ or ---- Reply Message ---- | ||||||
| # With variations in other languages. | # With variations in other languages. | ||||||
| @@ -193,9 +192,6 @@ RE_PARENTHESIS_LINK = re.compile("\(https?://") | |||||||
|  |  | ||||||
| SPLITTER_MAX_LINES = 6 | SPLITTER_MAX_LINES = 6 | ||||||
| MAX_LINES_COUNT = 1000 | MAX_LINES_COUNT = 1000 | ||||||
| # an extensive research shows that exceeding this limit |  | ||||||
| # leads to excessive processing time |  | ||||||
| MAX_HTML_LEN = 2794202 |  | ||||||
|  |  | ||||||
| QUOT_PATTERN = re.compile('^>+ ?') | QUOT_PATTERN = re.compile('^>+ ?') | ||||||
| NO_QUOT_LINE = re.compile('^[^>].*[\S].*') | NO_QUOT_LINE = re.compile('^[^>].*[\S].*') | ||||||
| @@ -346,9 +342,6 @@ def _replace_link_brackets(msg_body): | |||||||
|  |  | ||||||
|     Converts msg_body into a unicode |     Converts msg_body into a unicode | ||||||
|     """ |     """ | ||||||
|     if isinstance(msg_body, bytes): |  | ||||||
|         msg_body = msg_body.decode('utf8') |  | ||||||
|  |  | ||||||
|     def link_wrapper(link): |     def link_wrapper(link): | ||||||
|         newline_index = msg_body[:link.start()].rfind("\n") |         newline_index = msg_body[:link.start()].rfind("\n") | ||||||
|         if msg_body[newline_index + 1] == ">": |         if msg_body[newline_index + 1] == ">": | ||||||
| @@ -388,8 +381,6 @@ def postprocess(msg_body): | |||||||
|  |  | ||||||
| def extract_from_plain(msg_body): | def extract_from_plain(msg_body): | ||||||
|     """Extracts a non quoted message from provided plain text.""" |     """Extracts a non quoted message from provided plain text.""" | ||||||
|     stripped_text = msg_body |  | ||||||
|  |  | ||||||
|     delimiter = get_delimiter(msg_body) |     delimiter = get_delimiter(msg_body) | ||||||
|     msg_body = preprocess(msg_body, delimiter) |     msg_body = preprocess(msg_body, delimiter) | ||||||
|     # don't process too long messages |     # don't process too long messages | ||||||
| @@ -421,25 +412,27 @@ def extract_from_html(msg_body): | |||||||
|  |  | ||||||
|     Returns a unicode string. |     Returns a unicode string. | ||||||
|     """ |     """ | ||||||
|     if isinstance(msg_body, six.text_type): |     if msg_body.strip() == "": | ||||||
|         msg_body = msg_body.encode('utf8') |         return msg_body | ||||||
|     elif not isinstance(msg_body, bytes): |  | ||||||
|         msg_body = msg_body.encode('ascii') |  | ||||||
|  |  | ||||||
|     result = _extract_from_html(msg_body) |     msg_body = msg_body.replace("\r\n", "\n") | ||||||
|     if isinstance(result, bytes): |     # Cut out xml and doctype tags to avoid conflict with unicode decoding. | ||||||
|         result = result.decode('utf8') |     msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body) | ||||||
|  |     html_tree = html_document_fromstring(msg_body) | ||||||
|  |     if html_tree is None: | ||||||
|  |         return msg_body | ||||||
|  |  | ||||||
|  |     result = extract_from_html_tree(html_tree) | ||||||
|  |     if not result: | ||||||
|  |         return msg_body | ||||||
|  |  | ||||||
|     return result |     return result | ||||||
|  |  | ||||||
|  |  | ||||||
| def _extract_from_html(msg_body): | def extract_from_html_tree(html_tree): | ||||||
|     """ |     """ | ||||||
|     Extract not quoted message from provided html message body |     Extract not quoted message from provided parsed html tree using tags and | ||||||
|     using tags and plain text algorithm. |     plain text algorithm. | ||||||
|  |  | ||||||
|     Cut out first some encoding html tags such as xml and doctype |  | ||||||
|     for avoiding conflict with unicode decoding |  | ||||||
|  |  | ||||||
|     Cut out the 'blockquote', 'gmail_quote' tags. |     Cut out the 'blockquote', 'gmail_quote' tags. | ||||||
|     Cut Microsoft quotations. |     Cut Microsoft quotations. | ||||||
| @@ -452,20 +445,6 @@ def _extract_from_html(msg_body): | |||||||
|     then checking deleted checkpoints, |     then checking deleted checkpoints, | ||||||
|     then deleting necessary tags. |     then deleting necessary tags. | ||||||
|     """ |     """ | ||||||
|     if msg_body.strip() == b'': |  | ||||||
|         return msg_body |  | ||||||
|  |  | ||||||
|     msg_body = msg_body.replace(b'\r\n', b'\n') |  | ||||||
|  |  | ||||||
|     msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body) |  | ||||||
|  |  | ||||||
|     html_tree = html_document_fromstring(msg_body) |  | ||||||
|  |  | ||||||
|     if html_tree is None: |  | ||||||
|         return msg_body |  | ||||||
|  |  | ||||||
|     cut_quotations = False |  | ||||||
|     try: |  | ||||||
|     cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or |     cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or | ||||||
|                       html_quotations.cut_zimbra_quote(html_tree) or |                       html_quotations.cut_zimbra_quote(html_tree) or | ||||||
|                       html_quotations.cut_blockquote(html_tree) or |                       html_quotations.cut_blockquote(html_tree) or | ||||||
| @@ -473,10 +452,6 @@ def _extract_from_html(msg_body): | |||||||
|                       html_quotations.cut_by_id(html_tree) or |                       html_quotations.cut_by_id(html_tree) or | ||||||
|                       html_quotations.cut_from_block(html_tree) |                       html_quotations.cut_from_block(html_tree) | ||||||
|                       ) |                       ) | ||||||
|     except Exception as e: |  | ||||||
|         log.exception('during html quotations cut') |  | ||||||
|         pass |  | ||||||
|  |  | ||||||
|     html_tree_copy = deepcopy(html_tree) |     html_tree_copy = deepcopy(html_tree) | ||||||
|  |  | ||||||
|     number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) |     number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) | ||||||
| @@ -487,7 +462,7 @@ def _extract_from_html(msg_body): | |||||||
|  |  | ||||||
|     # Don't process too long messages |     # Don't process too long messages | ||||||
|     if len(lines) > MAX_LINES_COUNT: |     if len(lines) > MAX_LINES_COUNT: | ||||||
|         return msg_body |         return None | ||||||
|  |  | ||||||
|     # Collect checkpoints on each line |     # Collect checkpoints on each line | ||||||
|     line_checkpoints = [ |     line_checkpoints = [ | ||||||
| @@ -506,7 +481,7 @@ def _extract_from_html(msg_body): | |||||||
|     lines_were_deleted, first_deleted, last_deleted = return_flags |     lines_were_deleted, first_deleted, last_deleted = return_flags | ||||||
|  |  | ||||||
|     if not lines_were_deleted and not cut_quotations: |     if not lines_were_deleted and not cut_quotations: | ||||||
|         return msg_body |         return None | ||||||
|  |  | ||||||
|     if lines_were_deleted: |     if lines_were_deleted: | ||||||
|         #collect checkpoints from deleted lines |         #collect checkpoints from deleted lines | ||||||
| @@ -520,7 +495,7 @@ def _extract_from_html(msg_body): | |||||||
|         ) |         ) | ||||||
|  |  | ||||||
|     if _readable_text_empty(html_tree_copy): |     if _readable_text_empty(html_tree_copy): | ||||||
|         return msg_body |         return None | ||||||
|  |  | ||||||
|     # NOTE: We remove_namespaces() because we are using an HTML5 Parser, HTML |     # NOTE: We remove_namespaces() because we are using an HTML5 Parser, HTML | ||||||
|     # parsers do not recognize namespaces in HTML tags. As such the rendered |     # parsers do not recognize namespaces in HTML tags. As such the rendered | ||||||
| @@ -546,7 +521,11 @@ def _extract_from_html(msg_body): | |||||||
|     #    of replacing data outside the <tag> which might be essential to |     #    of replacing data outside the <tag> which might be essential to | ||||||
|     #    the customer. |     #    the customer. | ||||||
|     remove_namespaces(html_tree_copy) |     remove_namespaces(html_tree_copy) | ||||||
|     return html.tostring(html_tree_copy) |     s = html.tostring(html_tree_copy, encoding="ascii") | ||||||
|  |     if not s: | ||||||
|  |         return None | ||||||
|  |  | ||||||
|  |     return s.decode("ascii") | ||||||
|  |  | ||||||
|  |  | ||||||
| def remove_namespaces(root): | def remove_namespaces(root): | ||||||
| @@ -665,10 +644,10 @@ def _readable_text_empty(html_tree): | |||||||
|  |  | ||||||
|  |  | ||||||
| def is_splitter(line): | def is_splitter(line): | ||||||
|     ''' |     """ | ||||||
|     Returns Matcher object if provided string is a splitter and |     Returns Matcher object if provided string is a splitter and | ||||||
|     None otherwise. |     None otherwise. | ||||||
|     ''' |     """ | ||||||
|     for pattern in SPLITTER_PATTERNS: |     for pattern in SPLITTER_PATTERNS: | ||||||
|         matcher = re.match(pattern, line) |         matcher = re.match(pattern, line) | ||||||
|         if matcher: |         if matcher: | ||||||
| @@ -676,12 +655,12 @@ def is_splitter(line): | |||||||
|  |  | ||||||
|  |  | ||||||
| def text_content(context): | def text_content(context): | ||||||
|     '''XPath Extension function to return a node text content.''' |     """XPath Extension function to return a node text content.""" | ||||||
|     return context.context_node.xpath("string()").strip() |     return context.context_node.xpath("string()").strip() | ||||||
|  |  | ||||||
|  |  | ||||||
| def tail(context): | def tail(context): | ||||||
|     '''XPath Extension function to return a node tail text.''' |     """XPath Extension function to return a node tail text.""" | ||||||
|     return context.context_node.tail or '' |     return context.context_node.tail or '' | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -23,17 +23,14 @@ trained against, don't forget to regenerate: | |||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
| import os | import os | ||||||
|  |  | ||||||
| from . import extraction | from talon.signature import extraction | ||||||
| from . extraction import extract  #noqa | from talon.signature.extraction import extract | ||||||
| from . learning import classifier | from talon.signature.learning import classifier | ||||||
|  |  | ||||||
|  |  | ||||||
| DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') |  | ||||||
|  |  | ||||||
| EXTRACTOR_FILENAME = os.path.join(DATA_DIR, 'classifier') |  | ||||||
| EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data') |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def initialize(): | def initialize(): | ||||||
|     extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME, |     data_dir = os.path.join(os.path.dirname(__file__), 'data') | ||||||
|                                            EXTRACTOR_DATA) |     extractor_filename = os.path.join(data_dir, 'classifier') | ||||||
|  |     extractor_data_filename = os.path.join(data_dir, 'train.data') | ||||||
|  |     extraction.EXTRACTOR = classifier.load(extractor_filename, | ||||||
|  |                                            extractor_data_filename) | ||||||
|   | |||||||
| @@ -62,7 +62,7 @@ RE_SIGNATURE_CANDIDATE = re.compile(r''' | |||||||
|  |  | ||||||
|  |  | ||||||
| def extract_signature(msg_body): | def extract_signature(msg_body): | ||||||
|     """ |     ''' | ||||||
|     Analyzes message for a presence of signature block (by common patterns) |     Analyzes message for a presence of signature block (by common patterns) | ||||||
|     and returns tuple with two elements: message text without signature block |     and returns tuple with two elements: message text without signature block | ||||||
|     and the signature itself. |     and the signature itself. | ||||||
| @@ -72,7 +72,7 @@ def extract_signature(msg_body): | |||||||
|  |  | ||||||
|     >>> extract_signature('Hey man!') |     >>> extract_signature('Hey man!') | ||||||
|     ('Hey man!', None) |     ('Hey man!', None) | ||||||
|     """ |     ''' | ||||||
|     try: |     try: | ||||||
|         # identify line delimiter first |         # identify line delimiter first | ||||||
|         delimiter = get_delimiter(msg_body) |         delimiter = get_delimiter(msg_body) | ||||||
|   | |||||||
										
											Binary file not shown.
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -8,7 +8,7 @@ body belongs to the signature. | |||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
|  |  | ||||||
| from numpy import genfromtxt | from numpy import genfromtxt | ||||||
| from sklearn.externals import joblib | import joblib | ||||||
| from sklearn.svm import LinearSVC | from sklearn.svm import LinearSVC | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -5,21 +5,17 @@ | |||||||
| * regexp's constants used when evaluating signature's features | * regexp's constants used when evaluating signature's features | ||||||
|  |  | ||||||
| """ | """ | ||||||
|  |  | ||||||
| from __future__ import absolute_import |  | ||||||
| import unicodedata | import unicodedata | ||||||
|  |  | ||||||
| import regex as re | import regex as re | ||||||
|  |  | ||||||
| from talon.utils import to_unicode |  | ||||||
|  |  | ||||||
| from talon.signature.constants import SIGNATURE_MAX_LINES | from talon.signature.constants import SIGNATURE_MAX_LINES | ||||||
|  |  | ||||||
|  |  | ||||||
| rc = re.compile | rc = re.compile | ||||||
|  |  | ||||||
| RE_EMAIL = rc('\S@\S') | RE_EMAIL = rc('\S@\S') | ||||||
| RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}') | RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}') | ||||||
| RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''') | RE_URL = rc(r"""https?://|www\.[\S]+\.[\S]""") | ||||||
|  |  | ||||||
| # Taken from: | # Taken from: | ||||||
| # http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf | # http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf | ||||||
| @@ -55,7 +51,7 @@ BAD_SENDER_NAMES = [ | |||||||
|  |  | ||||||
|  |  | ||||||
| def binary_regex_search(prog): | def binary_regex_search(prog): | ||||||
|     '''Returns a function that returns 1 or 0 depending on regex search result. |     """Returns a function that returns 1 or 0 depending on regex search result. | ||||||
|  |  | ||||||
|     If regular expression compiled into prog is present in a string |     If regular expression compiled into prog is present in a string | ||||||
|     the result of calling the returned function with the string will be 1 |     the result of calling the returned function with the string will be 1 | ||||||
| @@ -66,12 +62,12 @@ def binary_regex_search(prog): | |||||||
|     1 |     1 | ||||||
|     >>> binary_regex_search(re.compile("12"))("34") |     >>> binary_regex_search(re.compile("12"))("34") | ||||||
|     0 |     0 | ||||||
|     ''' |     """ | ||||||
|     return lambda s: 1 if prog.search(s) else 0 |     return lambda s: 1 if prog.search(s) else 0 | ||||||
|  |  | ||||||
|  |  | ||||||
| def binary_regex_match(prog): | def binary_regex_match(prog): | ||||||
|     '''Returns a function that returns 1 or 0 depending on regex match result. |     """Returns a function that returns 1 or 0 depending on regex match result. | ||||||
|  |  | ||||||
|     If a string matches regular expression compiled into prog |     If a string matches regular expression compiled into prog | ||||||
|     the result of calling the returned function with the string will be 1 |     the result of calling the returned function with the string will be 1 | ||||||
| @@ -82,7 +78,7 @@ def binary_regex_match(prog): | |||||||
|     1 |     1 | ||||||
|     >>> binary_regex_match(re.compile("12"))("3 12") |     >>> binary_regex_match(re.compile("12"))("3 12") | ||||||
|     0 |     0 | ||||||
|     ''' |     """ | ||||||
|     return lambda s: 1 if prog.match(s) else 0 |     return lambda s: 1 if prog.match(s) else 0 | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -102,7 +98,7 @@ def flatten_list(list_to_flatten): | |||||||
|  |  | ||||||
|  |  | ||||||
| def contains_sender_names(sender): | def contains_sender_names(sender): | ||||||
|     '''Returns a functions to search sender\'s name or it\'s part. |     """Returns a functions to search sender\'s name or it\'s part. | ||||||
|  |  | ||||||
|     >>> feature = contains_sender_names("Sergey N.  Obukhov <xxx@example.com>") |     >>> feature = contains_sender_names("Sergey N.  Obukhov <xxx@example.com>") | ||||||
|     >>> feature("Sergey Obukhov") |     >>> feature("Sergey Obukhov") | ||||||
| @@ -115,7 +111,7 @@ def contains_sender_names(sender): | |||||||
|     1 |     1 | ||||||
|     >>> contains_sender_names("<serobnic@mail.ru>")("serobnic") |     >>> contains_sender_names("<serobnic@mail.ru>")("serobnic") | ||||||
|     1 |     1 | ||||||
|     ''' |     """ | ||||||
|     names = '( |$)|'.join(flatten_list([[e, e.capitalize()] |     names = '( |$)|'.join(flatten_list([[e, e.capitalize()] | ||||||
|                                         for e in extract_names(sender)])) |                                         for e in extract_names(sender)])) | ||||||
|     names = names or sender |     names = names or sender | ||||||
| @@ -135,20 +131,25 @@ def extract_names(sender): | |||||||
|     >>> extract_names('') |     >>> extract_names('') | ||||||
|     [] |     [] | ||||||
|     """ |     """ | ||||||
|     sender = to_unicode(sender, precise=True) |  | ||||||
|     # Remove non-alphabetical characters |     # Remove non-alphabetical characters | ||||||
|     sender = "".join([char if char.isalpha() else ' ' for char in sender]) |     sender = "".join([char if char.isalpha() else ' ' for char in sender]) | ||||||
|     # Remove too short words and words from "black" list i.e. |     # Remove too short words and words from "black" list i.e. | ||||||
|     # words like `ru`, `gmail`, `com`, `org`, etc. |     # words like `ru`, `gmail`, `com`, `org`, etc. | ||||||
|     sender = [word for word in sender.split() if len(word) > 1 and |     names = list() | ||||||
|               not word in BAD_SENDER_NAMES] |     for word in sender.split(): | ||||||
|     # Remove duplicates |         if len(word) < 2: | ||||||
|     names = list(set(sender)) |             continue | ||||||
|  |         if word in BAD_SENDER_NAMES: | ||||||
|  |             continue | ||||||
|  |         if word in names: | ||||||
|  |             continue | ||||||
|  |         names.append(word) | ||||||
|  |  | ||||||
|     return names |     return names | ||||||
|  |  | ||||||
|  |  | ||||||
| def categories_percent(s, categories): | def categories_percent(s, categories): | ||||||
|     '''Returns category characters percent. |     """Returns category characters percent. | ||||||
|  |  | ||||||
|     >>> categories_percent("qqq ggg hhh", ["Po"]) |     >>> categories_percent("qqq ggg hhh", ["Po"]) | ||||||
|     0.0 |     0.0 | ||||||
| @@ -160,9 +161,8 @@ def categories_percent(s, categories): | |||||||
|     50.0 |     50.0 | ||||||
|     >>> categories_percent("s.s,5s", ["Po", "Nd"]) |     >>> categories_percent("s.s,5s", ["Po", "Nd"]) | ||||||
|     50.0 |     50.0 | ||||||
|     ''' |     """ | ||||||
|     count = 0 |     count = 0 | ||||||
|     s = to_unicode(s, precise=True) |  | ||||||
|     for c in s: |     for c in s: | ||||||
|         if unicodedata.category(c) in categories: |         if unicodedata.category(c) in categories: | ||||||
|             count += 1 |             count += 1 | ||||||
| @@ -170,19 +170,18 @@ def categories_percent(s, categories): | |||||||
|  |  | ||||||
|  |  | ||||||
| def punctuation_percent(s): | def punctuation_percent(s): | ||||||
|     '''Returns punctuation percent. |     """Returns punctuation percent. | ||||||
|  |  | ||||||
|     >>> punctuation_percent("qqq ggg hhh") |     >>> punctuation_percent("qqq ggg hhh") | ||||||
|     0.0 |     0.0 | ||||||
|     >>> punctuation_percent("q,w.") |     >>> punctuation_percent("q,w.") | ||||||
|     50.0 |     50.0 | ||||||
|     ''' |     """ | ||||||
|     return categories_percent(s, ['Po']) |     return categories_percent(s, ['Po']) | ||||||
|  |  | ||||||
|  |  | ||||||
| def capitalized_words_percent(s): | def capitalized_words_percent(s): | ||||||
|     '''Returns capitalized words percent.''' |     """Returns capitalized words percent.""" | ||||||
|     s = to_unicode(s, precise=True) |  | ||||||
|     words = re.split('\s', s) |     words = re.split('\s', s) | ||||||
|     words = [w for w in words if w.strip()] |     words = [w for w in words if w.strip()] | ||||||
|     words = [w for w in words if len(w) > 2]     |     words = [w for w in words if len(w) > 2]     | ||||||
| @@ -208,20 +207,26 @@ def many_capitalized_words(s): | |||||||
|  |  | ||||||
|  |  | ||||||
| def has_signature(body, sender): | def has_signature(body, sender): | ||||||
|     '''Checks if the body has signature. Returns True or False.''' |     """Checks if the body has signature. Returns True or False.""" | ||||||
|     non_empty = [line for line in body.splitlines() if line.strip()] |     non_empty = [line for line in body.splitlines() if line.strip()] | ||||||
|     candidate = non_empty[-SIGNATURE_MAX_LINES:] |     candidate = non_empty[-SIGNATURE_MAX_LINES:] | ||||||
|     upvotes = 0 |     upvotes = 0 | ||||||
|  |     sender_check = contains_sender_names(sender) | ||||||
|     for line in candidate: |     for line in candidate: | ||||||
|         # we check lines for sender's name, phone, email and url, |         # we check lines for sender's name, phone, email and url, | ||||||
|         # those signature lines don't take more then 27 lines |         # those signature lines don't take more then 27 lines | ||||||
|         if len(line.strip()) > 27: |         if len(line.strip()) > 27: | ||||||
|             continue |             continue | ||||||
|         elif contains_sender_names(sender)(line): |  | ||||||
|  |         if sender_check(line): | ||||||
|             return True |             return True | ||||||
|         elif (binary_regex_search(RE_RELAX_PHONE)(line) + |  | ||||||
|  |         if (binary_regex_search(RE_RELAX_PHONE)(line) + | ||||||
|                 binary_regex_search(RE_EMAIL)(line) + |                 binary_regex_search(RE_EMAIL)(line) + | ||||||
|                 binary_regex_search(RE_URL)(line) == 1): |                 binary_regex_search(RE_URL)(line) == 1): | ||||||
|             upvotes += 1 |             upvotes += 1 | ||||||
|  |  | ||||||
|     if upvotes > 1: |     if upvotes > 1: | ||||||
|         return True |         return True | ||||||
|  |  | ||||||
|  |     return False | ||||||
|   | |||||||
							
								
								
									
										173
									
								
								talon/utils.py
									
									
									
									
									
								
							
							
						
						
									
										173
									
								
								talon/utils.py
									
									
									
									
									
								
							| @@ -1,110 +1,17 @@ | |||||||
| # coding:utf-8 | # coding:utf-8 | ||||||
|  | from __future__ import annotations | ||||||
|  |  | ||||||
| from __future__ import absolute_import |  | ||||||
|  |  | ||||||
| from random import shuffle |  | ||||||
|  |  | ||||||
| import cchardet |  | ||||||
| import chardet |  | ||||||
| import html5lib | import html5lib | ||||||
| import regex as re | import regex as re | ||||||
| import six | from html5lib import HTMLParser | ||||||
| from lxml.cssselect import CSSSelector | from lxml.cssselect import CSSSelector | ||||||
|  | from lxml.etree import _Element | ||||||
| from lxml.html import html5parser | from lxml.html import html5parser | ||||||
|  |  | ||||||
| from talon.constants import RE_DELIMITER | from talon.constants import RE_DELIMITER | ||||||
|  |  | ||||||
|  |  | ||||||
| def safe_format(format_string, *args, **kwargs): | def get_delimiter(msg_body: str) -> str: | ||||||
|     """ |  | ||||||
|     Helper: formats string with any combination of bytestrings/unicode |  | ||||||
|     strings without raising exceptions |  | ||||||
|     """ |  | ||||||
|     try: |  | ||||||
|         if not args and not kwargs: |  | ||||||
|             return format_string |  | ||||||
|         else: |  | ||||||
|             return format_string.format(*args, **kwargs) |  | ||||||
|  |  | ||||||
|     # catch encoding errors and transform everything into utf-8 string |  | ||||||
|     # before logging: |  | ||||||
|     except (UnicodeEncodeError, UnicodeDecodeError): |  | ||||||
|         format_string = to_utf8(format_string) |  | ||||||
|         args = [to_utf8(p) for p in args] |  | ||||||
|         kwargs = {k: to_utf8(v) for k, v in six.iteritems(kwargs)} |  | ||||||
|         return format_string.format(*args, **kwargs) |  | ||||||
|  |  | ||||||
|     # ignore other errors |  | ||||||
|     except: |  | ||||||
|         return u'' |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def to_unicode(str_or_unicode, precise=False): |  | ||||||
|     """ |  | ||||||
|     Safely returns a unicode version of a given string |  | ||||||
|     >>> utils.to_unicode('привет') |  | ||||||
|         u'привет' |  | ||||||
|     >>> utils.to_unicode(u'привет') |  | ||||||
|         u'привет' |  | ||||||
|     If `precise` flag is True, tries to guess the correct encoding first. |  | ||||||
|     """ |  | ||||||
|     if not isinstance(str_or_unicode, six.text_type): |  | ||||||
|         encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8' |  | ||||||
|         return six.text_type(str_or_unicode, encoding, 'replace') |  | ||||||
|     return str_or_unicode |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def detect_encoding(string): |  | ||||||
|     """ |  | ||||||
|     Tries to detect the encoding of the passed string. |  | ||||||
|  |  | ||||||
|     Defaults to UTF-8. |  | ||||||
|     """ |  | ||||||
|     assert isinstance(string, bytes) |  | ||||||
|     try: |  | ||||||
|         detected = chardet.detect(string) |  | ||||||
|         if detected: |  | ||||||
|             return detected.get('encoding') or 'utf-8' |  | ||||||
|     except Exception as e: |  | ||||||
|         pass |  | ||||||
|     return 'utf-8' |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def quick_detect_encoding(string): |  | ||||||
|     """ |  | ||||||
|     Tries to detect the encoding of the passed string. |  | ||||||
|  |  | ||||||
|     Uses cchardet. Fallbacks to detect_encoding. |  | ||||||
|     """ |  | ||||||
|     assert isinstance(string, bytes) |  | ||||||
|     try: |  | ||||||
|         detected = cchardet.detect(string) |  | ||||||
|         if detected: |  | ||||||
|             return detected.get('encoding') or detect_encoding(string) |  | ||||||
|     except Exception as e: |  | ||||||
|         pass |  | ||||||
|     return detect_encoding(string) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def to_utf8(str_or_unicode): |  | ||||||
|     """ |  | ||||||
|     Safely returns a UTF-8 version of a given string |  | ||||||
|     >>> utils.to_utf8(u'hi') |  | ||||||
|         'hi' |  | ||||||
|     """ |  | ||||||
|     if not isinstance(str_or_unicode, six.text_type): |  | ||||||
|         return str_or_unicode.encode("utf-8", "ignore") |  | ||||||
|     return str(str_or_unicode) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def random_token(length=7): |  | ||||||
|     vals = ("a b c d e f g h i j k l m n o p q r s t u v w x y z " |  | ||||||
|             "0 1 2 3 4 5 6 7 8 9").split(' ') |  | ||||||
|     shuffle(vals) |  | ||||||
|     return ''.join(vals[:length]) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def get_delimiter(msg_body): |  | ||||||
|     delimiter = RE_DELIMITER.search(msg_body) |     delimiter = RE_DELIMITER.search(msg_body) | ||||||
|     if delimiter: |     if delimiter: | ||||||
|         delimiter = delimiter.group() |         delimiter = delimiter.group() | ||||||
| @@ -114,7 +21,7 @@ def get_delimiter(msg_body): | |||||||
|     return delimiter |     return delimiter | ||||||
|  |  | ||||||
|  |  | ||||||
| def html_tree_to_text(tree): | def html_tree_to_text(tree: _Element) -> str: | ||||||
|     for style in CSSSelector('style')(tree): |     for style in CSSSelector('style')(tree): | ||||||
|         style.getparent().remove(style) |         style.getparent().remove(style) | ||||||
|  |  | ||||||
| @@ -146,26 +53,22 @@ def html_tree_to_text(tree): | |||||||
|             not text.endswith("\n") and not el_text): |             not text.endswith("\n") and not el_text): | ||||||
|             text += "\n" |             text += "\n" | ||||||
|  |  | ||||||
|     retval = _rm_excessive_newlines(text) |     text = _rm_excessive_newlines(text) | ||||||
|     return _encode_utf8(retval) |     return text | ||||||
|  |  | ||||||
|  |  | ||||||
| def html_to_text(string): | def html_to_text(s: str) -> str | None: | ||||||
|     """ |     """ | ||||||
|     Dead-simple HTML-to-text converter: |     Dead-simple HTML-to-text converter: | ||||||
|         >>> html_to_text("one<br>two<br>three") |         >>> html_to_text("one<br>two<br>three") | ||||||
|         >>> "one\ntwo\nthree" |         <<< "one\ntwo\nthree" | ||||||
|  |  | ||||||
|     NOTES: |     NOTES: | ||||||
|         1. the string is expected to contain UTF-8 encoded HTML! |         1. the string is expected to contain UTF-8 encoded HTML! | ||||||
|         2. returns utf-8 encoded str (not unicode) |  | ||||||
|         3. if html can't be parsed returns None |         3. if html can't be parsed returns None | ||||||
|     """ |     """ | ||||||
|     if isinstance(string, six.text_type): |     s = _prepend_utf8_declaration(s) | ||||||
|         string = string.encode('utf8') |     s = s.replace("\n", "") | ||||||
|  |  | ||||||
|     s = _prepend_utf8_declaration(string) |  | ||||||
|     s = s.replace(b"\n", b"") |  | ||||||
|     tree = html_fromstring(s) |     tree = html_fromstring(s) | ||||||
|  |  | ||||||
|     if tree is None: |     if tree is None: | ||||||
| @@ -174,74 +77,46 @@ def html_to_text(string): | |||||||
|     return html_tree_to_text(tree) |     return html_tree_to_text(tree) | ||||||
|  |  | ||||||
|  |  | ||||||
| def html_fromstring(s): | def html_fromstring(s: str) -> _Element: | ||||||
|     """Parse html tree from string. Return None if the string can't be parsed. |     """Parse html tree from string. Return None if the string can't be parsed. | ||||||
|     """ |     """ | ||||||
|     if isinstance(s, six.text_type): |  | ||||||
|         s = s.encode('utf8') |  | ||||||
|     try: |  | ||||||
|         if html_too_big(s): |  | ||||||
|             return None |  | ||||||
|  |  | ||||||
|     return html5parser.fromstring(s, parser=_html5lib_parser()) |     return html5parser.fromstring(s, parser=_html5lib_parser()) | ||||||
|     except Exception: |  | ||||||
|         pass |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def html_document_fromstring(s): | def html_document_fromstring(s: str) -> _Element: | ||||||
|     """Parse html tree from string. Return None if the string can't be parsed. |     """Parse html tree from string. Return None if the string can't be parsed. | ||||||
|     """ |     """ | ||||||
|     if isinstance(s, six.text_type): |  | ||||||
|         s = s.encode('utf8') |  | ||||||
|     try: |  | ||||||
|         if html_too_big(s): |  | ||||||
|             return None |  | ||||||
|  |  | ||||||
|     return html5parser.document_fromstring(s, parser=_html5lib_parser()) |     return html5parser.document_fromstring(s, parser=_html5lib_parser()) | ||||||
|     except Exception: |  | ||||||
|         pass |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def cssselect(expr, tree): | def cssselect(expr: str, tree: str) -> list[_Element]: | ||||||
|     return CSSSelector(expr)(tree) |     return CSSSelector(expr)(tree) | ||||||
|  |  | ||||||
|  |  | ||||||
| def html_too_big(s): | def _contains_charset_spec(s: str) -> str: | ||||||
|     if isinstance(s, six.text_type): |  | ||||||
|         s = s.encode('utf8') |  | ||||||
|     return s.count(b'<') > _MAX_TAGS_COUNT |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def _contains_charset_spec(s): |  | ||||||
|     """Return True if the first 4KB contain charset spec |     """Return True if the first 4KB contain charset spec | ||||||
|     """ |     """ | ||||||
|     return s.lower().find(b'html; charset=', 0, 4096) != -1 |     return s.lower().find('html; charset=', 0, 4096) != -1 | ||||||
|  |  | ||||||
|  |  | ||||||
| def _prepend_utf8_declaration(s): | def _prepend_utf8_declaration(s: str) -> str: | ||||||
|     """Prepend 'utf-8' encoding declaration if the first 4KB don't have any |     """Prepend 'utf-8' encoding declaration if the first 4KB don't have any | ||||||
|     """ |     """ | ||||||
|     return s if _contains_charset_spec(s) else _UTF8_DECLARATION + s |     return s if _contains_charset_spec(s) else _UTF8_DECLARATION + s | ||||||
|  |  | ||||||
|  |  | ||||||
| def _rm_excessive_newlines(s): | def _rm_excessive_newlines(s: str) -> str: | ||||||
|     """Remove excessive newlines that often happen due to tons of divs |     """Remove excessive newlines that often happen due to tons of divs | ||||||
|     """ |     """ | ||||||
|     return _RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip() |     return _RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip() | ||||||
|  |  | ||||||
|  |  | ||||||
| def _encode_utf8(s): | def _html5lib_parser() -> HTMLParser: | ||||||
|     """Encode in 'utf-8' if unicode |  | ||||||
|     """ |  | ||||||
|     return s.encode('utf-8') if isinstance(s, six.text_type) else s |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def _html5lib_parser(): |  | ||||||
|     """ |     """ | ||||||
|     html5lib is a pure-python library that conforms to the WHATWG HTML spec |     html5lib is a pure-python library that conforms to the WHATWG HTML spec | ||||||
|     and is not vulnarable to certain attacks common for XML libraries |     and is not vulnarable to certain attacks common for XML libraries | ||||||
|     """ |     """ | ||||||
|     return html5lib.HTMLParser( |     return HTMLParser( | ||||||
|         # build lxml tree |         # build lxml tree | ||||||
|         html5lib.treebuilders.getTreeBuilder("lxml"), |         html5lib.treebuilders.getTreeBuilder("lxml"), | ||||||
|         # remove namespace value from inside lxml.html.html5paser element tag |         # remove namespace value from inside lxml.html.html5paser element tag | ||||||
| @@ -251,14 +126,10 @@ def _html5lib_parser(): | |||||||
|     ) |     ) | ||||||
|  |  | ||||||
|  |  | ||||||
| _UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;' | _UTF8_DECLARATION = ('<meta http-equiv="Content-Type" content="text/html;' | ||||||
|                      b'charset=utf-8">') |                      'charset=utf-8">') | ||||||
|  |  | ||||||
| _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] | _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] | ||||||
| _HARDBREAKS = ['br', 'hr', 'tr'] | _HARDBREAKS = ['br', 'hr', 'tr'] | ||||||
|  |  | ||||||
| _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") | _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") | ||||||
|  |  | ||||||
| # an extensive research shows that exceeding this limit |  | ||||||
| # might lead to excessive processing time |  | ||||||
| _MAX_TAGS_COUNT = 419 |  | ||||||
|   | |||||||
							
								
								
									
										3
									
								
								test-requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								test-requirements.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | |||||||
|  | coverage | ||||||
|  | mock | ||||||
|  | nose>=1.2.1 | ||||||
| @@ -1,4 +1,6 @@ | |||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
|  | from nose.tools import * | ||||||
|  | from mock import * | ||||||
|  |  | ||||||
| import talon | import talon | ||||||
|  |  | ||||||
|   | |||||||
| @@ -2,14 +2,19 @@ | |||||||
|  |  | ||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
|  |  | ||||||
| from tests.fixtures import REPLY_QUOTATIONS_SHARE_BLOCK, OLK_SRC_BODY_SECTION, REPLY_SEPARATED_BY_HR | # noinspection PyUnresolvedReferences | ||||||
| from nose.tools import eq_, ok_, assert_false, assert_true |  | ||||||
| from talon import quotations, utils as u |  | ||||||
| from mock import Mock, patch |  | ||||||
| import re | import re | ||||||
|  | from unittest.mock import Mock, patch | ||||||
|  |  | ||||||
| RE_WHITESPACE = re.compile("\s") | from nose.tools import assert_false, assert_true, eq_, ok_ | ||||||
| RE_DOUBLE_WHITESPACE = re.compile("\s") |  | ||||||
|  | from tests.fixtures import (OLK_SRC_BODY_SECTION, | ||||||
|  |                             REPLY_QUOTATIONS_SHARE_BLOCK, | ||||||
|  |                             REPLY_SEPARATED_BY_HR) | ||||||
|  | from talon import quotations, utils as u | ||||||
|  |  | ||||||
|  | RE_WHITESPACE = re.compile(r"\s") | ||||||
|  | RE_DOUBLE_WHITESPACE = re.compile(r"\s") | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_quotation_splitter_inside_blockquote(): | def test_quotation_splitter_inside_blockquote(): | ||||||
| @@ -164,7 +169,7 @@ def test_unicode_in_reply(): | |||||||
|  |  | ||||||
| <blockquote> | <blockquote> | ||||||
|   Quote |   Quote | ||||||
| </blockquote>""".encode("utf-8") | </blockquote>""" | ||||||
|  |  | ||||||
|     eq_("<html><head></head><body>Reply  Text<br><div><br></div>" |     eq_("<html><head></head><body>Reply  Text<br><div><br></div>" | ||||||
|         "</body></html>", |         "</body></html>", | ||||||
| @@ -312,7 +317,6 @@ def extract_reply_and_check(filename): | |||||||
|     msg_body = f.read() |     msg_body = f.read() | ||||||
|     reply = quotations.extract_from_html(msg_body) |     reply = quotations.extract_from_html(msg_body) | ||||||
|     plain_reply = u.html_to_text(reply) |     plain_reply = u.html_to_text(reply) | ||||||
|     plain_reply = plain_reply.decode('utf8') |  | ||||||
|  |  | ||||||
|     eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"), |     eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"), | ||||||
|         RE_WHITESPACE.sub('', plain_reply)) |         RE_WHITESPACE.sub('', plain_reply)) | ||||||
| @@ -389,18 +393,6 @@ def test_gmail_forwarded_msg(): | |||||||
|     eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) |     eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(u, '_MAX_TAGS_COUNT', 4) |  | ||||||
| def test_too_large_html(): |  | ||||||
|     msg_body = 'Reply' \ |  | ||||||
|                '<div class="gmail_quote">' \ |  | ||||||
|                '<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:' \ |  | ||||||
|                '<div>Test</div>' \ |  | ||||||
|                '</div>' \ |  | ||||||
|                '</div>' |  | ||||||
|     eq_(RE_WHITESPACE.sub('', msg_body), |  | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_readable_html_empty(): | def test_readable_html_empty(): | ||||||
|     msg_body = """ |     msg_body = """ | ||||||
| <blockquote> | <blockquote> | ||||||
|   | |||||||
| @@ -1,10 +1,10 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
|  | from . import * | ||||||
|  | from . fixtures import * | ||||||
|  |  | ||||||
| from mock import Mock, patch |  | ||||||
| from talon import quotations | from talon import quotations | ||||||
| from nose.tools import eq_ |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(quotations, 'extract_from_html') | @patch.object(quotations, 'extract_from_html') | ||||||
|   | |||||||
| @@ -1,10 +1,9 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
| from nose.tools import eq_ | from .. import * | ||||||
|  |  | ||||||
| from talon.signature import bruteforce | from talon.signature import bruteforce | ||||||
| from mock import patch, Mock |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_empty_body(): | def test_empty_body(): | ||||||
|   | |||||||
| @@ -2,14 +2,14 @@ | |||||||
|  |  | ||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
|  |  | ||||||
|  | import os | ||||||
|  |  | ||||||
|  | from six.moves import range | ||||||
|  |  | ||||||
| from talon.signature import bruteforce, extraction, extract | from talon.signature import bruteforce, extraction, extract | ||||||
| from talon.signature import extraction as e | from talon.signature import extraction as e | ||||||
| from talon.signature.learning import dataset | from talon.signature.learning import dataset | ||||||
| from nose.tools import eq_ | from .. import * | ||||||
| from .. import STRIPPED, UNICODE_MSG |  | ||||||
| from six.moves import range |  | ||||||
| from mock import patch |  | ||||||
| import os |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_message_shorter_SIGNATURE_MAX_LINES(): | def test_message_shorter_SIGNATURE_MAX_LINES(): | ||||||
|   | |||||||
| @@ -1,14 +1,15 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
|  | from ... import * | ||||||
| from ... import EML_MSG_FILENAME, MSG_FILENAME_WITH_BODY_SUFFIX, TMP_DIR, EMAILS_DIR |  | ||||||
| from talon.signature.learning.featurespace import features |  | ||||||
| from talon.signature.learning import dataset as d |  | ||||||
| from nose.tools import eq_, assert_false, ok_ |  | ||||||
| from numpy import genfromtxt |  | ||||||
| import os | import os | ||||||
|  |  | ||||||
|  | from numpy import genfromtxt | ||||||
|  |  | ||||||
|  | from talon.signature.learning import dataset as d | ||||||
|  |  | ||||||
|  | from talon.signature.learning.featurespace import features | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_is_sender_filename(): | def test_is_sender_filename(): | ||||||
|     assert_false(d.is_sender_filename("foo/bar")) |     assert_false(d.is_sender_filename("foo/bar")) | ||||||
|   | |||||||
| @@ -1,10 +1,9 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
|  | from ... import * | ||||||
|  |  | ||||||
| from talon.signature.learning import featurespace as fs | from talon.signature.learning import featurespace as fs | ||||||
| from nose.tools import eq_, assert_false, ok_ |  | ||||||
| from mock import patch |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_apply_features(): | def test_apply_features(): | ||||||
|   | |||||||
| @@ -1,13 +1,13 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
|  | from ... import * | ||||||
|  |  | ||||||
|  | import regex as re | ||||||
|  |  | ||||||
| from talon.signature.learning import helpers as h | from talon.signature.learning import helpers as h | ||||||
| from talon.signature.learning.helpers import RE_RELAX_PHONE, RE_NAME | from talon.signature.learning.helpers import * | ||||||
| from nose.tools import eq_, ok_, assert_false, assert_in |  | ||||||
| from mock import patch, Mock |  | ||||||
| from six.moves import range | from six.moves import range | ||||||
| import re |  | ||||||
|  |  | ||||||
| # First testing regex constants. | # First testing regex constants. | ||||||
| VALID = ''' | VALID = ''' | ||||||
|   | |||||||
| @@ -1,16 +1,17 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
|  | from . import * | ||||||
|  | from . fixtures import * | ||||||
|  |  | ||||||
| from tests.fixtures import STANDARD_REPLIES |  | ||||||
| from talon import quotations |  | ||||||
| from six.moves import range |  | ||||||
| from nose.tools import eq_ |  | ||||||
| from mock import patch |  | ||||||
| import email.iterators |  | ||||||
| import six |  | ||||||
| import os | import os | ||||||
|  |  | ||||||
|  | import email.iterators | ||||||
|  | from talon import quotations | ||||||
|  | import six | ||||||
|  | from six.moves import range | ||||||
|  | from six import StringIO | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(quotations, 'MAX_LINES_COUNT', 1) | @patch.object(quotations, 'MAX_LINES_COUNT', 1) | ||||||
| def test_too_many_lines(): | def test_too_many_lines(): | ||||||
| @@ -34,7 +35,6 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote: | |||||||
|  |  | ||||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_pattern_on_date_polymail(): | def test_pattern_on_date_polymail(): | ||||||
|     msg_body = """Test reply |     msg_body = """Test reply | ||||||
|  |  | ||||||
| @@ -190,17 +190,14 @@ Test""" | |||||||
|     eq_('Test reply', quotations.extract_from_plain( |     eq_('Test reply', quotations.extract_from_plain( | ||||||
|         msg_body.format(six.text_type(original_message_indicator)))) |         msg_body.format(six.text_type(original_message_indicator)))) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_english_original_message(): | def test_english_original_message(): | ||||||
|     _check_pattern_original_message('Original Message') |     _check_pattern_original_message('Original Message') | ||||||
|     _check_pattern_original_message('Reply Message') |     _check_pattern_original_message('Reply Message') | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_german_original_message(): | def test_german_original_message(): | ||||||
|     _check_pattern_original_message(u'Ursprüngliche Nachricht') |     _check_pattern_original_message(u'Ursprüngliche Nachricht') | ||||||
|     _check_pattern_original_message('Antwort Nachricht') |     _check_pattern_original_message('Antwort Nachricht') | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_danish_original_message(): | def test_danish_original_message(): | ||||||
|     _check_pattern_original_message('Oprindelig meddelelse') |     _check_pattern_original_message('Oprindelig meddelelse') | ||||||
|  |  | ||||||
| @@ -299,7 +296,6 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote: | |||||||
| > Hello""" | > Hello""" | ||||||
|     eq_("Hi", quotations.extract_from_plain(msg_body)) |     eq_("Hi", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_with_indent(): | def test_with_indent(): | ||||||
|     msg_body = """YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin. |     msg_body = """YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin. | ||||||
|  |  | ||||||
| @@ -307,8 +303,7 @@ def test_with_indent(): | |||||||
|  |  | ||||||
| Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur. | Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur. | ||||||
|     """ |     """ | ||||||
|     eq_("YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.", |     eq_("YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.", quotations.extract_from_plain(msg_body)) | ||||||
|         quotations.extract_from_plain(msg_body)) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_short_quotation_with_newline(): | def test_short_quotation_with_newline(): | ||||||
| @@ -348,7 +343,6 @@ Subject: The manager has commented on your Loop | |||||||
| Blah-blah-blah | Blah-blah-blah | ||||||
| """)) | """)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_german_from_block(): | def test_german_from_block(): | ||||||
|     eq_('Allo! Follow up MIME!', quotations.extract_from_plain( |     eq_('Allo! Follow up MIME!', quotations.extract_from_plain( | ||||||
|     """Allo! Follow up MIME! |     """Allo! Follow up MIME! | ||||||
| @@ -361,7 +355,6 @@ Betreff: The manager has commented on your Loop | |||||||
| Blah-blah-blah | Blah-blah-blah | ||||||
| """)) | """)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_french_multiline_from_block(): | def test_french_multiline_from_block(): | ||||||
|     eq_('Lorem ipsum', quotations.extract_from_plain( |     eq_('Lorem ipsum', quotations.extract_from_plain( | ||||||
|     u"""Lorem ipsum |     u"""Lorem ipsum | ||||||
| @@ -374,7 +367,6 @@ Objet : Follow Up | |||||||
| Blah-blah-blah | Blah-blah-blah | ||||||
| """)) | """)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_french_from_block(): | def test_french_from_block(): | ||||||
|     eq_('Lorem ipsum', quotations.extract_from_plain( |     eq_('Lorem ipsum', quotations.extract_from_plain( | ||||||
|     u"""Lorem ipsum |     u"""Lorem ipsum | ||||||
| @@ -383,7 +375,6 @@ Le 23 janv. 2015 à 22:03, Brendan xxx <brendan.xxx@xxx.com<mailto:brendan.xxx@x | |||||||
|  |  | ||||||
| Bonjour!""")) | Bonjour!""")) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_polish_from_block(): | def test_polish_from_block(): | ||||||
|     eq_('Lorem ipsum', quotations.extract_from_plain( |     eq_('Lorem ipsum', quotations.extract_from_plain( | ||||||
|     u"""Lorem ipsum |     u"""Lorem ipsum | ||||||
| @@ -394,7 +385,6 @@ napisał: | |||||||
| Blah! | Blah! | ||||||
| """)) | """)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_danish_from_block(): | def test_danish_from_block(): | ||||||
|     eq_('Allo! Follow up MIME!', quotations.extract_from_plain( |     eq_('Allo! Follow up MIME!', quotations.extract_from_plain( | ||||||
|     """Allo! Follow up MIME! |     """Allo! Follow up MIME! | ||||||
| @@ -407,7 +397,6 @@ Emne: The manager has commented on your Loop | |||||||
| Blah-blah-blah | Blah-blah-blah | ||||||
| """)) | """)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_swedish_from_block(): | def test_swedish_from_block(): | ||||||
|     eq_('Allo! Follow up MIME!', quotations.extract_from_plain( |     eq_('Allo! Follow up MIME!', quotations.extract_from_plain( | ||||||
|     u"""Allo! Follow up MIME! |     u"""Allo! Follow up MIME! | ||||||
| @@ -419,7 +408,6 @@ Till: Isacson Leiff | |||||||
| Blah-blah-blah | Blah-blah-blah | ||||||
| """)) | """)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_swedish_from_line(): | def test_swedish_from_line(): | ||||||
|     eq_('Lorem', quotations.extract_from_plain( |     eq_('Lorem', quotations.extract_from_plain( | ||||||
|     """Lorem |     """Lorem | ||||||
| @@ -428,7 +416,6 @@ Den 14 september, 2015 02:23:18, Valentino Rudy (valentino@rudy.be) skrev: | |||||||
| Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse. | Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse. | ||||||
| """)) | """)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_norwegian_from_line(): | def test_norwegian_from_line(): | ||||||
|     eq_('Lorem', quotations.extract_from_plain( |     eq_('Lorem', quotations.extract_from_plain( | ||||||
|     u"""Lorem |     u"""Lorem | ||||||
| @@ -437,7 +424,6 @@ På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev: | |||||||
| Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse. | Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse. | ||||||
| """)) | """)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_dutch_from_block(): | def test_dutch_from_block(): | ||||||
|     eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain( |     eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain( | ||||||
|     """Gluten-free culpa lo-fi et nesciunt nostrud. |     """Gluten-free culpa lo-fi et nesciunt nostrud. | ||||||
| @@ -447,7 +433,6 @@ Op 17-feb.-2015, om 13:18 heeft Julius Caesar <pantheon@rome.com> het volgende g | |||||||
| Small batch beard laboris tempor, non listicle hella Tumblr heirloom. | Small batch beard laboris tempor, non listicle hella Tumblr heirloom. | ||||||
| """)) | """)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_vietnamese_from_block(): | def test_vietnamese_from_block(): | ||||||
|     eq_('Hello', quotations.extract_from_plain( |     eq_('Hello', quotations.extract_from_plain( | ||||||
|     u"""Hello |     u"""Hello | ||||||
| @@ -457,7 +442,6 @@ Vào 14:24 8 tháng 6, 2017, Hùng Nguyễn <hungnguyen@xxx.com> đã viết: | |||||||
| > Xin chào | > Xin chào | ||||||
| """)) | """)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_quotation_marker_false_positive(): | def test_quotation_marker_false_positive(): | ||||||
|     msg_body = """Visit us now for assistance... |     msg_body = """Visit us now for assistance... | ||||||
| >>> >>>  http://www.domain.com <<< | >>> >>>  http://www.domain.com <<< | ||||||
| @@ -842,10 +826,10 @@ The user experience was unparallelled. Please continue production. I'm sending p | |||||||
| that this line is intact.""" | that this line is intact.""" | ||||||
|  |  | ||||||
|     parsed = quotations.extract_from_plain(msg_body) |     parsed = quotations.extract_from_plain(msg_body) | ||||||
|     eq_(msg_body, parsed.decode('utf8')) |     eq_(msg_body, parsed) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_appointment(): | def test_appointment_2(): | ||||||
|     msg_body = """Invitation for an interview: |     msg_body = """Invitation for an interview: | ||||||
|  |  | ||||||
| Date: Wednesday 3, October 2011  | Date: Wednesday 3, October 2011  | ||||||
| @@ -854,4 +838,4 @@ Address: 130 Fox St | |||||||
|  |  | ||||||
| Please bring in your ID.""" | Please bring in your ID.""" | ||||||
|     parsed = quotations.extract_from_plain(msg_body) |     parsed = quotations.extract_from_plain(msg_body) | ||||||
|     eq_(msg_body, parsed.decode('utf8')) |     eq_(msg_body, parsed) | ||||||
|   | |||||||
| @@ -2,12 +2,8 @@ | |||||||
|  |  | ||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
|  |  | ||||||
|  |  | ||||||
| from nose.tools import eq_, ok_, assert_false |  | ||||||
| from talon import utils as u | from talon import utils as u | ||||||
| from mock import patch, Mock | from . import * | ||||||
| import cchardet |  | ||||||
| import six |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_get_delimiter(): | def test_get_delimiter(): | ||||||
| @@ -16,58 +12,6 @@ def test_get_delimiter(): | |||||||
|     eq_('\n', u.get_delimiter('abc')) |     eq_('\n', u.get_delimiter('abc')) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_unicode(): |  | ||||||
|     eq_(u'hi', u.to_unicode('hi')) |  | ||||||
|     eq_(type(u.to_unicode('hi')), six.text_type) |  | ||||||
|     eq_(type(u.to_unicode(u'hi')), six.text_type) |  | ||||||
|     eq_(type(u.to_unicode('привет')), six.text_type) |  | ||||||
|     eq_(type(u.to_unicode(u'привет')), six.text_type) |  | ||||||
|     eq_(u"привет", u.to_unicode('привет')) |  | ||||||
|     eq_(u"привет", u.to_unicode(u'привет')) |  | ||||||
|     # some latin1 stuff |  | ||||||
|     eq_(u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True)) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_detect_encoding(): |  | ||||||
|     eq_('ascii', u.detect_encoding(b'qwe').lower()) |  | ||||||
|     ok_(u.detect_encoding( |  | ||||||
|         u'Versi\xf3n'.encode('iso-8859-2')).lower() in [ |  | ||||||
|             'iso-8859-1', 'iso-8859-2']) |  | ||||||
|     eq_('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower()) |  | ||||||
|     # fallback to utf-8 |  | ||||||
|     with patch.object(u.chardet, 'detect') as detect: |  | ||||||
|         detect.side_effect = Exception |  | ||||||
|         eq_('utf-8', u.detect_encoding('qwe'.encode('utf8')).lower()) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_quick_detect_encoding(): |  | ||||||
|     eq_('ascii', u.quick_detect_encoding(b'qwe').lower()) |  | ||||||
|     ok_(u.quick_detect_encoding( |  | ||||||
|         u'Versi\xf3n'.encode('windows-1252')).lower() in [ |  | ||||||
|             'windows-1252', 'windows-1250']) |  | ||||||
|     eq_('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower()) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(cchardet, 'detect') |  | ||||||
| @patch.object(u, 'detect_encoding') |  | ||||||
| def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect): |  | ||||||
|     cchardet_detect.return_value = {'encoding': 'ascii'} |  | ||||||
|     eq_('ascii', u.quick_detect_encoding(b"qwe")) |  | ||||||
|     cchardet_detect.assert_called_once_with(b"qwe") |  | ||||||
|  |  | ||||||
|     # fallback to detect_encoding |  | ||||||
|     cchardet_detect.return_value = {} |  | ||||||
|     detect_encoding.return_value = 'utf-8' |  | ||||||
|     eq_('utf-8', u.quick_detect_encoding(b"qwe")) |  | ||||||
|  |  | ||||||
|     # exception |  | ||||||
|     detect_encoding.reset_mock() |  | ||||||
|     cchardet_detect.side_effect = Exception() |  | ||||||
|     detect_encoding.return_value = 'utf-8' |  | ||||||
|     eq_('utf-8', u.quick_detect_encoding(b"qwe")) |  | ||||||
|     ok_(detect_encoding.called) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_html_to_text(): | def test_html_to_text(): | ||||||
|     html = """<body> |     html = """<body> | ||||||
| <p>Hello world!</p> | <p>Hello world!</p> | ||||||
| @@ -81,11 +25,11 @@ Haha | |||||||
| </p> | </p> | ||||||
| </body>""" | </body>""" | ||||||
|     text = u.html_to_text(html) |     text = u.html_to_text(html) | ||||||
|     eq_(b"Hello world! \n\n  * One! \n  * Two \nHaha", text) |     eq_("Hello world! \n\n  * One! \n  * Two \nHaha", text) | ||||||
|     eq_(u"привет!", u.html_to_text("<b>привет!</b>").decode('utf8')) |     eq_(u"привет!", u.html_to_text("<b>привет!</b>")) | ||||||
|  |  | ||||||
|     html = '<body><br/><br/>Hi</body>' |     html = '<body><br/><br/>Hi</body>' | ||||||
|     eq_(b'Hi', u.html_to_text(html)) |     eq_('Hi', u.html_to_text(html)) | ||||||
|  |  | ||||||
|     html = """Hi |     html = """Hi | ||||||
| <style type="text/css"> | <style type="text/css"> | ||||||
| @@ -105,60 +49,23 @@ font: 13px 'Lucida Grande', Arial, sans-serif; | |||||||
|  |  | ||||||
| } | } | ||||||
| </style>""" | </style>""" | ||||||
|     eq_(b'Hi', u.html_to_text(html)) |     eq_('Hi', u.html_to_text(html)) | ||||||
|  |  | ||||||
|     html = """<div> |     html = """<div> | ||||||
| <!-- COMMENT 1 --> | <!-- COMMENT 1 --> | ||||||
| <span>TEXT 1</span> | <span>TEXT 1</span> | ||||||
| <p>TEXT 2 <!-- COMMENT 2 --></p> | <p>TEXT 2 <!-- COMMENT 2 --></p> | ||||||
| </div>""" | </div>""" | ||||||
|     eq_(b'TEXT 1 \nTEXT 2', u.html_to_text(html)) |     eq_('TEXT 1 \nTEXT 2', u.html_to_text(html)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_comment_no_parent(): | def test_comment_no_parent(): | ||||||
|     s = b'<!-- COMMENT 1 --> no comment' |     s = '<!-- COMMENT 1 --> no comment' | ||||||
|     d = u.html_document_fromstring(s) |     d = u.html_document_fromstring(s) | ||||||
|     eq_(b"no comment", u.html_tree_to_text(d)) |     eq_("no comment", u.html_tree_to_text(d)) | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception())) |  | ||||||
| def test_html_fromstring_exception(): |  | ||||||
|     eq_(None, u.html_fromstring("<html></html>")) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(u, 'html_too_big', Mock()) |  | ||||||
| @patch.object(u.html5parser, 'fromstring') |  | ||||||
| def test_html_fromstring_too_big(fromstring): |  | ||||||
|     eq_(None, u.html_fromstring("<html></html>")) |  | ||||||
|     assert_false(fromstring.called) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(u.html5parser, 'document_fromstring') |  | ||||||
| def test_html_document_fromstring_exception(document_fromstring): |  | ||||||
|     document_fromstring.side_effect = Exception() |  | ||||||
|     eq_(None, u.html_document_fromstring("<html></html>")) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(u, 'html_too_big', Mock()) |  | ||||||
| @patch.object(u.html5parser, 'document_fromstring') |  | ||||||
| def test_html_document_fromstring_too_big(document_fromstring): |  | ||||||
|     eq_(None, u.html_document_fromstring("<html></html>")) |  | ||||||
|     assert_false(document_fromstring.called) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(u, 'html_fromstring', Mock(return_value=None)) | @patch.object(u, 'html_fromstring', Mock(return_value=None)) | ||||||
| def test_bad_html_to_text(): | def test_bad_html_to_text(): | ||||||
|     bad_html = "one<br>two<br>three" |     bad_html = "one<br>two<br>three" | ||||||
|     eq_(None, u.html_to_text(bad_html)) |     eq_(None, u.html_to_text(bad_html)) | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(u, '_MAX_TAGS_COUNT', 3) |  | ||||||
| def test_html_too_big(): |  | ||||||
|     eq_(False, u.html_too_big("<div></div>")) |  | ||||||
|     eq_(True, u.html_too_big("<div><span>Hi</span></div>")) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(u, '_MAX_TAGS_COUNT', 3) |  | ||||||
| def test_html_to_text(): |  | ||||||
|     eq_(b"Hello", u.html_to_text("<div>Hello</div>")) |  | ||||||
|     eq_(None, u.html_to_text("<div><span>Hi</span></div>")) |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user