Compare commits
	
		
			9 Commits
		
	
	
		
			thrawn/dev
			...
			v1.5.0
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | a8c7e6a972 | ||
|  | b30c375c5b | ||
|  | cec5acf58f | ||
|  | 24d0f2d00a | ||
|  | 94007b0b92 | ||
|  | 1a5548f171 | ||
|  | 53c49b9121 | ||
|  | bd50872043 | ||
|  | d37c4fd551 | 
							
								
								
									
										20
									
								
								.build/Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								.build/Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,20 @@ | |||||||
|  | FROM python:3.9-slim-buster AS deps | ||||||
|  |  | ||||||
|  | RUN apt-get update && \ | ||||||
|  |     apt-get install -y build-essential git curl python3-dev libatlas3-base libatlas-base-dev liblapack-dev libxml2 libxml2-dev libffi6 libffi-dev musl-dev libxslt-dev | ||||||
|  |  | ||||||
|  | FROM deps AS testable | ||||||
|  | ARG REPORT_PATH | ||||||
|  |  | ||||||
|  | VOLUME ["/var/mailgun", "/etc/mailgun/ssl", ${REPORT_PATH}] | ||||||
|  |  | ||||||
|  | ADD . /app | ||||||
|  | WORKDIR /app | ||||||
|  | COPY wheel/* /wheel/ | ||||||
|  |  | ||||||
|  | RUN mkdir -p ${REPORT_PATH} | ||||||
|  |  | ||||||
|  | RUN python ./setup.py build bdist_wheel -d /wheel && \ | ||||||
|  |     pip install --no-deps /wheel/* | ||||||
|  |  | ||||||
|  | ENTRYPOINT ["/bin/sh", "/app/run_tests.sh"] | ||||||
							
								
								
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -54,3 +54,6 @@ _trial_temp | |||||||
|  |  | ||||||
| # OSX | # OSX | ||||||
| .DS_Store | .DS_Store | ||||||
|  |  | ||||||
|  | # vim-backup | ||||||
|  | *.bak | ||||||
|   | |||||||
							
								
								
									
										11
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,11 @@ | |||||||
|  | chardet>=1.0.1 | ||||||
|  | cchardet>=0.3.5 | ||||||
|  | cssselect | ||||||
|  | html5lib | ||||||
|  | joblib | ||||||
|  | lxml>=2.3.3 | ||||||
|  | numpy | ||||||
|  | regex>=1 | ||||||
|  | scikit-learn>=1.0.0 | ||||||
|  | scipy | ||||||
|  | six>=1.10.0 | ||||||
							
								
								
									
										4
									
								
								run_tests.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										4
									
								
								run_tests.sh
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,4 @@ | |||||||
|  | #!/usr/bin/env bash | ||||||
|  | set -ex | ||||||
|  | REPORT_PATH="${REPORT_PATH:-./}" | ||||||
|  | nosetests --with-xunit --with-coverage --cover-xml --cover-xml-file $REPORT_PATH/coverage.xml --xunit-file=$REPORT_PATH/nosetests.xml --cover-package=talon . | ||||||
							
								
								
									
										31
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										31
									
								
								setup.py
									
									
									
									
									
								
							| @@ -19,17 +19,17 @@ class InstallCommand(install): | |||||||
|         if self.no_ml: |         if self.no_ml: | ||||||
|             dist = self.distribution |             dist = self.distribution | ||||||
|             dist.packages=find_packages(exclude=[ |             dist.packages=find_packages(exclude=[ | ||||||
|                 'tests', |                 "tests", | ||||||
|                 'tests.*', |                 "tests.*", | ||||||
|                 'talon.signature', |                 "talon.signature", | ||||||
|                 'talon.signature.*', |                 "talon.signature.*", | ||||||
|             ]) |             ]) | ||||||
|             for not_required in ['numpy', 'scipy', 'scikit-learn==0.16.1']: |             for not_required in ["numpy", "scipy", "scikit-learn==0.24.1"]: | ||||||
|                 dist.install_requires.remove(not_required) |                 dist.install_requires.remove(not_required) | ||||||
|  |  | ||||||
|  |  | ||||||
| setup(name='talon', | setup(name='talon', | ||||||
|       version='1.4.8', |       version='1.5.0', | ||||||
|       description=("Mailgun library " |       description=("Mailgun library " | ||||||
|                    "to extract message quotations and signatures."), |                    "to extract message quotations and signatures."), | ||||||
|       long_description=open("README.rst").read(), |       long_description=open("README.rst").read(), | ||||||
| @@ -44,20 +44,21 @@ setup(name='talon', | |||||||
|       include_package_data=True, |       include_package_data=True, | ||||||
|       zip_safe=True, |       zip_safe=True, | ||||||
|       install_requires=[ |       install_requires=[ | ||||||
|           "lxml>=2.3.3", |           "lxml", | ||||||
|           "regex>=1", |           "regex", | ||||||
|           "numpy", |           "numpy", | ||||||
|           "scipy", |           "scipy", | ||||||
|           "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild |           "scikit-learn>=1.0.0", | ||||||
|           'chardet>=1.0.1', |           "chardet", | ||||||
|           'cchardet>=0.3.5', |           "cchardet", | ||||||
|           'cssselect', |           "cssselect", | ||||||
|           'six>=1.10.0', |           "six", | ||||||
|           'html5lib' |           "html5lib", | ||||||
|  |           "joblib", | ||||||
|           ], |           ], | ||||||
|       tests_require=[ |       tests_require=[ | ||||||
|           "mock", |           "mock", | ||||||
|           "nose>=1.2.1", |           "nose", | ||||||
|           "coverage" |           "coverage" | ||||||
|           ] |           ] | ||||||
|       ) |       ) | ||||||
|   | |||||||
| @@ -193,9 +193,6 @@ RE_PARENTHESIS_LINK = re.compile("\(https?://") | |||||||
|  |  | ||||||
| SPLITTER_MAX_LINES = 6 | SPLITTER_MAX_LINES = 6 | ||||||
| MAX_LINES_COUNT = 1000 | MAX_LINES_COUNT = 1000 | ||||||
| # an extensive research shows that exceeding this limit |  | ||||||
| # leads to excessive processing time |  | ||||||
| MAX_HTML_LEN = 2794202 |  | ||||||
|  |  | ||||||
| QUOT_PATTERN = re.compile('^>+ ?') | QUOT_PATTERN = re.compile('^>+ ?') | ||||||
| NO_QUOT_LINE = re.compile('^[^>].*[\S].*') | NO_QUOT_LINE = re.compile('^[^>].*[\S].*') | ||||||
| @@ -421,25 +418,31 @@ def extract_from_html(msg_body): | |||||||
|  |  | ||||||
|     Returns a unicode string. |     Returns a unicode string. | ||||||
|     """ |     """ | ||||||
|  |     msg_body_bytes = msg_body | ||||||
|     if isinstance(msg_body, six.text_type): |     if isinstance(msg_body, six.text_type): | ||||||
|         msg_body = msg_body.encode('utf8') |         msg_body_bytes = msg_body.encode('utf8') | ||||||
|     elif not isinstance(msg_body, bytes): |  | ||||||
|         msg_body = msg_body.encode('ascii') |  | ||||||
|  |  | ||||||
|     result = _extract_from_html(msg_body) |     if msg_body_bytes.strip() == b'': | ||||||
|     if isinstance(result, bytes): |         return msg_body | ||||||
|         result = result.decode('utf8') |  | ||||||
|  |     msg_body_bytes = msg_body_bytes.replace(b'\r\n', b'\n') | ||||||
|  |     # Cut out xml and doctype tags to avoid conflict with unicode decoding. | ||||||
|  |     msg_body_bytes = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", b"", msg_body_bytes) | ||||||
|  |     html_tree = html_document_fromstring(msg_body_bytes) | ||||||
|  |     if html_tree is None: | ||||||
|  |         return msg_body | ||||||
|  |  | ||||||
|  |     result = extract_from_html_tree(html_tree) | ||||||
|  |     if not result: | ||||||
|  |         return msg_body | ||||||
|  |  | ||||||
|     return result |     return result | ||||||
|  |  | ||||||
|  |  | ||||||
| def _extract_from_html(msg_body): | def extract_from_html_tree(html_tree): | ||||||
|     """ |     """ | ||||||
|     Extract not quoted message from provided html message body |     Extract not quoted message from provided parsed html tree using tags and | ||||||
|     using tags and plain text algorithm. |     plain text algorithm. | ||||||
|  |  | ||||||
|     Cut out first some encoding html tags such as xml and doctype |  | ||||||
|     for avoiding conflict with unicode decoding |  | ||||||
|  |  | ||||||
|     Cut out the 'blockquote', 'gmail_quote' tags. |     Cut out the 'blockquote', 'gmail_quote' tags. | ||||||
|     Cut Microsoft quotations. |     Cut Microsoft quotations. | ||||||
| @@ -452,18 +455,6 @@ def _extract_from_html(msg_body): | |||||||
|     then checking deleted checkpoints, |     then checking deleted checkpoints, | ||||||
|     then deleting necessary tags. |     then deleting necessary tags. | ||||||
|     """ |     """ | ||||||
|     if msg_body.strip() == b'': |  | ||||||
|         return msg_body |  | ||||||
|  |  | ||||||
|     msg_body = msg_body.replace(b'\r\n', b'\n') |  | ||||||
|  |  | ||||||
|     msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body) |  | ||||||
|  |  | ||||||
|     html_tree = html_document_fromstring(msg_body) |  | ||||||
|  |  | ||||||
|     if html_tree is None: |  | ||||||
|         return msg_body |  | ||||||
|  |  | ||||||
|     cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or |     cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or | ||||||
|                       html_quotations.cut_zimbra_quote(html_tree) or |                       html_quotations.cut_zimbra_quote(html_tree) or | ||||||
|                       html_quotations.cut_blockquote(html_tree) or |                       html_quotations.cut_blockquote(html_tree) or | ||||||
| @@ -481,7 +472,7 @@ def _extract_from_html(msg_body): | |||||||
|  |  | ||||||
|     # Don't process too long messages |     # Don't process too long messages | ||||||
|     if len(lines) > MAX_LINES_COUNT: |     if len(lines) > MAX_LINES_COUNT: | ||||||
|         return msg_body |         return None | ||||||
|  |  | ||||||
|     # Collect checkpoints on each line |     # Collect checkpoints on each line | ||||||
|     line_checkpoints = [ |     line_checkpoints = [ | ||||||
| @@ -500,7 +491,7 @@ def _extract_from_html(msg_body): | |||||||
|     lines_were_deleted, first_deleted, last_deleted = return_flags |     lines_were_deleted, first_deleted, last_deleted = return_flags | ||||||
|  |  | ||||||
|     if not lines_were_deleted and not cut_quotations: |     if not lines_were_deleted and not cut_quotations: | ||||||
|         return msg_body |         return None | ||||||
|  |  | ||||||
|     if lines_were_deleted: |     if lines_were_deleted: | ||||||
|         #collect checkpoints from deleted lines |         #collect checkpoints from deleted lines | ||||||
| @@ -514,7 +505,7 @@ def _extract_from_html(msg_body): | |||||||
|         ) |         ) | ||||||
|  |  | ||||||
|     if _readable_text_empty(html_tree_copy): |     if _readable_text_empty(html_tree_copy): | ||||||
|         return msg_body |         return None | ||||||
|  |  | ||||||
|     # NOTE: We remove_namespaces() because we are using an HTML5 Parser, HTML |     # NOTE: We remove_namespaces() because we are using an HTML5 Parser, HTML | ||||||
|     # parsers do not recognize namespaces in HTML tags. As such the rendered |     # parsers do not recognize namespaces in HTML tags. As such the rendered | ||||||
| @@ -540,7 +531,11 @@ def _extract_from_html(msg_body): | |||||||
|     #    of replacing data outside the <tag> which might be essential to |     #    of replacing data outside the <tag> which might be essential to | ||||||
|     #    the customer. |     #    the customer. | ||||||
|     remove_namespaces(html_tree_copy) |     remove_namespaces(html_tree_copy) | ||||||
|     return html.tostring(html_tree_copy) |     s = html.tostring(html_tree_copy) | ||||||
|  |     if not s: | ||||||
|  |         return None | ||||||
|  |  | ||||||
|  |     return s.decode('utf-8') | ||||||
|  |  | ||||||
|  |  | ||||||
| def remove_namespaces(root): | def remove_namespaces(root): | ||||||
|   | |||||||
| @@ -23,17 +23,14 @@ trained against, don't forget to regenerate: | |||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
| import os | import os | ||||||
|  |  | ||||||
| from . import extraction | from talon.signature import extraction | ||||||
| from . extraction import extract  #noqa | from talon.signature.extraction import extract | ||||||
| from . learning import classifier | from talon.signature.learning import classifier | ||||||
|  |  | ||||||
|  |  | ||||||
| DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') |  | ||||||
|  |  | ||||||
| EXTRACTOR_FILENAME = os.path.join(DATA_DIR, 'classifier') |  | ||||||
| EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data') |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def initialize(): | def initialize(): | ||||||
|     extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME, |     data_dir = os.path.join(os.path.dirname(__file__), 'data') | ||||||
|                                            EXTRACTOR_DATA) |     extractor_filename = os.path.join(data_dir, 'classifier') | ||||||
|  |     extractor_data_filename = os.path.join(data_dir, 'train.data') | ||||||
|  |     extraction.EXTRACTOR = classifier.load(extractor_filename, | ||||||
|  |                                            extractor_data_filename) | ||||||
|   | |||||||
										
											Binary file not shown.
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -8,7 +8,7 @@ body belongs to the signature. | |||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
|  |  | ||||||
| from numpy import genfromtxt | from numpy import genfromtxt | ||||||
| from sklearn.externals import joblib | import joblib | ||||||
| from sklearn.svm import LinearSVC | from sklearn.svm import LinearSVC | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -102,7 +102,7 @@ def flatten_list(list_to_flatten): | |||||||
|  |  | ||||||
|  |  | ||||||
| def contains_sender_names(sender): | def contains_sender_names(sender): | ||||||
|     '''Returns a functions to search sender\'s name or it\'s part. |     """Returns a functions to search sender\'s name or it\'s part. | ||||||
|  |  | ||||||
|     >>> feature = contains_sender_names("Sergey N.  Obukhov <xxx@example.com>") |     >>> feature = contains_sender_names("Sergey N.  Obukhov <xxx@example.com>") | ||||||
|     >>> feature("Sergey Obukhov") |     >>> feature("Sergey Obukhov") | ||||||
| @@ -115,7 +115,7 @@ def contains_sender_names(sender): | |||||||
|     1 |     1 | ||||||
|     >>> contains_sender_names("<serobnic@mail.ru>")("serobnic") |     >>> contains_sender_names("<serobnic@mail.ru>")("serobnic") | ||||||
|     1 |     1 | ||||||
|     ''' |     """ | ||||||
|     names = '( |$)|'.join(flatten_list([[e, e.capitalize()] |     names = '( |$)|'.join(flatten_list([[e, e.capitalize()] | ||||||
|                                         for e in extract_names(sender)])) |                                         for e in extract_names(sender)])) | ||||||
|     names = names or sender |     names = names or sender | ||||||
| @@ -140,10 +140,16 @@ def extract_names(sender): | |||||||
|     sender = "".join([char if char.isalpha() else ' ' for char in sender]) |     sender = "".join([char if char.isalpha() else ' ' for char in sender]) | ||||||
|     # Remove too short words and words from "black" list i.e. |     # Remove too short words and words from "black" list i.e. | ||||||
|     # words like `ru`, `gmail`, `com`, `org`, etc. |     # words like `ru`, `gmail`, `com`, `org`, etc. | ||||||
|     sender = [word for word in sender.split() if len(word) > 1 and |     names = list() | ||||||
|               not word in BAD_SENDER_NAMES] |     for word in sender.split(): | ||||||
|     # Remove duplicates |         if len(word) < 2: | ||||||
|     names = list(set(sender)) |             continue | ||||||
|  |         if word in BAD_SENDER_NAMES: | ||||||
|  |             continue | ||||||
|  |         if word in names: | ||||||
|  |             continue | ||||||
|  |         names.append(word) | ||||||
|  |  | ||||||
|     return names |     return names | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -208,20 +214,26 @@ def many_capitalized_words(s): | |||||||
|  |  | ||||||
|  |  | ||||||
| def has_signature(body, sender): | def has_signature(body, sender): | ||||||
|     '''Checks if the body has signature. Returns True or False.''' |     """Checks if the body has signature. Returns True or False.""" | ||||||
|     non_empty = [line for line in body.splitlines() if line.strip()] |     non_empty = [line for line in body.splitlines() if line.strip()] | ||||||
|     candidate = non_empty[-SIGNATURE_MAX_LINES:] |     candidate = non_empty[-SIGNATURE_MAX_LINES:] | ||||||
|     upvotes = 0 |     upvotes = 0 | ||||||
|  |     sender_check = contains_sender_names(sender) | ||||||
|     for line in candidate: |     for line in candidate: | ||||||
|         # we check lines for sender's name, phone, email and url, |         # we check lines for sender's name, phone, email and url, | ||||||
|         # those signature lines don't take more then 27 lines |         # those signature lines don't take more then 27 lines | ||||||
|         if len(line.strip()) > 27: |         if len(line.strip()) > 27: | ||||||
|             continue |             continue | ||||||
|         elif contains_sender_names(sender)(line): |  | ||||||
|  |         if sender_check(line): | ||||||
|             return True |             return True | ||||||
|         elif (binary_regex_search(RE_RELAX_PHONE)(line) + |  | ||||||
|  |         if (binary_regex_search(RE_RELAX_PHONE)(line) + | ||||||
|                 binary_regex_search(RE_EMAIL)(line) + |                 binary_regex_search(RE_EMAIL)(line) + | ||||||
|                 binary_regex_search(RE_URL)(line) == 1): |                 binary_regex_search(RE_URL)(line) == 1): | ||||||
|             upvotes += 1 |             upvotes += 1 | ||||||
|  |  | ||||||
|     if upvotes > 1: |     if upvotes > 1: | ||||||
|         return True |         return True | ||||||
|  |  | ||||||
|  |     return False | ||||||
|   | |||||||
| @@ -180,9 +180,6 @@ def html_fromstring(s): | |||||||
|     if isinstance(s, six.text_type): |     if isinstance(s, six.text_type): | ||||||
|         s = s.encode('utf8') |         s = s.encode('utf8') | ||||||
|     try: |     try: | ||||||
|         if html_too_big(s): |  | ||||||
|             return None |  | ||||||
|  |  | ||||||
|         return html5parser.fromstring(s, parser=_html5lib_parser()) |         return html5parser.fromstring(s, parser=_html5lib_parser()) | ||||||
|     except Exception: |     except Exception: | ||||||
|         pass |         pass | ||||||
| @@ -194,9 +191,6 @@ def html_document_fromstring(s): | |||||||
|     if isinstance(s, six.text_type): |     if isinstance(s, six.text_type): | ||||||
|         s = s.encode('utf8') |         s = s.encode('utf8') | ||||||
|     try: |     try: | ||||||
|         if html_too_big(s): |  | ||||||
|             return None |  | ||||||
|  |  | ||||||
|         return html5parser.document_fromstring(s, parser=_html5lib_parser()) |         return html5parser.document_fromstring(s, parser=_html5lib_parser()) | ||||||
|     except Exception: |     except Exception: | ||||||
|         pass |         pass | ||||||
| @@ -206,12 +200,6 @@ def cssselect(expr, tree): | |||||||
|     return CSSSelector(expr)(tree) |     return CSSSelector(expr)(tree) | ||||||
|  |  | ||||||
|  |  | ||||||
| def html_too_big(s): |  | ||||||
|     if isinstance(s, six.text_type): |  | ||||||
|         s = s.encode('utf8') |  | ||||||
|     return s.count(b'<') > _MAX_TAGS_COUNT |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def _contains_charset_spec(s): | def _contains_charset_spec(s): | ||||||
|     """Return True if the first 4KB contain charset spec |     """Return True if the first 4KB contain charset spec | ||||||
|     """ |     """ | ||||||
| @@ -258,7 +246,3 @@ _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] | |||||||
| _HARDBREAKS = ['br', 'hr', 'tr'] | _HARDBREAKS = ['br', 'hr', 'tr'] | ||||||
|  |  | ||||||
| _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") | _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") | ||||||
|  |  | ||||||
| # an extensive research shows that exceeding this limit |  | ||||||
| # might lead to excessive processing time |  | ||||||
| _MAX_TAGS_COUNT = 419 |  | ||||||
|   | |||||||
							
								
								
									
										3
									
								
								test-requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								test-requirements.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | |||||||
|  | coverage | ||||||
|  | mock | ||||||
|  | nose>=1.2.1 | ||||||
| @@ -391,18 +391,6 @@ def test_gmail_forwarded_msg(): | |||||||
|     eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) |     eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(u, '_MAX_TAGS_COUNT', 4) |  | ||||||
| def test_too_large_html(): |  | ||||||
|     msg_body = 'Reply' \ |  | ||||||
|                '<div class="gmail_quote">' \ |  | ||||||
|                '<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:' \ |  | ||||||
|                '<div>Test</div>' \ |  | ||||||
|                '</div>' \ |  | ||||||
|                '</div>' |  | ||||||
|     eq_(RE_WHITESPACE.sub('', msg_body), |  | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_readable_html_empty(): | def test_readable_html_empty(): | ||||||
|     msg_body = """ |     msg_body = """ | ||||||
| <blockquote> | <blockquote> | ||||||
|   | |||||||
| @@ -826,10 +826,10 @@ The user experience was unparallelled. Please continue production. I'm sending p | |||||||
| that this line is intact.""" | that this line is intact.""" | ||||||
|  |  | ||||||
|     parsed = quotations.extract_from_plain(msg_body) |     parsed = quotations.extract_from_plain(msg_body) | ||||||
|     eq_(msg_body, parsed.decode('utf8')) |     eq_(msg_body, parsed) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_appointment(): | def test_appointment_2(): | ||||||
|     msg_body = """Invitation for an interview: |     msg_body = """Invitation for an interview: | ||||||
|  |  | ||||||
| Date: Wednesday 3, October 2011  | Date: Wednesday 3, October 2011  | ||||||
| @@ -838,4 +838,4 @@ Address: 130 Fox St | |||||||
|  |  | ||||||
| Please bring in your ID.""" | Please bring in your ID.""" | ||||||
|     parsed = quotations.extract_from_plain(msg_body) |     parsed = quotations.extract_from_plain(msg_body) | ||||||
|     eq_(msg_body, parsed.decode('utf8')) |     eq_(msg_body, parsed) | ||||||
|   | |||||||
| @@ -125,39 +125,13 @@ def test_html_fromstring_exception(): | |||||||
|     eq_(None, u.html_fromstring("<html></html>")) |     eq_(None, u.html_fromstring("<html></html>")) | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(u, 'html_too_big', Mock()) |  | ||||||
| @patch.object(u.html5parser, 'fromstring') |  | ||||||
| def test_html_fromstring_too_big(fromstring): |  | ||||||
|     eq_(None, u.html_fromstring("<html></html>")) |  | ||||||
|     assert_false(fromstring.called) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(u.html5parser, 'document_fromstring') | @patch.object(u.html5parser, 'document_fromstring') | ||||||
| def test_html_document_fromstring_exception(document_fromstring): | def test_html_document_fromstring_exception(document_fromstring): | ||||||
|     document_fromstring.side_effect = Exception() |     document_fromstring.side_effect = Exception() | ||||||
|     eq_(None, u.html_document_fromstring("<html></html>")) |     eq_(None, u.html_document_fromstring("<html></html>")) | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(u, 'html_too_big', Mock()) |  | ||||||
| @patch.object(u.html5parser, 'document_fromstring') |  | ||||||
| def test_html_document_fromstring_too_big(document_fromstring): |  | ||||||
|     eq_(None, u.html_document_fromstring("<html></html>")) |  | ||||||
|     assert_false(document_fromstring.called) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(u, 'html_fromstring', Mock(return_value=None)) | @patch.object(u, 'html_fromstring', Mock(return_value=None)) | ||||||
| def test_bad_html_to_text(): | def test_bad_html_to_text(): | ||||||
|     bad_html = "one<br>two<br>three" |     bad_html = "one<br>two<br>three" | ||||||
|     eq_(None, u.html_to_text(bad_html)) |     eq_(None, u.html_to_text(bad_html)) | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(u, '_MAX_TAGS_COUNT', 3) |  | ||||||
| def test_html_too_big(): |  | ||||||
|     eq_(False, u.html_too_big("<div></div>")) |  | ||||||
|     eq_(True, u.html_too_big("<div><span>Hi</span></div>")) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(u, '_MAX_TAGS_COUNT', 3) |  | ||||||
| def test_html_to_text(): |  | ||||||
|     eq_(b"Hello", u.html_to_text("<div>Hello</div>")) |  | ||||||
|     eq_(None, u.html_to_text("<div><span>Hi</span></div>")) |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user