Compare commits
	
		
			58 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | f16ae5110b | ||
|  | ab5cbe5ec3 | ||
|  | be5da92f16 | ||
|  | 95954a65a0 | ||
|  | 0b55e8fa77 | ||
|  | 6f159e8959 | ||
|  | 5c413b4b00 | ||
|  | cca64d3ed1 | ||
|  | e11eaf6ff8 | ||
|  | 85a4c1d855 | ||
|  | 0f5e72623b | ||
|  | 061e549ad7 | ||
|  | 49d1a5d248 | ||
|  | 03d6b00db8 | ||
|  | a2eb0f7201 | ||
|  | 5c71a0ca07 | ||
|  | 489d16fad9 | ||
|  | a458707777 | ||
|  | a1d0a86305 | ||
|  | 29f1d21be7 | ||
|  | 34c5b526c3 | ||
|  | 3edb6578ba | ||
|  | 984c036b6e | ||
|  | a403ecb5c9 | ||
|  | a44713409c | ||
|  | 567467b8ed | ||
|  | 139edd6104 | ||
|  | e756d55abf | ||
|  | 015c8d2a78 | ||
|  | 5af846c13d | ||
|  | e69a9c7a54 | ||
|  | 23cb2a9a53 | ||
|  | b5e3397b88 | ||
|  | 5685a4055a | ||
|  | 97b72ef767 | ||
|  | 31489848be | ||
|  | e5988d447b | ||
|  | adfed748ce | ||
|  | 2444ba87c0 | ||
|  | 534457e713 | ||
|  | ea82a9730e | ||
|  | f04b872e14 | ||
|  | e61894e425 | ||
|  | 35fbdaadac | ||
|  | 8441bc7328 | ||
|  | 37c95ff97b | ||
|  | 5b1ca33c57 | ||
|  | ec8e09b34e | ||
|  | bcf97eccfa | ||
|  | f53b5cc7a6 | ||
|  | 27adde7aa7 | ||
|  | a9719833e0 | ||
|  | 7bf37090ca | ||
|  | 44fcef7123 | ||
|  | 69a44b10a1 | ||
|  | b085e3d049 | ||
|  | 4b953bcddc | ||
|  | 315eaa7080 | 
							
								
								
									
										5
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										5
									
								
								setup.py
									
									
									
									
									
								
							| @@ -29,7 +29,7 @@ class InstallCommand(install): | |||||||
|  |  | ||||||
|  |  | ||||||
| setup(name='talon', | setup(name='talon', | ||||||
|       version='1.2.14', |       version='1.3.7', | ||||||
|       description=("Mailgun library " |       description=("Mailgun library " | ||||||
|                    "to extract message quotations and signatures."), |                    "to extract message quotations and signatures."), | ||||||
|       long_description=open("README.rst").read(), |       long_description=open("README.rst").read(), | ||||||
| @@ -48,11 +48,12 @@ setup(name='talon', | |||||||
|           "regex>=1", |           "regex>=1", | ||||||
|           "numpy", |           "numpy", | ||||||
|           "scipy", |           "scipy", | ||||||
|           "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild |           "scikit-learn>=0.16.1", # pickled versions of classifier, else rebuild | ||||||
|           'chardet>=1.0.1', |           'chardet>=1.0.1', | ||||||
|           'cchardet>=0.3.5', |           'cchardet>=0.3.5', | ||||||
|           'cssselect', |           'cssselect', | ||||||
|           'six>=1.10.0', |           'six>=1.10.0', | ||||||
|  |           'html5lib' | ||||||
|           ], |           ], | ||||||
|       tests_require=[ |       tests_require=[ | ||||||
|           "mock", |           "mock", | ||||||
|   | |||||||
| @@ -6,6 +6,7 @@ messages (without quoted messages) from html | |||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
| import regex as re | import regex as re | ||||||
|  |  | ||||||
|  | from talon.utils import cssselect  | ||||||
|  |  | ||||||
| CHECKPOINT_PREFIX = '#!%!' | CHECKPOINT_PREFIX = '#!%!' | ||||||
| CHECKPOINT_SUFFIX = '!%!#' | CHECKPOINT_SUFFIX = '!%!#' | ||||||
| @@ -78,7 +79,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints): | |||||||
|  |  | ||||||
| def cut_gmail_quote(html_message): | def cut_gmail_quote(html_message): | ||||||
|     ''' Cuts the outermost block element with class gmail_quote. ''' |     ''' Cuts the outermost block element with class gmail_quote. ''' | ||||||
|     gmail_quote = html_message.cssselect('div.gmail_quote') |     gmail_quote = cssselect('div.gmail_quote', html_message) | ||||||
|     if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)): |     if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)): | ||||||
|         gmail_quote[0].getparent().remove(gmail_quote[0]) |         gmail_quote[0].getparent().remove(gmail_quote[0]) | ||||||
|         return True |         return True | ||||||
| @@ -135,7 +136,7 @@ def cut_microsoft_quote(html_message): | |||||||
| def cut_by_id(html_message): | def cut_by_id(html_message): | ||||||
|     found = False |     found = False | ||||||
|     for quote_id in QUOTE_IDS: |     for quote_id in QUOTE_IDS: | ||||||
|         quote = html_message.cssselect('#{}'.format(quote_id)) |         quote = cssselect('#{}'.format(quote_id), html_message) | ||||||
|         if quote: |         if quote: | ||||||
|             found = True |             found = True | ||||||
|             quote[0].getparent().remove(quote[0]) |             quote[0].getparent().remove(quote[0]) | ||||||
|   | |||||||
| @@ -12,7 +12,8 @@ from copy import deepcopy | |||||||
|  |  | ||||||
| from lxml import html, etree | from lxml import html, etree | ||||||
|  |  | ||||||
| from talon.utils import get_delimiter, html_to_text | from talon.utils import (get_delimiter, html_tree_to_text, | ||||||
|  |                          html_document_fromstring) | ||||||
| from talon import html_quotations | from talon import html_quotations | ||||||
| from six.moves import range | from six.moves import range | ||||||
| import six | import six | ||||||
| @@ -138,6 +139,21 @@ RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? . | |||||||
|         'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt', |         'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt', | ||||||
|     ))), re.I) |     ))), re.I) | ||||||
|  |  | ||||||
|  | # ---- John Smith wrote ---- | ||||||
|  | RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format( | ||||||
|  |     u'|'.join(( | ||||||
|  |         # English | ||||||
|  |         'wrote' | ||||||
|  |     ))), re.I) | ||||||
|  |  | ||||||
|  | # Support polymail.io reply format | ||||||
|  | # On Tue, Apr 11, 2017 at 10:07 PM John Smith | ||||||
|  | # | ||||||
|  | # < | ||||||
|  | # mailto:John Smith <johnsmith@gmail.com> | ||||||
|  | # > wrote: | ||||||
|  | RE_POLYMAIL = re.compile('On.*\s{2}<\smailto:.*\s> wrote:', re.I) | ||||||
|  |  | ||||||
| SPLITTER_PATTERNS = [ | SPLITTER_PATTERNS = [ | ||||||
|     RE_ORIGINAL_MESSAGE, |     RE_ORIGINAL_MESSAGE, | ||||||
|     RE_ON_DATE_SMB_WROTE, |     RE_ON_DATE_SMB_WROTE, | ||||||
| @@ -153,16 +169,17 @@ SPLITTER_PATTERNS = [ | |||||||
|     re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' |     re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' | ||||||
|                '( \S+){3,6}@\S+:'), |                '( \S+){3,6}@\S+:'), | ||||||
|     # Sent from Samsung MobileName <address@example.com> wrote: |     # Sent from Samsung MobileName <address@example.com> wrote: | ||||||
|     re.compile('Sent from Samsung .*@.*> wrote') |     re.compile('Sent from Samsung .*@.*> wrote'), | ||||||
|  |     RE_ANDROID_WROTE, | ||||||
|  |     RE_POLYMAIL | ||||||
|     ] |     ] | ||||||
|  |  | ||||||
|  |  | ||||||
| RE_LINK = re.compile('<(http://[^>]*)>') | RE_LINK = re.compile('<(http://[^>]*)>') | ||||||
| RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@') | RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@') | ||||||
|  |  | ||||||
| RE_PARENTHESIS_LINK = re.compile("\(https?://") | RE_PARENTHESIS_LINK = re.compile("\(https?://") | ||||||
|  |  | ||||||
| SPLITTER_MAX_LINES = 4 | SPLITTER_MAX_LINES = 6 | ||||||
| MAX_LINES_COUNT = 1000 | MAX_LINES_COUNT = 1000 | ||||||
| # an extensive research shows that exceeding this limit | # an extensive research shows that exceeding this limit | ||||||
| # leads to excessive processing time | # leads to excessive processing time | ||||||
| @@ -171,6 +188,9 @@ MAX_HTML_LEN = 2794202 | |||||||
| QUOT_PATTERN = re.compile('^>+ ?') | QUOT_PATTERN = re.compile('^>+ ?') | ||||||
| NO_QUOT_LINE = re.compile('^[^>].*[\S].*') | NO_QUOT_LINE = re.compile('^[^>].*[\S].*') | ||||||
|  |  | ||||||
|  | # Regular expression to identify if a line is a header. | ||||||
|  | RE_HEADER = re.compile(": ") | ||||||
|  |  | ||||||
|  |  | ||||||
| def extract_from(msg_body, content_type='text/plain'): | def extract_from(msg_body, content_type='text/plain'): | ||||||
|     try: |     try: | ||||||
| @@ -184,6 +204,19 @@ def extract_from(msg_body, content_type='text/plain'): | |||||||
|     return msg_body |     return msg_body | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def remove_initial_spaces_and_mark_message_lines(lines): | ||||||
|  |     """ | ||||||
|  |     Removes the initial spaces in each line before marking message lines. | ||||||
|  |  | ||||||
|  |     This ensures headers can be identified if they are indented with spaces. | ||||||
|  |     """ | ||||||
|  |     i = 0 | ||||||
|  |     while i < len(lines): | ||||||
|  |         lines[i] = lines[i].lstrip(' ') | ||||||
|  |         i += 1 | ||||||
|  |     return mark_message_lines(lines) | ||||||
|  |  | ||||||
|  |  | ||||||
| def mark_message_lines(lines): | def mark_message_lines(lines): | ||||||
|     """Mark message lines with markers to distinguish quotation lines. |     """Mark message lines with markers to distinguish quotation lines. | ||||||
|  |  | ||||||
| @@ -286,9 +319,21 @@ def preprocess(msg_body, delimiter, content_type='text/plain'): | |||||||
|  |  | ||||||
|     Converts msg_body into a unicode. |     Converts msg_body into a unicode. | ||||||
|     """ |     """ | ||||||
|     # normalize links i.e. replace '<', '>' wrapping the link with some symbols |     msg_body = _replace_link_brackets(msg_body) | ||||||
|     # so that '>' closing the link couldn't be mistakenly taken for quotation |  | ||||||
|     # marker. |     msg_body = _wrap_splitter_with_newline(msg_body, delimiter, content_type) | ||||||
|  |  | ||||||
|  |     return msg_body | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _replace_link_brackets(msg_body): | ||||||
|  |     """ | ||||||
|  |     Normalize links i.e. replace '<', '>' wrapping the link with some symbols | ||||||
|  |     so that '>' closing the link couldn't be mistakenly taken for quotation | ||||||
|  |     marker. | ||||||
|  |  | ||||||
|  |     Converts msg_body into a unicode | ||||||
|  |     """ | ||||||
|     if isinstance(msg_body, bytes): |     if isinstance(msg_body, bytes): | ||||||
|         msg_body = msg_body.decode('utf8') |         msg_body = msg_body.decode('utf8') | ||||||
|  |  | ||||||
| @@ -300,7 +345,14 @@ def preprocess(msg_body, delimiter, content_type='text/plain'): | |||||||
|             return "@@%s@@" % link.group(1) |             return "@@%s@@" % link.group(1) | ||||||
|  |  | ||||||
|     msg_body = re.sub(RE_LINK, link_wrapper, msg_body) |     msg_body = re.sub(RE_LINK, link_wrapper, msg_body) | ||||||
|  |     return msg_body | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'): | ||||||
|  |     """ | ||||||
|  |     Splits line in two if splitter pattern preceded by some text on the same | ||||||
|  |     line (done only for 'On <date> <person> wrote:' pattern. | ||||||
|  |     """ | ||||||
|     def splitter_wrapper(splitter): |     def splitter_wrapper(splitter): | ||||||
|         """Wraps splitter with new line""" |         """Wraps splitter with new line""" | ||||||
|         if splitter.start() and msg_body[splitter.start() - 1] != '\n': |         if splitter.start() and msg_body[splitter.start() - 1] != '\n': | ||||||
| @@ -385,17 +437,15 @@ def _extract_from_html(msg_body): | |||||||
|     then checking deleted checkpoints, |     then checking deleted checkpoints, | ||||||
|     then deleting necessary tags. |     then deleting necessary tags. | ||||||
|     """ |     """ | ||||||
|     if len(msg_body) > MAX_HTML_LEN: |  | ||||||
|         return msg_body |  | ||||||
|  |  | ||||||
|     if msg_body.strip() == b'': |     if msg_body.strip() == b'': | ||||||
|         return msg_body |         return msg_body | ||||||
|  |  | ||||||
|     msg_body = msg_body.replace(b'\r\n', b'').replace(b'\n', b'') |     msg_body = msg_body.replace(b'\r\n', b'\n') | ||||||
|     html_tree = html.document_fromstring( |     html_tree = html_document_fromstring(msg_body) | ||||||
|         msg_body, |  | ||||||
|         parser=html.HTMLParser(encoding="utf-8") |     if html_tree is None: | ||||||
|     ) |         return msg_body | ||||||
|  |  | ||||||
|     cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or |     cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or | ||||||
|                       html_quotations.cut_zimbra_quote(html_tree) or |                       html_quotations.cut_zimbra_quote(html_tree) or | ||||||
|                       html_quotations.cut_blockquote(html_tree) or |                       html_quotations.cut_blockquote(html_tree) or | ||||||
| @@ -407,8 +457,7 @@ def _extract_from_html(msg_body): | |||||||
|  |  | ||||||
|     number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) |     number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) | ||||||
|     quotation_checkpoints = [False] * number_of_checkpoints |     quotation_checkpoints = [False] * number_of_checkpoints | ||||||
|     msg_with_checkpoints = html.tostring(html_tree) |     plain_text = html_tree_to_text(html_tree) | ||||||
|     plain_text = html_to_text(msg_with_checkpoints) |  | ||||||
|     plain_text = preprocess(plain_text, '\n', content_type='text/html') |     plain_text = preprocess(plain_text, '\n', content_type='text/html') | ||||||
|     lines = plain_text.splitlines() |     lines = plain_text.splitlines() | ||||||
|  |  | ||||||
| @@ -431,25 +480,107 @@ def _extract_from_html(msg_body): | |||||||
|     return_flags = [] |     return_flags = [] | ||||||
|     process_marked_lines(lines, markers, return_flags) |     process_marked_lines(lines, markers, return_flags) | ||||||
|     lines_were_deleted, first_deleted, last_deleted = return_flags |     lines_were_deleted, first_deleted, last_deleted = return_flags | ||||||
|  |  | ||||||
|  |     if not lines_were_deleted and not cut_quotations: | ||||||
|  |         return msg_body | ||||||
|  |  | ||||||
|     if lines_were_deleted: |     if lines_were_deleted: | ||||||
|         #collect checkpoints from deleted lines |         #collect checkpoints from deleted lines | ||||||
|         for i in range(first_deleted, last_deleted): |         for i in range(first_deleted, last_deleted): | ||||||
|             for checkpoint in line_checkpoints[i]: |             for checkpoint in line_checkpoints[i]: | ||||||
|                 quotation_checkpoints[checkpoint] = True |                 quotation_checkpoints[checkpoint] = True | ||||||
|     else: |  | ||||||
|         if cut_quotations: |  | ||||||
|             return html.tostring(html_tree_copy) |  | ||||||
|         else: |  | ||||||
|             return msg_body |  | ||||||
|  |  | ||||||
|         # Remove tags with quotation checkpoints |         # Remove tags with quotation checkpoints | ||||||
|         html_quotations.delete_quotation_tags( |         html_quotations.delete_quotation_tags( | ||||||
|             html_tree_copy, 0, quotation_checkpoints |             html_tree_copy, 0, quotation_checkpoints | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|  |     if _readable_text_empty(html_tree_copy): | ||||||
|  |         return msg_body | ||||||
|  |  | ||||||
|     return html.tostring(html_tree_copy) |     return html.tostring(html_tree_copy) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def split_emails(msg): | ||||||
|  |     """ | ||||||
|  |     Given a message (which may consist of an email conversation thread with | ||||||
|  |     multiple emails), mark the lines to identify split lines, content lines and | ||||||
|  |     empty lines. | ||||||
|  |  | ||||||
|  |     Correct the split line markers inside header blocks. Header blocks are | ||||||
|  |     identified by the regular expression RE_HEADER. | ||||||
|  |  | ||||||
|  |     Return the corrected markers | ||||||
|  |     """ | ||||||
|  |     msg_body = _replace_link_brackets(msg) | ||||||
|  |  | ||||||
|  |     # don't process too long messages | ||||||
|  |     lines = msg_body.splitlines()[:MAX_LINES_COUNT] | ||||||
|  |     markers = remove_initial_spaces_and_mark_message_lines(lines) | ||||||
|  |  | ||||||
|  |     markers = _mark_quoted_email_splitlines(markers, lines) | ||||||
|  |  | ||||||
|  |     # we don't want splitlines in header blocks | ||||||
|  |     markers = _correct_splitlines_in_headers(markers, lines) | ||||||
|  |  | ||||||
|  |     return markers | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _mark_quoted_email_splitlines(markers, lines): | ||||||
|  |     """ | ||||||
|  |     When there are headers indented with '>' characters, this method will | ||||||
|  |     attempt to identify if the header is a splitline header. If it is, then we | ||||||
|  |     mark it with 's' instead of leaving it as 'm' and return the new markers. | ||||||
|  |     """ | ||||||
|  |     # Create a list of markers to easily alter specific characters | ||||||
|  |     markerlist = list(markers) | ||||||
|  |     for i, line in enumerate(lines): | ||||||
|  |         if markerlist[i] != 'm': | ||||||
|  |             continue | ||||||
|  |         for pattern in SPLITTER_PATTERNS: | ||||||
|  |             matcher = re.search(pattern, line) | ||||||
|  |             if matcher: | ||||||
|  |                 markerlist[i] = 's' | ||||||
|  |                 break | ||||||
|  |  | ||||||
|  |     return "".join(markerlist) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _correct_splitlines_in_headers(markers, lines): | ||||||
|  |     """ | ||||||
|  |     Corrects markers by removing splitlines deemed to be inside header blocks. | ||||||
|  |     """ | ||||||
|  |     updated_markers = "" | ||||||
|  |     i = 0 | ||||||
|  |     in_header_block = False | ||||||
|  |  | ||||||
|  |     for m in markers: | ||||||
|  |         # Only set in_header_block flag when we hit an 's' and line is a header | ||||||
|  |         if m == 's': | ||||||
|  |             if not in_header_block: | ||||||
|  |                 if bool(re.search(RE_HEADER, lines[i])): | ||||||
|  |                     in_header_block = True | ||||||
|  |             else: | ||||||
|  |                 if QUOT_PATTERN.match(lines[i]): | ||||||
|  |                     m = 'm' | ||||||
|  |                 else: | ||||||
|  |                     m = 't' | ||||||
|  |  | ||||||
|  |         # If the line is not a header line, set in_header_block false. | ||||||
|  |         if not bool(re.search(RE_HEADER, lines[i])): | ||||||
|  |             in_header_block = False | ||||||
|  |  | ||||||
|  |         # Add the marker to the new updated markers string. | ||||||
|  |         updated_markers += m | ||||||
|  |         i += 1 | ||||||
|  |  | ||||||
|  |     return updated_markers | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _readable_text_empty(html_tree): | ||||||
|  |     return not bool(html_tree_to_text(html_tree).strip()) | ||||||
|  |  | ||||||
|  |  | ||||||
| def is_splitter(line): | def is_splitter(line): | ||||||
|     ''' |     ''' | ||||||
|     Returns Matcher object if provided string is a splitter and |     Returns Matcher object if provided string is a splitter and | ||||||
| @@ -463,7 +594,7 @@ def is_splitter(line): | |||||||
|  |  | ||||||
| def text_content(context): | def text_content(context): | ||||||
|     '''XPath Extension function to return a node text content.''' |     '''XPath Extension function to return a node text content.''' | ||||||
|     return context.context_node.text_content().strip() |     return context.context_node.xpath("string()").strip() | ||||||
|  |  | ||||||
|  |  | ||||||
| def tail(context): | def tail(context): | ||||||
|   | |||||||
							
								
								
									
										107
									
								
								talon/utils.py
									
									
									
									
									
								
							
							
						
						
									
										107
									
								
								talon/utils.py
									
									
									
									
									
								
							| @@ -7,9 +7,11 @@ import chardet | |||||||
| import cchardet | import cchardet | ||||||
| import regex as re | import regex as re | ||||||
|  |  | ||||||
| from lxml import html | from lxml.html import html5parser | ||||||
| from lxml.cssselect import CSSSelector | from lxml.cssselect import CSSSelector | ||||||
|  |  | ||||||
|  | import html5lib | ||||||
|  |  | ||||||
| from talon.constants import RE_DELIMITER | from talon.constants import RE_DELIMITER | ||||||
| import six | import six | ||||||
|  |  | ||||||
| @@ -113,29 +115,18 @@ def get_delimiter(msg_body): | |||||||
|     return delimiter |     return delimiter | ||||||
|  |  | ||||||
|  |  | ||||||
| def html_to_text(string): | def html_tree_to_text(tree): | ||||||
|     """ |  | ||||||
|     Dead-simple HTML-to-text converter: |  | ||||||
|         >>> html_to_text("one<br>two<br>three") |  | ||||||
|         >>> "one\ntwo\nthree" |  | ||||||
|  |  | ||||||
|     NOTES: |  | ||||||
|         1. the string is expected to contain UTF-8 encoded HTML! |  | ||||||
|         2. returns utf-8 encoded str (not unicode) |  | ||||||
|     """ |  | ||||||
|     if isinstance(string, six.text_type): |  | ||||||
|         string = string.encode('utf8') |  | ||||||
|  |  | ||||||
|     s = _prepend_utf8_declaration(string) |  | ||||||
|     s = s.replace(b"\n", b"") |  | ||||||
|  |  | ||||||
|     tree = html.fromstring(s) |  | ||||||
|  |  | ||||||
|     for style in CSSSelector('style')(tree): |     for style in CSSSelector('style')(tree): | ||||||
|         style.getparent().remove(style) |         style.getparent().remove(style) | ||||||
|  |  | ||||||
|     for c in tree.xpath('//comment()'): |     for c in tree.xpath('//comment()'): | ||||||
|         c.getparent().remove(c) |         parent = c.getparent() | ||||||
|  |  | ||||||
|  |         # comment with no parent does not impact produced text | ||||||
|  |         if parent is None: | ||||||
|  |             continue | ||||||
|  |  | ||||||
|  |         parent.remove(c) | ||||||
|  |  | ||||||
|     text   = "" |     text   = "" | ||||||
|     for el in tree.iter(): |     for el in tree.iter(): | ||||||
| @@ -159,6 +150,62 @@ def html_to_text(string): | |||||||
|     return _encode_utf8(retval) |     return _encode_utf8(retval) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def html_to_text(string): | ||||||
|  |     """ | ||||||
|  |     Dead-simple HTML-to-text converter: | ||||||
|  |         >>> html_to_text("one<br>two<br>three") | ||||||
|  |         >>> "one\ntwo\nthree" | ||||||
|  |  | ||||||
|  |     NOTES: | ||||||
|  |         1. the string is expected to contain UTF-8 encoded HTML! | ||||||
|  |         2. returns utf-8 encoded str (not unicode) | ||||||
|  |         3. if html can't be parsed returns None | ||||||
|  |     """ | ||||||
|  |     if isinstance(string, six.text_type): | ||||||
|  |         string = string.encode('utf8') | ||||||
|  |  | ||||||
|  |     s = _prepend_utf8_declaration(string) | ||||||
|  |     s = s.replace(b"\n", b"") | ||||||
|  |     tree = html_fromstring(s) | ||||||
|  |  | ||||||
|  |     if tree is None: | ||||||
|  |         return None | ||||||
|  |  | ||||||
|  |     return html_tree_to_text(tree) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def html_fromstring(s): | ||||||
|  |     """Parse html tree from string. Return None if the string can't be parsed. | ||||||
|  |     """ | ||||||
|  |     try: | ||||||
|  |         if html_too_big(s): | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |         return html5parser.fromstring(s, parser=_html5lib_parser()) | ||||||
|  |     except Exception: | ||||||
|  |         pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def html_document_fromstring(s): | ||||||
|  |     """Parse html tree from string. Return None if the string can't be parsed. | ||||||
|  |     """ | ||||||
|  |     try: | ||||||
|  |         if html_too_big(s): | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |         return html5parser.document_fromstring(s, parser=_html5lib_parser()) | ||||||
|  |     except Exception: | ||||||
|  |         pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def cssselect(expr, tree): | ||||||
|  |     return CSSSelector(expr)(tree) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def html_too_big(s): | ||||||
|  |     return s.count('<') > _MAX_TAGS_COUNT | ||||||
|  |  | ||||||
|  |  | ||||||
| def _contains_charset_spec(s): | def _contains_charset_spec(s): | ||||||
|     """Return True if the first 4KB contain charset spec |     """Return True if the first 4KB contain charset spec | ||||||
|     """ |     """ | ||||||
| @@ -183,6 +230,21 @@ def _encode_utf8(s): | |||||||
|     return s.encode('utf-8') if isinstance(s, six.text_type) else s |     return s.encode('utf-8') if isinstance(s, six.text_type) else s | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _html5lib_parser(): | ||||||
|  |     """ | ||||||
|  |     html5lib is a pure-python library that conforms to the WHATWG HTML spec | ||||||
|  |     and is not vulnarable to certain attacks common for XML libraries | ||||||
|  |     """ | ||||||
|  |     return html5lib.HTMLParser( | ||||||
|  |         # build lxml tree | ||||||
|  |         html5lib.treebuilders.getTreeBuilder("lxml"), | ||||||
|  |         # remove namespace value from inside lxml.html.html5paser element tag | ||||||
|  |         # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div" | ||||||
|  |         # instead of "div", throwing the algo off | ||||||
|  |         namespaceHTMLElements=False | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  |  | ||||||
| _UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;' | _UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;' | ||||||
|                      b'charset=utf-8">') |                      b'charset=utf-8">') | ||||||
|  |  | ||||||
| @@ -190,5 +252,8 @@ _UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;' | |||||||
| _BLOCKTAGS  = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] | _BLOCKTAGS  = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] | ||||||
| _HARDBREAKS = ['br', 'hr', 'tr'] | _HARDBREAKS = ['br', 'hr', 'tr'] | ||||||
|  |  | ||||||
|  |  | ||||||
| _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") | _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") | ||||||
|  |  | ||||||
|  | # an extensive research shows that exceeding this limit | ||||||
|  | # might lead to excessive processing time | ||||||
|  | _MAX_TAGS_COUNT = 419 | ||||||
|   | |||||||
| @@ -27,7 +27,7 @@ def test_quotation_splitter_inside_blockquote(): | |||||||
|  |  | ||||||
| </blockquote>""" | </blockquote>""" | ||||||
|  |  | ||||||
|     eq_("<html><body><p>Reply</p></body></html>", |     eq_("<html><head></head><body>Reply</body></html>", | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -44,7 +44,7 @@ def test_quotation_splitter_outside_blockquote(): | |||||||
|   </div> |   </div> | ||||||
| </blockquote> | </blockquote> | ||||||
| """ | """ | ||||||
|     eq_("<html><body><p>Reply</p></body></html>", |     eq_("<html><head></head><body>Reply</body></html>", | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -62,7 +62,7 @@ def test_regular_blockquote(): | |||||||
|   </div> |   </div> | ||||||
| </blockquote> | </blockquote> | ||||||
| """ | """ | ||||||
|     eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote></body></html>", |     eq_("<html><head></head><body>Reply<blockquote>Regular</blockquote></body></html>", | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -85,6 +85,7 @@ Reply | |||||||
|  |  | ||||||
|     reply = """ |     reply = """ | ||||||
| <html> | <html> | ||||||
|  | <head></head> | ||||||
| <body> | <body> | ||||||
| Reply | Reply | ||||||
|  |  | ||||||
| @@ -128,7 +129,7 @@ def test_gmail_quote(): | |||||||
|     </div> |     </div> | ||||||
|   </div> |   </div> | ||||||
| </div>""" | </div>""" | ||||||
|     eq_("<html><body><p>Reply</p></body></html>", |     eq_("<html><head></head><body>Reply</body></html>", | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -139,7 +140,7 @@ def test_gmail_quote_compact(): | |||||||
|                '<div>Test</div>' \ |                '<div>Test</div>' \ | ||||||
|                '</div>' \ |                '</div>' \ | ||||||
|                '</div>' |                '</div>' | ||||||
|     eq_("<html><body><p>Reply</p></body></html>", |     eq_("<html><head></head><body>Reply</body></html>", | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -166,7 +167,7 @@ def test_unicode_in_reply(): | |||||||
|   Quote |   Quote | ||||||
| </blockquote>""".encode("utf-8") | </blockquote>""".encode("utf-8") | ||||||
|  |  | ||||||
|     eq_("<html><body><p>Reply  Text<br></p><div><br></div>" |     eq_("<html><head></head><body>Reply  Text<br><div><br></div>" | ||||||
|         "</body></html>", |         "</body></html>", | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
| @@ -192,6 +193,7 @@ def test_blockquote_disclaimer(): | |||||||
|  |  | ||||||
|     stripped_html = """ |     stripped_html = """ | ||||||
| <html> | <html> | ||||||
|  |   <head></head> | ||||||
|   <body> |   <body> | ||||||
|   <div> |   <div> | ||||||
|     <div> |     <div> | ||||||
| @@ -223,7 +225,7 @@ def test_date_block(): | |||||||
|   </div> |   </div> | ||||||
| </div> | </div> | ||||||
| """ | """ | ||||||
|     eq_('<html><body><div>message<br></div></body></html>', |     eq_('<html><head></head><body><div>message<br></div></body></html>', | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -240,7 +242,7 @@ Subject: You Have New Mail From Mary!<br><br> | |||||||
| text | text | ||||||
| </div></div> | </div></div> | ||||||
| """ | """ | ||||||
|     eq_('<html><body><div>message<br></div></body></html>', |     eq_('<html><head></head><body><div>message<br></div></body></html>', | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -258,7 +260,7 @@ def test_reply_shares_div_with_from_block(): | |||||||
|  |  | ||||||
|   </div> |   </div> | ||||||
| </body>''' | </body>''' | ||||||
|     eq_('<html><body><div>Blah<br><br></div></body></html>', |     eq_('<html><head></head><body><div>Blah<br><br></div></body></html>', | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -269,13 +271,13 @@ def test_reply_quotations_share_block(): | |||||||
|  |  | ||||||
|  |  | ||||||
| def test_OLK_SRC_BODY_SECTION_stripped(): | def test_OLK_SRC_BODY_SECTION_stripped(): | ||||||
|     eq_('<html><body><div>Reply</div></body></html>', |     eq_('<html><head></head><body><div>Reply</div></body></html>', | ||||||
|         RE_WHITESPACE.sub( |         RE_WHITESPACE.sub( | ||||||
|             '', quotations.extract_from_html(OLK_SRC_BODY_SECTION))) |             '', quotations.extract_from_html(OLK_SRC_BODY_SECTION))) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_reply_separated_by_hr(): | def test_reply_separated_by_hr(): | ||||||
|     eq_('<html><body><div>Hi<div>there</div></div></body></html>', |     eq_('<html><head></head><body><div>Hi<div>there</div></div></body></html>', | ||||||
|         RE_WHITESPACE.sub( |         RE_WHITESPACE.sub( | ||||||
|             '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) |             '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) | ||||||
|  |  | ||||||
| @@ -296,7 +298,7 @@ Reply | |||||||
|   </div> |   </div> | ||||||
| </div> | </div> | ||||||
| ''' | ''' | ||||||
|     eq_('<html><body><p>Reply</p><div><hr></div></body></html>', |     eq_('<html><head></head><body>Reply<div><hr></div></body></html>', | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -356,7 +358,8 @@ def test_CRLF(): | |||||||
|     assert_false(symbol in extracted) |     assert_false(symbol in extracted) | ||||||
|     eq_('<html></html>', RE_WHITESPACE.sub('', extracted)) |     eq_('<html></html>', RE_WHITESPACE.sub('', extracted)) | ||||||
|  |  | ||||||
|     msg_body = """Reply |     msg_body = """My | ||||||
|  | reply | ||||||
| <blockquote> | <blockquote> | ||||||
|  |  | ||||||
|   <div> |   <div> | ||||||
| @@ -371,8 +374,8 @@ def test_CRLF(): | |||||||
|     msg_body = msg_body.replace('\n', '\r\n') |     msg_body = msg_body.replace('\n', '\r\n') | ||||||
|     extracted = quotations.extract_from_html(msg_body) |     extracted = quotations.extract_from_html(msg_body) | ||||||
|     assert_false(symbol in extracted)     |     assert_false(symbol in extracted)     | ||||||
|     eq_("<html><body><p>Reply</p></body></html>", |     # Keep new lines otherwise "My reply" becomes one word - "Myreply"  | ||||||
|         RE_WHITESPACE.sub('', extracted)) |     eq_("<html><head></head><body>My\nreply\n</body></html>", extracted) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_gmail_forwarded_msg(): | def test_gmail_forwarded_msg(): | ||||||
| @@ -382,7 +385,7 @@ def test_gmail_forwarded_msg(): | |||||||
|     eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) |     eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(quotations, 'MAX_HTML_LEN', 1) | @patch.object(u, '_MAX_TAGS_COUNT', 4) | ||||||
| def test_too_large_html(): | def test_too_large_html(): | ||||||
|     msg_body = 'Reply' \ |     msg_body = 'Reply' \ | ||||||
|                '<div class="gmail_quote">' \ |                '<div class="gmail_quote">' \ | ||||||
| @@ -392,3 +395,27 @@ def test_too_large_html(): | |||||||
|                '</div>' |                '</div>' | ||||||
|     eq_(RE_WHITESPACE.sub('', msg_body), |     eq_(RE_WHITESPACE.sub('', msg_body), | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_readable_html_empty(): | ||||||
|  |     msg_body = """ | ||||||
|  | <blockquote> | ||||||
|  |   Reply | ||||||
|  |   <div> | ||||||
|  |     On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: | ||||||
|  |   </div> | ||||||
|  |  | ||||||
|  |   <div> | ||||||
|  |     Test | ||||||
|  |   </div> | ||||||
|  |  | ||||||
|  | </blockquote>""" | ||||||
|  |  | ||||||
|  |     eq_(RE_WHITESPACE.sub('', msg_body), | ||||||
|  |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @patch.object(quotations, 'html_document_fromstring', Mock(return_value=None)) | ||||||
|  | def test_bad_html(): | ||||||
|  |     bad_html = "<html></html>" | ||||||
|  |     eq_(bad_html, quotations.extract_from_html(bad_html)) | ||||||
|   | |||||||
| @@ -35,6 +35,19 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote: | |||||||
|  |  | ||||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  | def test_pattern_on_date_polymail(): | ||||||
|  |     msg_body = """Test reply | ||||||
|  |  | ||||||
|  | On Tue, Apr 11, 2017 at 10:07 PM John Smith | ||||||
|  |  | ||||||
|  | < | ||||||
|  | mailto:John Smith <johnsmith@gmail.com> | ||||||
|  | > wrote: | ||||||
|  | Test quoted data | ||||||
|  | """ | ||||||
|  |  | ||||||
|  |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_pattern_sent_from_samsung_smb_wrote(): | def test_pattern_sent_from_samsung_smb_wrote(): | ||||||
|     msg_body = """Test reply |     msg_body = """Test reply | ||||||
| @@ -142,7 +155,8 @@ def _check_pattern_original_message(original_message_indicator): | |||||||
| -----{}----- | -----{}----- | ||||||
|  |  | ||||||
| Test""" | Test""" | ||||||
|     eq_('Test reply', quotations.extract_from_plain(msg_body.format(six.text_type(original_message_indicator)))) |     eq_('Test reply', quotations.extract_from_plain( | ||||||
|  |         msg_body.format(six.text_type(original_message_indicator)))) | ||||||
|  |  | ||||||
| def test_english_original_message(): | def test_english_original_message(): | ||||||
|     _check_pattern_original_message('Original Message') |     _check_pattern_original_message('Original Message') | ||||||
| @@ -165,6 +179,17 @@ Test reply""" | |||||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_android_wrote(): | ||||||
|  |     msg_body = """Test reply | ||||||
|  |  | ||||||
|  | ---- John Smith wrote ---- | ||||||
|  |  | ||||||
|  | > quoted | ||||||
|  | > text | ||||||
|  | """ | ||||||
|  |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_reply_wraps_quotations(): | def test_reply_wraps_quotations(): | ||||||
|     msg_body = """Test reply |     msg_body = """Test reply | ||||||
|  |  | ||||||
| @@ -696,3 +721,52 @@ def test_standard_replies(): | |||||||
|                 "'%(reply)s' != %(stripped)s for %(fn)s" % \ |                 "'%(reply)s' != %(stripped)s for %(fn)s" % \ | ||||||
|                 {'reply': reply_text, 'stripped': stripped_text, |                 {'reply': reply_text, 'stripped': stripped_text, | ||||||
|                  'fn': filename} |                  'fn': filename} | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_split_email(): | ||||||
|  |     msg = """From: Mr. X | ||||||
|  |     Date: 24 February 2016 | ||||||
|  |     To: Mr. Y | ||||||
|  |     Subject: Hi | ||||||
|  |     Attachments: none | ||||||
|  |     Goodbye. | ||||||
|  |     From: Mr. Y | ||||||
|  |     To: Mr. X | ||||||
|  |     Date: 24 February 2016 | ||||||
|  |     Subject: Hi | ||||||
|  |     Attachments: none | ||||||
|  |  | ||||||
|  |     Hello. | ||||||
|  |  | ||||||
|  |         On 24th February 2016 at 09.32am, Conal wrote: | ||||||
|  |  | ||||||
|  |         Hey! | ||||||
|  |  | ||||||
|  |         On Mon, 2016-10-03 at 09:45 -0600, Stangel, Dan wrote: | ||||||
|  |         > Mohan, | ||||||
|  |         > | ||||||
|  |         > We have not yet migrated the systems. | ||||||
|  |         > | ||||||
|  |         > Dan | ||||||
|  |         > | ||||||
|  |         > > -----Original Message----- | ||||||
|  |         > > Date: Mon, 2 Apr 2012 17:44:22 +0400 | ||||||
|  |         > > Subject: Test | ||||||
|  |         > > From: bob@xxx.mailgun.org | ||||||
|  |         > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com | ||||||
|  |         > > | ||||||
|  |         > > Hi | ||||||
|  |         > > | ||||||
|  |         > > > From: bob@xxx.mailgun.org | ||||||
|  |         > > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com | ||||||
|  |         > > > Date: Mon, 2 Apr 2012 17:44:22 +0400 | ||||||
|  |         > > > Subject: Test | ||||||
|  |         > > > Hi | ||||||
|  |         > > > | ||||||
|  |         > > | ||||||
|  |         > | ||||||
|  |         > | ||||||
|  | """ | ||||||
|  |     expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm" | ||||||
|  |     markers = quotations.split_emails(msg) | ||||||
|  |     eq_(markers, expected_markers) | ||||||
|   | |||||||
| @@ -6,6 +6,7 @@ from . import * | |||||||
| from talon import utils as u | from talon import utils as u | ||||||
| import cchardet | import cchardet | ||||||
| import six | import six | ||||||
|  | from lxml import html | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_get_delimiter(): | def test_get_delimiter(): | ||||||
| @@ -28,7 +29,9 @@ def test_unicode(): | |||||||
|  |  | ||||||
| def test_detect_encoding(): | def test_detect_encoding(): | ||||||
|     eq_ ('ascii', u.detect_encoding(b'qwe').lower()) |     eq_ ('ascii', u.detect_encoding(b'qwe').lower()) | ||||||
|     eq_ ('iso-8859-2', u.detect_encoding(u'Versi\xf3n'.encode('iso-8859-2')).lower()) |     ok_ (u.detect_encoding( | ||||||
|  |         u'Versi\xf3n'.encode('iso-8859-2')).lower() in [ | ||||||
|  |             'iso-8859-1', 'iso-8859-2']) | ||||||
|     eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower()) |     eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower()) | ||||||
|     # fallback to utf-8 |     # fallback to utf-8 | ||||||
|     with patch.object(u.chardet, 'detect') as detect: |     with patch.object(u.chardet, 'detect') as detect: | ||||||
| @@ -38,7 +41,9 @@ def test_detect_encoding(): | |||||||
|  |  | ||||||
| def test_quick_detect_encoding(): | def test_quick_detect_encoding(): | ||||||
|     eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower()) |     eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower()) | ||||||
|     eq_ ('windows-1252', u.quick_detect_encoding(u'Versi\xf3n'.encode('windows-1252')).lower()) |     ok_ (u.quick_detect_encoding( | ||||||
|  |         u'Versi\xf3n'.encode('windows-1252')).lower() in [ | ||||||
|  |             'windows-1252', 'windows-1250']) | ||||||
|     eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower()) |     eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower()) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -107,3 +112,51 @@ font: 13px 'Lucida Grande', Arial, sans-serif; | |||||||
| <p>TEXT 2 <!-- COMMENT 2 --></p> | <p>TEXT 2 <!-- COMMENT 2 --></p> | ||||||
| </div>""" | </div>""" | ||||||
|     eq_(b'TEXT 1 \nTEXT 2', u.html_to_text(html)) |     eq_(b'TEXT 1 \nTEXT 2', u.html_to_text(html)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_comment_no_parent(): | ||||||
|  |     s = "<!-- COMMENT 1 --> no comment" | ||||||
|  |     d = u.html_document_fromstring(s) | ||||||
|  |     eq_("no comment", u.html_tree_to_text(d)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception())) | ||||||
|  | def test_html_fromstring_exception(): | ||||||
|  |     eq_(None, u.html_fromstring("<html></html>")) | ||||||
|  |  | ||||||
|  | @patch.object(u, 'html_too_big', Mock()) | ||||||
|  | @patch.object(u.html5parser, 'fromstring') | ||||||
|  | def test_html_fromstring_too_big(fromstring): | ||||||
|  |     eq_(None, u.html_fromstring("<html></html>")) | ||||||
|  |     assert_false(fromstring.called) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @patch.object(u.html5parser, 'document_fromstring') | ||||||
|  | def test_html_document_fromstring_exception(document_fromstring): | ||||||
|  |     document_fromstring.side_effect = Exception() | ||||||
|  |     eq_(None, u.html_document_fromstring("<html></html>")) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @patch.object(u, 'html_too_big', Mock()) | ||||||
|  | @patch.object(u.html5parser, 'document_fromstring') | ||||||
|  | def test_html_document_fromstring_too_big(document_fromstring): | ||||||
|  |     eq_(None, u.html_document_fromstring("<html></html>")) | ||||||
|  |     assert_false(document_fromstring.called) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @patch.object(u, 'html_fromstring', Mock(return_value=None)) | ||||||
|  | def test_bad_html_to_text(): | ||||||
|  |     bad_html = "one<br>two<br>three" | ||||||
|  |     eq_(None, u.html_to_text(bad_html)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @patch.object(u, '_MAX_TAGS_COUNT', 3) | ||||||
|  | def test_html_too_big(): | ||||||
|  |     eq_(False, u.html_too_big("<div></div>")) | ||||||
|  |     eq_(True, u.html_too_big("<div><span>Hi</span></div>")) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @patch.object(u, '_MAX_TAGS_COUNT', 3) | ||||||
|  | def test_html_to_text(): | ||||||
|  |     eq_("Hello", u.html_to_text("<div>Hello</div>")) | ||||||
|  |     eq_(None, u.html_to_text("<div><span>Hi</span></div>")) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user