Compare commits
	
		
			55 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | b40835eca2 | ||
|  | b38562c7cc | ||
|  | 70e9fb415e | ||
|  | 64612099cd | ||
|  | 45c20f979d | ||
|  | 743c76f159 | ||
|  | bc5dad75d3 | ||
|  | 4acf05cf28 | ||
|  | f5f7264077 | ||
|  | 4364bebf38 | ||
|  | 15e61768f2 | ||
|  | dd0a0f5c4d | ||
|  | 086f5ba43b | ||
|  | e16dcf629e | ||
|  | f16ae5110b | ||
|  | ab5cbe5ec3 | ||
|  | be5da92f16 | ||
|  | 95954a65a0 | ||
|  | 0b55e8fa77 | ||
|  | 6f159e8959 | ||
|  | 5c413b4b00 | ||
|  | cca64d3ed1 | ||
|  | e11eaf6ff8 | ||
|  | 85a4c1d855 | ||
|  | 0f5e72623b | ||
|  | 061e549ad7 | ||
|  | 49d1a5d248 | ||
|  | 03d6b00db8 | ||
|  | a2eb0f7201 | ||
|  | 5c71a0ca07 | ||
|  | 489d16fad9 | ||
|  | a458707777 | ||
|  | a1d0a86305 | ||
|  | 29f1d21be7 | ||
|  | 34c5b526c3 | ||
|  | 3edb6578ba | ||
|  | 984c036b6e | ||
|  | a403ecb5c9 | ||
|  | a44713409c | ||
|  | 567467b8ed | ||
|  | 139edd6104 | ||
|  | e756d55abf | ||
|  | 015c8d2a78 | ||
|  | 5af846c13d | ||
|  | e69a9c7a54 | ||
|  | 23cb2a9a53 | ||
|  | b5e3397b88 | ||
|  | 5685a4055a | ||
|  | 97b72ef767 | ||
|  | 31489848be | ||
|  | e5988d447b | ||
|  | adfed748ce | ||
|  | 2444ba87c0 | ||
|  | 534457e713 | ||
|  | ea82a9730e | 
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -39,6 +39,8 @@ nosetests.xml | |||||||
| /.emacs.desktop | /.emacs.desktop | ||||||
| /.emacs.desktop.lock | /.emacs.desktop.lock | ||||||
| .elc | .elc | ||||||
|  | .idea | ||||||
|  | .cache | ||||||
| auto-save-list | auto-save-list | ||||||
| tramp | tramp | ||||||
| .\#* | .\#* | ||||||
|   | |||||||
							
								
								
									
										4
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								setup.py
									
									
									
									
									
								
							| @@ -29,7 +29,7 @@ class InstallCommand(install): | |||||||
|  |  | ||||||
|  |  | ||||||
| setup(name='talon', | setup(name='talon', | ||||||
|       version='1.3.1', |       version='1.4.1', | ||||||
|       description=("Mailgun library " |       description=("Mailgun library " | ||||||
|                    "to extract message quotations and signatures."), |                    "to extract message quotations and signatures."), | ||||||
|       long_description=open("README.rst").read(), |       long_description=open("README.rst").read(), | ||||||
| @@ -48,7 +48,7 @@ setup(name='talon', | |||||||
|           "regex>=1", |           "regex>=1", | ||||||
|           "numpy", |           "numpy", | ||||||
|           "scipy", |           "scipy", | ||||||
|           "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild |           "scikit-learn>=0.16.1", # pickled versions of classifier, else rebuild | ||||||
|           'chardet>=1.0.1', |           'chardet>=1.0.1', | ||||||
|           'cchardet>=0.3.5', |           'cchardet>=0.3.5', | ||||||
|           'cssselect', |           'cssselect', | ||||||
|   | |||||||
| @@ -94,6 +94,12 @@ def cut_microsoft_quote(html_message): | |||||||
|         #outlook 2007, 2010 (american) |         #outlook 2007, 2010 (american) | ||||||
|         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" |         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" | ||||||
|         "padding:3.0pt 0in 0in 0in']|" |         "padding:3.0pt 0in 0in 0in']|" | ||||||
|  |         #outlook 2013 (international) | ||||||
|  |         "//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;" | ||||||
|  |         "padding:3.0pt 0cm 0cm 0cm']|" | ||||||
|  |         #outlook 2013 (american) | ||||||
|  |         "//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;" | ||||||
|  |         "padding:3.0pt 0in 0in 0in']|" | ||||||
|         #windows mail |         #windows mail | ||||||
|         "//div[@style='padding-top: 5px; " |         "//div[@style='padding-top: 5px; " | ||||||
|         "border-top-color: rgb(229, 229, 229); " |         "border-top-color: rgb(229, 229, 229); " | ||||||
|   | |||||||
| @@ -131,7 +131,7 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format( | |||||||
|         'Oprindelig meddelelse', |         'Oprindelig meddelelse', | ||||||
|     ))), re.I) |     ))), re.I) | ||||||
|  |  | ||||||
| RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format( | RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]?.*'.format( | ||||||
|     u'|'.join(( |     u'|'.join(( | ||||||
|         # "From" in different languages. |         # "From" in different languages. | ||||||
|         'From', 'Van', 'De', 'Von', 'Fra', u'Från', |         'From', 'Van', 'De', 'Von', 'Fra', u'Från', | ||||||
| @@ -139,6 +139,21 @@ RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? . | |||||||
|         'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt', |         'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt', | ||||||
|     ))), re.I) |     ))), re.I) | ||||||
|  |  | ||||||
|  | # ---- John Smith wrote ---- | ||||||
|  | RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format( | ||||||
|  |     u'|'.join(( | ||||||
|  |         # English | ||||||
|  |         'wrote' | ||||||
|  |     ))), re.I) | ||||||
|  |  | ||||||
|  | # Support polymail.io reply format | ||||||
|  | # On Tue, Apr 11, 2017 at 10:07 PM John Smith | ||||||
|  | # | ||||||
|  | # < | ||||||
|  | # mailto:John Smith <johnsmith@gmail.com> | ||||||
|  | # > wrote: | ||||||
|  | RE_POLYMAIL = re.compile('On.*\s{2}<\smailto:.*\s> wrote:', re.I) | ||||||
|  |  | ||||||
| SPLITTER_PATTERNS = [ | SPLITTER_PATTERNS = [ | ||||||
|     RE_ORIGINAL_MESSAGE, |     RE_ORIGINAL_MESSAGE, | ||||||
|     RE_ON_DATE_SMB_WROTE, |     RE_ON_DATE_SMB_WROTE, | ||||||
| @@ -154,16 +169,17 @@ SPLITTER_PATTERNS = [ | |||||||
|     re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' |     re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' | ||||||
|                '( \S+){3,6}@\S+:'), |                '( \S+){3,6}@\S+:'), | ||||||
|     # Sent from Samsung MobileName <address@example.com> wrote: |     # Sent from Samsung MobileName <address@example.com> wrote: | ||||||
|     re.compile('Sent from Samsung .*@.*> wrote') |     re.compile('Sent from Samsung .*@.*> wrote'), | ||||||
|  |     RE_ANDROID_WROTE, | ||||||
|  |     RE_POLYMAIL | ||||||
|     ] |     ] | ||||||
|  |  | ||||||
|  |  | ||||||
| RE_LINK = re.compile('<(http://[^>]*)>') | RE_LINK = re.compile('<(http://[^>]*)>') | ||||||
| RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@') | RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@') | ||||||
|  |  | ||||||
| RE_PARENTHESIS_LINK = re.compile("\(https?://") | RE_PARENTHESIS_LINK = re.compile("\(https?://") | ||||||
|  |  | ||||||
| SPLITTER_MAX_LINES = 4 | SPLITTER_MAX_LINES = 6 | ||||||
| MAX_LINES_COUNT = 1000 | MAX_LINES_COUNT = 1000 | ||||||
| # an extensive research shows that exceeding this limit | # an extensive research shows that exceeding this limit | ||||||
| # leads to excessive processing time | # leads to excessive processing time | ||||||
| @@ -172,6 +188,9 @@ MAX_HTML_LEN = 2794202 | |||||||
| QUOT_PATTERN = re.compile('^>+ ?') | QUOT_PATTERN = re.compile('^>+ ?') | ||||||
| NO_QUOT_LINE = re.compile('^[^>].*[\S].*') | NO_QUOT_LINE = re.compile('^[^>].*[\S].*') | ||||||
|  |  | ||||||
|  | # Regular expression to identify if a line is a header. | ||||||
|  | RE_HEADER = re.compile(": ") | ||||||
|  |  | ||||||
|  |  | ||||||
| def extract_from(msg_body, content_type='text/plain'): | def extract_from(msg_body, content_type='text/plain'): | ||||||
|     try: |     try: | ||||||
| @@ -185,6 +204,19 @@ def extract_from(msg_body, content_type='text/plain'): | |||||||
|     return msg_body |     return msg_body | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def remove_initial_spaces_and_mark_message_lines(lines): | ||||||
|  |     """ | ||||||
|  |     Removes the initial spaces in each line before marking message lines. | ||||||
|  |  | ||||||
|  |     This ensures headers can be identified if they are indented with spaces. | ||||||
|  |     """ | ||||||
|  |     i = 0 | ||||||
|  |     while i < len(lines): | ||||||
|  |         lines[i] = lines[i].lstrip(' ') | ||||||
|  |         i += 1 | ||||||
|  |     return mark_message_lines(lines) | ||||||
|  |  | ||||||
|  |  | ||||||
| def mark_message_lines(lines): | def mark_message_lines(lines): | ||||||
|     """Mark message lines with markers to distinguish quotation lines. |     """Mark message lines with markers to distinguish quotation lines. | ||||||
|  |  | ||||||
| @@ -287,9 +319,21 @@ def preprocess(msg_body, delimiter, content_type='text/plain'): | |||||||
|  |  | ||||||
|     Converts msg_body into a unicode. |     Converts msg_body into a unicode. | ||||||
|     """ |     """ | ||||||
|     # normalize links i.e. replace '<', '>' wrapping the link with some symbols |     msg_body = _replace_link_brackets(msg_body) | ||||||
|     # so that '>' closing the link couldn't be mistakenly taken for quotation |  | ||||||
|     # marker. |     msg_body = _wrap_splitter_with_newline(msg_body, delimiter, content_type) | ||||||
|  |  | ||||||
|  |     return msg_body | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _replace_link_brackets(msg_body): | ||||||
|  |     """ | ||||||
|  |     Normalize links i.e. replace '<', '>' wrapping the link with some symbols | ||||||
|  |     so that '>' closing the link couldn't be mistakenly taken for quotation | ||||||
|  |     marker. | ||||||
|  |  | ||||||
|  |     Converts msg_body into a unicode | ||||||
|  |     """ | ||||||
|     if isinstance(msg_body, bytes): |     if isinstance(msg_body, bytes): | ||||||
|         msg_body = msg_body.decode('utf8') |         msg_body = msg_body.decode('utf8') | ||||||
|  |  | ||||||
| @@ -301,7 +345,14 @@ def preprocess(msg_body, delimiter, content_type='text/plain'): | |||||||
|             return "@@%s@@" % link.group(1) |             return "@@%s@@" % link.group(1) | ||||||
|  |  | ||||||
|     msg_body = re.sub(RE_LINK, link_wrapper, msg_body) |     msg_body = re.sub(RE_LINK, link_wrapper, msg_body) | ||||||
|  |     return msg_body | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'): | ||||||
|  |     """ | ||||||
|  |     Splits line in two if splitter pattern preceded by some text on the same | ||||||
|  |     line (done only for 'On <date> <person> wrote:' pattern. | ||||||
|  |     """ | ||||||
|     def splitter_wrapper(splitter): |     def splitter_wrapper(splitter): | ||||||
|         """Wraps splitter with new line""" |         """Wraps splitter with new line""" | ||||||
|         if splitter.start() and msg_body[splitter.start() - 1] != '\n': |         if splitter.start() and msg_body[splitter.start() - 1] != '\n': | ||||||
| @@ -386,9 +437,6 @@ def _extract_from_html(msg_body): | |||||||
|     then checking deleted checkpoints, |     then checking deleted checkpoints, | ||||||
|     then deleting necessary tags. |     then deleting necessary tags. | ||||||
|     """ |     """ | ||||||
|     if len(msg_body) > MAX_HTML_LEN: |  | ||||||
|         return msg_body |  | ||||||
|  |  | ||||||
|     if msg_body.strip() == b'': |     if msg_body.strip() == b'': | ||||||
|         return msg_body |         return msg_body | ||||||
|  |  | ||||||
| @@ -453,6 +501,82 @@ def _extract_from_html(msg_body): | |||||||
|     return html.tostring(html_tree_copy) |     return html.tostring(html_tree_copy) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def split_emails(msg): | ||||||
|  |     """ | ||||||
|  |     Given a message (which may consist of an email conversation thread with | ||||||
|  |     multiple emails), mark the lines to identify split lines, content lines and | ||||||
|  |     empty lines. | ||||||
|  |  | ||||||
|  |     Correct the split line markers inside header blocks. Header blocks are | ||||||
|  |     identified by the regular expression RE_HEADER. | ||||||
|  |  | ||||||
|  |     Return the corrected markers | ||||||
|  |     """ | ||||||
|  |     msg_body = _replace_link_brackets(msg) | ||||||
|  |  | ||||||
|  |     # don't process too long messages | ||||||
|  |     lines = msg_body.splitlines()[:MAX_LINES_COUNT] | ||||||
|  |     markers = remove_initial_spaces_and_mark_message_lines(lines) | ||||||
|  |  | ||||||
|  |     markers = _mark_quoted_email_splitlines(markers, lines) | ||||||
|  |  | ||||||
|  |     # we don't want splitlines in header blocks | ||||||
|  |     markers = _correct_splitlines_in_headers(markers, lines) | ||||||
|  |  | ||||||
|  |     return markers | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _mark_quoted_email_splitlines(markers, lines): | ||||||
|  |     """ | ||||||
|  |     When there are headers indented with '>' characters, this method will | ||||||
|  |     attempt to identify if the header is a splitline header. If it is, then we | ||||||
|  |     mark it with 's' instead of leaving it as 'm' and return the new markers. | ||||||
|  |     """ | ||||||
|  |     # Create a list of markers to easily alter specific characters | ||||||
|  |     markerlist = list(markers) | ||||||
|  |     for i, line in enumerate(lines): | ||||||
|  |         if markerlist[i] != 'm': | ||||||
|  |             continue | ||||||
|  |         for pattern in SPLITTER_PATTERNS: | ||||||
|  |             matcher = re.search(pattern, line) | ||||||
|  |             if matcher: | ||||||
|  |                 markerlist[i] = 's' | ||||||
|  |                 break | ||||||
|  |  | ||||||
|  |     return "".join(markerlist) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _correct_splitlines_in_headers(markers, lines): | ||||||
|  |     """ | ||||||
|  |     Corrects markers by removing splitlines deemed to be inside header blocks. | ||||||
|  |     """ | ||||||
|  |     updated_markers = "" | ||||||
|  |     i = 0 | ||||||
|  |     in_header_block = False | ||||||
|  |  | ||||||
|  |     for m in markers: | ||||||
|  |         # Only set in_header_block flag when we hit an 's' and line is a header | ||||||
|  |         if m == 's': | ||||||
|  |             if not in_header_block: | ||||||
|  |                 if bool(re.search(RE_HEADER, lines[i])): | ||||||
|  |                     in_header_block = True | ||||||
|  |             else: | ||||||
|  |                 if QUOT_PATTERN.match(lines[i]): | ||||||
|  |                     m = 'm' | ||||||
|  |                 else: | ||||||
|  |                     m = 't' | ||||||
|  |  | ||||||
|  |         # If the line is not a header line, set in_header_block false. | ||||||
|  |         if not bool(re.search(RE_HEADER, lines[i])): | ||||||
|  |             in_header_block = False | ||||||
|  |  | ||||||
|  |         # Add the marker to the new updated markers string. | ||||||
|  |         updated_markers += m | ||||||
|  |         i += 1 | ||||||
|  |  | ||||||
|  |     return updated_markers | ||||||
|  |  | ||||||
|  |  | ||||||
| def _readable_text_empty(html_tree): | def _readable_text_empty(html_tree): | ||||||
|     return not bool(html_tree_to_text(html_tree).strip()) |     return not bool(html_tree_to_text(html_tree).strip()) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,15 +1,15 @@ | |||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
|  |  | ||||||
| import logging | import logging | ||||||
|  |  | ||||||
| import regex as re | import regex as re | ||||||
|  |  | ||||||
| from talon.utils import get_delimiter |  | ||||||
| from talon.signature.constants import (SIGNATURE_MAX_LINES, | from talon.signature.constants import (SIGNATURE_MAX_LINES, | ||||||
|                                        TOO_LONG_SIGNATURE_LINE) |                                        TOO_LONG_SIGNATURE_LINE) | ||||||
|  | from talon.utils import get_delimiter | ||||||
|  |  | ||||||
| log = logging.getLogger(__name__) | log = logging.getLogger(__name__) | ||||||
|  |  | ||||||
|  |  | ||||||
| # regex to fetch signature based on common signature words | # regex to fetch signature based on common signature words | ||||||
| RE_SIGNATURE = re.compile(r''' | RE_SIGNATURE = re.compile(r''' | ||||||
|                ( |                ( | ||||||
| @@ -28,7 +28,6 @@ RE_SIGNATURE = re.compile(r''' | |||||||
|                ) |                ) | ||||||
|                ''', re.I | re.X | re.M | re.S) |                ''', re.I | re.X | re.M | re.S) | ||||||
|  |  | ||||||
|  |  | ||||||
| # signatures appended by phone email clients | # signatures appended by phone email clients | ||||||
| RE_PHONE_SIGNATURE = re.compile(r''' | RE_PHONE_SIGNATURE = re.compile(r''' | ||||||
|                ( |                ( | ||||||
| @@ -45,7 +44,6 @@ RE_PHONE_SIGNATURE = re.compile(r''' | |||||||
|                ) |                ) | ||||||
|                ''', re.I | re.X | re.M | re.S) |                ''', re.I | re.X | re.M | re.S) | ||||||
|  |  | ||||||
|  |  | ||||||
| # see _mark_candidate_indexes() for details | # see _mark_candidate_indexes() for details | ||||||
| # c - could be signature line | # c - could be signature line | ||||||
| # d - line starts with dashes (could be signature or list item) | # d - line starts with dashes (could be signature or list item) | ||||||
| @@ -112,7 +110,7 @@ def extract_signature(msg_body): | |||||||
|  |  | ||||||
|             return (stripped_body.strip(), |             return (stripped_body.strip(), | ||||||
|                     signature.strip()) |                     signature.strip()) | ||||||
|     except Exception as e: |     except Exception: | ||||||
|         log.exception('ERROR extracting signature') |         log.exception('ERROR extracting signature') | ||||||
|         return (msg_body, None) |         return (msg_body, None) | ||||||
|  |  | ||||||
| @@ -163,7 +161,7 @@ def _mark_candidate_indexes(lines, candidate): | |||||||
|     'cdc' |     'cdc' | ||||||
|     """ |     """ | ||||||
|     # at first consider everything to be potential signature lines |     # at first consider everything to be potential signature lines | ||||||
|     markers = bytearray('c'*len(candidate)) |     markers = list('c' * len(candidate)) | ||||||
|  |  | ||||||
|     # mark lines starting from bottom up |     # mark lines starting from bottom up | ||||||
|     for i, line_idx in reversed(list(enumerate(candidate))): |     for i, line_idx in reversed(list(enumerate(candidate))): | ||||||
| @@ -174,7 +172,7 @@ def _mark_candidate_indexes(lines, candidate): | |||||||
|             if line.startswith('-') and line.strip("-"): |             if line.startswith('-') and line.strip("-"): | ||||||
|                 markers[i] = 'd' |                 markers[i] = 'd' | ||||||
|  |  | ||||||
|     return markers |     return "".join(markers) | ||||||
|  |  | ||||||
|  |  | ||||||
| def _process_marked_candidate_indexes(candidate, markers): | def _process_marked_candidate_indexes(candidate, markers): | ||||||
|   | |||||||
| @@ -1,16 +1,15 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
|  |  | ||||||
| import logging | import logging | ||||||
|  |  | ||||||
| import regex as re |  | ||||||
| import numpy | import numpy | ||||||
|  | import regex as re | ||||||
| from talon.signature.learning.featurespace import features, build_pattern |  | ||||||
| from talon.utils import get_delimiter |  | ||||||
| from talon.signature.bruteforce import get_signature_candidate | from talon.signature.bruteforce import get_signature_candidate | ||||||
|  | from talon.signature.learning.featurespace import features, build_pattern | ||||||
| from talon.signature.learning.helpers import has_signature | from talon.signature.learning.helpers import has_signature | ||||||
|  | from talon.utils import get_delimiter | ||||||
|  |  | ||||||
| log = logging.getLogger(__name__) | log = logging.getLogger(__name__) | ||||||
|  |  | ||||||
| @@ -58,7 +57,7 @@ def extract(body, sender): | |||||||
|                 text = delimiter.join(text) |                 text = delimiter.join(text) | ||||||
|                 if text.strip(): |                 if text.strip(): | ||||||
|                     return (text, delimiter.join(signature)) |                     return (text, delimiter.join(signature)) | ||||||
|     except Exception: |     except Exception as e: | ||||||
|         log.exception('ERROR when extracting signature with classifiers') |         log.exception('ERROR when extracting signature with classifiers') | ||||||
|  |  | ||||||
|     return (body, None) |     return (body, None) | ||||||
| @@ -81,7 +80,7 @@ def _mark_lines(lines, sender): | |||||||
|     candidate = get_signature_candidate(lines) |     candidate = get_signature_candidate(lines) | ||||||
|  |  | ||||||
|     # at first consider everything to be text no signature |     # at first consider everything to be text no signature | ||||||
|     markers = bytearray('t'*len(lines)) |     markers = list('t' * len(lines)) | ||||||
|  |  | ||||||
|     # mark lines starting from bottom up |     # mark lines starting from bottom up | ||||||
|     # mark only lines that belong to candidate |     # mark only lines that belong to candidate | ||||||
| @@ -96,7 +95,7 @@ def _mark_lines(lines, sender): | |||||||
|         elif is_signature_line(line, sender, EXTRACTOR): |         elif is_signature_line(line, sender, EXTRACTOR): | ||||||
|             markers[j] = 's' |             markers[j] = 's' | ||||||
|  |  | ||||||
|     return markers |     return "".join(markers) | ||||||
|  |  | ||||||
|  |  | ||||||
| def _process_marked_lines(lines, markers): | def _process_marked_lines(lines, markers): | ||||||
| @@ -111,3 +110,4 @@ def _process_marked_lines(lines, markers): | |||||||
|         return (lines[:-signature.end()], lines[-signature.end():]) |         return (lines[:-signature.end()], lines[-signature.end():]) | ||||||
|  |  | ||||||
|     return (lines, None) |     return (lines, None) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -6,9 +6,10 @@ body belongs to the signature. | |||||||
| """ | """ | ||||||
|  |  | ||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
|  |  | ||||||
| from numpy import genfromtxt | from numpy import genfromtxt | ||||||
| from sklearn.svm import LinearSVC |  | ||||||
| from sklearn.externals import joblib | from sklearn.externals import joblib | ||||||
|  | from sklearn.svm import LinearSVC | ||||||
|  |  | ||||||
|  |  | ||||||
| def init(): | def init(): | ||||||
| @@ -29,4 +30,40 @@ def train(classifier, train_data_filename, save_classifier_filename=None): | |||||||
|  |  | ||||||
| def load(saved_classifier_filename, train_data_filename): | def load(saved_classifier_filename, train_data_filename): | ||||||
|     """Loads saved classifier. """ |     """Loads saved classifier. """ | ||||||
|  |     try: | ||||||
|         return joblib.load(saved_classifier_filename) |         return joblib.load(saved_classifier_filename) | ||||||
|  |     except Exception: | ||||||
|  |         import sys | ||||||
|  |         if sys.version_info > (3, 0): | ||||||
|  |             return load_compat(saved_classifier_filename) | ||||||
|  |  | ||||||
|  |         raise | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def load_compat(saved_classifier_filename): | ||||||
|  |     import os | ||||||
|  |     import pickle | ||||||
|  |     import tempfile | ||||||
|  |  | ||||||
|  |     # we need to switch to the data path to properly load the related _xx.npy files | ||||||
|  |     cwd = os.getcwd() | ||||||
|  |     os.chdir(os.path.dirname(saved_classifier_filename)) | ||||||
|  |  | ||||||
|  |     # convert encoding using pick.load and write to temp file which we'll tell joblib to use | ||||||
|  |     pickle_file = open(saved_classifier_filename, 'rb') | ||||||
|  |     classifier = pickle.load(pickle_file, encoding='latin1') | ||||||
|  |  | ||||||
|  |     try: | ||||||
|  |         # save our conversion if permissions allow | ||||||
|  |         joblib.dump(classifier, saved_classifier_filename) | ||||||
|  |     except Exception: | ||||||
|  |         # can't write to classifier, use a temp file | ||||||
|  |         tmp = tempfile.SpooledTemporaryFile() | ||||||
|  |         joblib.dump(classifier, tmp) | ||||||
|  |         saved_classifier_filename = tmp | ||||||
|  |  | ||||||
|  |     # important, use joblib.load before switching back to original cwd | ||||||
|  |     jb_classifier = joblib.load(saved_classifier_filename) | ||||||
|  |     os.chdir(cwd) | ||||||
|  |  | ||||||
|  |     return jb_classifier | ||||||
|   | |||||||
| @@ -17,13 +17,14 @@ suffix which should be `_sender`. | |||||||
| """ | """ | ||||||
|  |  | ||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
|  |  | ||||||
| import os | import os | ||||||
|  |  | ||||||
| import regex as re | import regex as re | ||||||
|  | from six.moves import range | ||||||
|  |  | ||||||
| from talon.signature.constants import SIGNATURE_MAX_LINES | from talon.signature.constants import SIGNATURE_MAX_LINES | ||||||
| from talon.signature.learning.featurespace import build_pattern, features | from talon.signature.learning.featurespace import build_pattern, features | ||||||
| from six.moves import range |  | ||||||
|  |  | ||||||
|  |  | ||||||
| SENDER_SUFFIX = '_sender' | SENDER_SUFFIX = '_sender' | ||||||
| BODY_SUFFIX = '_body' | BODY_SUFFIX = '_body' | ||||||
| @@ -57,9 +58,14 @@ def parse_msg_sender(filename, sender_known=True): | |||||||
|     algorithm: |     algorithm: | ||||||
|     >>> parse_msg_sender(filename, False) |     >>> parse_msg_sender(filename, False) | ||||||
|     """ |     """ | ||||||
|  |     import sys | ||||||
|  |     kwargs = {} | ||||||
|  |     if sys.version_info > (3, 0): | ||||||
|  |         kwargs["encoding"] = "utf8" | ||||||
|  |  | ||||||
|     sender, msg = None, None |     sender, msg = None, None | ||||||
|     if os.path.isfile(filename) and not is_sender_filename(filename): |     if os.path.isfile(filename) and not is_sender_filename(filename): | ||||||
|         with open(filename) as f: |         with open(filename, **kwargs) as f: | ||||||
|             msg = f.read() |             msg = f.read() | ||||||
|             sender = u'' |             sender = u'' | ||||||
|             if sender_known: |             if sender_known: | ||||||
|   | |||||||
| @@ -1,19 +1,18 @@ | |||||||
| # coding:utf-8 | # coding:utf-8 | ||||||
|  |  | ||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
| import logging |  | ||||||
| from random import shuffle | from random import shuffle | ||||||
| import chardet |  | ||||||
| import cchardet | import cchardet | ||||||
| import regex as re | import chardet | ||||||
|  |  | ||||||
| from lxml.html import html5parser |  | ||||||
| from lxml.cssselect import CSSSelector |  | ||||||
|  |  | ||||||
| import html5lib | import html5lib | ||||||
|  | import regex as re | ||||||
|  | import six | ||||||
|  | from lxml.cssselect import CSSSelector | ||||||
|  | from lxml.html import html5parser | ||||||
|  |  | ||||||
| from talon.constants import RE_DELIMITER | from talon.constants import RE_DELIMITER | ||||||
| import six |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def safe_format(format_string, *args, **kwargs): | def safe_format(format_string, *args, **kwargs): | ||||||
| @@ -177,7 +176,12 @@ def html_to_text(string): | |||||||
| def html_fromstring(s): | def html_fromstring(s): | ||||||
|     """Parse html tree from string. Return None if the string can't be parsed. |     """Parse html tree from string. Return None if the string can't be parsed. | ||||||
|     """ |     """ | ||||||
|  |     if isinstance(s, six.text_type): | ||||||
|  |         s = s.encode('utf8') | ||||||
|     try: |     try: | ||||||
|  |         if html_too_big(s): | ||||||
|  |             return None | ||||||
|  |  | ||||||
|         return html5parser.fromstring(s, parser=_html5lib_parser()) |         return html5parser.fromstring(s, parser=_html5lib_parser()) | ||||||
|     except Exception: |     except Exception: | ||||||
|         pass |         pass | ||||||
| @@ -186,7 +190,12 @@ def html_fromstring(s): | |||||||
| def html_document_fromstring(s): | def html_document_fromstring(s): | ||||||
|     """Parse html tree from string. Return None if the string can't be parsed. |     """Parse html tree from string. Return None if the string can't be parsed. | ||||||
|     """ |     """ | ||||||
|  |     if isinstance(s, six.text_type): | ||||||
|  |         s = s.encode('utf8') | ||||||
|     try: |     try: | ||||||
|  |         if html_too_big(s): | ||||||
|  |             return None | ||||||
|  |  | ||||||
|         return html5parser.document_fromstring(s, parser=_html5lib_parser()) |         return html5parser.document_fromstring(s, parser=_html5lib_parser()) | ||||||
|     except Exception: |     except Exception: | ||||||
|         pass |         pass | ||||||
| @@ -196,6 +205,12 @@ def cssselect(expr, tree): | |||||||
|     return CSSSelector(expr)(tree) |     return CSSSelector(expr)(tree) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def html_too_big(s): | ||||||
|  |     if isinstance(s, six.text_type): | ||||||
|  |         s = s.encode('utf8') | ||||||
|  |     return s.count(b'<') > _MAX_TAGS_COUNT | ||||||
|  |  | ||||||
|  |  | ||||||
| def _contains_charset_spec(s): | def _contains_charset_spec(s): | ||||||
|     """Return True if the first 4KB contain charset spec |     """Return True if the first 4KB contain charset spec | ||||||
|     """ |     """ | ||||||
| @@ -238,8 +253,11 @@ def _html5lib_parser(): | |||||||
| _UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;' | _UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;' | ||||||
|                      b'charset=utf-8">') |                      b'charset=utf-8">') | ||||||
|  |  | ||||||
|  |  | ||||||
| _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] | _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] | ||||||
| _HARDBREAKS = ['br', 'hr', 'tr'] | _HARDBREAKS = ['br', 'hr', 'tr'] | ||||||
|  |  | ||||||
| _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") | _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") | ||||||
|  |  | ||||||
|  | # an extensive research shows that exceeding this limit | ||||||
|  | # might lead to excessive processing time | ||||||
|  | _MAX_TAGS_COUNT = 419 | ||||||
|   | |||||||
| @@ -1,13 +1,13 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
| from . import * |  | ||||||
| from . fixtures import * |  | ||||||
|  |  | ||||||
| import regex as re | # noinspection PyUnresolvedReferences | ||||||
|  | import re | ||||||
|  |  | ||||||
| from talon import quotations, utils as u | from talon import quotations, utils as u | ||||||
|  | from . import * | ||||||
|  | from .fixtures import * | ||||||
|  |  | ||||||
| RE_WHITESPACE = re.compile("\s") | RE_WHITESPACE = re.compile("\s") | ||||||
| RE_DOUBLE_WHITESPACE = re.compile("\s") | RE_DOUBLE_WHITESPACE = re.compile("\s") | ||||||
| @@ -303,7 +303,12 @@ Reply | |||||||
|  |  | ||||||
|  |  | ||||||
| def extract_reply_and_check(filename): | def extract_reply_and_check(filename): | ||||||
|     f = open(filename) |     import sys | ||||||
|  |     kwargs = {} | ||||||
|  |     if sys.version_info > (3, 0): | ||||||
|  |         kwargs["encoding"] = "utf8" | ||||||
|  |  | ||||||
|  |     f = open(filename, **kwargs) | ||||||
|  |  | ||||||
|     msg_body = f.read() |     msg_body = f.read() | ||||||
|     reply = quotations.extract_from_html(msg_body) |     reply = quotations.extract_from_html(msg_body) | ||||||
| @@ -385,7 +390,7 @@ def test_gmail_forwarded_msg(): | |||||||
|     eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) |     eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(quotations, 'MAX_HTML_LEN', 1) | @patch.object(u, '_MAX_TAGS_COUNT', 4) | ||||||
| def test_too_large_html(): | def test_too_large_html(): | ||||||
|     msg_body = 'Reply' \ |     msg_body = 'Reply' \ | ||||||
|                '<div class="gmail_quote">' \ |                '<div class="gmail_quote">' \ | ||||||
|   | |||||||
| @@ -1,16 +1,16 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
| from .. import * |  | ||||||
|  |  | ||||||
| import os | import os | ||||||
|  |  | ||||||
| from talon.signature.learning import dataset |  | ||||||
| from talon import signature |  | ||||||
| from talon.signature import extraction as e |  | ||||||
| from talon.signature import bruteforce |  | ||||||
| from six.moves import range | from six.moves import range | ||||||
|  |  | ||||||
|  | from talon.signature import bruteforce, extraction, extract | ||||||
|  | from talon.signature import extraction as e | ||||||
|  | from talon.signature.learning import dataset | ||||||
|  | from .. import * | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_message_shorter_SIGNATURE_MAX_LINES(): | def test_message_shorter_SIGNATURE_MAX_LINES(): | ||||||
|     sender = "bob@foo.bar" |     sender = "bob@foo.bar" | ||||||
| @@ -18,23 +18,28 @@ def test_message_shorter_SIGNATURE_MAX_LINES(): | |||||||
|  |  | ||||||
| Thanks in advance, | Thanks in advance, | ||||||
| Bob""" | Bob""" | ||||||
|     text, extracted_signature = signature.extract(body, sender) |     text, extracted_signature = extract(body, sender) | ||||||
|     eq_('\n'.join(body.splitlines()[:2]), text) |     eq_('\n'.join(body.splitlines()[:2]), text) | ||||||
|     eq_('\n'.join(body.splitlines()[-2:]), extracted_signature) |     eq_('\n'.join(body.splitlines()[-2:]), extracted_signature) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_messages_longer_SIGNATURE_MAX_LINES(): | def test_messages_longer_SIGNATURE_MAX_LINES(): | ||||||
|  |     import sys | ||||||
|  |     kwargs = {} | ||||||
|  |     if sys.version_info > (3, 0): | ||||||
|  |         kwargs["encoding"] = "utf8" | ||||||
|  |  | ||||||
|     for filename in os.listdir(STRIPPED): |     for filename in os.listdir(STRIPPED): | ||||||
|         filename = os.path.join(STRIPPED, filename) |         filename = os.path.join(STRIPPED, filename) | ||||||
|         if not filename.endswith('_body'): |         if not filename.endswith('_body'): | ||||||
|             continue |             continue | ||||||
|         sender, body = dataset.parse_msg_sender(filename) |         sender, body = dataset.parse_msg_sender(filename) | ||||||
|         text, extracted_signature = signature.extract(body, sender) |         text, extracted_signature = extract(body, sender) | ||||||
|         extracted_signature = extracted_signature or '' |         extracted_signature = extracted_signature or '' | ||||||
|         with open(filename[:-len('body')] + 'signature') as ms: |         with open(filename[:-len('body')] + 'signature', **kwargs) as ms: | ||||||
|             msg_signature = ms.read() |             msg_signature = ms.read() | ||||||
|             eq_(msg_signature.strip(), extracted_signature.strip()) |             eq_(msg_signature.strip(), extracted_signature.strip()) | ||||||
|             stripped_msg = body.strip()[:len(body.strip())-len(msg_signature)] |             stripped_msg = body.strip()[:len(body.strip()) - len(msg_signature)] | ||||||
|             eq_(stripped_msg.strip(), text.strip()) |             eq_(stripped_msg.strip(), text.strip()) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -47,7 +52,7 @@ Thanks in advance, | |||||||
| some text which doesn't seem to be a signature at all | some text which doesn't seem to be a signature at all | ||||||
| Bob""" | Bob""" | ||||||
|  |  | ||||||
|     text, extracted_signature = signature.extract(body, sender) |     text, extracted_signature = extract(body, sender) | ||||||
|     eq_('\n'.join(body.splitlines()[:2]), text) |     eq_('\n'.join(body.splitlines()[:2]), text) | ||||||
|     eq_('\n'.join(body.splitlines()[-3:]), extracted_signature) |     eq_('\n'.join(body.splitlines()[-3:]), extracted_signature) | ||||||
|  |  | ||||||
| @@ -60,7 +65,7 @@ Thanks in advance, | |||||||
| some long text here which doesn't seem to be a signature at all | some long text here which doesn't seem to be a signature at all | ||||||
| Bob""" | Bob""" | ||||||
|  |  | ||||||
|     text, extracted_signature = signature.extract(body, sender) |     text, extracted_signature = extract(body, sender) | ||||||
|     eq_('\n'.join(body.splitlines()[:-1]), text) |     eq_('\n'.join(body.splitlines()[:-1]), text) | ||||||
|     eq_('Bob', extracted_signature) |     eq_('Bob', extracted_signature) | ||||||
|  |  | ||||||
| @@ -68,13 +73,13 @@ Bob""" | |||||||
|  |  | ||||||
|     some *long* text here which doesn't seem to be a signature at all |     some *long* text here which doesn't seem to be a signature at all | ||||||
|     """ |     """ | ||||||
|     ((body, None), signature.extract(body, "david@example.com")) |     ((body, None), extract(body, "david@example.com")) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_basic(): | def test_basic(): | ||||||
|     msg_body = 'Blah\r\n--\r\n\r\nSergey Obukhov' |     msg_body = 'Blah\r\n--\r\n\r\nSergey Obukhov' | ||||||
|     eq_(('Blah', '--\r\n\r\nSergey Obukhov'), |     eq_(('Blah', '--\r\n\r\nSergey Obukhov'), | ||||||
|         signature.extract(msg_body, 'Sergey')) |         extract(msg_body, 'Sergey')) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_capitalized(): | def test_capitalized(): | ||||||
| @@ -99,7 +104,7 @@ Doe Inc | |||||||
| Doe Inc | Doe Inc | ||||||
| 555-531-7967""" | 555-531-7967""" | ||||||
|  |  | ||||||
|     eq_(sig, signature.extract(msg_body, 'Doe')[1]) |     eq_(sig, extract(msg_body, 'Doe')[1]) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_over_2_text_lines_after_signature(): | def test_over_2_text_lines_after_signature(): | ||||||
| @@ -110,25 +115,25 @@ def test_over_2_text_lines_after_signature(): | |||||||
|     2 non signature lines in the end |     2 non signature lines in the end | ||||||
|     It's not signature |     It's not signature | ||||||
|     """ |     """ | ||||||
|     text, extracted_signature = signature.extract(body, "Bob") |     text, extracted_signature = extract(body, "Bob") | ||||||
|     eq_(extracted_signature, None) |     eq_(extracted_signature, None) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_no_signature(): | def test_no_signature(): | ||||||
|     sender, body = "bob@foo.bar", "Hello" |     sender, body = "bob@foo.bar", "Hello" | ||||||
|     eq_((body, None), signature.extract(body, sender)) |     eq_((body, None), extract(body, sender)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_handles_unicode(): | def test_handles_unicode(): | ||||||
|     sender, body = dataset.parse_msg_sender(UNICODE_MSG) |     sender, body = dataset.parse_msg_sender(UNICODE_MSG) | ||||||
|     text, extracted_signature = signature.extract(body, sender) |     text, extracted_signature = extract(body, sender) | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(signature.extraction, 'has_signature') | @patch.object(extraction, 'has_signature') | ||||||
| def test_signature_extract_crash(has_signature): | def test_signature_extract_crash(has_signature): | ||||||
|     has_signature.side_effect = Exception('Bam!') |     has_signature.side_effect = Exception('Bam!') | ||||||
|     msg_body = u'Blah\r\n--\r\n\r\nСергей' |     msg_body = u'Blah\r\n--\r\n\r\nСергей' | ||||||
|     eq_((msg_body, None), signature.extract(msg_body, 'Сергей')) |     eq_((msg_body, None), extract(msg_body, 'Сергей')) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_mark_lines(): | def test_mark_lines(): | ||||||
|   | |||||||
| @@ -35,6 +35,19 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote: | |||||||
|  |  | ||||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  | def test_pattern_on_date_polymail(): | ||||||
|  |     msg_body = """Test reply | ||||||
|  |  | ||||||
|  | On Tue, Apr 11, 2017 at 10:07 PM John Smith | ||||||
|  |  | ||||||
|  | < | ||||||
|  | mailto:John Smith <johnsmith@gmail.com> | ||||||
|  | > wrote: | ||||||
|  | Test quoted data | ||||||
|  | """ | ||||||
|  |  | ||||||
|  |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_pattern_sent_from_samsung_smb_wrote(): | def test_pattern_sent_from_samsung_smb_wrote(): | ||||||
|     msg_body = """Test reply |     msg_body = """Test reply | ||||||
| @@ -142,7 +155,8 @@ def _check_pattern_original_message(original_message_indicator): | |||||||
| -----{}----- | -----{}----- | ||||||
|  |  | ||||||
| Test""" | Test""" | ||||||
|     eq_('Test reply', quotations.extract_from_plain(msg_body.format(six.text_type(original_message_indicator)))) |     eq_('Test reply', quotations.extract_from_plain( | ||||||
|  |         msg_body.format(six.text_type(original_message_indicator)))) | ||||||
|  |  | ||||||
| def test_english_original_message(): | def test_english_original_message(): | ||||||
|     _check_pattern_original_message('Original Message') |     _check_pattern_original_message('Original Message') | ||||||
| @@ -165,6 +179,17 @@ Test reply""" | |||||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_android_wrote(): | ||||||
|  |     msg_body = """Test reply | ||||||
|  |  | ||||||
|  | ---- John Smith wrote ---- | ||||||
|  |  | ||||||
|  | > quoted | ||||||
|  | > text | ||||||
|  | """ | ||||||
|  |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_reply_wraps_quotations(): | def test_reply_wraps_quotations(): | ||||||
|     msg_body = """Test reply |     msg_body = """Test reply | ||||||
|  |  | ||||||
| @@ -696,3 +721,52 @@ def test_standard_replies(): | |||||||
|                 "'%(reply)s' != %(stripped)s for %(fn)s" % \ |                 "'%(reply)s' != %(stripped)s for %(fn)s" % \ | ||||||
|                 {'reply': reply_text, 'stripped': stripped_text, |                 {'reply': reply_text, 'stripped': stripped_text, | ||||||
|                  'fn': filename} |                  'fn': filename} | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_split_email(): | ||||||
|  |     msg = """From: Mr. X | ||||||
|  |     Date: 24 February 2016 | ||||||
|  |     To: Mr. Y | ||||||
|  |     Subject: Hi | ||||||
|  |     Attachments: none | ||||||
|  |     Goodbye. | ||||||
|  |     From: Mr. Y | ||||||
|  |     To: Mr. X | ||||||
|  |     Date: 24 February 2016 | ||||||
|  |     Subject: Hi | ||||||
|  |     Attachments: none | ||||||
|  |  | ||||||
|  |     Hello. | ||||||
|  |  | ||||||
|  |         On 24th February 2016 at 09.32am, Conal wrote: | ||||||
|  |  | ||||||
|  |         Hey! | ||||||
|  |  | ||||||
|  |         On Mon, 2016-10-03 at 09:45 -0600, Stangel, Dan wrote: | ||||||
|  |         > Mohan, | ||||||
|  |         > | ||||||
|  |         > We have not yet migrated the systems. | ||||||
|  |         > | ||||||
|  |         > Dan | ||||||
|  |         > | ||||||
|  |         > > -----Original Message----- | ||||||
|  |         > > Date: Mon, 2 Apr 2012 17:44:22 +0400 | ||||||
|  |         > > Subject: Test | ||||||
|  |         > > From: bob@xxx.mailgun.org | ||||||
|  |         > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com | ||||||
|  |         > > | ||||||
|  |         > > Hi | ||||||
|  |         > > | ||||||
|  |         > > > From: bob@xxx.mailgun.org | ||||||
|  |         > > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com | ||||||
|  |         > > > Date: Mon, 2 Apr 2012 17:44:22 +0400 | ||||||
|  |         > > > Subject: Test | ||||||
|  |         > > > Hi | ||||||
|  |         > > > | ||||||
|  |         > > | ||||||
|  |         > | ||||||
|  |         > | ||||||
|  | """ | ||||||
|  |     expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm" | ||||||
|  |     markers = quotations.split_emails(msg) | ||||||
|  |     eq_(markers, expected_markers) | ||||||
|   | |||||||
| @@ -1,12 +1,12 @@ | |||||||
| # coding:utf-8 | # coding:utf-8 | ||||||
|  |  | ||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
| from . import * |  | ||||||
|  |  | ||||||
| from talon import utils as u |  | ||||||
| import cchardet | import cchardet | ||||||
| import six | import six | ||||||
| from lxml import html |  | ||||||
|  | from talon import utils as u | ||||||
|  | from . import * | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_get_delimiter(): | def test_get_delimiter(): | ||||||
| @@ -16,31 +16,35 @@ def test_get_delimiter(): | |||||||
|  |  | ||||||
|  |  | ||||||
| def test_unicode(): | def test_unicode(): | ||||||
|     eq_ (u'hi', u.to_unicode('hi')) |     eq_(u'hi', u.to_unicode('hi')) | ||||||
|     eq_ (type(u.to_unicode('hi')), six.text_type ) |     eq_(type(u.to_unicode('hi')), six.text_type) | ||||||
|     eq_ (type(u.to_unicode(u'hi')), six.text_type ) |     eq_(type(u.to_unicode(u'hi')), six.text_type) | ||||||
|     eq_ (type(u.to_unicode('привет')), six.text_type ) |     eq_(type(u.to_unicode('привет')), six.text_type) | ||||||
|     eq_ (type(u.to_unicode(u'привет')), six.text_type ) |     eq_(type(u.to_unicode(u'привет')), six.text_type) | ||||||
|     eq_ (u"привет", u.to_unicode('привет')) |     eq_(u"привет", u.to_unicode('привет')) | ||||||
|     eq_ (u"привет", u.to_unicode(u'привет')) |     eq_(u"привет", u.to_unicode(u'привет')) | ||||||
|     # some latin1 stuff |     # some latin1 stuff | ||||||
|     eq_ (u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True)) |     eq_(u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_detect_encoding(): | def test_detect_encoding(): | ||||||
|     eq_ ('ascii', u.detect_encoding(b'qwe').lower()) |     eq_('ascii', u.detect_encoding(b'qwe').lower()) | ||||||
|     eq_ ('iso-8859-2', u.detect_encoding(u'Versi\xf3n'.encode('iso-8859-2')).lower()) |     ok_(u.detect_encoding( | ||||||
|     eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower()) |         u'Versi\xf3n'.encode('iso-8859-2')).lower() in [ | ||||||
|  |             'iso-8859-1', 'iso-8859-2']) | ||||||
|  |     eq_('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower()) | ||||||
|     # fallback to utf-8 |     # fallback to utf-8 | ||||||
|     with patch.object(u.chardet, 'detect') as detect: |     with patch.object(u.chardet, 'detect') as detect: | ||||||
|         detect.side_effect = Exception |         detect.side_effect = Exception | ||||||
|         eq_ ('utf-8', u.detect_encoding('qwe'.encode('utf8')).lower()) |         eq_('utf-8', u.detect_encoding('qwe'.encode('utf8')).lower()) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_quick_detect_encoding(): | def test_quick_detect_encoding(): | ||||||
|     eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower()) |     eq_('ascii', u.quick_detect_encoding(b'qwe').lower()) | ||||||
|     eq_ ('windows-1252', u.quick_detect_encoding(u'Versi\xf3n'.encode('windows-1252')).lower()) |     ok_(u.quick_detect_encoding( | ||||||
|     eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower()) |         u'Versi\xf3n'.encode('windows-1252')).lower() in [ | ||||||
|  |             'windows-1252', 'windows-1250']) | ||||||
|  |     eq_('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower()) | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(cchardet, 'detect') | @patch.object(cchardet, 'detect') | ||||||
| @@ -80,7 +84,7 @@ Haha | |||||||
|     eq_(u"привет!", u.html_to_text("<b>привет!</b>").decode('utf8')) |     eq_(u"привет!", u.html_to_text("<b>привет!</b>").decode('utf8')) | ||||||
|  |  | ||||||
|     html = '<body><br/><br/>Hi</body>' |     html = '<body><br/><br/>Hi</body>' | ||||||
|     eq_ (b'Hi', u.html_to_text(html)) |     eq_(b'Hi', u.html_to_text(html)) | ||||||
|  |  | ||||||
|     html = """Hi |     html = """Hi | ||||||
| <style type="text/css"> | <style type="text/css"> | ||||||
| @@ -100,7 +104,7 @@ font: 13px 'Lucida Grande', Arial, sans-serif; | |||||||
|  |  | ||||||
| } | } | ||||||
| </style>""" | </style>""" | ||||||
|     eq_ (b'Hi', u.html_to_text(html)) |     eq_(b'Hi', u.html_to_text(html)) | ||||||
|  |  | ||||||
|     html = """<div> |     html = """<div> | ||||||
| <!-- COMMENT 1 --> | <!-- COMMENT 1 --> | ||||||
| @@ -111,9 +115,9 @@ font: 13px 'Lucida Grande', Arial, sans-serif; | |||||||
|  |  | ||||||
|  |  | ||||||
| def test_comment_no_parent(): | def test_comment_no_parent(): | ||||||
|     s = "<!-- COMMENT 1 --> no comment" |     s = b'<!-- COMMENT 1 --> no comment' | ||||||
|     d = u.html_document_fromstring(s) |     d = u.html_document_fromstring(s) | ||||||
|     eq_("no comment", u.html_tree_to_text(d)) |     eq_(b"no comment", u.html_tree_to_text(d)) | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception())) | @patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception())) | ||||||
| @@ -121,13 +125,39 @@ def test_html_fromstring_exception(): | |||||||
|     eq_(None, u.html_fromstring("<html></html>")) |     eq_(None, u.html_fromstring("<html></html>")) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @patch.object(u, 'html_too_big', Mock()) | ||||||
|  | @patch.object(u.html5parser, 'fromstring') | ||||||
|  | def test_html_fromstring_too_big(fromstring): | ||||||
|  |     eq_(None, u.html_fromstring("<html></html>")) | ||||||
|  |     assert_false(fromstring.called) | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(u.html5parser, 'document_fromstring') | @patch.object(u.html5parser, 'document_fromstring') | ||||||
| def test_html_document_fromstring_exception(document_fromstring): | def test_html_document_fromstring_exception(document_fromstring): | ||||||
|     document_fromstring.side_effect = Exception() |     document_fromstring.side_effect = Exception() | ||||||
|     eq_(None, u.html_document_fromstring("<html></html>")) |     eq_(None, u.html_document_fromstring("<html></html>")) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @patch.object(u, 'html_too_big', Mock()) | ||||||
|  | @patch.object(u.html5parser, 'document_fromstring') | ||||||
|  | def test_html_document_fromstring_too_big(document_fromstring): | ||||||
|  |     eq_(None, u.html_document_fromstring("<html></html>")) | ||||||
|  |     assert_false(document_fromstring.called) | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(u, 'html_fromstring', Mock(return_value=None)) | @patch.object(u, 'html_fromstring', Mock(return_value=None)) | ||||||
| def test_bad_html_to_text(): | def test_bad_html_to_text(): | ||||||
|     bad_html = "one<br>two<br>three" |     bad_html = "one<br>two<br>three" | ||||||
|     eq_(None, u.html_to_text(bad_html)) |     eq_(None, u.html_to_text(bad_html)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @patch.object(u, '_MAX_TAGS_COUNT', 3) | ||||||
|  | def test_html_too_big(): | ||||||
|  |     eq_(False, u.html_too_big("<div></div>")) | ||||||
|  |     eq_(True, u.html_too_big("<div><span>Hi</span></div>")) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @patch.object(u, '_MAX_TAGS_COUNT', 3) | ||||||
|  | def test_html_to_text(): | ||||||
|  |     eq_(b"Hello", u.html_to_text("<div>Hello</div>")) | ||||||
|  |     eq_(None, u.html_to_text("<div><span>Hi</span></div>")) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user