Compare commits
	
		
			37 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 2ecd9779fc | ||
|  | 5a7047233e | ||
|  | 999e9c3725 | ||
|  | f6940fe878 | ||
|  | ce65ff8fc8 | ||
|  | eed6784f25 | ||
|  | 3d9ae356ea | ||
|  | f688d074b5 | ||
|  | 41457d8fbd | ||
|  | 2c416ecc0e | ||
|  | 3ab33c557b | ||
|  | 8db05f4950 | ||
|  | 3d5bc82a03 | ||
|  | 14e3a0d80b | ||
|  | fcd9e2716a | ||
|  | d62d633215 | ||
|  | 3b0c9273c1 | ||
|  | e4c1c11845 | ||
|  | ae508fe0e5 | ||
|  | 2cb9b5399c | ||
|  | 134c47f515 | ||
|  | d328c9d128 | ||
|  | 77b62b0fef | ||
|  | ad09b18f3f | ||
|  | b5af9c03a5 | ||
|  | 176c7e7532 | ||
|  | 15976888a0 | ||
|  | 9bee502903 | ||
|  | e3cb8dc3e6 | ||
|  | 385285e5de | ||
|  | 127771dac9 | ||
|  | cc98befba5 | ||
|  | 567549cba4 | ||
|  | 76c4f49be8 | ||
|  | d9d89dc250 | ||
|  | 390b0a6dc9 | ||
|  | ed6b861a47 | 
							
								
								
									
										8
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										8
									
								
								setup.py
									
									
									
									
									
								
							| @@ -2,7 +2,7 @@ from setuptools import setup, find_packages | |||||||
|  |  | ||||||
|  |  | ||||||
| setup(name='talon', | setup(name='talon', | ||||||
|       version='1.0.3', |       version='1.2.2', | ||||||
|       description=("Mailgun library " |       description=("Mailgun library " | ||||||
|                    "to extract message quotations and signatures."), |                    "to extract message quotations and signatures."), | ||||||
|       long_description=open("README.rst").read(), |       long_description=open("README.rst").read(), | ||||||
| @@ -14,12 +14,14 @@ setup(name='talon', | |||||||
|       include_package_data=True, |       include_package_data=True, | ||||||
|       zip_safe=True, |       zip_safe=True, | ||||||
|       install_requires=[ |       install_requires=[ | ||||||
|           "lxml==2.3.3", |           "lxml>=2.3.3", | ||||||
|           "regex>=1", |           "regex>=1", | ||||||
|           "html2text", |  | ||||||
|           "numpy", |           "numpy", | ||||||
|           "scipy", |           "scipy", | ||||||
|           "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild |           "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild | ||||||
|  |           'chardet>=1.0.1', | ||||||
|  |           'cchardet>=0.3.5', | ||||||
|  |           'cssselect' | ||||||
|           ], |           ], | ||||||
|       tests_require=[ |       tests_require=[ | ||||||
|           "mock", |           "mock", | ||||||
|   | |||||||
| @@ -76,7 +76,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints): | |||||||
|  |  | ||||||
| def cut_gmail_quote(html_message): | def cut_gmail_quote(html_message): | ||||||
|     ''' Cuts the outermost block element with class gmail_quote. ''' |     ''' Cuts the outermost block element with class gmail_quote. ''' | ||||||
|     gmail_quote = html_message.cssselect('.gmail_quote') |     gmail_quote = html_message.cssselect('div.gmail_quote') | ||||||
|     if gmail_quote: |     if gmail_quote: | ||||||
|         gmail_quote[0].getparent().remove(gmail_quote[0]) |         gmail_quote[0].getparent().remove(gmail_quote[0]) | ||||||
|         return True |         return True | ||||||
| @@ -138,9 +138,14 @@ def cut_by_id(html_message): | |||||||
|  |  | ||||||
|  |  | ||||||
| def cut_blockquote(html_message): | def cut_blockquote(html_message): | ||||||
|     ''' Cuts blockquote with wrapping elements. ''' |     ''' Cuts the last non-nested blockquote with wrapping elements.''' | ||||||
|     quote = html_message.find('.//blockquote') |     quote = html_message.xpath( | ||||||
|     if quote is not None: |         '(.//blockquote)' | ||||||
|  |         '[not(@class="gmail_quote") and not(ancestor::blockquote)]' | ||||||
|  |         '[last()]') | ||||||
|  |  | ||||||
|  |     if quote: | ||||||
|  |         quote = quote[0] | ||||||
|         quote.getparent().remove(quote) |         quote.getparent().remove(quote) | ||||||
|         return True |         return True | ||||||
|  |  | ||||||
| @@ -154,13 +159,25 @@ def cut_from_block(html_message): | |||||||
|  |  | ||||||
|     if block: |     if block: | ||||||
|         block = block[-1] |         block = block[-1] | ||||||
|  |         parent_div = None | ||||||
|         while block.getparent() is not None: |         while block.getparent() is not None: | ||||||
|             if block.tag == 'div': |             if block.tag == 'div': | ||||||
|  |                 parent_div = block | ||||||
|  |                 break | ||||||
|  |             block = block.getparent() | ||||||
|  |         if parent_div is not None: | ||||||
|  |             maybe_body = parent_div.getparent() | ||||||
|  |             # In cases where removing this enclosing div will remove all | ||||||
|  |             # content, we should assume the quote is not enclosed in a tag. | ||||||
|  |             parent_div_is_all_content = ( | ||||||
|  |                 maybe_body is not None and maybe_body.tag == 'body' and | ||||||
|  |                 len(maybe_body.getchildren()) == 1) | ||||||
|  |             if not parent_div_is_all_content: | ||||||
|                 block.getparent().remove(block) |                 block.getparent().remove(block) | ||||||
|                 return True |                 return True | ||||||
|         else: |         else: | ||||||
|                 block = block.getparent() |             return False | ||||||
|     else: |  | ||||||
|     # handle the case when From: block goes right after e.g. <hr> |     # handle the case when From: block goes right after e.g. <hr> | ||||||
|     # and not enclosed in some tag |     # and not enclosed in some tag | ||||||
|     block = html_message.xpath( |     block = html_message.xpath( | ||||||
|   | |||||||
| @@ -10,9 +10,8 @@ import logging | |||||||
| from copy import deepcopy | from copy import deepcopy | ||||||
|  |  | ||||||
| from lxml import html, etree | from lxml import html, etree | ||||||
| import html2text |  | ||||||
|  |  | ||||||
| from talon.utils import get_delimiter | from talon.utils import get_delimiter, html_to_text | ||||||
| from talon import html_quotations | from talon import html_quotations | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -22,7 +21,7 @@ log = logging.getLogger(__name__) | |||||||
| RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) | RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) | ||||||
|  |  | ||||||
| RE_ON_DATE_SMB_WROTE = re.compile( | RE_ON_DATE_SMB_WROTE = re.compile( | ||||||
|     u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format( |     u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format( | ||||||
|         # Beginning of the line |         # Beginning of the line | ||||||
|         u'|'.join(( |         u'|'.join(( | ||||||
|             # English |             # English | ||||||
| @@ -32,7 +31,13 @@ RE_ON_DATE_SMB_WROTE = re.compile( | |||||||
|             # Polish |             # Polish | ||||||
|             'W dniu', |             'W dniu', | ||||||
|             # Dutch |             # Dutch | ||||||
|             'Op' |             'Op', | ||||||
|  |             # German | ||||||
|  |             'Am', | ||||||
|  |             # Norwegian | ||||||
|  |             u'På', | ||||||
|  |             # Swedish, Danish | ||||||
|  |             'Den', | ||||||
|         )), |         )), | ||||||
|         # Date and sender separator |         # Date and sender separator | ||||||
|         u'|'.join(( |         u'|'.join(( | ||||||
| @@ -50,18 +55,28 @@ RE_ON_DATE_SMB_WROTE = re.compile( | |||||||
|             # Polish |             # Polish | ||||||
|             u'napisał', |             u'napisał', | ||||||
|             # Dutch |             # Dutch | ||||||
|             'schreef','verzond','geschreven' |             'schreef','verzond','geschreven', | ||||||
|  |             # German | ||||||
|  |             'schrieb', | ||||||
|  |             # Norwegian, Swedish | ||||||
|  |             'skrev', | ||||||
|         )) |         )) | ||||||
|     )) |     )) | ||||||
| # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:' | # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:' | ||||||
| RE_ON_DATE_WROTE_SMB = re.compile( | RE_ON_DATE_WROTE_SMB = re.compile( | ||||||
|     u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format( |     u'(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format( | ||||||
|         # Beginning of the line |         # Beginning of the line | ||||||
|  |         u'|'.join(( | ||||||
|         	'Op', |         	'Op', | ||||||
|  |         	#German | ||||||
|  |         	'Am' | ||||||
|  |         )), | ||||||
|         # Ending of the line |         # Ending of the line | ||||||
|         u'|'.join(( |         u'|'.join(( | ||||||
|             # Dutch |             # Dutch | ||||||
|             'schreef','verzond','geschreven' |             'schreef','verzond','geschreven', | ||||||
|  |             # German | ||||||
|  |             'schrieb' | ||||||
|         )) |         )) | ||||||
|     ) |     ) | ||||||
|     ) |     ) | ||||||
| @@ -92,7 +107,7 @@ RE_EMPTY_QUOTATION = re.compile( | |||||||
|     ( |     ( | ||||||
|         # quotation border: splitter line or a number of quotation marker lines |         # quotation border: splitter line or a number of quotation marker lines | ||||||
|         (?: |         (?: | ||||||
|             s |             (?:se*)+ | ||||||
|             | |             | | ||||||
|             (?:me*){2,} |             (?:me*){2,} | ||||||
|         ) |         ) | ||||||
| @@ -115,18 +130,23 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format( | |||||||
| RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format( | RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format( | ||||||
|     u'|'.join(( |     u'|'.join(( | ||||||
|         # "From" in different languages. |         # "From" in different languages. | ||||||
|         'From', 'Van', 'De', 'Von', 'Fra', |         'From', 'Van', 'De', 'Von', 'Fra', u'Från', | ||||||
|         # "Date" in different languages. |         # "Date" in different languages. | ||||||
|         'Date', 'Datum', u'Envoyé' |         'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt', | ||||||
|     ))), re.I) |     ))), re.I) | ||||||
|  |  | ||||||
| SPLITTER_PATTERNS = [ | SPLITTER_PATTERNS = [ | ||||||
|     RE_ORIGINAL_MESSAGE, |     RE_ORIGINAL_MESSAGE, | ||||||
|     # <date> <person> |  | ||||||
|     re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE), |  | ||||||
|     RE_ON_DATE_SMB_WROTE, |     RE_ON_DATE_SMB_WROTE, | ||||||
|     RE_ON_DATE_WROTE_SMB, |     RE_ON_DATE_WROTE_SMB, | ||||||
|     RE_FROM_COLON_OR_DATE_COLON, |     RE_FROM_COLON_OR_DATE_COLON, | ||||||
|  |     # 02.04.2012 14:20 пользователь "bob@example.com" < | ||||||
|  |     # bob@xxx.mailgun.org> написал: | ||||||
|  |     re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S), | ||||||
|  |     # 2014-10-17 11:28 GMT+03:00 Bob < | ||||||
|  |     # bob@example.com>: | ||||||
|  |     re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S), | ||||||
|  |     # Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>: | ||||||
|     re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' |     re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' | ||||||
|                '( \S+){3,6}@\S+:') |                '( \S+){3,6}@\S+:') | ||||||
|     ] |     ] | ||||||
| @@ -181,6 +201,7 @@ def mark_message_lines(lines): | |||||||
|         else: |         else: | ||||||
|             # in case splitter is spread across several lines |             # in case splitter is spread across several lines | ||||||
|             splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES])) |             splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES])) | ||||||
|  |  | ||||||
|             if splitter: |             if splitter: | ||||||
|                 # append as many splitter markers as lines in splitter |                 # append as many splitter markers as lines in splitter | ||||||
|                 splitter_lines = splitter.group().splitlines() |                 splitter_lines = splitter.group().splitlines() | ||||||
| @@ -293,12 +314,8 @@ def extract_from_plain(msg_body): | |||||||
|  |  | ||||||
|     delimiter = get_delimiter(msg_body) |     delimiter = get_delimiter(msg_body) | ||||||
|     msg_body = preprocess(msg_body, delimiter) |     msg_body = preprocess(msg_body, delimiter) | ||||||
|     lines = msg_body.splitlines() |  | ||||||
|  |  | ||||||
|     # don't process too long messages |     # don't process too long messages | ||||||
|     if len(lines) > MAX_LINES_COUNT: |     lines = msg_body.splitlines()[:MAX_LINES_COUNT] | ||||||
|         return stripped_text |  | ||||||
|  |  | ||||||
|     markers = mark_message_lines(lines) |     markers = mark_message_lines(lines) | ||||||
|     lines = process_marked_lines(lines, markers) |     lines = process_marked_lines(lines, markers) | ||||||
|  |  | ||||||
| @@ -324,43 +341,27 @@ def extract_from_html(msg_body): | |||||||
|     then checking deleted checkpoints, |     then checking deleted checkpoints, | ||||||
|     then deleting necessary tags. |     then deleting necessary tags. | ||||||
|     """ |     """ | ||||||
|  |  | ||||||
|     if msg_body.strip() == '': |     if msg_body.strip() == '': | ||||||
|         return msg_body |         return msg_body | ||||||
|  |  | ||||||
|  |     msg_body = msg_body.replace('\r\n', '').replace('\n', '') | ||||||
|     html_tree = html.document_fromstring( |     html_tree = html.document_fromstring( | ||||||
|         msg_body, |         msg_body, | ||||||
|         parser=html.HTMLParser(encoding="utf-8") |         parser=html.HTMLParser(encoding="utf-8") | ||||||
|     ) |     ) | ||||||
|  |  | ||||||
|     cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or |     cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or | ||||||
|                       html_quotations.cut_blockquote(html_tree) or |                       html_quotations.cut_blockquote(html_tree) or | ||||||
|                       html_quotations.cut_microsoft_quote(html_tree) or |                       html_quotations.cut_microsoft_quote(html_tree) or | ||||||
|                       html_quotations.cut_by_id(html_tree) or |                       html_quotations.cut_by_id(html_tree) or | ||||||
|                       html_quotations.cut_from_block(html_tree) |                       html_quotations.cut_from_block(html_tree) | ||||||
|                       ) |                       ) | ||||||
|  |  | ||||||
|     html_tree_copy = deepcopy(html_tree) |     html_tree_copy = deepcopy(html_tree) | ||||||
|  |  | ||||||
|     number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) |     number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) | ||||||
|     quotation_checkpoints = [False] * number_of_checkpoints |     quotation_checkpoints = [False] * number_of_checkpoints | ||||||
|     msg_with_checkpoints = html.tostring(html_tree) |     msg_with_checkpoints = html.tostring(html_tree) | ||||||
|  |     plain_text = html_to_text(msg_with_checkpoints) | ||||||
|     h = html2text.HTML2Text() |     plain_text = preprocess(plain_text, '\n', content_type='text/html') | ||||||
|     h.body_width = 0  # generate plain text without wrap |  | ||||||
|  |  | ||||||
|     # html2text adds unnecessary star symbols. Remove them. |  | ||||||
|     # Mask star symbols |  | ||||||
|     msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432') |  | ||||||
|     plain_text = h.handle(msg_with_checkpoints) |  | ||||||
|     # Remove created star symbols |  | ||||||
|     plain_text = plain_text.replace('*', '') |  | ||||||
|     # Unmask saved star symbols |  | ||||||
|     plain_text = plain_text.replace('3423oorkg432', '*') |  | ||||||
|  |  | ||||||
|     delimiter = get_delimiter(plain_text) |  | ||||||
|  |  | ||||||
|     plain_text = preprocess(plain_text, delimiter, content_type='text/html') |  | ||||||
|     lines = plain_text.splitlines() |     lines = plain_text.splitlines() | ||||||
|  |  | ||||||
|     # Don't process too long messages |     # Don't process too long messages | ||||||
| @@ -382,7 +383,6 @@ def extract_from_html(msg_body): | |||||||
|     return_flags = [] |     return_flags = [] | ||||||
|     process_marked_lines(lines, markers, return_flags) |     process_marked_lines(lines, markers, return_flags) | ||||||
|     lines_were_deleted, first_deleted, last_deleted = return_flags |     lines_were_deleted, first_deleted, last_deleted = return_flags | ||||||
|  |  | ||||||
|     if lines_were_deleted: |     if lines_were_deleted: | ||||||
|         #collect checkpoints from deleted lines |         #collect checkpoints from deleted lines | ||||||
|         for i in xrange(first_deleted, last_deleted): |         for i in xrange(first_deleted, last_deleted): | ||||||
|   | |||||||
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							| @@ -16,7 +16,7 @@ from talon.signature.constants import SIGNATURE_MAX_LINES | |||||||
|  |  | ||||||
| rc = re.compile | rc = re.compile | ||||||
|  |  | ||||||
| RE_EMAIL = rc('@') | RE_EMAIL = rc('\S@\S') | ||||||
| RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}') | RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}') | ||||||
| RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''') | RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''') | ||||||
|  |  | ||||||
| @@ -120,7 +120,7 @@ def contains_sender_names(sender): | |||||||
|     names = names or sender |     names = names or sender | ||||||
|     if names != '': |     if names != '': | ||||||
|         return binary_regex_search(re.compile(names)) |         return binary_regex_search(re.compile(names)) | ||||||
|     return lambda s: False |     return lambda s: 0 | ||||||
|  |  | ||||||
|  |  | ||||||
| def extract_names(sender): | def extract_names(sender): | ||||||
| @@ -134,7 +134,7 @@ def extract_names(sender): | |||||||
|     >>> extract_names('') |     >>> extract_names('') | ||||||
|     [] |     [] | ||||||
|     """ |     """ | ||||||
|     sender = to_unicode(sender) |     sender = to_unicode(sender, precise=True) | ||||||
|     # Remove non-alphabetical characters |     # Remove non-alphabetical characters | ||||||
|     sender = "".join([char if char.isalpha() else ' ' for char in sender]) |     sender = "".join([char if char.isalpha() else ' ' for char in sender]) | ||||||
|     # Remove too short words and words from "black" list i.e. |     # Remove too short words and words from "black" list i.e. | ||||||
| @@ -161,7 +161,7 @@ def categories_percent(s, categories): | |||||||
|     50.0 |     50.0 | ||||||
|     ''' |     ''' | ||||||
|     count = 0 |     count = 0 | ||||||
|     s = to_unicode(s) |     s = to_unicode(s, precise=True) | ||||||
|     for c in s: |     for c in s: | ||||||
|         if unicodedata.category(c) in categories: |         if unicodedata.category(c) in categories: | ||||||
|             count += 1 |             count += 1 | ||||||
| @@ -181,7 +181,7 @@ def punctuation_percent(s): | |||||||
|  |  | ||||||
| def capitalized_words_percent(s): | def capitalized_words_percent(s): | ||||||
|     '''Returns capitalized words percent.''' |     '''Returns capitalized words percent.''' | ||||||
|     s = to_unicode(s) |     s = to_unicode(s, precise=True) | ||||||
|     words = re.split('\s', s) |     words = re.split('\s', s) | ||||||
|     words = [w for w in words if w.strip()] |     words = [w for w in words if w.strip()] | ||||||
|     capitalized_words_counter = 0 |     capitalized_words_counter = 0 | ||||||
|   | |||||||
							
								
								
									
										119
									
								
								talon/utils.py
									
									
									
									
									
								
							
							
						
						
									
										119
									
								
								talon/utils.py
									
									
									
									
									
								
							| @@ -2,13 +2,16 @@ | |||||||
|  |  | ||||||
| import logging | import logging | ||||||
| from random import shuffle | from random import shuffle | ||||||
|  | import chardet | ||||||
|  | import cchardet | ||||||
|  | import regex as re | ||||||
|  |  | ||||||
|  | from lxml import html | ||||||
|  | from lxml.cssselect import CSSSelector | ||||||
|  |  | ||||||
| from talon.constants import RE_DELIMITER | from talon.constants import RE_DELIMITER | ||||||
|  |  | ||||||
|  |  | ||||||
| log = logging.getLogger(__name__) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def safe_format(format_string, *args, **kwargs): | def safe_format(format_string, *args, **kwargs): | ||||||
|     """ |     """ | ||||||
|     Helper: formats string with any combination of bytestrings/unicode |     Helper: formats string with any combination of bytestrings/unicode | ||||||
| @@ -42,12 +45,42 @@ def to_unicode(str_or_unicode, precise=False): | |||||||
|         u'привет' |         u'привет' | ||||||
|     If `precise` flag is True, tries to guess the correct encoding first. |     If `precise` flag is True, tries to guess the correct encoding first. | ||||||
|     """ |     """ | ||||||
|     encoding = detect_encoding(str_or_unicode) if precise else 'utf-8' |     encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8' | ||||||
|     if isinstance(str_or_unicode, str): |     if isinstance(str_or_unicode, str): | ||||||
|         return unicode(str_or_unicode, encoding, 'replace') |         return unicode(str_or_unicode, encoding, 'replace') | ||||||
|     return str_or_unicode |     return str_or_unicode | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def detect_encoding(string): | ||||||
|  |     """ | ||||||
|  |     Tries to detect the encoding of the passed string. | ||||||
|  |  | ||||||
|  |     Defaults to UTF-8. | ||||||
|  |     """ | ||||||
|  |     try: | ||||||
|  |         detected = chardet.detect(string) | ||||||
|  |         if detected: | ||||||
|  |             return detected.get('encoding') or 'utf-8' | ||||||
|  |     except Exception, e: | ||||||
|  |         pass | ||||||
|  |     return 'utf-8' | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def quick_detect_encoding(string): | ||||||
|  |     """ | ||||||
|  |     Tries to detect the encoding of the passed string. | ||||||
|  |  | ||||||
|  |     Uses cchardet. Fallbacks to detect_encoding. | ||||||
|  |     """ | ||||||
|  |     try: | ||||||
|  |         detected = cchardet.detect(string) | ||||||
|  |         if detected: | ||||||
|  |             return detected.get('encoding') or detect_encoding(string) | ||||||
|  |     except Exception, e: | ||||||
|  |         pass | ||||||
|  |     return detect_encoding(string) | ||||||
|  |  | ||||||
|  |  | ||||||
| def to_utf8(str_or_unicode): | def to_utf8(str_or_unicode): | ||||||
|     """ |     """ | ||||||
|     Safely returns a UTF-8 version of a given string |     Safely returns a UTF-8 version of a given string | ||||||
| @@ -74,3 +107,81 @@ def get_delimiter(msg_body): | |||||||
|         delimiter = '\n' |         delimiter = '\n' | ||||||
|  |  | ||||||
|     return delimiter |     return delimiter | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def html_to_text(string): | ||||||
|  |     """ | ||||||
|  |     Dead-simple HTML-to-text converter: | ||||||
|  |         >>> html_to_text("one<br>two<br>three") | ||||||
|  |         >>> "one\ntwo\nthree" | ||||||
|  |  | ||||||
|  |     NOTES: | ||||||
|  |         1. the string is expected to contain UTF-8 encoded HTML! | ||||||
|  |         2. returns utf-8 encoded str (not unicode) | ||||||
|  |     """ | ||||||
|  |     s = _prepend_utf8_declaration(string) | ||||||
|  |     s = s.replace("\n", "") | ||||||
|  |  | ||||||
|  |     tree = html.fromstring(s) | ||||||
|  |  | ||||||
|  |     for style in CSSSelector('style')(tree): | ||||||
|  |         style.getparent().remove(style) | ||||||
|  |  | ||||||
|  |     for c in tree.xpath('//comment()'): | ||||||
|  |         c.getparent().remove(c) | ||||||
|  |  | ||||||
|  |     text   = "" | ||||||
|  |     for el in tree.iter(): | ||||||
|  |         el_text = (el.text or '') + (el.tail or '') | ||||||
|  |         if len(el_text) > 1: | ||||||
|  |             if el.tag in _BLOCKTAGS: | ||||||
|  |                 text += "\n" | ||||||
|  |             if el.tag == 'li': | ||||||
|  |                 text += "  * " | ||||||
|  |             text += el_text.strip() + " " | ||||||
|  |  | ||||||
|  |             # add href to the output | ||||||
|  |             href = el.attrib.get('href') | ||||||
|  |             if href: | ||||||
|  |                 text += "(%s) " % href | ||||||
|  |  | ||||||
|  |         if el.tag in _HARDBREAKS and text and not text.endswith("\n"): | ||||||
|  |             text += "\n" | ||||||
|  |  | ||||||
|  |     retval = _rm_excessive_newlines(text) | ||||||
|  |     return _encode_utf8(retval) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _contains_charset_spec(s): | ||||||
|  |     """Return True if the first 4KB contain charset spec | ||||||
|  |     """ | ||||||
|  |     return s.lower().find('html; charset=', 0, 4096) != -1 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _prepend_utf8_declaration(s): | ||||||
|  |     """Prepend 'utf-8' encoding declaration if the first 4KB don't have any | ||||||
|  |     """ | ||||||
|  |     return s if _contains_charset_spec(s) else _UTF8_DECLARATION + s | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _rm_excessive_newlines(s): | ||||||
|  |     """Remove excessive newlines that often happen due to tons of divs | ||||||
|  |     """ | ||||||
|  |     return _RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip() | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _encode_utf8(s): | ||||||
|  |     """Encode in 'utf-8' if unicode | ||||||
|  |     """ | ||||||
|  |     return s.encode('utf-8') if isinstance(s, unicode) else s | ||||||
|  |  | ||||||
|  |  | ||||||
|  | _UTF8_DECLARATION = ('<meta http-equiv="Content-Type" content="text/html;' | ||||||
|  |                      'charset=utf-8">') | ||||||
|  |  | ||||||
|  |  | ||||||
|  | _BLOCKTAGS  = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] | ||||||
|  | _HARDBREAKS = ['br', 'hr', 'tr'] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") | ||||||
|   | |||||||
							
								
								
									
										1
									
								
								tests/fixtures/html_replies/hotmail.html
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								tests/fixtures/html_replies/hotmail.html
									
									
									
									
										vendored
									
									
								
							| @@ -1,3 +1,4 @@ | |||||||
|  | <?xml version="1.0" encoding="UTF-8"?> | ||||||
| <html> | <html> | ||||||
| <head> | <head> | ||||||
| <style><!-- | <style><!-- | ||||||
|   | |||||||
							
								
								
									
										87
									
								
								tests/fixtures/html_replies/ms_outlook_2010.html
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										87
									
								
								tests/fixtures/html_replies/ms_outlook_2010.html
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,87 @@ | |||||||
|  | <html> | ||||||
|  | <head> | ||||||
|  | <meta http-equiv="Content-Type" content="text/html; charset=iso-2022-jp"> | ||||||
|  | <meta name="Generator" content="Microsoft Word 14 (filtered medium)"> | ||||||
|  | <style><!-- | ||||||
|  | /* Font Definitions */ | ||||||
|  | @font-face | ||||||
|  | 	{font-family:Calibri; | ||||||
|  | 	panose-1:2 15 5 2 2 2 4 3 2 4;} | ||||||
|  | @font-face | ||||||
|  | 	{font-family:Tahoma; | ||||||
|  | 	panose-1:2 11 6 4 3 5 4 4 2 4;} | ||||||
|  | /* Style Definitions */ | ||||||
|  | p.MsoNormal, li.MsoNormal, div.MsoNormal | ||||||
|  | 	{margin:0in; | ||||||
|  | 	margin-bottom:.0001pt; | ||||||
|  | 	font-size:12.0pt; | ||||||
|  | 	font-family:"Times New Roman","serif";} | ||||||
|  | h3 | ||||||
|  | 	{mso-style-priority:9; | ||||||
|  | 	mso-style-link:"Heading 3 Char"; | ||||||
|  | 	mso-margin-top-alt:auto; | ||||||
|  | 	margin-right:0in; | ||||||
|  | 	mso-margin-bottom-alt:auto; | ||||||
|  | 	margin-left:0in; | ||||||
|  | 	font-size:13.5pt; | ||||||
|  | 	font-family:"Times New Roman","serif"; | ||||||
|  | 	font-weight:bold;} | ||||||
|  | a:link, span.MsoHyperlink | ||||||
|  | 	{mso-style-priority:99; | ||||||
|  | 	color:blue; | ||||||
|  | 	text-decoration:underline;} | ||||||
|  | a:visited, span.MsoHyperlinkFollowed | ||||||
|  | 	{mso-style-priority:99; | ||||||
|  | 	color:purple; | ||||||
|  | 	text-decoration:underline;} | ||||||
|  | p | ||||||
|  | 	{mso-style-priority:99; | ||||||
|  | 	mso-margin-top-alt:auto; | ||||||
|  | 	margin-right:0in; | ||||||
|  | 	mso-margin-bottom-alt:auto; | ||||||
|  | 	margin-left:0in; | ||||||
|  | 	font-size:12.0pt; | ||||||
|  | 	font-family:"Times New Roman","serif";} | ||||||
|  | span.Heading3Char | ||||||
|  | 	{mso-style-name:"Heading 3 Char"; | ||||||
|  | 	mso-style-priority:9; | ||||||
|  | 	mso-style-link:"Heading 3"; | ||||||
|  | 	font-family:"Cambria","serif"; | ||||||
|  | 	color:#4F81BD; | ||||||
|  | 	font-weight:bold;} | ||||||
|  | span.EmailStyle19 | ||||||
|  | 	{mso-style-type:personal-reply; | ||||||
|  | 	font-family:"Calibri","sans-serif"; | ||||||
|  | 	color:#1F497D;} | ||||||
|  | .MsoChpDefault | ||||||
|  | 	{mso-style-type:export-only; | ||||||
|  | 	font-family:"Calibri","sans-serif";} | ||||||
|  | @page WordSection1 | ||||||
|  | 	{size:8.5in 11.0in; | ||||||
|  | 	margin:1.0in 1.0in 1.0in 1.0in;} | ||||||
|  | div.WordSection1 | ||||||
|  | 	{page:WordSection1;} | ||||||
|  | --></style><!--[if gte mso 9]><xml> | ||||||
|  | <o:shapedefaults v:ext="edit" spidmax="1026" /> | ||||||
|  | </xml><![endif]--><!--[if gte mso 9]><xml> | ||||||
|  | <o:shapelayout v:ext="edit"> | ||||||
|  | <o:idmap v:ext="edit" data="1" /> | ||||||
|  | </o:shapelayout></xml><![endif]--> | ||||||
|  | </head> | ||||||
|  | <body lang="EN-US" link="blue" vlink="purple"> | ||||||
|  | <div class="WordSection1"> | ||||||
|  | <p class="MsoNormal"><span style="font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D">Hi. I am fine.<o:p></o:p></span></p> | ||||||
|  | <p class="MsoNormal"><span style="font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D">Thanks,<o:p></o:p></span></p> | ||||||
|  | <p class="MsoNormal"><span style="font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D">Alex<o:p></o:p></span></p> | ||||||
|  | <p class="MsoNormal"><b><span style="font-size:10.0pt;font-family:"Tahoma","sans-serif"">From:</span></b><span style="font-size:10.0pt;font-family:"Tahoma","sans-serif""> Foo [mailto:foo@bar.com] | ||||||
|  | <b>On Behalf Of </b>baz@bar.com<br> | ||||||
|  | <b>Sent:</b> Monday, January 01, 2000 12:00 AM<br> | ||||||
|  | <b>To:</b> john@bar.com<br> | ||||||
|  | <b>Cc:</b> jane@bar.io<br> | ||||||
|  | <b>Subject:</b> Conversation<o:p></o:p></span></p> | ||||||
|  | <p class="MsoNormal"><o:p> </o:p></p> | ||||||
|  | <p>Hello! How are you?<o:p></o:p></p> | ||||||
|  | <p class="MsoNormal"><o:p> </o:p></p> | ||||||
|  | </div> | ||||||
|  | </body> | ||||||
|  | </html> | ||||||
							
								
								
									
										19
									
								
								tests/fixtures/standard_replies/apple_mail_2.eml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								tests/fixtures/standard_replies/apple_mail_2.eml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,19 @@ | |||||||
|  | Content-Type: text/plain; | ||||||
|  | 	charset=us-ascii | ||||||
|  | Mime-Version: 1.0 (Mac OS X Mail 8.2 \(2104\)) | ||||||
|  | Subject: Re: Hello there | ||||||
|  | X-Universally-Unique-Identifier: 85B1075D-5841-46A9-8565-FCB287A93AC4 | ||||||
|  | From: Adam Renberg <adam@tictail.com> | ||||||
|  | In-Reply-To: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com> | ||||||
|  | Date: Sat, 22 Aug 2015 19:22:20 +0200 | ||||||
|  | Content-Transfer-Encoding: 7bit | ||||||
|  | X-Smtp-Server: smtp.gmail.com:adam@tictail.com | ||||||
|  | Message-Id: <68001B29-8EA4-444C-A894-0537D2CA5208@tictail.com> | ||||||
|  | References: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com> | ||||||
|  | To: Adam Renberg <tgwizard@gmail.com> | ||||||
|  |  | ||||||
|  | Hello | ||||||
|  | > On 22 Aug 2015, at 19:21, Adam Renberg <tgwizard@gmail.com> wrote: | ||||||
|  | > | ||||||
|  | > Hi there! | ||||||
|  |  | ||||||
| @@ -5,9 +5,7 @@ from . fixtures import * | |||||||
|  |  | ||||||
| import regex as re | import regex as re | ||||||
|  |  | ||||||
| from talon import quotations | from talon import quotations, utils as u | ||||||
|  |  | ||||||
| import html2text |  | ||||||
|  |  | ||||||
|  |  | ||||||
| RE_WHITESPACE = re.compile("\s") | RE_WHITESPACE = re.compile("\s") | ||||||
| @@ -45,7 +43,25 @@ def test_quotation_splitter_outside_blockquote(): | |||||||
|   </div> |   </div> | ||||||
| </blockquote> | </blockquote> | ||||||
| """ | """ | ||||||
|     eq_("<html><body><p>Reply</p><div></div></body></html>", |     eq_("<html><body><p>Reply</p></body></html>", | ||||||
|  |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_regular_blockquote(): | ||||||
|  |     msg_body = """Reply | ||||||
|  | <blockquote>Regular</blockquote> | ||||||
|  |  | ||||||
|  | <div> | ||||||
|  |   On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: | ||||||
|  | </div> | ||||||
|  |  | ||||||
|  | <blockquote> | ||||||
|  |   <div> | ||||||
|  |     <blockquote>Nested</blockquote> | ||||||
|  |   </div> | ||||||
|  | </blockquote> | ||||||
|  | """ | ||||||
|  |     eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote></body></html>", | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -115,6 +131,18 @@ def test_gmail_quote(): | |||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_gmail_quote_blockquote(): | ||||||
|  |     msg_body = """Message | ||||||
|  | <blockquote class="gmail_quote"> | ||||||
|  |   <div class="gmail_default"> | ||||||
|  |     My name is William Shakespeare. | ||||||
|  |     <br/> | ||||||
|  |   </div> | ||||||
|  | </blockquote>""" | ||||||
|  |     eq_(RE_WHITESPACE.sub('', msg_body), | ||||||
|  |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_unicode_in_reply(): | def test_unicode_in_reply(): | ||||||
|     msg_body = u"""Reply \xa0 \xa0 Text<br> |     msg_body = u"""Reply \xa0 \xa0 Text<br> | ||||||
|  |  | ||||||
| @@ -122,7 +150,7 @@ def test_unicode_in_reply(): | |||||||
|   <br> |   <br> | ||||||
| </div> | </div> | ||||||
|  |  | ||||||
| <blockquote class="gmail_quote"> | <blockquote> | ||||||
|   Quote |   Quote | ||||||
| </blockquote>""".encode("utf-8") | </blockquote>""".encode("utf-8") | ||||||
|  |  | ||||||
| @@ -240,26 +268,15 @@ def test_reply_separated_by_hr(): | |||||||
|             '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) |             '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) | ||||||
|  |  | ||||||
|  |  | ||||||
| RE_REPLY = re.compile(r"^Hi\. I am fine\.\s*\n\s*Thanks,\s*\n\s*Alex\s*$") |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def extract_reply_and_check(filename): | def extract_reply_and_check(filename): | ||||||
|     f = open(filename) |     f = open(filename) | ||||||
|  |  | ||||||
|     msg_body = f.read().decode("utf-8") |     msg_body = f.read() | ||||||
|     reply = quotations.extract_from_html(msg_body) |     reply = quotations.extract_from_html(msg_body) | ||||||
|  |     plain_reply = u.html_to_text(reply) | ||||||
|  |  | ||||||
|     h = html2text.HTML2Text() |     eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"), | ||||||
|     h.body_width = 0 |         RE_WHITESPACE.sub('', plain_reply)) | ||||||
|     plain_reply = h.handle(reply) |  | ||||||
|  |  | ||||||
|     #remove   spaces |  | ||||||
|     plain_reply = plain_reply.replace(u'\xa0', u' ') |  | ||||||
|  |  | ||||||
|     if RE_REPLY.match(plain_reply): |  | ||||||
|         eq_(1, 1) |  | ||||||
|     else: |  | ||||||
|         eq_("Hi. I am fine.\n\nThanks,\nAlex", plain_reply) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_gmail_reply(): | def test_gmail_reply(): | ||||||
| @@ -282,6 +299,10 @@ def test_ms_outlook_2007_reply(): | |||||||
|     extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2007.html") |     extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2007.html") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_ms_outlook_2010_reply(): | ||||||
|  |     extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2010.html") | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_thunderbird_reply(): | def test_thunderbird_reply(): | ||||||
|     extract_reply_and_check("tests/fixtures/html_replies/thunderbird.html") |     extract_reply_and_check("tests/fixtures/html_replies/thunderbird.html") | ||||||
|  |  | ||||||
| @@ -292,3 +313,30 @@ def test_windows_mail_reply(): | |||||||
|  |  | ||||||
| def test_yandex_ru_reply(): | def test_yandex_ru_reply(): | ||||||
|     extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html") |     extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_CRLF(): | ||||||
|  |     """CR is not converted to '
' | ||||||
|  |     """ | ||||||
|  |     symbol = '
' | ||||||
|  |     extracted = quotations.extract_from_html('<html>\r\n</html>') | ||||||
|  |     assert_false(symbol in extracted) | ||||||
|  |     eq_('<html></html>', RE_WHITESPACE.sub('', extracted)) | ||||||
|  |  | ||||||
|  |     msg_body = """Reply | ||||||
|  | <blockquote> | ||||||
|  |  | ||||||
|  |   <div> | ||||||
|  |     On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: | ||||||
|  |   </div> | ||||||
|  |  | ||||||
|  |   <div> | ||||||
|  |     Test | ||||||
|  |   </div> | ||||||
|  |  | ||||||
|  | </blockquote>""" | ||||||
|  |     msg_body = msg_body.replace('\n', '\r\n') | ||||||
|  |     extracted = quotations.extract_from_html(msg_body) | ||||||
|  |     assert_false(symbol in extracted)     | ||||||
|  |     eq_("<html><body><p>Reply</p></body></html>", | ||||||
|  |         RE_WHITESPACE.sub('', extracted)) | ||||||
|   | |||||||
| @@ -6,7 +6,9 @@ from talon.signature.learning import featurespace as fs | |||||||
|  |  | ||||||
|  |  | ||||||
| def test_apply_features(): | def test_apply_features(): | ||||||
|     s = '''John Doe |     s = '''This is John Doe | ||||||
|  |  | ||||||
|  | Tuesday @3pm suits. I'll chat to you then. | ||||||
|  |  | ||||||
| VP Research and Development, Xxxx Xxxx Xxxxx | VP Research and Development, Xxxx Xxxx Xxxxx | ||||||
|  |  | ||||||
| @@ -19,11 +21,12 @@ john@example.com''' | |||||||
|     # note that we don't consider the first line because signatures don't |     # note that we don't consider the first line because signatures don't | ||||||
|     # usually take all the text, empty lines are not considered |     # usually take all the text, empty lines are not considered | ||||||
|     eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], |     eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], | ||||||
|  |                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], | ||||||
|                  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], |                  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], | ||||||
|                  [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], |                  [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], | ||||||
|                  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) |                  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) | ||||||
|  |  | ||||||
|     with patch.object(fs, 'SIGNATURE_MAX_LINES', 4): |     with patch.object(fs, 'SIGNATURE_MAX_LINES', 5): | ||||||
|         features = fs.features(sender) |         features = fs.features(sender) | ||||||
|         new_result = fs.apply_features(s, features) |         new_result = fs.apply_features(s, features) | ||||||
|         # result remains the same because we don't consider empty lines |         # result remains the same because we don't consider empty lines | ||||||
|   | |||||||
| @@ -12,11 +12,11 @@ from talon import quotations | |||||||
| @patch.object(quotations, 'MAX_LINES_COUNT', 1) | @patch.object(quotations, 'MAX_LINES_COUNT', 1) | ||||||
| def test_too_many_lines(): | def test_too_many_lines(): | ||||||
|     msg_body = """Test reply |     msg_body = """Test reply | ||||||
|  | Hi | ||||||
| -----Original Message----- | -----Original Message----- | ||||||
|  |  | ||||||
| Test""" | Test""" | ||||||
|     eq_(msg_body, quotations.extract_from_plain(msg_body)) |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_pattern_on_date_somebody_wrote(): | def test_pattern_on_date_somebody_wrote(): | ||||||
| @@ -54,6 +54,18 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote: | |||||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_date_time_email_splitter(): | ||||||
|  |     msg_body = """Test reply | ||||||
|  |  | ||||||
|  | 2014-10-17 11:28 GMT+03:00 Postmaster < | ||||||
|  | postmaster@sandboxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.mailgun.org>: | ||||||
|  |  | ||||||
|  | > First from site | ||||||
|  | > | ||||||
|  |     """ | ||||||
|  |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_pattern_on_date_somebody_wrote_allows_space_in_front(): | def test_pattern_on_date_somebody_wrote_allows_space_in_front(): | ||||||
|     msg_body = """Thanks Thanmai |     msg_body = """Thanks Thanmai | ||||||
|  On Mar 8, 2012 9:59 AM, "Example.com" < |  On Mar 8, 2012 9:59 AM, "Example.com" < | ||||||
| @@ -311,6 +323,33 @@ Emne: The manager has commented on your Loop | |||||||
| Blah-blah-blah | Blah-blah-blah | ||||||
| """)) | """)) | ||||||
|  |  | ||||||
|  | def test_swedish_from_block(): | ||||||
|  |     eq_('Allo! Follow up MIME!', quotations.extract_from_plain( | ||||||
|  |     u"""Allo! Follow up MIME! | ||||||
|  | Från: Anno Sportel [mailto:anno.spoel@hsbcssad.com] | ||||||
|  | Skickat: den 26 augusti 2015 14:45 | ||||||
|  | Till: Isacson Leiff | ||||||
|  | Ämne: RE: Week 36 | ||||||
|  |  | ||||||
|  | Blah-blah-blah | ||||||
|  | """)) | ||||||
|  |  | ||||||
|  | def test_swedish_from_line(): | ||||||
|  |     eq_('Lorem', quotations.extract_from_plain( | ||||||
|  |     """Lorem | ||||||
|  | Den 14 september, 2015 02:23:18, Valentino Rudy (valentino@rudy.be) skrev: | ||||||
|  |  | ||||||
|  | Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse. | ||||||
|  | """)) | ||||||
|  |  | ||||||
|  | def test_norwegian_from_line(): | ||||||
|  |     eq_('Lorem', quotations.extract_from_plain( | ||||||
|  |     u"""Lorem | ||||||
|  | På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev: | ||||||
|  |  | ||||||
|  | Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse. | ||||||
|  | """)) | ||||||
|  |  | ||||||
| def test_dutch_from_block(): | def test_dutch_from_block(): | ||||||
|     eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain( |     eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain( | ||||||
|     """Gluten-free culpa lo-fi et nesciunt nostrud.  |     """Gluten-free culpa lo-fi et nesciunt nostrud.  | ||||||
|   | |||||||
| @@ -1,9 +1,107 @@ | |||||||
|  | # coding:utf-8 | ||||||
|  |  | ||||||
| from . import * | from . import * | ||||||
|  |  | ||||||
| from talon import utils | from talon import utils as u | ||||||
|  | import cchardet | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_get_delimiter(): | def test_get_delimiter(): | ||||||
|     eq_('\r\n', utils.get_delimiter('abc\r\n123')) |     eq_('\r\n', u.get_delimiter('abc\r\n123')) | ||||||
|     eq_('\n', utils.get_delimiter('abc\n123')) |     eq_('\n', u.get_delimiter('abc\n123')) | ||||||
|     eq_('\n', utils.get_delimiter('abc')) |     eq_('\n', u.get_delimiter('abc')) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_unicode(): | ||||||
|  |     eq_ (u'hi', u.to_unicode('hi')) | ||||||
|  |     eq_ (type(u.to_unicode('hi')), unicode ) | ||||||
|  |     eq_ (type(u.to_unicode(u'hi')), unicode ) | ||||||
|  |     eq_ (type(u.to_unicode('привет')), unicode ) | ||||||
|  |     eq_ (type(u.to_unicode(u'привет')), unicode ) | ||||||
|  |     eq_ (u"привет", u.to_unicode('привет')) | ||||||
|  |     eq_ (u"привет", u.to_unicode(u'привет')) | ||||||
|  |     # some latin1 stuff | ||||||
|  |     eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_detect_encoding(): | ||||||
|  |     eq_ ('ascii', u.detect_encoding('qwe').lower()) | ||||||
|  |     eq_ ('iso-8859-2', u.detect_encoding('Versi\xf3n').lower()) | ||||||
|  |     eq_ ('utf-8', u.detect_encoding('привет').lower()) | ||||||
|  |     # fallback to utf-8 | ||||||
|  |     with patch.object(u.chardet, 'detect') as detect: | ||||||
|  |         detect.side_effect = Exception | ||||||
|  |         eq_ ('utf-8', u.detect_encoding('qwe').lower()) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_quick_detect_encoding(): | ||||||
|  |     eq_ ('ascii', u.quick_detect_encoding('qwe').lower()) | ||||||
|  |     eq_ ('windows-1252', u.quick_detect_encoding('Versi\xf3n').lower()) | ||||||
|  |     eq_ ('utf-8', u.quick_detect_encoding('привет').lower()) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @patch.object(cchardet, 'detect') | ||||||
|  | @patch.object(u, 'detect_encoding') | ||||||
|  | def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect): | ||||||
|  |     cchardet_detect.return_value = {'encoding': 'ascii'} | ||||||
|  |     eq_('ascii', u.quick_detect_encoding("qwe")) | ||||||
|  |     cchardet_detect.assert_called_once_with("qwe") | ||||||
|  |  | ||||||
|  |     # fallback to detect_encoding | ||||||
|  |     cchardet_detect.return_value = {} | ||||||
|  |     detect_encoding.return_value = 'utf-8' | ||||||
|  |     eq_('utf-8', u.quick_detect_encoding("qwe")) | ||||||
|  |  | ||||||
|  |     # exception | ||||||
|  |     detect_encoding.reset_mock() | ||||||
|  |     cchardet_detect.side_effect = Exception() | ||||||
|  |     detect_encoding.return_value = 'utf-8' | ||||||
|  |     eq_('utf-8', u.quick_detect_encoding("qwe")) | ||||||
|  |     ok_(detect_encoding.called) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_html_to_text(): | ||||||
|  |     html = """<body> | ||||||
|  | <p>Hello world!</p> | ||||||
|  | <br> | ||||||
|  | <ul> | ||||||
|  | <li>One!</li> | ||||||
|  | <li>Two</li> | ||||||
|  | </ul> | ||||||
|  | <p> | ||||||
|  | Haha | ||||||
|  | </p> | ||||||
|  | </body>""" | ||||||
|  |     text = u.html_to_text(html) | ||||||
|  |     eq_("Hello world! \n\n  * One! \n  * Two \nHaha", text) | ||||||
|  |     eq_("привет!", u.html_to_text("<b>привет!</b>")) | ||||||
|  |  | ||||||
|  |     html = '<body><br/><br/>Hi</body>' | ||||||
|  |     eq_ ('Hi', u.html_to_text(html)) | ||||||
|  |  | ||||||
|  |     html = """Hi | ||||||
|  | <style type="text/css"> | ||||||
|  |  | ||||||
|  | div, p, li { | ||||||
|  |  | ||||||
|  | font: 13px 'Lucida Grande', Arial, sans-serif; | ||||||
|  |  | ||||||
|  | } | ||||||
|  | </style> | ||||||
|  |  | ||||||
|  | <style type="text/css"> | ||||||
|  |  | ||||||
|  | h1 { | ||||||
|  |  | ||||||
|  | font: 13px 'Lucida Grande', Arial, sans-serif; | ||||||
|  |  | ||||||
|  | } | ||||||
|  | </style>""" | ||||||
|  |     eq_ ('Hi', u.html_to_text(html)) | ||||||
|  |  | ||||||
|  |     html = """<div> | ||||||
|  | <!-- COMMENT 1 --> | ||||||
|  | <span>TEXT 1</span> | ||||||
|  | <p>TEXT 2 <!-- COMMENT 2 --></p> | ||||||
|  | </div>""" | ||||||
|  |     eq_('TEXT 1 \nTEXT 2', u.html_to_text(html)) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user