Compare commits
	
		
			6 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | f6940fe878 | ||
|  | ce65ff8fc8 | ||
|  | eed6784f25 | ||
|  | 3d9ae356ea | ||
|  | f688d074b5 | ||
|  | 41457d8fbd | 
							
								
								
									
										3
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										3
									
								
								setup.py
									
									
									
									
									
								
							| @@ -2,7 +2,7 @@ from setuptools import setup, find_packages | |||||||
|  |  | ||||||
|  |  | ||||||
| setup(name='talon', | setup(name='talon', | ||||||
|       version='1.0.9', |       version='1.2.1', | ||||||
|       description=("Mailgun library " |       description=("Mailgun library " | ||||||
|                    "to extract message quotations and signatures."), |                    "to extract message quotations and signatures."), | ||||||
|       long_description=open("README.rst").read(), |       long_description=open("README.rst").read(), | ||||||
| @@ -16,7 +16,6 @@ setup(name='talon', | |||||||
|       install_requires=[ |       install_requires=[ | ||||||
|           "lxml>=2.3.3", |           "lxml>=2.3.3", | ||||||
|           "regex>=1", |           "regex>=1", | ||||||
|           "html2text", |  | ||||||
|           "numpy", |           "numpy", | ||||||
|           "scipy", |           "scipy", | ||||||
|           "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild |           "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild | ||||||
|   | |||||||
| @@ -76,7 +76,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints): | |||||||
|  |  | ||||||
| def cut_gmail_quote(html_message): | def cut_gmail_quote(html_message): | ||||||
|     ''' Cuts the outermost block element with class gmail_quote. ''' |     ''' Cuts the outermost block element with class gmail_quote. ''' | ||||||
|     gmail_quote = html_message.cssselect('.gmail_quote') |     gmail_quote = html_message.cssselect('div.gmail_quote') | ||||||
|     if gmail_quote: |     if gmail_quote: | ||||||
|         gmail_quote[0].getparent().remove(gmail_quote[0]) |         gmail_quote[0].getparent().remove(gmail_quote[0]) | ||||||
|         return True |         return True | ||||||
| @@ -139,7 +139,11 @@ def cut_by_id(html_message): | |||||||
|  |  | ||||||
| def cut_blockquote(html_message): | def cut_blockquote(html_message): | ||||||
|     ''' Cuts the last non-nested blockquote with wrapping elements.''' |     ''' Cuts the last non-nested blockquote with wrapping elements.''' | ||||||
|     quote = html_message.xpath('(.//blockquote)[not(ancestor::blockquote)][last()]') |     quote = html_message.xpath( | ||||||
|  |         '(.//blockquote)' | ||||||
|  |         '[not(@class="gmail_quote") and not(ancestor::blockquote)]' | ||||||
|  |         '[last()]') | ||||||
|  |  | ||||||
|     if quote: |     if quote: | ||||||
|         quote = quote[0] |         quote = quote[0] | ||||||
|         quote.getparent().remove(quote) |         quote.getparent().remove(quote) | ||||||
| @@ -155,13 +159,25 @@ def cut_from_block(html_message): | |||||||
|  |  | ||||||
|     if block: |     if block: | ||||||
|         block = block[-1] |         block = block[-1] | ||||||
|  |         parent_div = None | ||||||
|         while block.getparent() is not None: |         while block.getparent() is not None: | ||||||
|             if block.tag == 'div': |             if block.tag == 'div': | ||||||
|  |                 parent_div = block | ||||||
|  |                 break | ||||||
|  |             block = block.getparent() | ||||||
|  |         if parent_div is not None: | ||||||
|  |             maybe_body = parent_div.getparent() | ||||||
|  |             # In cases where removing this enclosing div will remove all | ||||||
|  |             # content, we should assume the quote is not enclosed in a tag. | ||||||
|  |             parent_div_is_all_content = ( | ||||||
|  |                 maybe_body is not None and maybe_body.tag == 'body' and | ||||||
|  |                 len(maybe_body.getchildren()) == 1) | ||||||
|  |             if not parent_div_is_all_content: | ||||||
|                 block.getparent().remove(block) |                 block.getparent().remove(block) | ||||||
|                 return True |                 return True | ||||||
|         else: |         else: | ||||||
|                 block = block.getparent() |             return False | ||||||
|     else: |  | ||||||
|     # handle the case when From: block goes right after e.g. <hr> |     # handle the case when From: block goes right after e.g. <hr> | ||||||
|     # and not enclosed in some tag |     # and not enclosed in some tag | ||||||
|     block = html_message.xpath( |     block = html_message.xpath( | ||||||
|   | |||||||
| @@ -10,9 +10,8 @@ import logging | |||||||
| from copy import deepcopy | from copy import deepcopy | ||||||
|  |  | ||||||
| from lxml import html, etree | from lxml import html, etree | ||||||
| import html2text |  | ||||||
|  |  | ||||||
| from talon.utils import get_delimiter | from talon.utils import get_delimiter, html_to_text | ||||||
| from talon import html_quotations | from talon import html_quotations | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -108,7 +107,7 @@ RE_EMPTY_QUOTATION = re.compile( | |||||||
|     ( |     ( | ||||||
|         # quotation border: splitter line or a number of quotation marker lines |         # quotation border: splitter line or a number of quotation marker lines | ||||||
|         (?: |         (?: | ||||||
|             s |             (?:se*)+ | ||||||
|             | |             | | ||||||
|             (?:me*){2,} |             (?:me*){2,} | ||||||
|         ) |         ) | ||||||
| @@ -139,7 +138,7 @@ RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? . | |||||||
| SPLITTER_PATTERNS = [ | SPLITTER_PATTERNS = [ | ||||||
|     RE_ORIGINAL_MESSAGE, |     RE_ORIGINAL_MESSAGE, | ||||||
|     # <date> <person> |     # <date> <person> | ||||||
|     re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE), |     re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S), | ||||||
|     RE_ON_DATE_SMB_WROTE, |     RE_ON_DATE_SMB_WROTE, | ||||||
|     RE_ON_DATE_WROTE_SMB, |     RE_ON_DATE_WROTE_SMB, | ||||||
|     RE_FROM_COLON_OR_DATE_COLON, |     RE_FROM_COLON_OR_DATE_COLON, | ||||||
| @@ -321,7 +320,7 @@ def extract_from_plain(msg_body): | |||||||
|     return msg_body |     return msg_body | ||||||
|  |  | ||||||
|  |  | ||||||
| def extract_from_html(s): | def extract_from_html(msg_body): | ||||||
|     """ |     """ | ||||||
|     Extract not quoted message from provided html message body |     Extract not quoted message from provided html message body | ||||||
|     using tags and plain text algorithm. |     using tags and plain text algorithm. | ||||||
| @@ -337,49 +336,32 @@ def extract_from_html(s): | |||||||
|     then checking deleted checkpoints, |     then checking deleted checkpoints, | ||||||
|     then deleting necessary tags. |     then deleting necessary tags. | ||||||
|     """ |     """ | ||||||
|  |     if msg_body.strip() == '': | ||||||
|  |         return msg_body | ||||||
|  |  | ||||||
|     if s.strip() == '': |     msg_body = msg_body.replace('\r\n', '').replace('\n', '') | ||||||
|         return s |  | ||||||
|  |  | ||||||
|     # replace CRLF with LF temporaraly otherwise CR will be converted to '
' |  | ||||||
|     # when doing deepcopy on html tree |  | ||||||
|     msg_body, replaced = _CRLF_to_LF(s) |  | ||||||
|  |  | ||||||
|     html_tree = html.document_fromstring( |     html_tree = html.document_fromstring( | ||||||
|         msg_body, |         msg_body, | ||||||
|         parser=html.HTMLParser(encoding="utf-8") |         parser=html.HTMLParser(encoding="utf-8") | ||||||
|     ) |     ) | ||||||
|  |  | ||||||
|     cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or |     cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or | ||||||
|                       html_quotations.cut_blockquote(html_tree) or |                       html_quotations.cut_blockquote(html_tree) or | ||||||
|                       html_quotations.cut_microsoft_quote(html_tree) or |                       html_quotations.cut_microsoft_quote(html_tree) or | ||||||
|                       html_quotations.cut_by_id(html_tree) or |                       html_quotations.cut_by_id(html_tree) or | ||||||
|                       html_quotations.cut_from_block(html_tree) |                       html_quotations.cut_from_block(html_tree) | ||||||
|                       ) |                       ) | ||||||
|  |  | ||||||
|     html_tree_copy = deepcopy(html_tree) |     html_tree_copy = deepcopy(html_tree) | ||||||
|  |  | ||||||
|     number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) |     number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) | ||||||
|     quotation_checkpoints = [False] * number_of_checkpoints |     quotation_checkpoints = [False] * number_of_checkpoints | ||||||
|     msg_with_checkpoints = html.tostring(html_tree) |     msg_with_checkpoints = html.tostring(html_tree) | ||||||
|  |     plain_text = html_to_text(msg_with_checkpoints) | ||||||
|     h = html2text.HTML2Text() |  | ||||||
|     h.body_width = 0  # generate plain text without wrap |  | ||||||
|  |  | ||||||
|     # html2text adds unnecessary star symbols. Remove them. |  | ||||||
|     # Mask star symbols |  | ||||||
|     msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432') |  | ||||||
|     plain_text = h.handle(msg_with_checkpoints) |  | ||||||
|     # Remove created star symbols |  | ||||||
|     plain_text = plain_text.replace('*', '') |  | ||||||
|     # Unmask saved star symbols |  | ||||||
|     plain_text = plain_text.replace('3423oorkg432', '*') |  | ||||||
|     plain_text = preprocess(plain_text, '\n', content_type='text/html') |     plain_text = preprocess(plain_text, '\n', content_type='text/html') | ||||||
|     lines = plain_text.splitlines() |     lines = plain_text.splitlines() | ||||||
|  |  | ||||||
|     # Don't process too long messages |     # Don't process too long messages | ||||||
|     if len(lines) > MAX_LINES_COUNT: |     if len(lines) > MAX_LINES_COUNT: | ||||||
|         return s |         return msg_body | ||||||
|  |  | ||||||
|     # Collect checkpoints on each line |     # Collect checkpoints on each line | ||||||
|     line_checkpoints = [ |     line_checkpoints = [ | ||||||
| @@ -396,7 +378,6 @@ def extract_from_html(s): | |||||||
|     return_flags = [] |     return_flags = [] | ||||||
|     process_marked_lines(lines, markers, return_flags) |     process_marked_lines(lines, markers, return_flags) | ||||||
|     lines_were_deleted, first_deleted, last_deleted = return_flags |     lines_were_deleted, first_deleted, last_deleted = return_flags | ||||||
|  |  | ||||||
|     if lines_were_deleted: |     if lines_were_deleted: | ||||||
|         #collect checkpoints from deleted lines |         #collect checkpoints from deleted lines | ||||||
|         for i in xrange(first_deleted, last_deleted): |         for i in xrange(first_deleted, last_deleted): | ||||||
| @@ -404,9 +385,9 @@ def extract_from_html(s): | |||||||
|                 quotation_checkpoints[checkpoint] = True |                 quotation_checkpoints[checkpoint] = True | ||||||
|     else: |     else: | ||||||
|         if cut_quotations: |         if cut_quotations: | ||||||
|             return _restore_CRLF(html.tostring(html_tree_copy), replaced) |             return html.tostring(html_tree_copy) | ||||||
|         else: |         else: | ||||||
|             return s |             return msg_body | ||||||
|  |  | ||||||
|     # Remove tags with quotation checkpoints |     # Remove tags with quotation checkpoints | ||||||
|     html_quotations.delete_quotation_tags( |     html_quotations.delete_quotation_tags( | ||||||
| @@ -442,37 +423,3 @@ def register_xpath_extensions(): | |||||||
|     ns.prefix = 'mg' |     ns.prefix = 'mg' | ||||||
|     ns['text_content'] = text_content |     ns['text_content'] = text_content | ||||||
|     ns['tail'] = tail |     ns['tail'] = tail | ||||||
|  |  | ||||||
|  |  | ||||||
| def _restore_CRLF(s, replaced=True): |  | ||||||
|     """Restore CRLF if previously CRLF was replaced with LF |  | ||||||
|  |  | ||||||
|     >>> _restore_CRLF('a\nb') |  | ||||||
|     'a\r\nb' |  | ||||||
|     >>> _restore_CRLF('a\nb', replaced=False) |  | ||||||
|     'a\nb' |  | ||||||
|     """ |  | ||||||
|     if replaced: |  | ||||||
|         return s.replace('\n', '\r\n') |  | ||||||
|     return s |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def _CRLF_to_LF(s): |  | ||||||
|     """Replace CRLF with LF |  | ||||||
|  |  | ||||||
|     >>> s, changed = _CRLF_to_LF('a\r\n'b) |  | ||||||
|     >>> s |  | ||||||
|     'a\nb' |  | ||||||
|     >>> changed |  | ||||||
|     True |  | ||||||
|  |  | ||||||
|     >>> s, changed = _CRLF_to_LF('a\n'b) |  | ||||||
|     >>> s |  | ||||||
|     'a\nb' |  | ||||||
|     >>> changed |  | ||||||
|     False |  | ||||||
|     """ |  | ||||||
|     delimiter = get_delimiter(s) |  | ||||||
|     if delimiter == '\r\n': |  | ||||||
|         return s.replace(delimiter, '\n'), True |  | ||||||
|     return s, False |  | ||||||
|   | |||||||
| @@ -4,6 +4,10 @@ import logging | |||||||
| from random import shuffle | from random import shuffle | ||||||
| import chardet | import chardet | ||||||
| import cchardet | import cchardet | ||||||
|  | import regex as re | ||||||
|  |  | ||||||
|  | from lxml import html | ||||||
|  | from lxml.cssselect import CSSSelector | ||||||
|  |  | ||||||
| from talon.constants import RE_DELIMITER | from talon.constants import RE_DELIMITER | ||||||
|  |  | ||||||
| @@ -58,7 +62,6 @@ def detect_encoding(string): | |||||||
|         if detected: |         if detected: | ||||||
|             return detected.get('encoding') or 'utf-8' |             return detected.get('encoding') or 'utf-8' | ||||||
|     except Exception, e: |     except Exception, e: | ||||||
|         print 11111111111, e |  | ||||||
|         pass |         pass | ||||||
|     return 'utf-8' |     return 'utf-8' | ||||||
|  |  | ||||||
| @@ -74,7 +77,6 @@ def quick_detect_encoding(string): | |||||||
|         if detected: |         if detected: | ||||||
|             return detected.get('encoding') or detect_encoding(string) |             return detected.get('encoding') or detect_encoding(string) | ||||||
|     except Exception, e: |     except Exception, e: | ||||||
|         print 222222222222, e |  | ||||||
|         pass |         pass | ||||||
|     return detect_encoding(string) |     return detect_encoding(string) | ||||||
|  |  | ||||||
| @@ -105,3 +107,81 @@ def get_delimiter(msg_body): | |||||||
|         delimiter = '\n' |         delimiter = '\n' | ||||||
|  |  | ||||||
|     return delimiter |     return delimiter | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def html_to_text(string): | ||||||
|  |     """ | ||||||
|  |     Dead-simple HTML-to-text converter: | ||||||
|  |         >>> html_to_text("one<br>two<br>three") | ||||||
|  |         >>> "one\ntwo\nthree" | ||||||
|  |  | ||||||
|  |     NOTES: | ||||||
|  |         1. the string is expected to contain UTF-8 encoded HTML! | ||||||
|  |         2. returns utf-8 encoded str (not unicode) | ||||||
|  |     """ | ||||||
|  |     s = _prepend_utf8_declaration(string) | ||||||
|  |     s = s.replace("\n", "") | ||||||
|  |  | ||||||
|  |     tree = html.fromstring(s) | ||||||
|  |  | ||||||
|  |     for style in CSSSelector('style')(tree): | ||||||
|  |         style.getparent().remove(style) | ||||||
|  |  | ||||||
|  |     for c in tree.xpath('//comment()'): | ||||||
|  |         c.getparent().remove(c) | ||||||
|  |  | ||||||
|  |     text   = "" | ||||||
|  |     for el in tree.iter(): | ||||||
|  |         el_text = (el.text or '') + (el.tail or '') | ||||||
|  |         if len(el_text) > 1: | ||||||
|  |             if el.tag in _BLOCKTAGS: | ||||||
|  |                 text += "\n" | ||||||
|  |             if el.tag == 'li': | ||||||
|  |                 text += "  * " | ||||||
|  |             text += el_text.strip() + " " | ||||||
|  |  | ||||||
|  |             # add href to the output | ||||||
|  |             href = el.attrib.get('href') | ||||||
|  |             if href: | ||||||
|  |                 text += "(%s) " % href | ||||||
|  |  | ||||||
|  |         if el.tag in _HARDBREAKS and text and not text.endswith("\n"): | ||||||
|  |             text += "\n" | ||||||
|  |  | ||||||
|  |     retval = _rm_excessive_newlines(text) | ||||||
|  |     return _encode_utf8(retval) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _contains_charset_spec(s): | ||||||
|  |     """Return True if the first 4KB contain charset spec | ||||||
|  |     """ | ||||||
|  |     return s.lower().find('html; charset=', 0, 4096) != -1 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _prepend_utf8_declaration(s): | ||||||
|  |     """Prepend 'utf-8' encoding declaration if the first 4KB don't have any | ||||||
|  |     """ | ||||||
|  |     return s if _contains_charset_spec(s) else _UTF8_DECLARATION + s | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _rm_excessive_newlines(s): | ||||||
|  |     """Remove excessive newlines that often happen due to tons of divs | ||||||
|  |     """ | ||||||
|  |     return _RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip() | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _encode_utf8(s): | ||||||
|  |     """Encode in 'utf-8' if unicode | ||||||
|  |     """ | ||||||
|  |     return s.encode('utf-8') if isinstance(s, unicode) else s | ||||||
|  |  | ||||||
|  |  | ||||||
|  | _UTF8_DECLARATION = ('<meta http-equiv="Content-Type" content="text/html;' | ||||||
|  |                      'charset=utf-8">') | ||||||
|  |  | ||||||
|  |  | ||||||
|  | _BLOCKTAGS  = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] | ||||||
|  | _HARDBREAKS = ['br', 'hr', 'tr'] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") | ||||||
|   | |||||||
							
								
								
									
										87
									
								
								tests/fixtures/html_replies/ms_outlook_2010.html
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										87
									
								
								tests/fixtures/html_replies/ms_outlook_2010.html
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,87 @@ | |||||||
|  | <html> | ||||||
|  | <head> | ||||||
|  | <meta http-equiv="Content-Type" content="text/html; charset=iso-2022-jp"> | ||||||
|  | <meta name="Generator" content="Microsoft Word 14 (filtered medium)"> | ||||||
|  | <style><!-- | ||||||
|  | /* Font Definitions */ | ||||||
|  | @font-face | ||||||
|  | 	{font-family:Calibri; | ||||||
|  | 	panose-1:2 15 5 2 2 2 4 3 2 4;} | ||||||
|  | @font-face | ||||||
|  | 	{font-family:Tahoma; | ||||||
|  | 	panose-1:2 11 6 4 3 5 4 4 2 4;} | ||||||
|  | /* Style Definitions */ | ||||||
|  | p.MsoNormal, li.MsoNormal, div.MsoNormal | ||||||
|  | 	{margin:0in; | ||||||
|  | 	margin-bottom:.0001pt; | ||||||
|  | 	font-size:12.0pt; | ||||||
|  | 	font-family:"Times New Roman","serif";} | ||||||
|  | h3 | ||||||
|  | 	{mso-style-priority:9; | ||||||
|  | 	mso-style-link:"Heading 3 Char"; | ||||||
|  | 	mso-margin-top-alt:auto; | ||||||
|  | 	margin-right:0in; | ||||||
|  | 	mso-margin-bottom-alt:auto; | ||||||
|  | 	margin-left:0in; | ||||||
|  | 	font-size:13.5pt; | ||||||
|  | 	font-family:"Times New Roman","serif"; | ||||||
|  | 	font-weight:bold;} | ||||||
|  | a:link, span.MsoHyperlink | ||||||
|  | 	{mso-style-priority:99; | ||||||
|  | 	color:blue; | ||||||
|  | 	text-decoration:underline;} | ||||||
|  | a:visited, span.MsoHyperlinkFollowed | ||||||
|  | 	{mso-style-priority:99; | ||||||
|  | 	color:purple; | ||||||
|  | 	text-decoration:underline;} | ||||||
|  | p | ||||||
|  | 	{mso-style-priority:99; | ||||||
|  | 	mso-margin-top-alt:auto; | ||||||
|  | 	margin-right:0in; | ||||||
|  | 	mso-margin-bottom-alt:auto; | ||||||
|  | 	margin-left:0in; | ||||||
|  | 	font-size:12.0pt; | ||||||
|  | 	font-family:"Times New Roman","serif";} | ||||||
|  | span.Heading3Char | ||||||
|  | 	{mso-style-name:"Heading 3 Char"; | ||||||
|  | 	mso-style-priority:9; | ||||||
|  | 	mso-style-link:"Heading 3"; | ||||||
|  | 	font-family:"Cambria","serif"; | ||||||
|  | 	color:#4F81BD; | ||||||
|  | 	font-weight:bold;} | ||||||
|  | span.EmailStyle19 | ||||||
|  | 	{mso-style-type:personal-reply; | ||||||
|  | 	font-family:"Calibri","sans-serif"; | ||||||
|  | 	color:#1F497D;} | ||||||
|  | .MsoChpDefault | ||||||
|  | 	{mso-style-type:export-only; | ||||||
|  | 	font-family:"Calibri","sans-serif";} | ||||||
|  | @page WordSection1 | ||||||
|  | 	{size:8.5in 11.0in; | ||||||
|  | 	margin:1.0in 1.0in 1.0in 1.0in;} | ||||||
|  | div.WordSection1 | ||||||
|  | 	{page:WordSection1;} | ||||||
|  | --></style><!--[if gte mso 9]><xml> | ||||||
|  | <o:shapedefaults v:ext="edit" spidmax="1026" /> | ||||||
|  | </xml><![endif]--><!--[if gte mso 9]><xml> | ||||||
|  | <o:shapelayout v:ext="edit"> | ||||||
|  | <o:idmap v:ext="edit" data="1" /> | ||||||
|  | </o:shapelayout></xml><![endif]--> | ||||||
|  | </head> | ||||||
|  | <body lang="EN-US" link="blue" vlink="purple"> | ||||||
|  | <div class="WordSection1"> | ||||||
|  | <p class="MsoNormal"><span style="font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D">Hi. I am fine.<o:p></o:p></span></p> | ||||||
|  | <p class="MsoNormal"><span style="font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D">Thanks,<o:p></o:p></span></p> | ||||||
|  | <p class="MsoNormal"><span style="font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D">Alex<o:p></o:p></span></p> | ||||||
|  | <p class="MsoNormal"><b><span style="font-size:10.0pt;font-family:"Tahoma","sans-serif"">From:</span></b><span style="font-size:10.0pt;font-family:"Tahoma","sans-serif""> Foo [mailto:foo@bar.com] | ||||||
|  | <b>On Behalf Of </b>baz@bar.com<br> | ||||||
|  | <b>Sent:</b> Monday, January 01, 2000 12:00 AM<br> | ||||||
|  | <b>To:</b> john@bar.com<br> | ||||||
|  | <b>Cc:</b> jane@bar.io<br> | ||||||
|  | <b>Subject:</b> Conversation<o:p></o:p></span></p> | ||||||
|  | <p class="MsoNormal"><o:p> </o:p></p> | ||||||
|  | <p>Hello! How are you?<o:p></o:p></p> | ||||||
|  | <p class="MsoNormal"><o:p> </o:p></p> | ||||||
|  | </div> | ||||||
|  | </body> | ||||||
|  | </html> | ||||||
| @@ -5,9 +5,7 @@ from . fixtures import * | |||||||
|  |  | ||||||
| import regex as re | import regex as re | ||||||
|  |  | ||||||
| from talon import quotations | from talon import quotations, utils as u | ||||||
|  |  | ||||||
| import html2text |  | ||||||
|  |  | ||||||
|  |  | ||||||
| RE_WHITESPACE = re.compile("\s") | RE_WHITESPACE = re.compile("\s") | ||||||
| @@ -28,8 +26,8 @@ def test_quotation_splitter_inside_blockquote(): | |||||||
|  |  | ||||||
| </blockquote>""" | </blockquote>""" | ||||||
|  |  | ||||||
|     eq_("<html><body><p>Reply\n</p></body></html>", |     eq_("<html><body><p>Reply</p></body></html>", | ||||||
|         quotations.extract_from_html(msg_body)) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_quotation_splitter_outside_blockquote(): | def test_quotation_splitter_outside_blockquote(): | ||||||
| @@ -45,7 +43,7 @@ def test_quotation_splitter_outside_blockquote(): | |||||||
|   </div> |   </div> | ||||||
| </blockquote> | </blockquote> | ||||||
| """ | """ | ||||||
|     eq_("<html><body><p>Reply</p><div></div></body></html>", |     eq_("<html><body><p>Reply</p></body></html>", | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -63,7 +61,7 @@ def test_regular_blockquote(): | |||||||
|   </div> |   </div> | ||||||
| </blockquote> | </blockquote> | ||||||
| """ | """ | ||||||
|     eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote><div></div></body></html>", |     eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote></body></html>", | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -133,6 +131,18 @@ def test_gmail_quote(): | |||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_gmail_quote_blockquote(): | ||||||
|  |     msg_body = """Message | ||||||
|  | <blockquote class="gmail_quote"> | ||||||
|  |   <div class="gmail_default"> | ||||||
|  |     My name is William Shakespeare. | ||||||
|  |     <br/> | ||||||
|  |   </div> | ||||||
|  | </blockquote>""" | ||||||
|  |     eq_(RE_WHITESPACE.sub('', msg_body), | ||||||
|  |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_unicode_in_reply(): | def test_unicode_in_reply(): | ||||||
|     msg_body = u"""Reply \xa0 \xa0 Text<br> |     msg_body = u"""Reply \xa0 \xa0 Text<br> | ||||||
|  |  | ||||||
| @@ -140,7 +150,7 @@ def test_unicode_in_reply(): | |||||||
|   <br> |   <br> | ||||||
| </div> | </div> | ||||||
|  |  | ||||||
| <blockquote class="gmail_quote"> | <blockquote> | ||||||
|   Quote |   Quote | ||||||
| </blockquote>""".encode("utf-8") | </blockquote>""".encode("utf-8") | ||||||
|  |  | ||||||
| @@ -258,26 +268,15 @@ def test_reply_separated_by_hr(): | |||||||
|             '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) |             '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) | ||||||
|  |  | ||||||
|  |  | ||||||
| RE_REPLY = re.compile(r"^Hi\. I am fine\.\s*\n\s*Thanks,\s*\n\s*Alex\s*$") |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def extract_reply_and_check(filename): | def extract_reply_and_check(filename): | ||||||
|     f = open(filename) |     f = open(filename) | ||||||
|  |  | ||||||
|     msg_body = f.read() |     msg_body = f.read() | ||||||
|     reply = quotations.extract_from_html(msg_body) |     reply = quotations.extract_from_html(msg_body) | ||||||
|  |     plain_reply = u.html_to_text(reply) | ||||||
|  |  | ||||||
|     h = html2text.HTML2Text() |     eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"), | ||||||
|     h.body_width = 0 |         RE_WHITESPACE.sub('', plain_reply)) | ||||||
|     plain_reply = h.handle(reply) |  | ||||||
|  |  | ||||||
|     #remove   spaces |  | ||||||
|     plain_reply = plain_reply.replace(u'\xa0', u' ') |  | ||||||
|  |  | ||||||
|     if RE_REPLY.match(plain_reply): |  | ||||||
|         eq_(1, 1) |  | ||||||
|     else: |  | ||||||
|         eq_("Hi. I am fine.\n\nThanks,\nAlex", plain_reply) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_gmail_reply(): | def test_gmail_reply(): | ||||||
| @@ -300,6 +299,10 @@ def test_ms_outlook_2007_reply(): | |||||||
|     extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2007.html") |     extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2007.html") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_ms_outlook_2010_reply(): | ||||||
|  |     extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2010.html") | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_thunderbird_reply(): | def test_thunderbird_reply(): | ||||||
|     extract_reply_and_check("tests/fixtures/html_replies/thunderbird.html") |     extract_reply_and_check("tests/fixtures/html_replies/thunderbird.html") | ||||||
|  |  | ||||||
| @@ -315,7 +318,10 @@ def test_yandex_ru_reply(): | |||||||
| def test_CRLF(): | def test_CRLF(): | ||||||
|     """CR is not converted to '
' |     """CR is not converted to '
' | ||||||
|     """ |     """ | ||||||
|     eq_('<html>\r\n</html>', quotations.extract_from_html('<html>\r\n</html>')) |     symbol = '
' | ||||||
|  |     extracted = quotations.extract_from_html('<html>\r\n</html>') | ||||||
|  |     assert_false(symbol in extracted) | ||||||
|  |     eq_('<html></html>', RE_WHITESPACE.sub('', extracted)) | ||||||
|  |  | ||||||
|     msg_body = """Reply |     msg_body = """Reply | ||||||
| <blockquote> | <blockquote> | ||||||
| @@ -330,5 +336,7 @@ def test_CRLF(): | |||||||
|  |  | ||||||
| </blockquote>""" | </blockquote>""" | ||||||
|     msg_body = msg_body.replace('\n', '\r\n') |     msg_body = msg_body.replace('\n', '\r\n') | ||||||
|     eq_("<html><body><p>Reply\r\n</p></body></html>", |     extracted = quotations.extract_from_html(msg_body) | ||||||
|         quotations.extract_from_html(msg_body)) |     assert_false(symbol in extracted)     | ||||||
|  |     eq_("<html><body><p>Reply</p></body></html>", | ||||||
|  |         RE_WHITESPACE.sub('', extracted)) | ||||||
|   | |||||||
| @@ -29,15 +29,3 @@ def test_crash_inside_extract_from(): | |||||||
|  |  | ||||||
| def test_empty_body(): | def test_empty_body(): | ||||||
|     eq_('', quotations.extract_from_plain('')) |     eq_('', quotations.extract_from_plain('')) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test__CRLF_to_LF(): |  | ||||||
|     eq_(('\n\r', True), quotations._CRLF_to_LF('\r\n\r')) |  | ||||||
|     eq_(('\n\r', False), quotations._CRLF_to_LF('\n\r')) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def test__restore_CRLF(): |  | ||||||
|     eq_('\n', quotations._restore_CRLF('\n', replaced=False)) |  | ||||||
|     eq_('\r\n', quotations._restore_CRLF('\n', replaced=True))     |  | ||||||
|     # default |  | ||||||
|     eq_('\r\n', quotations._restore_CRLF('\n')) |  | ||||||
|   | |||||||
| @@ -58,3 +58,50 @@ def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect): | |||||||
|     detect_encoding.return_value = 'utf-8' |     detect_encoding.return_value = 'utf-8' | ||||||
|     eq_('utf-8', u.quick_detect_encoding("qwe")) |     eq_('utf-8', u.quick_detect_encoding("qwe")) | ||||||
|     ok_(detect_encoding.called) |     ok_(detect_encoding.called) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_html_to_text(): | ||||||
|  |     html = """<body> | ||||||
|  | <p>Hello world!</p> | ||||||
|  | <br> | ||||||
|  | <ul> | ||||||
|  | <li>One!</li> | ||||||
|  | <li>Two</li> | ||||||
|  | </ul> | ||||||
|  | <p> | ||||||
|  | Haha | ||||||
|  | </p> | ||||||
|  | </body>""" | ||||||
|  |     text = u.html_to_text(html) | ||||||
|  |     eq_("Hello world! \n\n  * One! \n  * Two \nHaha", text) | ||||||
|  |     eq_("привет!", u.html_to_text("<b>привет!</b>")) | ||||||
|  |  | ||||||
|  |     html = '<body><br/><br/>Hi</body>' | ||||||
|  |     eq_ ('Hi', u.html_to_text(html)) | ||||||
|  |  | ||||||
|  |     html = """Hi | ||||||
|  | <style type="text/css"> | ||||||
|  |  | ||||||
|  | div, p, li { | ||||||
|  |  | ||||||
|  | font: 13px 'Lucida Grande', Arial, sans-serif; | ||||||
|  |  | ||||||
|  | } | ||||||
|  | </style> | ||||||
|  |  | ||||||
|  | <style type="text/css"> | ||||||
|  |  | ||||||
|  | h1 { | ||||||
|  |  | ||||||
|  | font: 13px 'Lucida Grande', Arial, sans-serif; | ||||||
|  |  | ||||||
|  | } | ||||||
|  | </style>""" | ||||||
|  |     eq_ ('Hi', u.html_to_text(html)) | ||||||
|  |  | ||||||
|  |     html = """<div> | ||||||
|  | <!-- COMMENT 1 --> | ||||||
|  | <span>TEXT 1</span> | ||||||
|  | <p>TEXT 2 <!-- COMMENT 2 --></p> | ||||||
|  | </div>""" | ||||||
|  |     eq_('TEXT 1 \nTEXT 2', u.html_to_text(html)) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user