Compare commits
	
		
			2 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 16354e3528 | ||
|  | 1018e88ec1 | 
							
								
								
									
										4
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								setup.py
									
									
									
									
									
								
							| @@ -29,7 +29,7 @@ class InstallCommand(install): | |||||||
|  |  | ||||||
|  |  | ||||||
| setup(name='talon', | setup(name='talon', | ||||||
|       version='1.4.7', |       version='1.4.8', | ||||||
|       description=("Mailgun library " |       description=("Mailgun library " | ||||||
|                    "to extract message quotations and signatures."), |                    "to extract message quotations and signatures."), | ||||||
|       long_description=open("README.rst").read(), |       long_description=open("README.rst").read(), | ||||||
| @@ -48,7 +48,7 @@ setup(name='talon', | |||||||
|           "regex>=1", |           "regex>=1", | ||||||
|           "numpy", |           "numpy", | ||||||
|           "scipy", |           "scipy", | ||||||
|           "scikit-learn>=0.16.1", # pickled versions of classifier, else rebuild |           "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild | ||||||
|           'chardet>=1.0.1', |           'chardet>=1.0.1', | ||||||
|           'cchardet>=0.3.5', |           'cchardet>=0.3.5', | ||||||
|           'cssselect', |           'cssselect', | ||||||
|   | |||||||
| @@ -516,9 +516,69 @@ def _extract_from_html(msg_body): | |||||||
|     if _readable_text_empty(html_tree_copy): |     if _readable_text_empty(html_tree_copy): | ||||||
|         return msg_body |         return msg_body | ||||||
|  |  | ||||||
|  |     # NOTE: We remove_namespaces() because we are using an HTML5 Parser, HTML | ||||||
|  |     # parsers do not recognize namespaces in HTML tags. As such the rendered | ||||||
|  |     # HTML tags are no longer recognizable HTML tags. Example: <o:p> becomes | ||||||
|  |     # <oU0003Ap>. When we port this to golang we should look into using an | ||||||
|  |     # XML Parser NOT and HTML5 Parser since we do not know what input a | ||||||
|  |     # customer will send us. Switching to a common XML parser in python | ||||||
|  |     # opens us up to a host of vulnerabilities. | ||||||
|  |     # See https://docs.python.org/3/library/xml.html#xml-vulnerabilities | ||||||
|  |     # | ||||||
|  |     # The down sides to removing the namespaces is that customers might | ||||||
|  |     # judge the XML namespaces important. If that is the case then support | ||||||
|  |     # should encourage customers to preform XML parsing of the un-stripped | ||||||
|  |     # body to get the full unmodified XML payload. | ||||||
|  |     # | ||||||
|  |     # Alternatives to this approach are | ||||||
|  |     # 1. Ignore the U0003A in tag names and let the customer deal with it. | ||||||
|  |     #    This is not ideal, as most customers use stripped-html for viewing | ||||||
|  |     #    emails sent from a recipient, as such they cannot control the HTML | ||||||
|  |     #    provided by a recipient. | ||||||
|  |     # 2. Preform a string replace of 'U0003A' to ':' on the rendered HTML | ||||||
|  |     #    string. While this would solve the issue simply, it runs the risk | ||||||
|  |     #    of replacing data outside the <tag> which might be essential to | ||||||
|  |     #    the customer. | ||||||
|  |     remove_namespaces(html_tree_copy) | ||||||
|     return html.tostring(html_tree_copy) |     return html.tostring(html_tree_copy) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def remove_namespaces(root): | ||||||
|  |     """ | ||||||
|  |     Given the root of an HTML document iterate through all the elements | ||||||
|  |     and remove any namespaces that might have been provided and remove | ||||||
|  |     any attributes that contain a namespace | ||||||
|  |  | ||||||
|  |     <html xmlns:o="urn:schemas-microsoft-com:office:office"> | ||||||
|  |     becomes | ||||||
|  |     <html> | ||||||
|  |  | ||||||
|  |     <o:p>Hi</o:p> | ||||||
|  |     becomes | ||||||
|  |     <p>Hi</p> | ||||||
|  |  | ||||||
|  |     Start tags do NOT have a namespace; COLON characters have no special meaning. | ||||||
|  |     if we don't remove the namespace the parser translates the tag name into a | ||||||
|  |     unicode representation. For example <o:p> becomes <oU0003Ap> | ||||||
|  |  | ||||||
|  |     See https://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#start-tags | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     """ | ||||||
|  |     for child in root.iter(): | ||||||
|  |         for key, value in child.attrib.items(): | ||||||
|  |             # If the attribute includes a colon | ||||||
|  |             if key.rfind("U0003A") != -1: | ||||||
|  |                 child.attrib.pop(key) | ||||||
|  |  | ||||||
|  |         # If the tag includes a colon | ||||||
|  |         idx = child.tag.rfind("U0003A") | ||||||
|  |         if idx != -1: | ||||||
|  |             child.tag = child.tag[idx+6:] | ||||||
|  |  | ||||||
|  |     return root | ||||||
|  |  | ||||||
|  |  | ||||||
| def split_emails(msg): | def split_emails(msg): | ||||||
|     """ |     """ | ||||||
|     Given a message (which may consist of an email conversation thread with |     Given a message (which may consist of an email conversation thread with | ||||||
|   | |||||||
| @@ -8,6 +8,7 @@ import re | |||||||
| from talon import quotations, utils as u | from talon import quotations, utils as u | ||||||
| from . import * | from . import * | ||||||
| from .fixtures import * | from .fixtures import * | ||||||
|  | from lxml import html | ||||||
|  |  | ||||||
| RE_WHITESPACE = re.compile("\s") | RE_WHITESPACE = re.compile("\s") | ||||||
| RE_DOUBLE_WHITESPACE = re.compile("\s") | RE_DOUBLE_WHITESPACE = re.compile("\s") | ||||||
| @@ -424,3 +425,23 @@ def test_readable_html_empty(): | |||||||
| def test_bad_html(): | def test_bad_html(): | ||||||
|     bad_html = "<html></html>" |     bad_html = "<html></html>" | ||||||
|     eq_(bad_html, quotations.extract_from_html(bad_html)) |     eq_(bad_html, quotations.extract_from_html(bad_html)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_remove_namespaces(): | ||||||
|  |     msg_body = """ | ||||||
|  |     <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns="http://www.w3.org/TR/REC-html40"> | ||||||
|  |         <body> | ||||||
|  |             <o:p>Dear Sir,</o:p> | ||||||
|  |             <o:p>Thank you for the email.</o:p> | ||||||
|  |             <blockquote>thing</blockquote> | ||||||
|  |         </body> | ||||||
|  |     </html> | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     rendered = quotations.extract_from_html(msg_body) | ||||||
|  |  | ||||||
|  |     assert_true("<p>" in rendered) | ||||||
|  |     assert_true("xmlns" in rendered) | ||||||
|  |  | ||||||
|  |     assert_true("<o:p>" not in rendered) | ||||||
|  |     assert_true("<xmlns:o>" not in rendered) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user