Merge pull request #191 from mailgun/thrawn/develop

PIP-423: Now removing namespaces from parsed HTML
Now removing namespaces from parsed HTML
2019-05-12 11:54:17 +03:00 · 2019-05-10 11:16:12 -05:00
3 changed files with 83 additions and 2 deletions
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):
 setup(name='talon',
-      version='1.4.7',
+      version='1.4.8',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
@@ -48,7 +48,7 @@ setup(name='talon',
          "regex>=1",
          "numpy",
          "scipy",
-          "scikit-learn>=0.16.1", # pickled versions of classifier, else rebuild
+          "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
          'chardet>=1.0.1',
          'cchardet>=0.3.5',
          'cssselect',
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -516,9 +516,69 @@ def _extract_from_html(msg_body):
    if _readable_text_empty(html_tree_copy):
        return msg_body
    # NOTE: We remove_namespaces() because we are using an HTML5 Parser, HTML
    # parsers do not recognize namespaces in HTML tags. As such the rendered
    # HTML tags are no longer recognizable HTML tags. Example: <o:p> becomes
    # <oU0003Ap>. When we port this to golang we should look into using an
    # XML Parser NOT and HTML5 Parser since we do not know what input a
    # customer will send us. Switching to a common XML parser in python
    # opens us up to a host of vulnerabilities.
    # See https://docs.python.org/3/library/xml.html#xml-vulnerabilities
    #
    # The down sides to removing the namespaces is that customers might
    # judge the XML namespaces important. If that is the case then support
    # should encourage customers to preform XML parsing of the un-stripped
    # body to get the full unmodified XML payload.
    #
    # Alternatives to this approach are
    # 1. Ignore the U0003A in tag names and let the customer deal with it.
    #    This is not ideal, as most customers use stripped-html for viewing
    #    emails sent from a recipient, as such they cannot control the HTML
    #    provided by a recipient.
    # 2. Preform a string replace of 'U0003A' to ':' on the rendered HTML
    #    string. While this would solve the issue simply, it runs the risk
    #    of replacing data outside the <tag> which might be essential to
    #    the customer.
    remove_namespaces(html_tree_copy)
    return html.tostring(html_tree_copy)
 def remove_namespaces(root):
    """
    Given the root of an HTML document iterate through all the elements
    and remove any namespaces that might have been provided and remove
    any attributes that contain a namespace
    <html xmlns:o="urn:schemas-microsoft-com:office:office">
    becomes
    <html>
    <o:p>Hi</o:p>
    becomes
    <p>Hi</p>
    Start tags do NOT have a namespace; COLON characters have no special meaning.
    if we don't remove the namespace the parser translates the tag name into a
    unicode representation. For example <o:p> becomes <oU0003Ap>
    See https://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#start-tags
    """
    for child in root.iter():
        for key, value in child.attrib.items():
            # If the attribute includes a colon
            if key.rfind("U0003A") != -1:
                child.attrib.pop(key)
        # If the tag includes a colon
        idx = child.tag.rfind("U0003A")
        if idx != -1:
            child.tag = child.tag[idx+6:]
    return root
 def split_emails(msg):
    """
    Given a message (which may consist of an email conversation thread with
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -8,6 +8,7 @@ import re
 from talon import quotations, utils as u
 from . import *
 from .fixtures import *
 from lxml import html
 RE_WHITESPACE = re.compile("\s")
 RE_DOUBLE_WHITESPACE = re.compile("\s")
@@ -424,3 +425,23 @@ def test_readable_html_empty():
 def test_bad_html():
    bad_html = "<html></html>"
    eq_(bad_html, quotations.extract_from_html(bad_html))
 def test_remove_namespaces():
    msg_body = """
    <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns="http://www.w3.org/TR/REC-html40">
        <body>
            <o:p>Dear Sir,</o:p>
            <o:p>Thank you for the email.</o:p>
            <blockquote>thing</blockquote>
        </body>
    </html>
    """
    rendered = quotations.extract_from_html(msg_body)
    assert_true("<p>" in rendered)
    assert_true("xmlns" in rendered)
    assert_true("<o:p>" not in rendered)
    assert_true("<xmlns:o>" not in rendered)
Author	SHA1	Message	Date
Sergey Obukhov	16354e3528	Merge pull request #191 from mailgun/thrawn/develop PIP-423: Now removing namespaces from parsed HTML	2019-05-12 11:54:17 +03:00
Derrick J. Wippler	1018e88ec1	Now removing namespaces from parsed HTML	2019-05-10 11:16:12 -05:00