Merge pull request #226 from mailgun/maxim/develop

PIP-1562: Remove max tags limit [python3]
Expose extract_from_html_tree
2022-01-06 15:24:57 +03:00 · 2022-01-06 15:16:43 +03:00 · 2022-01-06 14:18:11 +03:00 · 2021-11-19 13:11:29 +03:00 · 2021-11-19 11:12:26 +03:00 · 2021-11-11 16:29:30 +03:00
9 changed files with 66 additions and 116 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,6 @@ joblib
 lxml>=2.3.3
 numpy
 regex>=1
-scikit-learn==0.24.1 # pickled versions of classifier, else rebuild
+scikit-learn>=1.0.0
 scipy
 six>=1.10.0
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):


 setup(name='talon',
-      version='1.4.8',
+      version='1.5.0',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
@@ -44,21 +44,21 @@ setup(name='talon',
      include_package_data=True,
      zip_safe=True,
      install_requires=[
-          "lxml>=2.3.3",
-          "regex>=1",
+          "lxml",
+          "regex",
          "numpy",
          "scipy",
-          "scikit-learn==0.24.1", # pickled versions of classifier, else rebuild
-          "chardet>=1.0.1",
-          "cchardet>=0.3.5",
+          "scikit-learn>=1.0.0",
+          "chardet",
+          "cchardet",
          "cssselect",
-          "six>=1.10.0",
+          "six",
          "html5lib",
          "joblib",
          ],
      tests_require=[
          "mock",
-          "nose>=1.2.1",
+          "nose",
          "coverage"
          ]
      )
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -193,9 +193,6 @@ RE_PARENTHESIS_LINK = re.compile("\(https?://")

 SPLITTER_MAX_LINES = 6
 MAX_LINES_COUNT = 1000
-# an extensive research shows that exceeding this limit
-# leads to excessive processing time
-MAX_HTML_LEN = 2794202

 QUOT_PATTERN = re.compile('^>+ ?')
 NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
@@ -421,25 +418,31 @@ def extract_from_html(msg_body):

    Returns a unicode string.
    """
+    msg_body_bytes = msg_body
    if isinstance(msg_body, six.text_type):
-        msg_body = msg_body.encode('utf8')
-    elif not isinstance(msg_body, bytes):
-        msg_body = msg_body.encode('ascii')
+        msg_body_bytes = msg_body.encode('utf8')

-    result = _extract_from_html(msg_body)
-    if isinstance(result, bytes):
-        result = result.decode('utf8')
+    if msg_body_bytes.strip() == b'':
+        return msg_body
+
+    msg_body_bytes = msg_body_bytes.replace(b'\r\n', b'\n')
+    # Cut out xml and doctype tags to avoid conflict with unicode decoding.
+    msg_body_bytes = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", b"", msg_body_bytes)
+    html_tree = html_document_fromstring(msg_body_bytes)
+    if html_tree is None:
+        return msg_body
+
+    result = extract_from_html_tree(html_tree)
+    if not result:
+        return msg_body

    return result


-def _extract_from_html(msg_body):
+def extract_from_html_tree(html_tree):
    """
-    Extract not quoted message from provided html message body
-    using tags and plain text algorithm.
-
-    Cut out first some encoding html tags such as xml and doctype
-    for avoiding conflict with unicode decoding
+    Extract not quoted message from provided parsed html tree using tags and
+    plain text algorithm.

    Cut out the 'blockquote', 'gmail_quote' tags.
    Cut Microsoft quotations.
@@ -452,18 +455,6 @@ def _extract_from_html(msg_body):
    then checking deleted checkpoints,
    then deleting necessary tags.
    """
-    if msg_body.strip() == b'':
-        return msg_body
-
-    msg_body = msg_body.replace(b'\r\n', b'\n')
-
-    msg_body = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
-
-    html_tree = html_document_fromstring(msg_body)
-
-    if html_tree is None:
-        return msg_body
-
    cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
                      html_quotations.cut_zimbra_quote(html_tree) or
                      html_quotations.cut_blockquote(html_tree) or
@@ -481,7 +472,7 @@ def _extract_from_html(msg_body):

    # Don't process too long messages
    if len(lines) > MAX_LINES_COUNT:
-        return msg_body
+        return None

    # Collect checkpoints on each line
    line_checkpoints = [
@@ -500,7 +491,7 @@ def _extract_from_html(msg_body):
    lines_were_deleted, first_deleted, last_deleted = return_flags

    if not lines_were_deleted and not cut_quotations:
-        return msg_body
+        return None

    if lines_were_deleted:
        #collect checkpoints from deleted lines
@@ -514,7 +505,7 @@ def _extract_from_html(msg_body):
        )

    if _readable_text_empty(html_tree_copy):
-        return msg_body
+        return None

    # NOTE: We remove_namespaces() because we are using an HTML5 Parser, HTML
    # parsers do not recognize namespaces in HTML tags. As such the rendered
@@ -540,7 +531,11 @@ def _extract_from_html(msg_body):
    #    of replacing data outside the <tag> which might be essential to
    #    the customer.
    remove_namespaces(html_tree_copy)
-    return html.tostring(html_tree_copy)
+    s = html.tostring(html_tree_copy)
+    if not s:
+        return None
+
+    return s.decode('utf-8')


 def remove_namespaces(root):
--- a/talon/signature/init.py
+++ b/talon/signature/init.py
@@ -23,17 +23,14 @@ trained against, don't forget to regenerate:
 from __future__ import absolute_import
 import os

-from . import extraction
-from . extraction import extract  #noqa
-from . learning import classifier
-
-
-DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
-
-EXTRACTOR_FILENAME = os.path.join(DATA_DIR, 'classifier')
-EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data')
+from talon.signature import extraction
+from talon.signature.extraction import extract
+from talon.signature.learning import classifier


 def initialize():
-    extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME,
-                                           EXTRACTOR_DATA)
+    data_dir = os.path.join(os.path.dirname(__file__), 'data')
+    extractor_filename = os.path.join(data_dir, 'classifier')
+    extractor_data_filename = os.path.join(data_dir, 'train.data')
+    extraction.EXTRACTOR = classifier.load(extractor_filename,
+                                           extractor_data_filename)
--- a/talon/signature/data/classifier
+++ b/talon/signature/data/classifier
--- a/talon/signature/learning/helpers.py
+++ b/talon/signature/learning/helpers.py
@@ -102,7 +102,7 @@ def flatten_list(list_to_flatten):


 def contains_sender_names(sender):
-    '''Returns a functions to search sender\'s name or it\'s part.
+    """Returns a functions to search sender\'s name or it\'s part.

    >>> feature = contains_sender_names("Sergey N.  Obukhov <xxx@example.com>")
    >>> feature("Sergey Obukhov")
@@ -115,7 +115,7 @@ def contains_sender_names(sender):
    1
    >>> contains_sender_names("<serobnic@mail.ru>")("serobnic")
    1
-    '''
+    """
    names = '( |$)|'.join(flatten_list([[e, e.capitalize()]
                                        for e in extract_names(sender)]))
    names = names or sender
@@ -140,10 +140,16 @@ def extract_names(sender):
    sender = "".join([char if char.isalpha() else ' ' for char in sender])
    # Remove too short words and words from "black" list i.e.
    # words like `ru`, `gmail`, `com`, `org`, etc.
-    sender = [word for word in sender.split() if len(word) > 1 and
-              not word in BAD_SENDER_NAMES]
-    # Remove duplicates
-    names = list(set(sender))
+    names = list()
+    for word in sender.split():
+        if len(word) < 2:
+            continue
+        if word in BAD_SENDER_NAMES:
+            continue
+        if word in names:
+            continue
+        names.append(word)
+
    return names


@@ -208,20 +214,26 @@ def many_capitalized_words(s):


 def has_signature(body, sender):
-    '''Checks if the body has signature. Returns True or False.'''
+    """Checks if the body has signature. Returns True or False."""
    non_empty = [line for line in body.splitlines() if line.strip()]
    candidate = non_empty[-SIGNATURE_MAX_LINES:]
    upvotes = 0
+    sender_check = contains_sender_names(sender)
    for line in candidate:
        # we check lines for sender's name, phone, email and url,
        # those signature lines don't take more then 27 lines
        if len(line.strip()) > 27:
            continue
-        elif contains_sender_names(sender)(line):
+
+        if sender_check(line):
            return True
-        elif (binary_regex_search(RE_RELAX_PHONE)(line) +
-              binary_regex_search(RE_EMAIL)(line) +
-              binary_regex_search(RE_URL)(line) == 1):
+
+        if (binary_regex_search(RE_RELAX_PHONE)(line) +
+                binary_regex_search(RE_EMAIL)(line) +
+                binary_regex_search(RE_URL)(line) == 1):
            upvotes += 1
+
    if upvotes > 1:
        return True
+
+    return False
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -180,9 +180,6 @@ def html_fromstring(s):
    if isinstance(s, six.text_type):
        s = s.encode('utf8')
    try:
-        if html_too_big(s):
-            return None
-
        return html5parser.fromstring(s, parser=_html5lib_parser())
    except Exception:
        pass
@@ -194,9 +191,6 @@ def html_document_fromstring(s):
    if isinstance(s, six.text_type):
        s = s.encode('utf8')
    try:
-        if html_too_big(s):
-            return None
-
        return html5parser.document_fromstring(s, parser=_html5lib_parser())
    except Exception:
        pass
@@ -206,12 +200,6 @@ def cssselect(expr, tree):
    return CSSSelector(expr)(tree)


-def html_too_big(s):
-    if isinstance(s, six.text_type):
-        s = s.encode('utf8')
-    return s.count(b'<') > _MAX_TAGS_COUNT
-
-
 def _contains_charset_spec(s):
    """Return True if the first 4KB contain charset spec
    """
@@ -258,7 +246,3 @@ _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
 _HARDBREAKS = ['br', 'hr', 'tr']

 _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
-
-# an extensive research shows that exceeding this limit
-# might lead to excessive processing time
-_MAX_TAGS_COUNT = 419
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -391,18 +391,6 @@ def test_gmail_forwarded_msg():
    eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))


-@patch.object(u, '_MAX_TAGS_COUNT', 4)
-def test_too_large_html():
-    msg_body = 'Reply' \
-               '<div class="gmail_quote">' \
-               '<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:' \
-               '<div>Test</div>' \
-               '</div>' \
-               '</div>'
-    eq_(RE_WHITESPACE.sub('', msg_body),
-        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
-
-
 def test_readable_html_empty():
    msg_body = """
 <blockquote>
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -125,39 +125,13 @@ def test_html_fromstring_exception():
    eq_(None, u.html_fromstring("<html></html>"))


-@patch.object(u, 'html_too_big', Mock())
-@patch.object(u.html5parser, 'fromstring')
-def test_html_fromstring_too_big(fromstring):
-    eq_(None, u.html_fromstring("<html></html>"))
-    assert_false(fromstring.called)
-
-
@patch.object(u.html5parser, 'document_fromstring')
 def test_html_document_fromstring_exception(document_fromstring):
    document_fromstring.side_effect = Exception()
    eq_(None, u.html_document_fromstring("<html></html>"))


-@patch.object(u, 'html_too_big', Mock())
-@patch.object(u.html5parser, 'document_fromstring')
-def test_html_document_fromstring_too_big(document_fromstring):
-    eq_(None, u.html_document_fromstring("<html></html>"))
-    assert_false(document_fromstring.called)
-
-
@patch.object(u, 'html_fromstring', Mock(return_value=None))
 def test_bad_html_to_text():
    bad_html = "one<br>two<br>three"
    eq_(None, u.html_to_text(bad_html))
-
-
-@patch.object(u, '_MAX_TAGS_COUNT', 3)
-def test_html_too_big():
-    eq_(False, u.html_too_big("<div></div>"))
-    eq_(True, u.html_too_big("<div><span>Hi</span></div>"))
-
-
-@patch.object(u, '_MAX_TAGS_COUNT', 3)
-def test_html_to_text():
-    eq_(b"Hello", u.html_to_text("<div>Hello</div>"))
-    eq_(None, u.html_to_text("<div><span>Hi</span></div>"))
Author	SHA1	Message	Date
Maxim Vladimirskiy	a8c7e6a972	Merge pull request #226 from mailgun/maxim/develop PIP-1562: Remove max tags limit [python3]	2022-01-06 15:24:57 +03:00
Maxim Vladimirskiy	b30c375c5b	Expose extract_from_html_tree	2022-01-06 15:16:43 +03:00
Maxim Vladimirskiy	cec5acf58f	Remove max tags limit	2022-01-06 14:18:11 +03:00
Maxim Vladimirskiy	24d0f2d00a	Merge pull request #223 from mailgun/maxim/develop PIP-1509: Optimise sender name check [python3]	2021-11-19 13:11:29 +03:00
Maxim Vladimirskiy	94007b0b92	Optimise sender name check	2021-11-19 11:12:26 +03:00
Maxim Vladimirskiy	1a5548f171	Merge pull request #222 from mailgun/maxim/develop PIP-1409: Remove version pins from setup.py [python3]	2021-11-11 16:29:30 +03:00
Maxim Vladimirskiy	53c49b9121	Remove version pins from setup.py	2021-11-11 15:36:50 +03:00
Matt Dietz	bd50872043	Merge pull request #217 from mailgun/dietz/REP-1030 Drops Python 2 support [python3]	2021-06-15 09:46:29 -05:00