Merge pull request #226 from mailgun/maxim/develop

PIP-1562: Remove max tags limit [python3]
Expose extract_from_html_tree
2022-01-06 15:24:57 +03:00 · 2022-01-06 15:16:43 +03:00 · 2022-01-06 14:18:11 +03:00 · 2021-11-19 13:11:29 +03:00 · 2021-11-19 11:12:26 +03:00 · 2021-11-11 16:29:30 +03:00
16 changed files with 2712 additions and 2527 deletions
--- a/.build/Dockerfile
+++ b/.build/Dockerfile
@@ -0,0 +1,20 @@
+FROM python:3.9-slim-buster AS deps
+
+RUN apt-get update && \
+    apt-get install -y build-essential git curl python3-dev libatlas3-base libatlas-base-dev liblapack-dev libxml2 libxml2-dev libffi6 libffi-dev musl-dev libxslt-dev
+
+FROM deps AS testable
+ARG REPORT_PATH
+
+VOLUME ["/var/mailgun", "/etc/mailgun/ssl", ${REPORT_PATH}]
+
+ADD . /app
+WORKDIR /app
+COPY wheel/* /wheel/
+
+RUN mkdir -p ${REPORT_PATH}
+
+RUN python ./setup.py build bdist_wheel -d /wheel && \
+    pip install --no-deps /wheel/*
+
+ENTRYPOINT ["/bin/sh", "/app/run_tests.sh"]
--- a/.gitignore
+++ b/.gitignore
@@ -54,3 +54,6 @@ _trial_temp

 # OSX
 .DS_Store
+
+# vim-backup
+*.bak
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,11 @@
+chardet>=1.0.1
+cchardet>=0.3.5
+cssselect
+html5lib
+joblib
+lxml>=2.3.3
+numpy
+regex>=1
+scikit-learn>=1.0.0
+scipy
+six>=1.10.0
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+set -ex
+REPORT_PATH="${REPORT_PATH:-./}"
+nosetests --with-xunit --with-coverage --cover-xml --cover-xml-file $REPORT_PATH/coverage.xml --xunit-file=$REPORT_PATH/nosetests.xml --cover-package=talon .
--- a/setup.py
+++ b/setup.py
@@ -19,17 +19,17 @@ class InstallCommand(install):
        if self.no_ml:
            dist = self.distribution
            dist.packages=find_packages(exclude=[
-                'tests',
-                'tests.*',
-                'talon.signature',
-                'talon.signature.*',
+                "tests",
+                "tests.*",
+                "talon.signature",
+                "talon.signature.*",
            ])
-            for not_required in ['numpy', 'scipy', 'scikit-learn==0.16.1']:
+            for not_required in ["numpy", "scipy", "scikit-learn==0.24.1"]:
                dist.install_requires.remove(not_required)


 setup(name='talon',
-      version='1.4.8',
+      version='1.5.0',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
@@ -44,20 +44,21 @@ setup(name='talon',
      include_package_data=True,
      zip_safe=True,
      install_requires=[
-          "lxml>=2.3.3",
-          "regex>=1",
+          "lxml",
+          "regex",
          "numpy",
          "scipy",
-          "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
-          'chardet>=1.0.1',
-          'cchardet>=0.3.5',
-          'cssselect',
-          'six>=1.10.0',
-          'html5lib'
+          "scikit-learn>=1.0.0",
+          "chardet",
+          "cchardet",
+          "cssselect",
+          "six",
+          "html5lib",
+          "joblib",
          ],
      tests_require=[
          "mock",
-          "nose>=1.2.1",
+          "nose",
          "coverage"
          ]
      )
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -193,9 +193,6 @@ RE_PARENTHESIS_LINK = re.compile("\(https?://")

 SPLITTER_MAX_LINES = 6
 MAX_LINES_COUNT = 1000
-# an extensive research shows that exceeding this limit
-# leads to excessive processing time
-MAX_HTML_LEN = 2794202

 QUOT_PATTERN = re.compile('^>+ ?')
 NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
@@ -421,25 +418,31 @@ def extract_from_html(msg_body):

    Returns a unicode string.
    """
+    msg_body_bytes = msg_body
    if isinstance(msg_body, six.text_type):
-        msg_body = msg_body.encode('utf8')
-    elif not isinstance(msg_body, bytes):
-        msg_body = msg_body.encode('ascii')
+        msg_body_bytes = msg_body.encode('utf8')

-    result = _extract_from_html(msg_body)
-    if isinstance(result, bytes):
-        result = result.decode('utf8')
+    if msg_body_bytes.strip() == b'':
+        return msg_body
+
+    msg_body_bytes = msg_body_bytes.replace(b'\r\n', b'\n')
+    # Cut out xml and doctype tags to avoid conflict with unicode decoding.
+    msg_body_bytes = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", b"", msg_body_bytes)
+    html_tree = html_document_fromstring(msg_body_bytes)
+    if html_tree is None:
+        return msg_body
+
+    result = extract_from_html_tree(html_tree)
+    if not result:
+        return msg_body

    return result


-def _extract_from_html(msg_body):
+def extract_from_html_tree(html_tree):
    """
-    Extract not quoted message from provided html message body
-    using tags and plain text algorithm.
-
-    Cut out first some encoding html tags such as xml and doctype
-    for avoiding conflict with unicode decoding
+    Extract not quoted message from provided parsed html tree using tags and
+    plain text algorithm.

    Cut out the 'blockquote', 'gmail_quote' tags.
    Cut Microsoft quotations.
@@ -452,18 +455,6 @@ def _extract_from_html(msg_body):
    then checking deleted checkpoints,
    then deleting necessary tags.
    """
-    if msg_body.strip() == b'':
-        return msg_body
-
-    msg_body = msg_body.replace(b'\r\n', b'\n')
-
-    msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
-
-    html_tree = html_document_fromstring(msg_body)
-
-    if html_tree is None:
-        return msg_body
-
    cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
                      html_quotations.cut_zimbra_quote(html_tree) or
                      html_quotations.cut_blockquote(html_tree) or
@@ -481,7 +472,7 @@ def _extract_from_html(msg_body):

    # Don't process too long messages
    if len(lines) > MAX_LINES_COUNT:
-        return msg_body
+        return None

    # Collect checkpoints on each line
    line_checkpoints = [
@@ -500,7 +491,7 @@ def _extract_from_html(msg_body):
    lines_were_deleted, first_deleted, last_deleted = return_flags

    if not lines_were_deleted and not cut_quotations:
-        return msg_body
+        return None

    if lines_were_deleted:
        #collect checkpoints from deleted lines
@@ -514,7 +505,7 @@ def _extract_from_html(msg_body):
        )

    if _readable_text_empty(html_tree_copy):
-        return msg_body
+        return None

    # NOTE: We remove_namespaces() because we are using an HTML5 Parser, HTML
    # parsers do not recognize namespaces in HTML tags. As such the rendered
@@ -540,7 +531,11 @@ def _extract_from_html(msg_body):
    #    of replacing data outside the <tag> which might be essential to
    #    the customer.
    remove_namespaces(html_tree_copy)
-    return html.tostring(html_tree_copy)
+    s = html.tostring(html_tree_copy)
+    if not s:
+        return None
+
+    return s.decode('utf-8')


 def remove_namespaces(root):
--- a/talon/signature/init.py
+++ b/talon/signature/init.py
@@ -23,17 +23,14 @@ trained against, don't forget to regenerate:
 from __future__ import absolute_import
 import os

-from . import extraction
-from . extraction import extract  #noqa
-from . learning import classifier
-
-
-DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
-
-EXTRACTOR_FILENAME = os.path.join(DATA_DIR, 'classifier')
-EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data')
+from talon.signature import extraction
+from talon.signature.extraction import extract
+from talon.signature.learning import classifier


 def initialize():
-    extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME,
-                                           EXTRACTOR_DATA)
+    data_dir = os.path.join(os.path.dirname(__file__), 'data')
+    extractor_filename = os.path.join(data_dir, 'classifier')
+    extractor_data_filename = os.path.join(data_dir, 'train.data')
+    extraction.EXTRACTOR = classifier.load(extractor_filename,
+                                           extractor_data_filename)
--- a/talon/signature/data/classifier
+++ b/talon/signature/data/classifier
--- a/talon/signature/data/train.data
+++ b/talon/signature/data/train.data
--- a/talon/signature/learning/classifier.py
+++ b/talon/signature/learning/classifier.py
@@ -8,7 +8,7 @@ body belongs to the signature.
 from __future__ import absolute_import

 from numpy import genfromtxt
-from sklearn.externals import joblib
+import joblib
 from sklearn.svm import LinearSVC


--- a/talon/signature/learning/helpers.py
+++ b/talon/signature/learning/helpers.py
@@ -102,7 +102,7 @@ def flatten_list(list_to_flatten):


 def contains_sender_names(sender):
-    '''Returns a functions to search sender\'s name or it\'s part.
+    """Returns a functions to search sender\'s name or it\'s part.

    >>> feature = contains_sender_names("Sergey N.  Obukhov <xxx@example.com>")
    >>> feature("Sergey Obukhov")
@@ -115,7 +115,7 @@ def contains_sender_names(sender):
    1
    >>> contains_sender_names("<serobnic@mail.ru>")("serobnic")
    1
-    '''
+    """
    names = '( |$)|'.join(flatten_list([[e, e.capitalize()]
                                        for e in extract_names(sender)]))
    names = names or sender
@@ -140,10 +140,16 @@ def extract_names(sender):
    sender = "".join([char if char.isalpha() else ' ' for char in sender])
    # Remove too short words and words from "black" list i.e.
    # words like `ru`, `gmail`, `com`, `org`, etc.
-    sender = [word for word in sender.split() if len(word) > 1 and
-              not word in BAD_SENDER_NAMES]
-    # Remove duplicates
-    names = list(set(sender))
+    names = list()
+    for word in sender.split():
+        if len(word) < 2:
+            continue
+        if word in BAD_SENDER_NAMES:
+            continue
+        if word in names:
+            continue
+        names.append(word)
+
    return names


@@ -208,20 +214,26 @@ def many_capitalized_words(s):


 def has_signature(body, sender):
-    '''Checks if the body has signature. Returns True or False.'''
+    """Checks if the body has signature. Returns True or False."""
    non_empty = [line for line in body.splitlines() if line.strip()]
    candidate = non_empty[-SIGNATURE_MAX_LINES:]
    upvotes = 0
+    sender_check = contains_sender_names(sender)
    for line in candidate:
        # we check lines for sender's name, phone, email and url,
        # those signature lines don't take more then 27 lines
        if len(line.strip()) > 27:
            continue
-        elif contains_sender_names(sender)(line):
+
+        if sender_check(line):
            return True
-        elif (binary_regex_search(RE_RELAX_PHONE)(line) +
+
+        if (binary_regex_search(RE_RELAX_PHONE)(line) +
                binary_regex_search(RE_EMAIL)(line) +
                binary_regex_search(RE_URL)(line) == 1):
            upvotes += 1
+
    if upvotes > 1:
        return True
+
+    return False
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -180,9 +180,6 @@ def html_fromstring(s):
    if isinstance(s, six.text_type):
        s = s.encode('utf8')
    try:
-        if html_too_big(s):
-            return None
-
        return html5parser.fromstring(s, parser=_html5lib_parser())
    except Exception:
        pass
@@ -194,9 +191,6 @@ def html_document_fromstring(s):
    if isinstance(s, six.text_type):
        s = s.encode('utf8')
    try:
-        if html_too_big(s):
-            return None
-
        return html5parser.document_fromstring(s, parser=_html5lib_parser())
    except Exception:
        pass
@@ -206,12 +200,6 @@ def cssselect(expr, tree):
    return CSSSelector(expr)(tree)


-def html_too_big(s):
-    if isinstance(s, six.text_type):
-        s = s.encode('utf8')
-    return s.count(b'<') > _MAX_TAGS_COUNT
-
-
 def _contains_charset_spec(s):
    """Return True if the first 4KB contain charset spec
    """
@@ -258,7 +246,3 @@ _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
 _HARDBREAKS = ['br', 'hr', 'tr']

 _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
-
-# an extensive research shows that exceeding this limit
-# might lead to excessive processing time
-_MAX_TAGS_COUNT = 419
--- a/test-requirements.txt
+++ b/test-requirements.txt
@@ -0,0 +1,3 @@
+coverage
+mock
+nose>=1.2.1
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -391,18 +391,6 @@ def test_gmail_forwarded_msg():
    eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))


-@patch.object(u, '_MAX_TAGS_COUNT', 4)
-def test_too_large_html():
-    msg_body = 'Reply' \
-               '<div class="gmail_quote">' \
-               '<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:' \
-               '<div>Test</div>' \
-               '</div>' \
-               '</div>'
-    eq_(RE_WHITESPACE.sub('', msg_body),
-        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
-
-
 def test_readable_html_empty():
    msg_body = """
 <blockquote>
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -826,10 +826,10 @@ The user experience was unparallelled. Please continue production. I'm sending p
 that this line is intact."""

    parsed = quotations.extract_from_plain(msg_body)
-    eq_(msg_body, parsed.decode('utf8'))
+    eq_(msg_body, parsed)


-def test_appointment():
+def test_appointment_2():
    msg_body = """Invitation for an interview:

 Date: Wednesday 3, October 2011 
@@ -838,4 +838,4 @@ Address: 130 Fox St

 Please bring in your ID."""
    parsed = quotations.extract_from_plain(msg_body)
-    eq_(msg_body, parsed.decode('utf8'))
+    eq_(msg_body, parsed)
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -125,39 +125,13 @@ def test_html_fromstring_exception():
    eq_(None, u.html_fromstring("<html></html>"))


-@patch.object(u, 'html_too_big', Mock())
-@patch.object(u.html5parser, 'fromstring')
-def test_html_fromstring_too_big(fromstring):
-    eq_(None, u.html_fromstring("<html></html>"))
-    assert_false(fromstring.called)
-
-
@patch.object(u.html5parser, 'document_fromstring')
 def test_html_document_fromstring_exception(document_fromstring):
    document_fromstring.side_effect = Exception()
    eq_(None, u.html_document_fromstring("<html></html>"))


-@patch.object(u, 'html_too_big', Mock())
-@patch.object(u.html5parser, 'document_fromstring')
-def test_html_document_fromstring_too_big(document_fromstring):
-    eq_(None, u.html_document_fromstring("<html></html>"))
-    assert_false(document_fromstring.called)
-
-
@patch.object(u, 'html_fromstring', Mock(return_value=None))
 def test_bad_html_to_text():
    bad_html = "one<br>two<br>three"
    eq_(None, u.html_to_text(bad_html))
-
-
-@patch.object(u, '_MAX_TAGS_COUNT', 3)
-def test_html_too_big():
-    eq_(False, u.html_too_big("<div></div>"))
-    eq_(True, u.html_too_big("<div><span>Hi</span></div>"))
-
-
-@patch.object(u, '_MAX_TAGS_COUNT', 3)
-def test_html_to_text():
-    eq_(b"Hello", u.html_to_text("<div>Hello</div>"))
-    eq_(None, u.html_to_text("<div><span>Hi</span></div>"))
Author	SHA1	Message	Date
Maxim Vladimirskiy	a8c7e6a972	Merge pull request #226 from mailgun/maxim/develop PIP-1562: Remove max tags limit [python3]	2022-01-06 15:24:57 +03:00
Maxim Vladimirskiy	b30c375c5b	Expose extract_from_html_tree	2022-01-06 15:16:43 +03:00
Maxim Vladimirskiy	cec5acf58f	Remove max tags limit	2022-01-06 14:18:11 +03:00
Maxim Vladimirskiy	24d0f2d00a	Merge pull request #223 from mailgun/maxim/develop PIP-1509: Optimise sender name check [python3]	2021-11-19 13:11:29 +03:00
Maxim Vladimirskiy	94007b0b92	Optimise sender name check	2021-11-19 11:12:26 +03:00
Maxim Vladimirskiy	1a5548f171	Merge pull request #222 from mailgun/maxim/develop PIP-1409: Remove version pins from setup.py [python3]	2021-11-11 16:29:30 +03:00
Maxim Vladimirskiy	53c49b9121	Remove version pins from setup.py	2021-11-11 15:36:50 +03:00
Matt Dietz	bd50872043	Merge pull request #217 from mailgun/dietz/REP-1030 Drops Python 2 support [python3]	2021-06-15 09:46:29 -05:00
Matt Dietz	d37c4fd551	Drops Python 2 support REP-1030 In addition to some python 2 => 3 fixes, this change bumps the scikit-learn version to latest. The previously pinned version of scikit-learn failed trying to compile all necessary C modules under python 3.7+ due to included header files that weren't compatible with C the API implemented in python 3.7+. Simultaneously, with the restrictive compatibility supported by scikit-learn, it seemed prudent to drop python 2 support altogether. Otherwise, we'd be stuck with python 3.4 as the newest possible version we could support. With this change, tests are currently passing under 3.9.2. Lastly, imports the original training data. At some point, a new version of the training data was committed to the repo but no classifier was trained from it. Using a classifier trained from this new data resulted in most of the tests failing.	2021-06-10 14:03:25 -05:00