Merge pull request #101 from mailgun/sergey/empty-html

if html stripped off quotations does not have readable text fallback …
bump version
2016-08-12 12:18:50 -07:00 · 2016-08-11 23:59:18 -07:00 · 2016-08-11 23:58:11 -07:00 · 2016-08-11 23:56:26 -07:00 · 2016-08-11 20:17:37 -07:00 · 2016-08-11 19:55:23 -07:00
8 changed files with 2583 additions and 2705 deletions
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):


 setup(name='talon',
-      version='1.2.11',
+      version='1.2.15',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -12,7 +12,7 @@ from copy import deepcopy

 from lxml import html, etree

-from talon.utils import get_delimiter, html_to_text
+from talon.utils import get_delimiter, html_tree_to_text
 from talon import html_quotations
 from six.moves import range
 import six
@@ -164,6 +164,9 @@ RE_PARENTHESIS_LINK = re.compile("\(https?://")

 SPLITTER_MAX_LINES = 4
 MAX_LINES_COUNT = 1000
+# an extensive research shows that exceeding this limit
+# leads to excessive processing time
+MAX_HTML_LEN = 2794202

 QUOT_PATTERN = re.compile('^>+ ?')
 NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
@@ -382,10 +385,13 @@ def _extract_from_html(msg_body):
    then checking deleted checkpoints,
    then deleting necessary tags.
    """
+    if len(msg_body) > MAX_HTML_LEN:
+        return msg_body
+
    if msg_body.strip() == b'':
        return msg_body

-    msg_body = msg_body.replace(b'\r\n', b'').replace(b'\n', b'')
+    msg_body = msg_body.replace(b'\r\n', b'\n')
    html_tree = html.document_fromstring(
        msg_body,
        parser=html.HTMLParser(encoding="utf-8")
@@ -401,8 +407,7 @@ def _extract_from_html(msg_body):

    number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
    quotation_checkpoints = [False] * number_of_checkpoints
-    msg_with_checkpoints = html.tostring(html_tree)
-    plain_text = html_to_text(msg_with_checkpoints)
+    plain_text = html_tree_to_text(html_tree)
    plain_text = preprocess(plain_text, '\n', content_type='text/html')
    lines = plain_text.splitlines()

@@ -425,25 +430,31 @@ def _extract_from_html(msg_body):
    return_flags = []
    process_marked_lines(lines, markers, return_flags)
    lines_were_deleted, first_deleted, last_deleted = return_flags
+
+    if not lines_were_deleted and not cut_quotations:
+        return msg_body
+
    if lines_were_deleted:
        #collect checkpoints from deleted lines
        for i in range(first_deleted, last_deleted):
            for checkpoint in line_checkpoints[i]:
                quotation_checkpoints[checkpoint] = True
-    else:
-        if cut_quotations:
-            return html.tostring(html_tree_copy)
-        else:
-            return msg_body

        # Remove tags with quotation checkpoints
        html_quotations.delete_quotation_tags(
            html_tree_copy, 0, quotation_checkpoints
        )

+    if _readable_text_empty(html_tree_copy):
+        return msg_body
+
    return html.tostring(html_tree_copy)


+def _readable_text_empty(html_tree):
+    return not bool(html_tree_to_text(html_tree).strip())
+
+
 def is_splitter(line):
    '''
    Returns Matcher object if provided string is a splitter and
--- a/talon/signature/data/train.data
+++ b/talon/signature/data/train.data
--- a/talon/signature/learning/helpers.py
+++ b/talon/signature/learning/helpers.py
@@ -185,12 +185,13 @@ def capitalized_words_percent(s):
    s = to_unicode(s, precise=True)
    words = re.split('\s', s)
    words = [w for w in words if w.strip()]
+    words = [w for w in words if len(w) > 2]    
    capitalized_words_counter = 0
    valid_words_counter = 0
    for word in words:
        if not INVALID_WORD_START.match(word):
            valid_words_counter += 1
-            if word[0].isupper():
+            if word[0].isupper() and not word[1].isupper():
                capitalized_words_counter += 1
    if valid_words_counter > 0 and len(words) > 1:
        return 100 * float(capitalized_words_counter) / valid_words_counter
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -112,25 +112,7 @@ def get_delimiter(msg_body):

    return delimiter

-
-def html_to_text(string):
-    """
-    Dead-simple HTML-to-text converter:
-        >>> html_to_text("one<br>two<br>three")
-        >>> "one\ntwo\nthree"
-
-    NOTES:
-        1. the string is expected to contain UTF-8 encoded HTML!
-        2. returns utf-8 encoded str (not unicode)
-    """
-    if isinstance(string, six.text_type):
-        string = string.encode('utf8')
-
-    s = _prepend_utf8_declaration(string)
-    s = s.replace(b"\n", b"")
-
-    tree = html.fromstring(s)
-
+def html_tree_to_text(tree):
    for style in CSSSelector('style')(tree):
        style.getparent().remove(style)

@@ -159,6 +141,26 @@ def html_to_text(string):
    return _encode_utf8(retval)


+def html_to_text(string):
+    """
+    Dead-simple HTML-to-text converter:
+        >>> html_to_text("one<br>two<br>three")
+        >>> "one\ntwo\nthree"
+
+    NOTES:
+        1. the string is expected to contain UTF-8 encoded HTML!
+        2. returns utf-8 encoded str (not unicode)
+    """
+    if isinstance(string, six.text_type):
+        string = string.encode('utf8')
+
+    s = _prepend_utf8_declaration(string)
+    s = s.replace(b"\n", b"")
+
+    tree = html.fromstring(s)
+    return html_tree_to_text(tree)
+
+
 def _contains_charset_spec(s):
    """Return True if the first 4KB contain charset spec
    """
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -356,7 +356,8 @@ def test_CRLF():
    assert_false(symbol in extracted)
    eq_('<html></html>', RE_WHITESPACE.sub('', extracted))

-    msg_body = """Reply
+    msg_body = """My
+reply
 <blockquote>

  <div>
@@ -371,8 +372,8 @@ def test_CRLF():
    msg_body = msg_body.replace('\n', '\r\n')
    extracted = quotations.extract_from_html(msg_body)
    assert_false(symbol in extracted)    
-    eq_("<html><body><p>Reply</p></body></html>",
-        RE_WHITESPACE.sub('', extracted))
+    # Keep new lines otherwise "My reply" becomes one word - "Myreply" 
+    eq_("<html><body><p>My\nreply\n</p></body></html>", extracted)


 def test_gmail_forwarded_msg():
@@ -380,3 +381,33 @@ def test_gmail_forwarded_msg():
 </div><br></div>"""
    extracted = quotations.extract_from_html(msg_body)
    eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
+
+
+@patch.object(quotations, 'MAX_HTML_LEN', 1)
+def test_too_large_html():
+    msg_body = 'Reply' \
+               '<div class="gmail_quote">' \
+               '<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:' \
+               '<div>Test</div>' \
+               '</div>' \
+               '</div>'
+    eq_(RE_WHITESPACE.sub('', msg_body),
+        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
+
+
+def test_readable_html_empty():
+    msg_body = """
+<blockquote>
+  Reply
+  <div>
+    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
+  </div>
+
+  <div>
+    Test
+  </div>
+
+</blockquote>"""
+
+    eq_(RE_WHITESPACE.sub('', msg_body),
+        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
--- a/tests/signature/extraction_test.py
+++ b/tests/signature/extraction_test.py
@@ -77,6 +77,31 @@ def test_basic():
        signature.extract(msg_body, 'Sergey'))


+def test_capitalized():
+    msg_body = """Hi Mary,
+
+Do you still need a DJ for your wedding? I've included a video demo of one of our DJs available for your wedding date.
+
+DJ Doe 
+http://example.com
+Password: SUPERPASSWORD
+
+Would you like to check out more?
+
+
+At your service,
+
+John Smith
+Doe Inc
+555-531-7967"""
+
+    sig = """John Smith
+Doe Inc
+555-531-7967"""
+
+    eq_(sig, signature.extract(msg_body, 'Doe')[1])
+
+
 def test_over_2_text_lines_after_signature():
    body = """Blah

--- a/tests/signature/learning/helpers_test.py
+++ b/tests/signature/learning/helpers_test.py
@@ -192,10 +192,11 @@ def test_punctuation_percent(categories_percent):
 def test_capitalized_words_percent():
    eq_(0.0, h.capitalized_words_percent(''))
    eq_(100.0, h.capitalized_words_percent('Example Corp'))
-    eq_(50.0, h.capitalized_words_percent('Qqq qqq QQQ 123 sss'))
+    eq_(50.0, h.capitalized_words_percent('Qqq qqq Aqs 123 sss'))
    eq_(100.0, h.capitalized_words_percent('Cell 713-444-7368'))
    eq_(100.0, h.capitalized_words_percent('8th Floor'))
    eq_(0.0, h.capitalized_words_percent('(212) 230-9276'))
+    eq_(50.0, h.capitalized_words_percent('Password: REMARKABLE'))


 def test_has_signature():
Author	SHA1	Message	Date
Sergey Obukhov	7bf37090ca	Merge pull request #101 from mailgun/sergey/empty-html if html stripped off quotations does not have readable text fallback …	2016-08-12 12:18:50 -07:00
Sergey Obukhov	44fcef7123	bump version	2016-08-11 23:59:18 -07:00
Sergey Obukhov	69a44b10a1	Merge branch 'master' into sergey/empty-html	2016-08-11 23:58:11 -07:00
Sergey Obukhov	b085e3d049	Merge pull request #104 from mailgun/sergey/spaces fixes mailgun/talon#103 keep newlines when parsing html quotations	2016-08-11 23:56:26 -07:00
Sergey Obukhov	4b953bcddc	fixes mailgun/talon#103 keep newlines when parsing html quotations	2016-08-11 20:17:37 -07:00
Sergey Obukhov	315eaa7080	if html stripped off quotations does not have readable text fallback to unparsed html	2016-08-11 19:55:23 -07:00
Sergey Obukhov	5a9bc967f1	Merge pull request #100 from mailgun/sergey/restrict do not parse html quotations if html is longer then certain threshold	2016-08-11 16:08:03 -07:00
Sergey Obukhov	a0d7236d0b	bump version and add a comment	2016-08-11 15:49:09 -07:00
Sergey Obukhov	21e9a31ffe	add test	2016-08-09 17:15:49 -07:00
Sergey Obukhov	4ee46c0a97	do not parse html quotations if html is longer then certain threshold	2016-08-09 17:08:58 -07:00
Sergey Obukhov	10d9a930f9	Merge pull request #99 from mailgun/sergey/capitalized consider word capitilized only if it is camel case - not all upper case	2016-07-20 16:47:12 -07:00
Sergey Obukhov	a21ccdb21b	consider word capitilized only if it is camel case - not all upper case	2016-07-19 17:37:36 -07:00