Merge pull request #120 from mailgun/sergey/talon-1.3.3

bump talon version
2016-11-30 18:28:39 -08:00 · 2016-11-30 12:56:06 -08:00 · 2016-11-30 12:51:32 -08:00 · 2016-11-22 20:02:54 +00:00 · 2016-11-22 20:00:31 +00:00 · 2016-11-22 19:56:57 +00:00
6 changed files with 116 additions and 5 deletions
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):
 setup(name='talon',
-      version='1.3.1',
+      version='1.3.3',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -172,6 +172,9 @@ MAX_HTML_LEN = 2794202
 QUOT_PATTERN = re.compile('^>+ ?')
 NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
 # Regular expression to identify if a line is a header.
 RE_HEADER = re.compile(": ")
 def extract_from(msg_body, content_type='text/plain'):
    try:
@@ -386,9 +389,6 @@ def _extract_from_html(msg_body):
    then checking deleted checkpoints,
    then deleting necessary tags.
    """
    if len(msg_body) > MAX_HTML_LEN:
        return msg_body
    if msg_body.strip() == b'':
        return msg_body
@@ -453,6 +453,54 @@ def _extract_from_html(msg_body):
    return html.tostring(html_tree_copy)
 def split_emails(msg):
    """
    Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify
     split lines, content lines and empty lines.
    Correct the split line markers inside header blocks. Header blocks are identified by the regular expression
    RE_HEADER.
    Return the corrected markers
    """
    delimiter = get_delimiter(msg)
    msg_body = preprocess(msg, delimiter)
    # don't process too long messages
    lines = msg_body.splitlines()[:MAX_LINES_COUNT]
    markers = mark_message_lines(lines)
    # we don't want splitlines in header blocks
    markers = _correct_splitlines_in_headers(markers, lines)
    return markers
 def _correct_splitlines_in_headers(markers, lines):
    """Corrects markers by removing splitlines deemed to be inside header blocks"""
    updated_markers = ""
    i = 0
    in_header_block = False
    for m in markers:
        # Only set in_header_block flag true when we hit an 's' and the line is a header.
        if m == 's':
            if not in_header_block:
                if bool(re.search(RE_HEADER, lines[i])):
                    in_header_block = True
            else:
                m = 't'
        # If the line is not a header line, set in_header_block false.
        if not bool(re.search(RE_HEADER, lines[i])):
            in_header_block = False
        # Add the marker to the new updated markers string.
        updated_markers += m
        i += 1
    return updated_markers
 def _readable_text_empty(html_tree):
    return not bool(html_tree_to_text(html_tree).strip())
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -178,6 +178,9 @@ def html_fromstring(s):
    """Parse html tree from string. Return None if the string can't be parsed.
    """
    try:
        if html_too_big(s):
            return None
        return html5parser.fromstring(s, parser=_html5lib_parser())
    except Exception:
        pass
@@ -187,6 +190,9 @@ def html_document_fromstring(s):
    """Parse html tree from string. Return None if the string can't be parsed.
    """
    try:
        if html_too_big(s):
            return None
        return html5parser.document_fromstring(s, parser=_html5lib_parser())
    except Exception:
        pass
@@ -196,6 +202,10 @@ def cssselect(expr, tree):
    return CSSSelector(expr)(tree)
 def html_too_big(s):
    return s.count('<') > _MAX_TAGS_COUNT
 def _contains_charset_spec(s):
    """Return True if the first 4KB contain charset spec
    """
@@ -243,3 +253,7 @@ _BLOCKTAGS  = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
 _HARDBREAKS = ['br', 'hr', 'tr']
 _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
 # an extensive research shows that exceeding this limit
 # might lead to excessive processing time
 _MAX_TAGS_COUNT = 419
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -385,7 +385,7 @@ def test_gmail_forwarded_msg():
    eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
-@patch.object(quotations, 'MAX_HTML_LEN', 1)
+@patch.object(u, '_MAX_TAGS_COUNT', 4)
 def test_too_large_html():
    msg_body = 'Reply' \
               '<div class="gmail_quote">' \
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -696,3 +696,27 @@ def test_standard_replies():
                "'%(reply)s' != %(stripped)s for %(fn)s" % \
                {'reply': reply_text, 'stripped': stripped_text,
                 'fn': filename}
 def test_split_email():
    msg = """From: Mr. X
 Date: 24 February 2016
 To: Mr. Y
 Subject: Hi
 Attachments: none
 Goodbye.
 From: Mr. Y
 To: Mr. X
 Date: 24 February 2016
 Subject: Hi
 Attachments: none
 Hello.
 -- Original Message --
 On 24th February 2016 at 09.32am Conal Wrote:
 Hey!
 """
    expected_markers = "stttttsttttetestt"
    markers = quotations.split_emails(msg)
    eq_(markers, expected_markers)
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -120,6 +120,12 @@ def test_comment_no_parent():
 def test_html_fromstring_exception():
    eq_(None, u.html_fromstring("<html></html>"))
@patch.object(u, 'html_too_big', Mock())
@patch.object(u.html5parser, 'fromstring')
 def test_html_fromstring_too_big(fromstring):
    eq_(None, u.html_fromstring("<html></html>"))
    assert_false(fromstring.called)
@patch.object(u.html5parser, 'document_fromstring')
 def test_html_document_fromstring_exception(document_fromstring):
@@ -127,7 +133,26 @@ def test_html_document_fromstring_exception(document_fromstring):
    eq_(None, u.html_document_fromstring("<html></html>"))
@patch.object(u, 'html_too_big', Mock())
@patch.object(u.html5parser, 'document_fromstring')
 def test_html_document_fromstring_too_big(document_fromstring):
    eq_(None, u.html_document_fromstring("<html></html>"))
    assert_false(document_fromstring.called)
@patch.object(u, 'html_fromstring', Mock(return_value=None))
 def test_bad_html_to_text():
    bad_html = "one<br>two<br>three"
    eq_(None, u.html_to_text(bad_html))
@patch.object(u, '_MAX_TAGS_COUNT', 3)
 def test_html_too_big():
    eq_(False, u.html_too_big("<div></div>"))
    eq_(True, u.html_too_big("<div><span>Hi</span></div>"))
@patch.object(u, '_MAX_TAGS_COUNT', 3)
 def test_html_to_text():
    eq_("Hello", u.html_to_text("<div>Hello</div>"))
    eq_(None, u.html_to_text("<div><span>Hi</span></div>"))
Author	SHA1	Message	Date
Sergey Obukhov	015c8d2a78	Merge pull request #120 from mailgun/sergey/talon-1.3.3 bump talon version	2016-11-30 18:28:39 -08:00
Sergey Obukhov	5af846c13d	bump talon version	2016-11-30 12:56:06 -08:00
Sergey Obukhov	e69a9c7a54	Merge pull request #119 from conapart3/master Addition of new split_email method for issue:115	2016-11-30 12:51:32 -08:00
conapart3	23cb2a9a53	Merge pull request #1 from conapart3/issue-115-date-split-in-headers split_emails function added, test added	2016-11-22 20:02:54 +00:00
smitcona	b5e3397b88	Updating test to account for --original message-- case	2016-11-22 20:00:31 +00:00
smitcona	5685a4055a	Improved algorithm	2016-11-22 19:56:57 +00:00
smitcona	97b72ef767	Adding in_header_block variable for reliability	2016-11-22 19:06:34 +00:00
smitcona	31489848be	Remove print lines	2016-11-21 17:36:06 +00:00
smitcona	e5988d447b	Add space	2016-11-21 12:48:29 +00:00
smitcona	adfed748ce	split_emails function added, test added	2016-11-21 12:35:36 +00:00
Sergey Obukhov	2444ba87c0	Merge pull request #111 from mailgun/sergey/tagscount restrict html processing to a certain number of tags	2016-09-14 11:06:29 -07:00
Sergey Obukhov	534457e713	protect html_to_text as well	2016-09-14 09:58:41 -07:00
Sergey Obukhov	ea82a9730e	restrict html processing to a certain number of tags	2016-09-14 09:33:30 -07:00