Merge pull request #120 from mailgun/sergey/talon-1.3.3

bump talon version
2016-11-30 18:28:39 -08:00 · 2016-11-30 12:56:06 -08:00 · 2016-11-30 12:51:32 -08:00 · 2016-11-22 20:02:54 +00:00 · 2016-11-22 20:00:31 +00:00 · 2016-11-22 19:56:57 +00:00
11 changed files with 2800 additions and 2726 deletions
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):
 setup(name='talon',
-      version='1.2.11',
+      version='1.3.3',
      description=("Mailgun library "
                   "to extract message quotations and signatures."),
      long_description=open("README.rst").read(),
@@ -53,6 +53,7 @@ setup(name='talon',
          'cchardet>=0.3.5',
          'cssselect',
          'six>=1.10.0',
          'html5lib'
          ],
      tests_require=[
          "mock",
--- a/talon/html_quotations.py
+++ b/talon/html_quotations.py
@@ -6,6 +6,7 @@ messages (without quoted messages) from html
 from __future__ import absolute_import
 import regex as re
 from talon.utils import cssselect 
 CHECKPOINT_PREFIX = '#!%!'
 CHECKPOINT_SUFFIX = '!%!#'
@@ -78,7 +79,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints):
 def cut_gmail_quote(html_message):
    ''' Cuts the outermost block element with class gmail_quote. '''
-    gmail_quote = html_message.cssselect('div.gmail_quote')
+    gmail_quote = cssselect('div.gmail_quote', html_message)
    if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)):
        gmail_quote[0].getparent().remove(gmail_quote[0])
        return True
@@ -135,7 +136,7 @@ def cut_microsoft_quote(html_message):
 def cut_by_id(html_message):
    found = False
    for quote_id in QUOTE_IDS:
-        quote = html_message.cssselect('#{}'.format(quote_id))
+        quote = cssselect('#{}'.format(quote_id), html_message)
        if quote:
            found = True
            quote[0].getparent().remove(quote[0])
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -12,7 +12,8 @@ from copy import deepcopy
 from lxml import html, etree
-from talon.utils import get_delimiter, html_to_text
+from talon.utils import (get_delimiter, html_tree_to_text,
                         html_document_fromstring)
 from talon import html_quotations
 from six.moves import range
 import six
@@ -164,10 +165,16 @@ RE_PARENTHESIS_LINK = re.compile("\(https?://")
 SPLITTER_MAX_LINES = 4
 MAX_LINES_COUNT = 1000
 # an extensive research shows that exceeding this limit
 # leads to excessive processing time
 MAX_HTML_LEN = 2794202
 QUOT_PATTERN = re.compile('^>+ ?')
 NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
 # Regular expression to identify if a line is a header.
 RE_HEADER = re.compile(": ")
 def extract_from(msg_body, content_type='text/plain'):
    try:
@@ -385,11 +392,12 @@ def _extract_from_html(msg_body):
    if msg_body.strip() == b'':
        return msg_body
-    msg_body = msg_body.replace(b'\r\n', b'').replace(b'\n', b'')
+    msg_body = msg_body.replace(b'\r\n', b'\n')
-    html_tree = html.document_fromstring(
+    html_tree = html_document_fromstring(msg_body)
-        msg_body,
+
-        parser=html.HTMLParser(encoding="utf-8")
+    if html_tree is None:
-    )
+        return msg_body
    cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
                      html_quotations.cut_zimbra_quote(html_tree) or
                      html_quotations.cut_blockquote(html_tree) or
@@ -401,8 +409,7 @@ def _extract_from_html(msg_body):
    number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
    quotation_checkpoints = [False] * number_of_checkpoints
-    msg_with_checkpoints = html.tostring(html_tree)
+    plain_text = html_tree_to_text(html_tree)
    plain_text = html_to_text(msg_with_checkpoints)
    plain_text = preprocess(plain_text, '\n', content_type='text/html')
    lines = plain_text.splitlines()
@@ -425,25 +432,79 @@ def _extract_from_html(msg_body):
    return_flags = []
    process_marked_lines(lines, markers, return_flags)
    lines_were_deleted, first_deleted, last_deleted = return_flags
    if not lines_were_deleted and not cut_quotations:
        return msg_body
    if lines_were_deleted:
        #collect checkpoints from deleted lines
        for i in range(first_deleted, last_deleted):
            for checkpoint in line_checkpoints[i]:
                quotation_checkpoints[checkpoint] = True
    else:
        if cut_quotations:
            return html.tostring(html_tree_copy)
        else:
            return msg_body
-    # Remove tags with quotation checkpoints
+        # Remove tags with quotation checkpoints
-    html_quotations.delete_quotation_tags(
+        html_quotations.delete_quotation_tags(
-        html_tree_copy, 0, quotation_checkpoints
+            html_tree_copy, 0, quotation_checkpoints
-    )
+        )
    if _readable_text_empty(html_tree_copy):
        return msg_body
    return html.tostring(html_tree_copy)
 def split_emails(msg):
    """
    Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify
     split lines, content lines and empty lines.
    Correct the split line markers inside header blocks. Header blocks are identified by the regular expression
    RE_HEADER.
    Return the corrected markers
    """
    delimiter = get_delimiter(msg)
    msg_body = preprocess(msg, delimiter)
    # don't process too long messages
    lines = msg_body.splitlines()[:MAX_LINES_COUNT]
    markers = mark_message_lines(lines)
    # we don't want splitlines in header blocks
    markers = _correct_splitlines_in_headers(markers, lines)
    return markers
 def _correct_splitlines_in_headers(markers, lines):
    """Corrects markers by removing splitlines deemed to be inside header blocks"""
    updated_markers = ""
    i = 0
    in_header_block = False
    for m in markers:
        # Only set in_header_block flag true when we hit an 's' and the line is a header.
        if m == 's':
            if not in_header_block:
                if bool(re.search(RE_HEADER, lines[i])):
                    in_header_block = True
            else:
                m = 't'
        # If the line is not a header line, set in_header_block false.
        if not bool(re.search(RE_HEADER, lines[i])):
            in_header_block = False
        # Add the marker to the new updated markers string.
        updated_markers += m
        i += 1
    return updated_markers
 def _readable_text_empty(html_tree):
    return not bool(html_tree_to_text(html_tree).strip())
 def is_splitter(line):
    '''
    Returns Matcher object if provided string is a splitter and
@@ -457,7 +518,7 @@ def is_splitter(line):
 def text_content(context):
    '''XPath Extension function to return a node text content.'''
-    return context.context_node.text_content().strip()
+    return context.context_node.xpath("string()").strip()
 def tail(context):
--- a/talon/signature/data/train.data
+++ b/talon/signature/data/train.data
--- a/talon/signature/learning/helpers.py
+++ b/talon/signature/learning/helpers.py
@@ -185,12 +185,13 @@ def capitalized_words_percent(s):
    s = to_unicode(s, precise=True)
    words = re.split('\s', s)
    words = [w for w in words if w.strip()]
    words = [w for w in words if len(w) > 2]    
    capitalized_words_counter = 0
    valid_words_counter = 0
    for word in words:
        if not INVALID_WORD_START.match(word):
            valid_words_counter += 1
-            if word[0].isupper():
+            if word[0].isupper() and not word[1].isupper():
                capitalized_words_counter += 1
    if valid_words_counter > 0 and len(words) > 1:
        return 100 * float(capitalized_words_counter) / valid_words_counter
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -7,9 +7,11 @@ import chardet
 import cchardet
 import regex as re
-from lxml import html
+from lxml.html import html5parser
 from lxml.cssselect import CSSSelector
 import html5lib
 from talon.constants import RE_DELIMITER
 import six
@@ -113,29 +115,18 @@ def get_delimiter(msg_body):
    return delimiter
-def html_to_text(string):
+def html_tree_to_text(tree):
    """
    Dead-simple HTML-to-text converter:
        >>> html_to_text("one<br>two<br>three")
        >>> "one\ntwo\nthree"
    NOTES:
        1. the string is expected to contain UTF-8 encoded HTML!
        2. returns utf-8 encoded str (not unicode)
    """
    if isinstance(string, six.text_type):
        string = string.encode('utf8')
    s = _prepend_utf8_declaration(string)
    s = s.replace(b"\n", b"")
    tree = html.fromstring(s)
    for style in CSSSelector('style')(tree):
        style.getparent().remove(style)
    for c in tree.xpath('//comment()'):
-        c.getparent().remove(c)
+        parent = c.getparent()
        # comment with no parent does not impact produced text
        if parent is None:
            continue
        parent.remove(c)
    text   = ""
    for el in tree.iter():
@@ -159,6 +150,62 @@ def html_to_text(string):
    return _encode_utf8(retval)
 def html_to_text(string):
    """
    Dead-simple HTML-to-text converter:
        >>> html_to_text("one<br>two<br>three")
        >>> "one\ntwo\nthree"
    NOTES:
        1. the string is expected to contain UTF-8 encoded HTML!
        2. returns utf-8 encoded str (not unicode)
        3. if html can't be parsed returns None
    """
    if isinstance(string, six.text_type):
        string = string.encode('utf8')
    s = _prepend_utf8_declaration(string)
    s = s.replace(b"\n", b"")
    tree = html_fromstring(s)
    if tree is None:
        return None
    return html_tree_to_text(tree)
 def html_fromstring(s):
    """Parse html tree from string. Return None if the string can't be parsed.
    """
    try:
        if html_too_big(s):
            return None
        return html5parser.fromstring(s, parser=_html5lib_parser())
    except Exception:
        pass
 def html_document_fromstring(s):
    """Parse html tree from string. Return None if the string can't be parsed.
    """
    try:
        if html_too_big(s):
            return None
        return html5parser.document_fromstring(s, parser=_html5lib_parser())
    except Exception:
        pass
 def cssselect(expr, tree):
    return CSSSelector(expr)(tree)
 def html_too_big(s):
    return s.count('<') > _MAX_TAGS_COUNT
 def _contains_charset_spec(s):
    """Return True if the first 4KB contain charset spec
    """
@@ -183,6 +230,21 @@ def _encode_utf8(s):
    return s.encode('utf-8') if isinstance(s, six.text_type) else s
 def _html5lib_parser():
    """
    html5lib is a pure-python library that conforms to the WHATWG HTML spec
    and is not vulnarable to certain attacks common for XML libraries
    """
    return html5lib.HTMLParser(
        # build lxml tree
        html5lib.treebuilders.getTreeBuilder("lxml"),
        # remove namespace value from inside lxml.html.html5paser element tag
        # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div"
        # instead of "div", throwing the algo off
        namespaceHTMLElements=False
    )
 _UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;'
                     b'charset=utf-8">')
@@ -190,5 +252,8 @@ _UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;'
 _BLOCKTAGS  = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
 _HARDBREAKS = ['br', 'hr', 'tr']
 _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
 # an extensive research shows that exceeding this limit
 # might lead to excessive processing time
 _MAX_TAGS_COUNT = 419
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -27,7 +27,7 @@ def test_quotation_splitter_inside_blockquote():
 </blockquote>"""
-    eq_("<html><body><p>Reply</p></body></html>",
+    eq_("<html><head></head><body>Reply</body></html>",
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -44,7 +44,7 @@ def test_quotation_splitter_outside_blockquote():
  </div>
 </blockquote>
 """
-    eq_("<html><body><p>Reply</p></body></html>",
+    eq_("<html><head></head><body>Reply</body></html>",
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -62,7 +62,7 @@ def test_regular_blockquote():
  </div>
 </blockquote>
 """
-    eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote></body></html>",
+    eq_("<html><head></head><body>Reply<blockquote>Regular</blockquote></body></html>",
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -85,6 +85,7 @@ Reply
    reply = """
 <html>
 <head></head>
 <body>
 Reply
@@ -128,7 +129,7 @@ def test_gmail_quote():
    </div>
  </div>
 </div>"""
-    eq_("<html><body><p>Reply</p></body></html>",
+    eq_("<html><head></head><body>Reply</body></html>",
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -139,7 +140,7 @@ def test_gmail_quote_compact():
               '<div>Test</div>' \
               '</div>' \
               '</div>'
-    eq_("<html><body><p>Reply</p></body></html>",
+    eq_("<html><head></head><body>Reply</body></html>",
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -166,7 +167,7 @@ def test_unicode_in_reply():
  Quote
 </blockquote>""".encode("utf-8")
-    eq_("<html><body><p>Reply&#160;&#160;Text<br></p><div><br></div>"
+    eq_("<html><head></head><body>Reply&#160;&#160;Text<br><div><br></div>"
        "</body></html>",
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -192,6 +193,7 @@ def test_blockquote_disclaimer():
    stripped_html = """
 <html>
  <head></head>
  <body>
  <div>
    <div>
@@ -223,7 +225,7 @@ def test_date_block():
  </div>
 </div>
 """
-    eq_('<html><body><div>message<br></div></body></html>',
+    eq_('<html><head></head><body><div>message<br></div></body></html>',
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -240,7 +242,7 @@ Subject: You Have New Mail From Mary!<br><br>
 text
 </div></div>
 """
-    eq_('<html><body><div>message<br></div></body></html>',
+    eq_('<html><head></head><body><div>message<br></div></body></html>',
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -258,7 +260,7 @@ def test_reply_shares_div_with_from_block():
  </div>
 </body>'''
-    eq_('<html><body><div>Blah<br><br></div></body></html>',
+    eq_('<html><head></head><body><div>Blah<br><br></div></body></html>',
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -269,13 +271,13 @@ def test_reply_quotations_share_block():
 def test_OLK_SRC_BODY_SECTION_stripped():
-    eq_('<html><body><div>Reply</div></body></html>',
+    eq_('<html><head></head><body><div>Reply</div></body></html>',
        RE_WHITESPACE.sub(
            '', quotations.extract_from_html(OLK_SRC_BODY_SECTION)))
 def test_reply_separated_by_hr():
-    eq_('<html><body><div>Hi<div>there</div></div></body></html>',
+    eq_('<html><head></head><body><div>Hi<div>there</div></div></body></html>',
        RE_WHITESPACE.sub(
            '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR)))
@@ -296,7 +298,7 @@ Reply
  </div>
 </div>
 '''
-    eq_('<html><body><p>Reply</p><div><hr></div></body></html>',
+    eq_('<html><head></head><body>Reply<div><hr></div></body></html>',
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -356,7 +358,8 @@ def test_CRLF():
    assert_false(symbol in extracted)
    eq_('<html></html>', RE_WHITESPACE.sub('', extracted))
-    msg_body = """Reply
+    msg_body = """My
 reply
 <blockquote>
  <div>
@@ -371,8 +374,8 @@ def test_CRLF():
    msg_body = msg_body.replace('\n', '\r\n')
    extracted = quotations.extract_from_html(msg_body)
    assert_false(symbol in extracted)    
-    eq_("<html><body><p>Reply</p></body></html>",
+    # Keep new lines otherwise "My reply" becomes one word - "Myreply" 
-        RE_WHITESPACE.sub('', extracted))
+    eq_("<html><head></head><body>My\nreply\n</body></html>", extracted)
 def test_gmail_forwarded_msg():
@@ -380,3 +383,39 @@ def test_gmail_forwarded_msg():
 </div><br></div>"""
    extracted = quotations.extract_from_html(msg_body)
    eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
@patch.object(u, '_MAX_TAGS_COUNT', 4)
 def test_too_large_html():
    msg_body = 'Reply' \
               '<div class="gmail_quote">' \
               '<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:' \
               '<div>Test</div>' \
               '</div>' \
               '</div>'
    eq_(RE_WHITESPACE.sub('', msg_body),
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
 def test_readable_html_empty():
    msg_body = """
 <blockquote>
  Reply
  <div>
    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
  </div>
  <div>
    Test
  </div>
 </blockquote>"""
    eq_(RE_WHITESPACE.sub('', msg_body),
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@patch.object(quotations, 'html_document_fromstring', Mock(return_value=None))
 def test_bad_html():
    bad_html = "<html></html>"
    eq_(bad_html, quotations.extract_from_html(bad_html))
--- a/tests/signature/extraction_test.py
+++ b/tests/signature/extraction_test.py
@@ -77,6 +77,31 @@ def test_basic():
        signature.extract(msg_body, 'Sergey'))
 def test_capitalized():
    msg_body = """Hi Mary,
 Do you still need a DJ for your wedding? I've included a video demo of one of our DJs available for your wedding date.
 DJ Doe 
 http://example.com
 Password: SUPERPASSWORD
 Would you like to check out more?
 At your service,
 John Smith
 Doe Inc
 555-531-7967"""
    sig = """John Smith
 Doe Inc
 555-531-7967"""
    eq_(sig, signature.extract(msg_body, 'Doe')[1])
 def test_over_2_text_lines_after_signature():
    body = """Blah
--- a/tests/signature/learning/helpers_test.py
+++ b/tests/signature/learning/helpers_test.py
@@ -192,10 +192,11 @@ def test_punctuation_percent(categories_percent):
 def test_capitalized_words_percent():
    eq_(0.0, h.capitalized_words_percent(''))
    eq_(100.0, h.capitalized_words_percent('Example Corp'))
-    eq_(50.0, h.capitalized_words_percent('Qqq qqq QQQ 123 sss'))
+    eq_(50.0, h.capitalized_words_percent('Qqq qqq Aqs 123 sss'))
    eq_(100.0, h.capitalized_words_percent('Cell 713-444-7368'))
    eq_(100.0, h.capitalized_words_percent('8th Floor'))
    eq_(0.0, h.capitalized_words_percent('(212) 230-9276'))
    eq_(50.0, h.capitalized_words_percent('Password: REMARKABLE'))
 def test_has_signature():
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -696,3 +696,27 @@ def test_standard_replies():
                "'%(reply)s' != %(stripped)s for %(fn)s" % \
                {'reply': reply_text, 'stripped': stripped_text,
                 'fn': filename}
 def test_split_email():
    msg = """From: Mr. X
 Date: 24 February 2016
 To: Mr. Y
 Subject: Hi
 Attachments: none
 Goodbye.
 From: Mr. Y
 To: Mr. X
 Date: 24 February 2016
 Subject: Hi
 Attachments: none
 Hello.
 -- Original Message --
 On 24th February 2016 at 09.32am Conal Wrote:
 Hey!
 """
    expected_markers = "stttttsttttetestt"
    markers = quotations.split_emails(msg)
    eq_(markers, expected_markers)
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -6,6 +6,7 @@ from . import *
 from talon import utils as u
 import cchardet
 import six
 from lxml import html
 def test_get_delimiter():
@@ -107,3 +108,51 @@ font: 13px 'Lucida Grande', Arial, sans-serif;
 <p>TEXT 2 <!-- COMMENT 2 --></p>
 </div>"""
    eq_(b'TEXT 1 \nTEXT 2', u.html_to_text(html))
 def test_comment_no_parent():
    s = "<!-- COMMENT 1 --> no comment"
    d = u.html_document_fromstring(s)
    eq_("no comment", u.html_tree_to_text(d))
@patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception()))
 def test_html_fromstring_exception():
    eq_(None, u.html_fromstring("<html></html>"))
@patch.object(u, 'html_too_big', Mock())
@patch.object(u.html5parser, 'fromstring')
 def test_html_fromstring_too_big(fromstring):
    eq_(None, u.html_fromstring("<html></html>"))
    assert_false(fromstring.called)
@patch.object(u.html5parser, 'document_fromstring')
 def test_html_document_fromstring_exception(document_fromstring):
    document_fromstring.side_effect = Exception()
    eq_(None, u.html_document_fromstring("<html></html>"))
@patch.object(u, 'html_too_big', Mock())
@patch.object(u.html5parser, 'document_fromstring')
 def test_html_document_fromstring_too_big(document_fromstring):
    eq_(None, u.html_document_fromstring("<html></html>"))
    assert_false(document_fromstring.called)
@patch.object(u, 'html_fromstring', Mock(return_value=None))
 def test_bad_html_to_text():
    bad_html = "one<br>two<br>three"
    eq_(None, u.html_to_text(bad_html))
@patch.object(u, '_MAX_TAGS_COUNT', 3)
 def test_html_too_big():
    eq_(False, u.html_too_big("<div></div>"))
    eq_(True, u.html_too_big("<div><span>Hi</span></div>"))
@patch.object(u, '_MAX_TAGS_COUNT', 3)
 def test_html_to_text():
    eq_("Hello", u.html_to_text("<div>Hello</div>"))
    eq_(None, u.html_to_text("<div><span>Hi</span></div>"))
Author	SHA1	Message	Date
Sergey Obukhov	015c8d2a78	Merge pull request #120 from mailgun/sergey/talon-1.3.3 bump talon version	2016-11-30 18:28:39 -08:00
Sergey Obukhov	5af846c13d	bump talon version	2016-11-30 12:56:06 -08:00
Sergey Obukhov	e69a9c7a54	Merge pull request #119 from conapart3/master Addition of new split_email method for issue:115	2016-11-30 12:51:32 -08:00
conapart3	23cb2a9a53	Merge pull request #1 from conapart3/issue-115-date-split-in-headers split_emails function added, test added	2016-11-22 20:02:54 +00:00
smitcona	b5e3397b88	Updating test to account for --original message-- case	2016-11-22 20:00:31 +00:00
smitcona	5685a4055a	Improved algorithm	2016-11-22 19:56:57 +00:00
smitcona	97b72ef767	Adding in_header_block variable for reliability	2016-11-22 19:06:34 +00:00
smitcona	31489848be	Remove print lines	2016-11-21 17:36:06 +00:00
smitcona	e5988d447b	Add space	2016-11-21 12:48:29 +00:00
smitcona	adfed748ce	split_emails function added, test added	2016-11-21 12:35:36 +00:00
Sergey Obukhov	2444ba87c0	Merge pull request #111 from mailgun/sergey/tagscount restrict html processing to a certain number of tags	2016-09-14 11:06:29 -07:00
Sergey Obukhov	534457e713	protect html_to_text as well	2016-09-14 09:58:41 -07:00
Sergey Obukhov	ea82a9730e	restrict html processing to a certain number of tags	2016-09-14 09:33:30 -07:00
Sergey Obukhov	f04b872e14	Merge pull request #108 from mailgun/sergey/html5lib-fix use new parser each time we parse a document	2016-08-22 18:10:35 -07:00
Sergey Obukhov	e61894e425	bump version	2016-08-22 17:34:18 -07:00
Sergey Obukhov	35fbdaadac	use new parser each time we parse a document	2016-08-22 16:25:04 -07:00
Sergey Obukhov	8441bc7328	Merge pull request #106 from mailgun/sergey/html5lib use html5lib to parse html	2016-08-19 15:58:07 -07:00
Sergey Obukhov	37c95ff97b	fallback untouched html if we can not parse html tree	2016-08-19 11:38:12 -07:00
Sergey Obukhov	5b1ca33c57	fix cssselect	2016-08-16 17:11:41 -07:00
Sergey Obukhov	ec8e09b34e	fix	2016-08-15 20:31:04 -07:00
Sergey Obukhov	bcf97eccfa	use html5lib to parse html	2016-08-15 19:36:21 -07:00
Sergey Obukhov	f53b5cc7a6	Merge pull request #105 from mailgun/sergey/fromstring html with comment that has no parent crashes html_tree_to_text	2016-08-15 13:40:37 -07:00
Sergey Obukhov	27adde7aa7	bump version	2016-08-15 13:21:10 -07:00
Sergey Obukhov	a9719833e0	html with comment that has no parent crashes html_tree_to_text	2016-08-12 17:40:12 -07:00
Sergey Obukhov	7bf37090ca	Merge pull request #101 from mailgun/sergey/empty-html if html stripped off quotations does not have readable text fallback …	2016-08-12 12:18:50 -07:00
Sergey Obukhov	44fcef7123	bump version	2016-08-11 23:59:18 -07:00
Sergey Obukhov	69a44b10a1	Merge branch 'master' into sergey/empty-html	2016-08-11 23:58:11 -07:00
Sergey Obukhov	b085e3d049	Merge pull request #104 from mailgun/sergey/spaces fixes mailgun/talon#103 keep newlines when parsing html quotations	2016-08-11 23:56:26 -07:00
Sergey Obukhov	4b953bcddc	fixes mailgun/talon#103 keep newlines when parsing html quotations	2016-08-11 20:17:37 -07:00
Sergey Obukhov	315eaa7080	if html stripped off quotations does not have readable text fallback to unparsed html	2016-08-11 19:55:23 -07:00
Sergey Obukhov	5a9bc967f1	Merge pull request #100 from mailgun/sergey/restrict do not parse html quotations if html is longer then certain threshold	2016-08-11 16:08:03 -07:00
Sergey Obukhov	a0d7236d0b	bump version and add a comment	2016-08-11 15:49:09 -07:00
Sergey Obukhov	21e9a31ffe	add test	2016-08-09 17:15:49 -07:00
Sergey Obukhov	4ee46c0a97	do not parse html quotations if html is longer then certain threshold	2016-08-09 17:08:58 -07:00
Sergey Obukhov	10d9a930f9	Merge pull request #99 from mailgun/sergey/capitalized consider word capitilized only if it is camel case - not all upper case	2016-07-20 16:47:12 -07:00
Sergey Obukhov	a21ccdb21b	consider word capitilized only if it is camel case - not all upper case	2016-07-19 17:37:36 -07:00