36 Commits

Author SHA1 Message Date
Sergey Obukhov
015c8d2a78 Merge pull request #120 from mailgun/sergey/talon-1.3.3
bump talon version
2016-11-30 18:28:39 -08:00
Sergey Obukhov
5af846c13d bump talon version 2016-11-30 12:56:06 -08:00
Sergey Obukhov
e69a9c7a54 Merge pull request #119 from conapart3/master
Addition of new split_email method for issue:115
2016-11-30 12:51:32 -08:00
conapart3
23cb2a9a53 Merge pull request #1 from conapart3/issue-115-date-split-in-headers
split_emails function added, test added
2016-11-22 20:02:54 +00:00
smitcona
b5e3397b88 Updating test to account for --original message-- case 2016-11-22 20:00:31 +00:00
smitcona
5685a4055a Improved algorithm 2016-11-22 19:56:57 +00:00
smitcona
97b72ef767 Adding in_header_block variable for reliability 2016-11-22 19:06:34 +00:00
smitcona
31489848be Remove print lines 2016-11-21 17:36:06 +00:00
smitcona
e5988d447b Add space 2016-11-21 12:48:29 +00:00
smitcona
adfed748ce split_emails function added, test added 2016-11-21 12:35:36 +00:00
Sergey Obukhov
2444ba87c0 Merge pull request #111 from mailgun/sergey/tagscount
restrict html processing to a certain number of tags
2016-09-14 11:06:29 -07:00
Sergey Obukhov
534457e713 protect html_to_text as well 2016-09-14 09:58:41 -07:00
Sergey Obukhov
ea82a9730e restrict html processing to a certain number of tags 2016-09-14 09:33:30 -07:00
Sergey Obukhov
f04b872e14 Merge pull request #108 from mailgun/sergey/html5lib-fix
use new parser each time we parse a document
2016-08-22 18:10:35 -07:00
Sergey Obukhov
e61894e425 bump version 2016-08-22 17:34:18 -07:00
Sergey Obukhov
35fbdaadac use new parser each time we parse a document 2016-08-22 16:25:04 -07:00
Sergey Obukhov
8441bc7328 Merge pull request #106 from mailgun/sergey/html5lib
use html5lib to parse html
2016-08-19 15:58:07 -07:00
Sergey Obukhov
37c95ff97b fallback untouched html if we can not parse html tree 2016-08-19 11:38:12 -07:00
Sergey Obukhov
5b1ca33c57 fix cssselect 2016-08-16 17:11:41 -07:00
Sergey Obukhov
ec8e09b34e fix 2016-08-15 20:31:04 -07:00
Sergey Obukhov
bcf97eccfa use html5lib to parse html 2016-08-15 19:36:21 -07:00
Sergey Obukhov
f53b5cc7a6 Merge pull request #105 from mailgun/sergey/fromstring
html with comment that has no parent crashes html_tree_to_text
2016-08-15 13:40:37 -07:00
Sergey Obukhov
27adde7aa7 bump version 2016-08-15 13:21:10 -07:00
Sergey Obukhov
a9719833e0 html with comment that has no parent crashes html_tree_to_text 2016-08-12 17:40:12 -07:00
Sergey Obukhov
7bf37090ca Merge pull request #101 from mailgun/sergey/empty-html
if html stripped off quotations does not have readable text fallback …
2016-08-12 12:18:50 -07:00
Sergey Obukhov
44fcef7123 bump version 2016-08-11 23:59:18 -07:00
Sergey Obukhov
69a44b10a1 Merge branch 'master' into sergey/empty-html 2016-08-11 23:58:11 -07:00
Sergey Obukhov
b085e3d049 Merge pull request #104 from mailgun/sergey/spaces
fixes mailgun/talon#103 keep newlines when parsing html quotations
2016-08-11 23:56:26 -07:00
Sergey Obukhov
4b953bcddc fixes mailgun/talon#103 keep newlines when parsing html quotations 2016-08-11 20:17:37 -07:00
Sergey Obukhov
315eaa7080 if html stripped off quotations does not have readable text fallback to unparsed html 2016-08-11 19:55:23 -07:00
Sergey Obukhov
5a9bc967f1 Merge pull request #100 from mailgun/sergey/restrict
do not parse html quotations if html is longer then certain threshold
2016-08-11 16:08:03 -07:00
Sergey Obukhov
a0d7236d0b bump version and add a comment 2016-08-11 15:49:09 -07:00
Sergey Obukhov
21e9a31ffe add test 2016-08-09 17:15:49 -07:00
Sergey Obukhov
4ee46c0a97 do not parse html quotations if html is longer then certain threshold 2016-08-09 17:08:58 -07:00
Sergey Obukhov
10d9a930f9 Merge pull request #99 from mailgun/sergey/capitalized
consider word capitilized only if it is camel case - not all upper case
2016-07-20 16:47:12 -07:00
Sergey Obukhov
a21ccdb21b consider word capitilized only if it is camel case - not all upper case 2016-07-19 17:37:36 -07:00
11 changed files with 2800 additions and 2726 deletions

View File

@@ -29,7 +29,7 @@ class InstallCommand(install):
setup(name='talon', setup(name='talon',
version='1.2.11', version='1.3.3',
description=("Mailgun library " description=("Mailgun library "
"to extract message quotations and signatures."), "to extract message quotations and signatures."),
long_description=open("README.rst").read(), long_description=open("README.rst").read(),
@@ -53,6 +53,7 @@ setup(name='talon',
'cchardet>=0.3.5', 'cchardet>=0.3.5',
'cssselect', 'cssselect',
'six>=1.10.0', 'six>=1.10.0',
'html5lib'
], ],
tests_require=[ tests_require=[
"mock", "mock",

View File

@@ -6,6 +6,7 @@ messages (without quoted messages) from html
from __future__ import absolute_import from __future__ import absolute_import
import regex as re import regex as re
from talon.utils import cssselect
CHECKPOINT_PREFIX = '#!%!' CHECKPOINT_PREFIX = '#!%!'
CHECKPOINT_SUFFIX = '!%!#' CHECKPOINT_SUFFIX = '!%!#'
@@ -78,7 +79,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints):
def cut_gmail_quote(html_message): def cut_gmail_quote(html_message):
''' Cuts the outermost block element with class gmail_quote. ''' ''' Cuts the outermost block element with class gmail_quote. '''
gmail_quote = html_message.cssselect('div.gmail_quote') gmail_quote = cssselect('div.gmail_quote', html_message)
if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)): if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)):
gmail_quote[0].getparent().remove(gmail_quote[0]) gmail_quote[0].getparent().remove(gmail_quote[0])
return True return True
@@ -135,7 +136,7 @@ def cut_microsoft_quote(html_message):
def cut_by_id(html_message): def cut_by_id(html_message):
found = False found = False
for quote_id in QUOTE_IDS: for quote_id in QUOTE_IDS:
quote = html_message.cssselect('#{}'.format(quote_id)) quote = cssselect('#{}'.format(quote_id), html_message)
if quote: if quote:
found = True found = True
quote[0].getparent().remove(quote[0]) quote[0].getparent().remove(quote[0])

View File

@@ -12,7 +12,8 @@ from copy import deepcopy
from lxml import html, etree from lxml import html, etree
from talon.utils import get_delimiter, html_to_text from talon.utils import (get_delimiter, html_tree_to_text,
html_document_fromstring)
from talon import html_quotations from talon import html_quotations
from six.moves import range from six.moves import range
import six import six
@@ -164,10 +165,16 @@ RE_PARENTHESIS_LINK = re.compile("\(https?://")
SPLITTER_MAX_LINES = 4 SPLITTER_MAX_LINES = 4
MAX_LINES_COUNT = 1000 MAX_LINES_COUNT = 1000
# an extensive research shows that exceeding this limit
# leads to excessive processing time
MAX_HTML_LEN = 2794202
QUOT_PATTERN = re.compile('^>+ ?') QUOT_PATTERN = re.compile('^>+ ?')
NO_QUOT_LINE = re.compile('^[^>].*[\S].*') NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
# Regular expression to identify if a line is a header.
RE_HEADER = re.compile(": ")
def extract_from(msg_body, content_type='text/plain'): def extract_from(msg_body, content_type='text/plain'):
try: try:
@@ -385,11 +392,12 @@ def _extract_from_html(msg_body):
if msg_body.strip() == b'': if msg_body.strip() == b'':
return msg_body return msg_body
msg_body = msg_body.replace(b'\r\n', b'').replace(b'\n', b'') msg_body = msg_body.replace(b'\r\n', b'\n')
html_tree = html.document_fromstring( html_tree = html_document_fromstring(msg_body)
msg_body,
parser=html.HTMLParser(encoding="utf-8") if html_tree is None:
) return msg_body
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
html_quotations.cut_zimbra_quote(html_tree) or html_quotations.cut_zimbra_quote(html_tree) or
html_quotations.cut_blockquote(html_tree) or html_quotations.cut_blockquote(html_tree) or
@@ -401,8 +409,7 @@ def _extract_from_html(msg_body):
number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
quotation_checkpoints = [False] * number_of_checkpoints quotation_checkpoints = [False] * number_of_checkpoints
msg_with_checkpoints = html.tostring(html_tree) plain_text = html_tree_to_text(html_tree)
plain_text = html_to_text(msg_with_checkpoints)
plain_text = preprocess(plain_text, '\n', content_type='text/html') plain_text = preprocess(plain_text, '\n', content_type='text/html')
lines = plain_text.splitlines() lines = plain_text.splitlines()
@@ -425,25 +432,79 @@ def _extract_from_html(msg_body):
return_flags = [] return_flags = []
process_marked_lines(lines, markers, return_flags) process_marked_lines(lines, markers, return_flags)
lines_were_deleted, first_deleted, last_deleted = return_flags lines_were_deleted, first_deleted, last_deleted = return_flags
if not lines_were_deleted and not cut_quotations:
return msg_body
if lines_were_deleted: if lines_were_deleted:
#collect checkpoints from deleted lines #collect checkpoints from deleted lines
for i in range(first_deleted, last_deleted): for i in range(first_deleted, last_deleted):
for checkpoint in line_checkpoints[i]: for checkpoint in line_checkpoints[i]:
quotation_checkpoints[checkpoint] = True quotation_checkpoints[checkpoint] = True
else:
if cut_quotations:
return html.tostring(html_tree_copy)
else:
return msg_body
# Remove tags with quotation checkpoints # Remove tags with quotation checkpoints
html_quotations.delete_quotation_tags( html_quotations.delete_quotation_tags(
html_tree_copy, 0, quotation_checkpoints html_tree_copy, 0, quotation_checkpoints
) )
if _readable_text_empty(html_tree_copy):
return msg_body
return html.tostring(html_tree_copy) return html.tostring(html_tree_copy)
def split_emails(msg):
"""
Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify
split lines, content lines and empty lines.
Correct the split line markers inside header blocks. Header blocks are identified by the regular expression
RE_HEADER.
Return the corrected markers
"""
delimiter = get_delimiter(msg)
msg_body = preprocess(msg, delimiter)
# don't process too long messages
lines = msg_body.splitlines()[:MAX_LINES_COUNT]
markers = mark_message_lines(lines)
# we don't want splitlines in header blocks
markers = _correct_splitlines_in_headers(markers, lines)
return markers
def _correct_splitlines_in_headers(markers, lines):
"""Corrects markers by removing splitlines deemed to be inside header blocks"""
updated_markers = ""
i = 0
in_header_block = False
for m in markers:
# Only set in_header_block flag true when we hit an 's' and the line is a header.
if m == 's':
if not in_header_block:
if bool(re.search(RE_HEADER, lines[i])):
in_header_block = True
else:
m = 't'
# If the line is not a header line, set in_header_block false.
if not bool(re.search(RE_HEADER, lines[i])):
in_header_block = False
# Add the marker to the new updated markers string.
updated_markers += m
i += 1
return updated_markers
def _readable_text_empty(html_tree):
return not bool(html_tree_to_text(html_tree).strip())
def is_splitter(line): def is_splitter(line):
''' '''
Returns Matcher object if provided string is a splitter and Returns Matcher object if provided string is a splitter and
@@ -457,7 +518,7 @@ def is_splitter(line):
def text_content(context): def text_content(context):
'''XPath Extension function to return a node text content.''' '''XPath Extension function to return a node text content.'''
return context.context_node.text_content().strip() return context.context_node.xpath("string()").strip()
def tail(context): def tail(context):

File diff suppressed because it is too large Load Diff

View File

@@ -185,12 +185,13 @@ def capitalized_words_percent(s):
s = to_unicode(s, precise=True) s = to_unicode(s, precise=True)
words = re.split('\s', s) words = re.split('\s', s)
words = [w for w in words if w.strip()] words = [w for w in words if w.strip()]
words = [w for w in words if len(w) > 2]
capitalized_words_counter = 0 capitalized_words_counter = 0
valid_words_counter = 0 valid_words_counter = 0
for word in words: for word in words:
if not INVALID_WORD_START.match(word): if not INVALID_WORD_START.match(word):
valid_words_counter += 1 valid_words_counter += 1
if word[0].isupper(): if word[0].isupper() and not word[1].isupper():
capitalized_words_counter += 1 capitalized_words_counter += 1
if valid_words_counter > 0 and len(words) > 1: if valid_words_counter > 0 and len(words) > 1:
return 100 * float(capitalized_words_counter) / valid_words_counter return 100 * float(capitalized_words_counter) / valid_words_counter

View File

@@ -7,9 +7,11 @@ import chardet
import cchardet import cchardet
import regex as re import regex as re
from lxml import html from lxml.html import html5parser
from lxml.cssselect import CSSSelector from lxml.cssselect import CSSSelector
import html5lib
from talon.constants import RE_DELIMITER from talon.constants import RE_DELIMITER
import six import six
@@ -113,29 +115,18 @@ def get_delimiter(msg_body):
return delimiter return delimiter
def html_to_text(string): def html_tree_to_text(tree):
"""
Dead-simple HTML-to-text converter:
>>> html_to_text("one<br>two<br>three")
>>> "one\ntwo\nthree"
NOTES:
1. the string is expected to contain UTF-8 encoded HTML!
2. returns utf-8 encoded str (not unicode)
"""
if isinstance(string, six.text_type):
string = string.encode('utf8')
s = _prepend_utf8_declaration(string)
s = s.replace(b"\n", b"")
tree = html.fromstring(s)
for style in CSSSelector('style')(tree): for style in CSSSelector('style')(tree):
style.getparent().remove(style) style.getparent().remove(style)
for c in tree.xpath('//comment()'): for c in tree.xpath('//comment()'):
c.getparent().remove(c) parent = c.getparent()
# comment with no parent does not impact produced text
if parent is None:
continue
parent.remove(c)
text = "" text = ""
for el in tree.iter(): for el in tree.iter():
@@ -159,6 +150,62 @@ def html_to_text(string):
return _encode_utf8(retval) return _encode_utf8(retval)
def html_to_text(string):
"""
Dead-simple HTML-to-text converter:
>>> html_to_text("one<br>two<br>three")
>>> "one\ntwo\nthree"
NOTES:
1. the string is expected to contain UTF-8 encoded HTML!
2. returns utf-8 encoded str (not unicode)
3. if html can't be parsed returns None
"""
if isinstance(string, six.text_type):
string = string.encode('utf8')
s = _prepend_utf8_declaration(string)
s = s.replace(b"\n", b"")
tree = html_fromstring(s)
if tree is None:
return None
return html_tree_to_text(tree)
def html_fromstring(s):
"""Parse html tree from string. Return None if the string can't be parsed.
"""
try:
if html_too_big(s):
return None
return html5parser.fromstring(s, parser=_html5lib_parser())
except Exception:
pass
def html_document_fromstring(s):
"""Parse html tree from string. Return None if the string can't be parsed.
"""
try:
if html_too_big(s):
return None
return html5parser.document_fromstring(s, parser=_html5lib_parser())
except Exception:
pass
def cssselect(expr, tree):
return CSSSelector(expr)(tree)
def html_too_big(s):
return s.count('<') > _MAX_TAGS_COUNT
def _contains_charset_spec(s): def _contains_charset_spec(s):
"""Return True if the first 4KB contain charset spec """Return True if the first 4KB contain charset spec
""" """
@@ -183,6 +230,21 @@ def _encode_utf8(s):
return s.encode('utf-8') if isinstance(s, six.text_type) else s return s.encode('utf-8') if isinstance(s, six.text_type) else s
def _html5lib_parser():
"""
html5lib is a pure-python library that conforms to the WHATWG HTML spec
and is not vulnarable to certain attacks common for XML libraries
"""
return html5lib.HTMLParser(
# build lxml tree
html5lib.treebuilders.getTreeBuilder("lxml"),
# remove namespace value from inside lxml.html.html5paser element tag
# otherwise it yields something like "{http://www.w3.org/1999/xhtml}div"
# instead of "div", throwing the algo off
namespaceHTMLElements=False
)
_UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;' _UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;'
b'charset=utf-8">') b'charset=utf-8">')
@@ -190,5 +252,8 @@ _UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;'
_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
_HARDBREAKS = ['br', 'hr', 'tr'] _HARDBREAKS = ['br', 'hr', 'tr']
_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
# an extensive research shows that exceeding this limit
# might lead to excessive processing time
_MAX_TAGS_COUNT = 419

View File

@@ -27,7 +27,7 @@ def test_quotation_splitter_inside_blockquote():
</blockquote>""" </blockquote>"""
eq_("<html><body><p>Reply</p></body></html>", eq_("<html><head></head><body>Reply</body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -44,7 +44,7 @@ def test_quotation_splitter_outside_blockquote():
</div> </div>
</blockquote> </blockquote>
""" """
eq_("<html><body><p>Reply</p></body></html>", eq_("<html><head></head><body>Reply</body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -62,7 +62,7 @@ def test_regular_blockquote():
</div> </div>
</blockquote> </blockquote>
""" """
eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote></body></html>", eq_("<html><head></head><body>Reply<blockquote>Regular</blockquote></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -85,6 +85,7 @@ Reply
reply = """ reply = """
<html> <html>
<head></head>
<body> <body>
Reply Reply
@@ -128,7 +129,7 @@ def test_gmail_quote():
</div> </div>
</div> </div>
</div>""" </div>"""
eq_("<html><body><p>Reply</p></body></html>", eq_("<html><head></head><body>Reply</body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -139,7 +140,7 @@ def test_gmail_quote_compact():
'<div>Test</div>' \ '<div>Test</div>' \
'</div>' \ '</div>' \
'</div>' '</div>'
eq_("<html><body><p>Reply</p></body></html>", eq_("<html><head></head><body>Reply</body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -166,7 +167,7 @@ def test_unicode_in_reply():
Quote Quote
</blockquote>""".encode("utf-8") </blockquote>""".encode("utf-8")
eq_("<html><body><p>Reply&#160;&#160;Text<br></p><div><br></div>" eq_("<html><head></head><body>Reply&#160;&#160;Text<br><div><br></div>"
"</body></html>", "</body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -192,6 +193,7 @@ def test_blockquote_disclaimer():
stripped_html = """ stripped_html = """
<html> <html>
<head></head>
<body> <body>
<div> <div>
<div> <div>
@@ -223,7 +225,7 @@ def test_date_block():
</div> </div>
</div> </div>
""" """
eq_('<html><body><div>message<br></div></body></html>', eq_('<html><head></head><body><div>message<br></div></body></html>',
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -240,7 +242,7 @@ Subject: You Have New Mail From Mary!<br><br>
text text
</div></div> </div></div>
""" """
eq_('<html><body><div>message<br></div></body></html>', eq_('<html><head></head><body><div>message<br></div></body></html>',
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -258,7 +260,7 @@ def test_reply_shares_div_with_from_block():
</div> </div>
</body>''' </body>'''
eq_('<html><body><div>Blah<br><br></div></body></html>', eq_('<html><head></head><body><div>Blah<br><br></div></body></html>',
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -269,13 +271,13 @@ def test_reply_quotations_share_block():
def test_OLK_SRC_BODY_SECTION_stripped(): def test_OLK_SRC_BODY_SECTION_stripped():
eq_('<html><body><div>Reply</div></body></html>', eq_('<html><head></head><body><div>Reply</div></body></html>',
RE_WHITESPACE.sub( RE_WHITESPACE.sub(
'', quotations.extract_from_html(OLK_SRC_BODY_SECTION))) '', quotations.extract_from_html(OLK_SRC_BODY_SECTION)))
def test_reply_separated_by_hr(): def test_reply_separated_by_hr():
eq_('<html><body><div>Hi<div>there</div></div></body></html>', eq_('<html><head></head><body><div>Hi<div>there</div></div></body></html>',
RE_WHITESPACE.sub( RE_WHITESPACE.sub(
'', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR)))
@@ -296,7 +298,7 @@ Reply
</div> </div>
</div> </div>
''' '''
eq_('<html><body><p>Reply</p><div><hr></div></body></html>', eq_('<html><head></head><body>Reply<div><hr></div></body></html>',
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -356,7 +358,8 @@ def test_CRLF():
assert_false(symbol in extracted) assert_false(symbol in extracted)
eq_('<html></html>', RE_WHITESPACE.sub('', extracted)) eq_('<html></html>', RE_WHITESPACE.sub('', extracted))
msg_body = """Reply msg_body = """My
reply
<blockquote> <blockquote>
<div> <div>
@@ -371,8 +374,8 @@ def test_CRLF():
msg_body = msg_body.replace('\n', '\r\n') msg_body = msg_body.replace('\n', '\r\n')
extracted = quotations.extract_from_html(msg_body) extracted = quotations.extract_from_html(msg_body)
assert_false(symbol in extracted) assert_false(symbol in extracted)
eq_("<html><body><p>Reply</p></body></html>", # Keep new lines otherwise "My reply" becomes one word - "Myreply"
RE_WHITESPACE.sub('', extracted)) eq_("<html><head></head><body>My\nreply\n</body></html>", extracted)
def test_gmail_forwarded_msg(): def test_gmail_forwarded_msg():
@@ -380,3 +383,39 @@ def test_gmail_forwarded_msg():
</div><br></div>""" </div><br></div>"""
extracted = quotations.extract_from_html(msg_body) extracted = quotations.extract_from_html(msg_body)
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
@patch.object(u, '_MAX_TAGS_COUNT', 4)
def test_too_large_html():
msg_body = 'Reply' \
'<div class="gmail_quote">' \
'<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:' \
'<div>Test</div>' \
'</div>' \
'</div>'
eq_(RE_WHITESPACE.sub('', msg_body),
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_readable_html_empty():
msg_body = """
<blockquote>
Reply
<div>
On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
</div>
<div>
Test
</div>
</blockquote>"""
eq_(RE_WHITESPACE.sub('', msg_body),
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@patch.object(quotations, 'html_document_fromstring', Mock(return_value=None))
def test_bad_html():
bad_html = "<html></html>"
eq_(bad_html, quotations.extract_from_html(bad_html))

View File

@@ -77,6 +77,31 @@ def test_basic():
signature.extract(msg_body, 'Sergey')) signature.extract(msg_body, 'Sergey'))
def test_capitalized():
msg_body = """Hi Mary,
Do you still need a DJ for your wedding? I've included a video demo of one of our DJs available for your wedding date.
DJ Doe
http://example.com
Password: SUPERPASSWORD
Would you like to check out more?
At your service,
John Smith
Doe Inc
555-531-7967"""
sig = """John Smith
Doe Inc
555-531-7967"""
eq_(sig, signature.extract(msg_body, 'Doe')[1])
def test_over_2_text_lines_after_signature(): def test_over_2_text_lines_after_signature():
body = """Blah body = """Blah

View File

@@ -192,10 +192,11 @@ def test_punctuation_percent(categories_percent):
def test_capitalized_words_percent(): def test_capitalized_words_percent():
eq_(0.0, h.capitalized_words_percent('')) eq_(0.0, h.capitalized_words_percent(''))
eq_(100.0, h.capitalized_words_percent('Example Corp')) eq_(100.0, h.capitalized_words_percent('Example Corp'))
eq_(50.0, h.capitalized_words_percent('Qqq qqq QQQ 123 sss')) eq_(50.0, h.capitalized_words_percent('Qqq qqq Aqs 123 sss'))
eq_(100.0, h.capitalized_words_percent('Cell 713-444-7368')) eq_(100.0, h.capitalized_words_percent('Cell 713-444-7368'))
eq_(100.0, h.capitalized_words_percent('8th Floor')) eq_(100.0, h.capitalized_words_percent('8th Floor'))
eq_(0.0, h.capitalized_words_percent('(212) 230-9276')) eq_(0.0, h.capitalized_words_percent('(212) 230-9276'))
eq_(50.0, h.capitalized_words_percent('Password: REMARKABLE'))
def test_has_signature(): def test_has_signature():

View File

@@ -696,3 +696,27 @@ def test_standard_replies():
"'%(reply)s' != %(stripped)s for %(fn)s" % \ "'%(reply)s' != %(stripped)s for %(fn)s" % \
{'reply': reply_text, 'stripped': stripped_text, {'reply': reply_text, 'stripped': stripped_text,
'fn': filename} 'fn': filename}
def test_split_email():
msg = """From: Mr. X
Date: 24 February 2016
To: Mr. Y
Subject: Hi
Attachments: none
Goodbye.
From: Mr. Y
To: Mr. X
Date: 24 February 2016
Subject: Hi
Attachments: none
Hello.
-- Original Message --
On 24th February 2016 at 09.32am Conal Wrote:
Hey!
"""
expected_markers = "stttttsttttetestt"
markers = quotations.split_emails(msg)
eq_(markers, expected_markers)

View File

@@ -6,6 +6,7 @@ from . import *
from talon import utils as u from talon import utils as u
import cchardet import cchardet
import six import six
from lxml import html
def test_get_delimiter(): def test_get_delimiter():
@@ -107,3 +108,51 @@ font: 13px 'Lucida Grande', Arial, sans-serif;
<p>TEXT 2 <!-- COMMENT 2 --></p> <p>TEXT 2 <!-- COMMENT 2 --></p>
</div>""" </div>"""
eq_(b'TEXT 1 \nTEXT 2', u.html_to_text(html)) eq_(b'TEXT 1 \nTEXT 2', u.html_to_text(html))
def test_comment_no_parent():
s = "<!-- COMMENT 1 --> no comment"
d = u.html_document_fromstring(s)
eq_("no comment", u.html_tree_to_text(d))
@patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception()))
def test_html_fromstring_exception():
eq_(None, u.html_fromstring("<html></html>"))
@patch.object(u, 'html_too_big', Mock())
@patch.object(u.html5parser, 'fromstring')
def test_html_fromstring_too_big(fromstring):
eq_(None, u.html_fromstring("<html></html>"))
assert_false(fromstring.called)
@patch.object(u.html5parser, 'document_fromstring')
def test_html_document_fromstring_exception(document_fromstring):
document_fromstring.side_effect = Exception()
eq_(None, u.html_document_fromstring("<html></html>"))
@patch.object(u, 'html_too_big', Mock())
@patch.object(u.html5parser, 'document_fromstring')
def test_html_document_fromstring_too_big(document_fromstring):
eq_(None, u.html_document_fromstring("<html></html>"))
assert_false(document_fromstring.called)
@patch.object(u, 'html_fromstring', Mock(return_value=None))
def test_bad_html_to_text():
bad_html = "one<br>two<br>three"
eq_(None, u.html_to_text(bad_html))
@patch.object(u, '_MAX_TAGS_COUNT', 3)
def test_html_too_big():
eq_(False, u.html_too_big("<div></div>"))
eq_(True, u.html_too_big("<div><span>Hi</span></div>"))
@patch.object(u, '_MAX_TAGS_COUNT', 3)
def test_html_to_text():
eq_("Hello", u.html_to_text("<div>Hello</div>"))
eq_(None, u.html_to_text("<div><span>Hi</span></div>"))