42 Commits

Author SHA1 Message Date
Sergey Obukhov
0b55e8fa77 Merge pull request #137 from mailgun/sergey/chardet
loosen the encoding requirement for detect_encoding
2017-04-25 11:29:06 -07:00
Sergey Obukhov
6f159e8959 loosen the encoding requirement for detect_encoding 2017-04-25 11:19:01 -07:00
Sergey Obukhov
85a4c1d855 Merge pull request #133 from mailgun/sergey/android
add android quotation pattern
2017-04-10 16:37:17 -07:00
Sergey Obukhov
0f5e72623b add android quotation pattern 2017-04-10 16:33:21 -07:00
Sergey Obukhov
061e549ad7 Merge pull request #128 from mailgun/sergey/1.3.4
bump version
2017-02-14 11:17:35 -08:00
Sergey Obukhov
49d1a5d248 bump version 2017-02-14 11:05:50 -08:00
Sergey Obukhov
03d6b00db8 Merge pull request #127 from conalsmith49/mark-splitlines-in-email-quotation-indents
Split_Email(): Mark splitlines for headers indented with spaces or email quotation indents (">")
2017-02-14 11:03:51 -08:00
smitcona
a2eb0f7201 Creating new method which removes initial spaces and marks the message lines. Removing ambiguity introduced to mark_message_lines 2017-02-14 18:19:45 +00:00
smitcona
5c71a0ca07 Split the comment lines so that they are not over 80 characters 2017-02-13 16:45:26 +00:00
Sergey Obukhov
489d16fad9 Merge branch 'master' into mark-splitlines-in-email-quotation-indents 2017-02-09 21:10:16 -08:00
Sergey Obukhov
a458707777 Merge pull request #124 from phanindra-ramesh/issue_123
Fixes issue #123
2017-02-09 20:55:36 -08:00
smitcona
a1d0a86305 Pass ignore_initial_spaces=True as this has better clarity than separate boolean variable 2017-02-07 12:47:33 +00:00
smitcona
29f1d21be7 fixed expected markers and incorrect condensed header not matching regex 2017-02-06 15:03:22 +00:00
smitcona
34c5b526c3 Remove the whitespace before the line if the flag is set 2017-02-03 12:57:26 +00:00
smitcona
3edb6578ba Dividing preprocess method into two methods, split_emails() now calls one without email content being altered. 2017-02-03 11:49:23 +00:00
smitcona
984c036b6e Set the marker back to 'm' rather than 't' if it matches the QUOT_PATTERN. Updated test case. 2017-02-01 18:28:19 +00:00
smitcona
a403ecb5c9 Adding two level indentation test 2017-02-01 18:09:35 +00:00
smitcona
a44713409c Added additional case for testing new functionality of split_emails() 2017-02-01 17:40:59 +00:00
smitcona
567467b8ed Update comment 2017-02-01 17:29:05 +00:00
smitcona
139edd6104 Add new method which marks as splitlines, lines which are splitlines but start with email quotation indents ("> ") 2017-02-01 17:16:30 +00:00
Phanindra Ramesh Challa
e756d55abf Fixes issue #123 2016-12-27 13:53:40 +05:30
Sergey Obukhov
015c8d2a78 Merge pull request #120 from mailgun/sergey/talon-1.3.3
bump talon version
2016-11-30 18:28:39 -08:00
Sergey Obukhov
5af846c13d bump talon version 2016-11-30 12:56:06 -08:00
Sergey Obukhov
e69a9c7a54 Merge pull request #119 from conapart3/master
Addition of new split_email method for issue:115
2016-11-30 12:51:32 -08:00
conapart3
23cb2a9a53 Merge pull request #1 from conapart3/issue-115-date-split-in-headers
split_emails function added, test added
2016-11-22 20:02:54 +00:00
smitcona
b5e3397b88 Updating test to account for --original message-- case 2016-11-22 20:00:31 +00:00
smitcona
5685a4055a Improved algorithm 2016-11-22 19:56:57 +00:00
smitcona
97b72ef767 Adding in_header_block variable for reliability 2016-11-22 19:06:34 +00:00
smitcona
31489848be Remove print lines 2016-11-21 17:36:06 +00:00
smitcona
e5988d447b Add space 2016-11-21 12:48:29 +00:00
smitcona
adfed748ce split_emails function added, test added 2016-11-21 12:35:36 +00:00
Sergey Obukhov
2444ba87c0 Merge pull request #111 from mailgun/sergey/tagscount
restrict html processing to a certain number of tags
2016-09-14 11:06:29 -07:00
Sergey Obukhov
534457e713 protect html_to_text as well 2016-09-14 09:58:41 -07:00
Sergey Obukhov
ea82a9730e restrict html processing to a certain number of tags 2016-09-14 09:33:30 -07:00
Sergey Obukhov
f04b872e14 Merge pull request #108 from mailgun/sergey/html5lib-fix
use new parser each time we parse a document
2016-08-22 18:10:35 -07:00
Sergey Obukhov
e61894e425 bump version 2016-08-22 17:34:18 -07:00
Sergey Obukhov
35fbdaadac use new parser each time we parse a document 2016-08-22 16:25:04 -07:00
Sergey Obukhov
8441bc7328 Merge pull request #106 from mailgun/sergey/html5lib
use html5lib to parse html
2016-08-19 15:58:07 -07:00
Sergey Obukhov
37c95ff97b fallback untouched html if we can not parse html tree 2016-08-19 11:38:12 -07:00
Sergey Obukhov
5b1ca33c57 fix cssselect 2016-08-16 17:11:41 -07:00
Sergey Obukhov
ec8e09b34e fix 2016-08-15 20:31:04 -07:00
Sergey Obukhov
bcf97eccfa use html5lib to parse html 2016-08-15 19:36:21 -07:00
7 changed files with 332 additions and 41 deletions

View File

@@ -29,7 +29,7 @@ class InstallCommand(install):
setup(name='talon',
version='1.2.16',
version='1.3.6',
description=("Mailgun library "
"to extract message quotations and signatures."),
long_description=open("README.rst").read(),
@@ -48,11 +48,12 @@ setup(name='talon',
"regex>=1",
"numpy",
"scipy",
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
"scikit-learn>=0.16.1", # pickled versions of classifier, else rebuild
'chardet>=1.0.1',
'cchardet>=0.3.5',
'cssselect',
'six>=1.10.0',
'html5lib'
],
tests_require=[
"mock",

View File

@@ -6,6 +6,7 @@ messages (without quoted messages) from html
from __future__ import absolute_import
import regex as re
from talon.utils import cssselect
CHECKPOINT_PREFIX = '#!%!'
CHECKPOINT_SUFFIX = '!%!#'
@@ -78,7 +79,7 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints):
def cut_gmail_quote(html_message):
''' Cuts the outermost block element with class gmail_quote. '''
gmail_quote = html_message.cssselect('div.gmail_quote')
gmail_quote = cssselect('div.gmail_quote', html_message)
if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)):
gmail_quote[0].getparent().remove(gmail_quote[0])
return True
@@ -135,7 +136,7 @@ def cut_microsoft_quote(html_message):
def cut_by_id(html_message):
found = False
for quote_id in QUOTE_IDS:
quote = html_message.cssselect('#{}'.format(quote_id))
quote = cssselect('#{}'.format(quote_id), html_message)
if quote:
found = True
quote[0].getparent().remove(quote[0])

View File

@@ -12,7 +12,8 @@ from copy import deepcopy
from lxml import html, etree
from talon.utils import get_delimiter, html_tree_to_text
from talon.utils import (get_delimiter, html_tree_to_text,
html_document_fromstring)
from talon import html_quotations
from six.moves import range
import six
@@ -138,6 +139,13 @@ RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .
'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
))), re.I)
# ---- John Smith wrote ----
RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format(
u'|'.join((
# English
'wrote'
))), re.I)
SPLITTER_PATTERNS = [
RE_ORIGINAL_MESSAGE,
RE_ON_DATE_SMB_WROTE,
@@ -153,10 +161,10 @@ SPLITTER_PATTERNS = [
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
'( \S+){3,6}@\S+:'),
# Sent from Samsung MobileName <address@example.com> wrote:
re.compile('Sent from Samsung .*@.*> wrote')
re.compile('Sent from Samsung .*@.*> wrote'),
RE_ANDROID_WROTE
]
RE_LINK = re.compile('<(http://[^>]*)>')
RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@')
@@ -171,6 +179,9 @@ MAX_HTML_LEN = 2794202
QUOT_PATTERN = re.compile('^>+ ?')
NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
# Regular expression to identify if a line is a header.
RE_HEADER = re.compile(": ")
def extract_from(msg_body, content_type='text/plain'):
try:
@@ -184,6 +195,19 @@ def extract_from(msg_body, content_type='text/plain'):
return msg_body
def remove_initial_spaces_and_mark_message_lines(lines):
"""
Removes the initial spaces in each line before marking message lines.
This ensures headers can be identified if they are indented with spaces.
"""
i = 0
while i < len(lines):
lines[i] = lines[i].lstrip(' ')
i += 1
return mark_message_lines(lines)
def mark_message_lines(lines):
"""Mark message lines with markers to distinguish quotation lines.
@@ -286,9 +310,21 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
Converts msg_body into a unicode.
"""
# normalize links i.e. replace '<', '>' wrapping the link with some symbols
# so that '>' closing the link couldn't be mistakenly taken for quotation
# marker.
msg_body = _replace_link_brackets(msg_body)
msg_body = _wrap_splitter_with_newline(msg_body, delimiter, content_type)
return msg_body
def _replace_link_brackets(msg_body):
"""
Normalize links i.e. replace '<', '>' wrapping the link with some symbols
so that '>' closing the link couldn't be mistakenly taken for quotation
marker.
Converts msg_body into a unicode
"""
if isinstance(msg_body, bytes):
msg_body = msg_body.decode('utf8')
@@ -300,7 +336,14 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
return "@@%s@@" % link.group(1)
msg_body = re.sub(RE_LINK, link_wrapper, msg_body)
return msg_body
def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'):
"""
Splits line in two if splitter pattern preceded by some text on the same
line (done only for 'On <date> <person> wrote:' pattern.
"""
def splitter_wrapper(splitter):
"""Wraps splitter with new line"""
if splitter.start() and msg_body[splitter.start() - 1] != '\n':
@@ -385,17 +428,15 @@ def _extract_from_html(msg_body):
then checking deleted checkpoints,
then deleting necessary tags.
"""
if len(msg_body) > MAX_HTML_LEN:
return msg_body
if msg_body.strip() == b'':
return msg_body
msg_body = msg_body.replace(b'\r\n', b'\n')
html_tree = html.document_fromstring(
msg_body,
parser=html.HTMLParser(encoding="utf-8")
)
html_tree = html_document_fromstring(msg_body)
if html_tree is None:
return msg_body
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
html_quotations.cut_zimbra_quote(html_tree) or
html_quotations.cut_blockquote(html_tree) or
@@ -451,6 +492,82 @@ def _extract_from_html(msg_body):
return html.tostring(html_tree_copy)
def split_emails(msg):
"""
Given a message (which may consist of an email conversation thread with
multiple emails), mark the lines to identify split lines, content lines and
empty lines.
Correct the split line markers inside header blocks. Header blocks are
identified by the regular expression RE_HEADER.
Return the corrected markers
"""
msg_body = _replace_link_brackets(msg)
# don't process too long messages
lines = msg_body.splitlines()[:MAX_LINES_COUNT]
markers = remove_initial_spaces_and_mark_message_lines(lines)
markers = _mark_quoted_email_splitlines(markers, lines)
# we don't want splitlines in header blocks
markers = _correct_splitlines_in_headers(markers, lines)
return markers
def _mark_quoted_email_splitlines(markers, lines):
"""
When there are headers indented with '>' characters, this method will
attempt to identify if the header is a splitline header. If it is, then we
mark it with 's' instead of leaving it as 'm' and return the new markers.
"""
# Create a list of markers to easily alter specific characters
markerlist = list(markers)
for i, line in enumerate(lines):
if markerlist[i] != 'm':
continue
for pattern in SPLITTER_PATTERNS:
matcher = re.search(pattern, line)
if matcher:
markerlist[i] = 's'
break
return "".join(markerlist)
def _correct_splitlines_in_headers(markers, lines):
"""
Corrects markers by removing splitlines deemed to be inside header blocks.
"""
updated_markers = ""
i = 0
in_header_block = False
for m in markers:
# Only set in_header_block flag when we hit an 's' and line is a header
if m == 's':
if not in_header_block:
if bool(re.search(RE_HEADER, lines[i])):
in_header_block = True
else:
if QUOT_PATTERN.match(lines[i]):
m = 'm'
else:
m = 't'
# If the line is not a header line, set in_header_block false.
if not bool(re.search(RE_HEADER, lines[i])):
in_header_block = False
# Add the marker to the new updated markers string.
updated_markers += m
i += 1
return updated_markers
def _readable_text_empty(html_tree):
return not bool(html_tree_to_text(html_tree).strip())
@@ -468,7 +585,7 @@ def is_splitter(line):
def text_content(context):
'''XPath Extension function to return a node text content.'''
return context.context_node.text_content().strip()
return context.context_node.xpath("string()").strip()
def tail(context):

View File

@@ -7,9 +7,11 @@ import chardet
import cchardet
import regex as re
from lxml import html
from lxml.html import html5parser
from lxml.cssselect import CSSSelector
import html5lib
from talon.constants import RE_DELIMITER
import six
@@ -112,6 +114,7 @@ def get_delimiter(msg_body):
return delimiter
def html_tree_to_text(tree):
for style in CSSSelector('style')(tree):
style.getparent().remove(style)
@@ -120,7 +123,7 @@ def html_tree_to_text(tree):
parent = c.getparent()
# comment with no parent does not impact produced text
if not parent:
if parent is None:
continue
parent.remove(c)
@@ -156,17 +159,53 @@ def html_to_text(string):
NOTES:
1. the string is expected to contain UTF-8 encoded HTML!
2. returns utf-8 encoded str (not unicode)
3. if html can't be parsed returns None
"""
if isinstance(string, six.text_type):
string = string.encode('utf8')
s = _prepend_utf8_declaration(string)
s = s.replace(b"\n", b"")
tree = html_fromstring(s)
if tree is None:
return None
tree = html.fromstring(s)
return html_tree_to_text(tree)
def html_fromstring(s):
"""Parse html tree from string. Return None if the string can't be parsed.
"""
try:
if html_too_big(s):
return None
return html5parser.fromstring(s, parser=_html5lib_parser())
except Exception:
pass
def html_document_fromstring(s):
"""Parse html tree from string. Return None if the string can't be parsed.
"""
try:
if html_too_big(s):
return None
return html5parser.document_fromstring(s, parser=_html5lib_parser())
except Exception:
pass
def cssselect(expr, tree):
return CSSSelector(expr)(tree)
def html_too_big(s):
return s.count('<') > _MAX_TAGS_COUNT
def _contains_charset_spec(s):
"""Return True if the first 4KB contain charset spec
"""
@@ -191,6 +230,21 @@ def _encode_utf8(s):
return s.encode('utf-8') if isinstance(s, six.text_type) else s
def _html5lib_parser():
"""
html5lib is a pure-python library that conforms to the WHATWG HTML spec
and is not vulnarable to certain attacks common for XML libraries
"""
return html5lib.HTMLParser(
# build lxml tree
html5lib.treebuilders.getTreeBuilder("lxml"),
# remove namespace value from inside lxml.html.html5paser element tag
# otherwise it yields something like "{http://www.w3.org/1999/xhtml}div"
# instead of "div", throwing the algo off
namespaceHTMLElements=False
)
_UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;'
b'charset=utf-8">')
@@ -198,5 +252,8 @@ _UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;'
_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
_HARDBREAKS = ['br', 'hr', 'tr']
_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")
# an extensive research shows that exceeding this limit
# might lead to excessive processing time
_MAX_TAGS_COUNT = 419

View File

@@ -27,7 +27,7 @@ def test_quotation_splitter_inside_blockquote():
</blockquote>"""
eq_("<html><body><p>Reply</p></body></html>",
eq_("<html><head></head><body>Reply</body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -44,7 +44,7 @@ def test_quotation_splitter_outside_blockquote():
</div>
</blockquote>
"""
eq_("<html><body><p>Reply</p></body></html>",
eq_("<html><head></head><body>Reply</body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -62,7 +62,7 @@ def test_regular_blockquote():
</div>
</blockquote>
"""
eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote></body></html>",
eq_("<html><head></head><body>Reply<blockquote>Regular</blockquote></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -85,6 +85,7 @@ Reply
reply = """
<html>
<head></head>
<body>
Reply
@@ -128,7 +129,7 @@ def test_gmail_quote():
</div>
</div>
</div>"""
eq_("<html><body><p>Reply</p></body></html>",
eq_("<html><head></head><body>Reply</body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -139,7 +140,7 @@ def test_gmail_quote_compact():
'<div>Test</div>' \
'</div>' \
'</div>'
eq_("<html><body><p>Reply</p></body></html>",
eq_("<html><head></head><body>Reply</body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -166,7 +167,7 @@ def test_unicode_in_reply():
Quote
</blockquote>""".encode("utf-8")
eq_("<html><body><p>Reply&#160;&#160;Text<br></p><div><br></div>"
eq_("<html><head></head><body>Reply&#160;&#160;Text<br><div><br></div>"
"</body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -192,6 +193,7 @@ def test_blockquote_disclaimer():
stripped_html = """
<html>
<head></head>
<body>
<div>
<div>
@@ -223,7 +225,7 @@ def test_date_block():
</div>
</div>
"""
eq_('<html><body><div>message<br></div></body></html>',
eq_('<html><head></head><body><div>message<br></div></body></html>',
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -240,7 +242,7 @@ Subject: You Have New Mail From Mary!<br><br>
text
</div></div>
"""
eq_('<html><body><div>message<br></div></body></html>',
eq_('<html><head></head><body><div>message<br></div></body></html>',
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -258,7 +260,7 @@ def test_reply_shares_div_with_from_block():
</div>
</body>'''
eq_('<html><body><div>Blah<br><br></div></body></html>',
eq_('<html><head></head><body><div>Blah<br><br></div></body></html>',
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -269,13 +271,13 @@ def test_reply_quotations_share_block():
def test_OLK_SRC_BODY_SECTION_stripped():
eq_('<html><body><div>Reply</div></body></html>',
eq_('<html><head></head><body><div>Reply</div></body></html>',
RE_WHITESPACE.sub(
'', quotations.extract_from_html(OLK_SRC_BODY_SECTION)))
def test_reply_separated_by_hr():
eq_('<html><body><div>Hi<div>there</div></div></body></html>',
eq_('<html><head></head><body><div>Hi<div>there</div></div></body></html>',
RE_WHITESPACE.sub(
'', quotations.extract_from_html(REPLY_SEPARATED_BY_HR)))
@@ -296,7 +298,7 @@ Reply
</div>
</div>
'''
eq_('<html><body><p>Reply</p><div><hr></div></body></html>',
eq_('<html><head></head><body>Reply<div><hr></div></body></html>',
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -373,7 +375,7 @@ reply
extracted = quotations.extract_from_html(msg_body)
assert_false(symbol in extracted)
# Keep new lines otherwise "My reply" becomes one word - "Myreply"
eq_("<html><body><p>My\nreply\n</p></body></html>", extracted)
eq_("<html><head></head><body>My\nreply\n</body></html>", extracted)
def test_gmail_forwarded_msg():
@@ -383,7 +385,7 @@ def test_gmail_forwarded_msg():
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
@patch.object(quotations, 'MAX_HTML_LEN', 1)
@patch.object(u, '_MAX_TAGS_COUNT', 4)
def test_too_large_html():
msg_body = 'Reply' \
'<div class="gmail_quote">' \
@@ -411,3 +413,9 @@ def test_readable_html_empty():
eq_(RE_WHITESPACE.sub('', msg_body),
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@patch.object(quotations, 'html_document_fromstring', Mock(return_value=None))
def test_bad_html():
bad_html = "<html></html>"
eq_(bad_html, quotations.extract_from_html(bad_html))

View File

@@ -142,7 +142,8 @@ def _check_pattern_original_message(original_message_indicator):
-----{}-----
Test"""
eq_('Test reply', quotations.extract_from_plain(msg_body.format(six.text_type(original_message_indicator))))
eq_('Test reply', quotations.extract_from_plain(
msg_body.format(six.text_type(original_message_indicator))))
def test_english_original_message():
_check_pattern_original_message('Original Message')
@@ -165,6 +166,17 @@ Test reply"""
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_android_wrote():
msg_body = """Test reply
---- John Smith wrote ----
> quoted
> text
"""
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_reply_wraps_quotations():
msg_body = """Test reply
@@ -696,3 +708,52 @@ def test_standard_replies():
"'%(reply)s' != %(stripped)s for %(fn)s" % \
{'reply': reply_text, 'stripped': stripped_text,
'fn': filename}
def test_split_email():
msg = """From: Mr. X
Date: 24 February 2016
To: Mr. Y
Subject: Hi
Attachments: none
Goodbye.
From: Mr. Y
To: Mr. X
Date: 24 February 2016
Subject: Hi
Attachments: none
Hello.
On 24th February 2016 at 09.32am, Conal wrote:
Hey!
On Mon, 2016-10-03 at 09:45 -0600, Stangel, Dan wrote:
> Mohan,
>
> We have not yet migrated the systems.
>
> Dan
>
> > -----Original Message-----
> > Date: Mon, 2 Apr 2012 17:44:22 +0400
> > Subject: Test
> > From: bob@xxx.mailgun.org
> > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
> >
> > Hi
> >
> > > From: bob@xxx.mailgun.org
> > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com
> > > Date: Mon, 2 Apr 2012 17:44:22 +0400
> > > Subject: Test
> > > Hi
> > >
> >
>
>
"""
expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm"
markers = quotations.split_emails(msg)
eq_(markers, expected_markers)

View File

@@ -29,7 +29,9 @@ def test_unicode():
def test_detect_encoding():
eq_ ('ascii', u.detect_encoding(b'qwe').lower())
eq_ ('iso-8859-2', u.detect_encoding(u'Versi\xf3n'.encode('iso-8859-2')).lower())
ok_ (u.detect_encoding(
u'Versi\xf3n'.encode('iso-8859-2')).lower() in [
'iso-8859-1', 'iso-8859-2'])
eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower())
# fallback to utf-8
with patch.object(u.chardet, 'detect') as detect:
@@ -39,7 +41,9 @@ def test_detect_encoding():
def test_quick_detect_encoding():
eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower())
eq_ ('windows-1252', u.quick_detect_encoding(u'Versi\xf3n'.encode('windows-1252')).lower())
ok_ (u.quick_detect_encoding(
u'Versi\xf3n'.encode('windows-1252')).lower() in [
'windows-1252', 'windows-1250'])
eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower())
@@ -112,5 +116,47 @@ font: 13px 'Lucida Grande', Arial, sans-serif;
def test_comment_no_parent():
s = "<!-- COMMENT 1 --> no comment"
d = html.document_fromstring(s)
d = u.html_document_fromstring(s)
eq_("no comment", u.html_tree_to_text(d))
@patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception()))
def test_html_fromstring_exception():
eq_(None, u.html_fromstring("<html></html>"))
@patch.object(u, 'html_too_big', Mock())
@patch.object(u.html5parser, 'fromstring')
def test_html_fromstring_too_big(fromstring):
eq_(None, u.html_fromstring("<html></html>"))
assert_false(fromstring.called)
@patch.object(u.html5parser, 'document_fromstring')
def test_html_document_fromstring_exception(document_fromstring):
document_fromstring.side_effect = Exception()
eq_(None, u.html_document_fromstring("<html></html>"))
@patch.object(u, 'html_too_big', Mock())
@patch.object(u.html5parser, 'document_fromstring')
def test_html_document_fromstring_too_big(document_fromstring):
eq_(None, u.html_document_fromstring("<html></html>"))
assert_false(document_fromstring.called)
@patch.object(u, 'html_fromstring', Mock(return_value=None))
def test_bad_html_to_text():
bad_html = "one<br>two<br>three"
eq_(None, u.html_to_text(bad_html))
@patch.object(u, '_MAX_TAGS_COUNT', 3)
def test_html_too_big():
eq_(False, u.html_too_big("<div></div>"))
eq_(True, u.html_too_big("<div><span>Hi</span></div>"))
@patch.object(u, '_MAX_TAGS_COUNT', 3)
def test_html_to_text():
eq_("Hello", u.html_to_text("<div>Hello</div>"))
eq_(None, u.html_to_text("<div><span>Hi</span></div>"))