27 Commits

Author SHA1 Message Date
Sergey Obukhov
0e6d5f993c fix appointments in text 2017-10-23 16:32:42 -07:00
Sergey Obukhov
60637ff13a Merge pull request #152 from mailgun/sergey/v1.4.4
bump version
2017-08-24 16:00:05 -07:00
Sergey Obukhov
df8259e3fe bump version 2017-08-24 15:58:53 -07:00
Sergey Obukhov
aab3b1cc75 Merge pull request #150 from ezrapagel/fix_greedy_dash_regex
android_wrote regex incorrectly matching
2017-08-24 15:52:29 -07:00
Sergey Obukhov
9492b39f2d Merge branch 'master' into fix_greedy_dash_regex 2017-08-24 15:39:28 -07:00
Sergey Obukhov
b9ac866ea7 Merge pull request #151 from mailgun/sergey/reshape
reshape data as suggested by sklearn
2017-08-24 12:04:58 -07:00
Sergey Obukhov
678517dd89 reshape data as suggested by sklearn 2017-08-24 12:03:47 -07:00
Ezra Pagel
221774c6f8 android_wrote regex was incorrectly iterating characters in 'wrote', resulting in greedy regex that
matched many strings with dashes
2017-08-21 12:47:06 -05:00
Sergey Obukhov
a2aa345712 Merge pull request #148 from mailgun/sergey/v1.4.2
bump version after adding support for Vietnamese format
2017-07-10 11:44:46 -07:00
Sergey Obukhov
d998beaff3 bump version after adding support for Vietnamese format 2017-07-10 11:42:52 -07:00
Sergey Obukhov
a379bc4e7c Merge pull request #147 from hnx116/master
add support for Vietnamese reply format
2017-07-10 11:40:04 -07:00
Hung Nguyen
b8e1894f3b add test case 2017-07-10 13:28:33 +07:00
Hung Nguyen
0b5a44090f add support for Vietnamese reply format 2017-07-10 11:18:57 +07:00
Sergey Obukhov
b40835eca2 Merge pull request #145 from mailgun/sergey/outlook-2013-version-bump
bump version after merging outlook 2013 support PR
2017-06-18 22:56:16 -07:00
Sergey Obukhov
b38562c7cc bump version after merging outlook 2013 support PR 2017-06-18 22:55:15 -07:00
Sergey Obukhov
70e9fb415e Merge pull request #139 from Savageman/patch-1
Added Outlook 2013 rules
2017-06-18 22:53:18 -07:00
Sergey Obukhov
64612099cd Merge branch 'master' into patch-1 2017-06-18 22:51:46 -07:00
Sergey Obukhov
45c20f979d Merge pull request #144 from mailgun/sergey/python3-support-version-bump
bump version after merging python 3 support PR
2017-06-18 22:49:20 -07:00
Sergey Obukhov
743c76f159 bump version after merging python 3 support PR 2017-06-18 22:48:12 -07:00
Sergey Obukhov
bc5dad75d3 Merge pull request #141 from yfilali/master
Python 3 compatibility up to 3.6.1
2017-06-18 22:44:07 -07:00
Yacine Filali
4acf05cf28 Only use load compat if we can't load the classifier 2017-05-24 13:29:59 -07:00
Yacine Filali
f5f7264077 Can now handle read only classifier data as well 2017-05-24 13:22:24 -07:00
Yacine Filali
4364bebf38 Added exception checking for pickle format conversion 2017-05-24 10:26:33 -07:00
Yacine Filali
15e61768f2 Encoding fixes 2017-05-23 16:17:39 -07:00
Yacine Filali
dd0a0f5c4d Python 2.7 backward compat 2017-05-23 16:10:13 -07:00
Yacine Filali
086f5ba43b Updated talon for Python 3 2017-05-23 15:39:50 -07:00
Esperat Julian
e16dcf629e Added Outlook 2013 rules
Only the border color changes (compared to Outlook 2007, 2010) from `#B5C4DF` to `#E1E1E1`.
2017-04-27 11:34:01 +02:00
13 changed files with 216 additions and 95 deletions

4
.gitignore vendored
View File

@@ -39,6 +39,8 @@ nosetests.xml
/.emacs.desktop
/.emacs.desktop.lock
.elc
.idea
.cache
auto-save-list
tramp
.\#*
@@ -51,4 +53,4 @@ tramp
_trial_temp
# OSX
.DS_Store
.DS_Store

View File

@@ -29,7 +29,7 @@ class InstallCommand(install):
setup(name='talon',
version='1.3.7',
version='1.4.5',
description=("Mailgun library "
"to extract message quotations and signatures."),
long_description=open("README.rst").read(),

View File

@@ -94,6 +94,12 @@ def cut_microsoft_quote(html_message):
#outlook 2007, 2010 (american)
"//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
"padding:3.0pt 0in 0in 0in']|"
#outlook 2013 (international)
"//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;"
"padding:3.0pt 0cm 0cm 0cm']|"
#outlook 2013 (american)
"//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;"
"padding:3.0pt 0in 0in 0in']|"
#windows mail
"//div[@style='padding-top: 5px; "
"border-top-color: rgb(229, 229, 229); "

View File

@@ -42,6 +42,8 @@ RE_ON_DATE_SMB_WROTE = re.compile(
u'',
# Swedish, Danish
'Den',
# Vietnamese
u'Vào',
)),
# Date and sender separator
u'|'.join((
@@ -64,6 +66,8 @@ RE_ON_DATE_SMB_WROTE = re.compile(
'schrieb',
# Norwegian, Swedish
'skrev',
# Vietnamese
u'đã viết',
))
))
# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
@@ -143,7 +147,7 @@ RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]?.*
RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format(
u'|'.join((
# English
'wrote'
'wrote',
))), re.I)
# Support polymail.io reply format
@@ -161,15 +165,15 @@ SPLITTER_PATTERNS = [
RE_FROM_COLON_OR_DATE_COLON,
# 02.04.2012 14:20 пользователь "bob@example.com" <
# bob@xxx.mailgun.org> написал:
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S),
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*\s\S+@\S+", re.S),
# 2014-10-17 11:28 GMT+03:00 Bob <
# bob@example.com>:
re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S),
re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*\s\S+@\S+", re.S),
# Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>:
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
'( \S+){3,6}@\S+:'),
# Sent from Samsung MobileName <address@example.com> wrote:
re.compile('Sent from Samsung .*@.*> wrote'),
re.compile('Sent from Samsung.* \S+@\S+> wrote'),
RE_ANDROID_WROTE,
RE_POLYMAIL
]

View File

@@ -1,15 +1,15 @@
from __future__ import absolute_import
import logging
import regex as re
from talon.utils import get_delimiter
from talon.signature.constants import (SIGNATURE_MAX_LINES,
TOO_LONG_SIGNATURE_LINE)
from talon.utils import get_delimiter
log = logging.getLogger(__name__)
# regex to fetch signature based on common signature words
RE_SIGNATURE = re.compile(r'''
(
@@ -28,7 +28,6 @@ RE_SIGNATURE = re.compile(r'''
)
''', re.I | re.X | re.M | re.S)
# signatures appended by phone email clients
RE_PHONE_SIGNATURE = re.compile(r'''
(
@@ -45,7 +44,6 @@ RE_PHONE_SIGNATURE = re.compile(r'''
)
''', re.I | re.X | re.M | re.S)
# see _mark_candidate_indexes() for details
# c - could be signature line
# d - line starts with dashes (could be signature or list item)
@@ -112,7 +110,7 @@ def extract_signature(msg_body):
return (stripped_body.strip(),
signature.strip())
except Exception as e:
except Exception:
log.exception('ERROR extracting signature')
return (msg_body, None)
@@ -163,7 +161,7 @@ def _mark_candidate_indexes(lines, candidate):
'cdc'
"""
# at first consider everything to be potential signature lines
markers = bytearray('c'*len(candidate))
markers = list('c' * len(candidate))
# mark lines starting from bottom up
for i, line_idx in reversed(list(enumerate(candidate))):
@@ -174,7 +172,7 @@ def _mark_candidate_indexes(lines, candidate):
if line.startswith('-') and line.strip("-"):
markers[i] = 'd'
return markers
return "".join(markers)
def _process_marked_candidate_indexes(candidate, markers):

View File

@@ -1,16 +1,15 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import logging
import regex as re
import numpy
from talon.signature.learning.featurespace import features, build_pattern
from talon.utils import get_delimiter
import regex as re
from talon.signature.bruteforce import get_signature_candidate
from talon.signature.learning.featurespace import features, build_pattern
from talon.signature.learning.helpers import has_signature
from talon.utils import get_delimiter
log = logging.getLogger(__name__)
@@ -33,7 +32,7 @@ RE_REVERSE_SIGNATURE = re.compile(r'''
def is_signature_line(line, sender, classifier):
'''Checks if the line belongs to signature. Returns True or False.'''
data = numpy.array(build_pattern(line, features(sender)))
data = numpy.array(build_pattern(line, features(sender))).reshape(1, -1)
return classifier.predict(data) > 0
@@ -58,7 +57,7 @@ def extract(body, sender):
text = delimiter.join(text)
if text.strip():
return (text, delimiter.join(signature))
except Exception:
except Exception as e:
log.exception('ERROR when extracting signature with classifiers')
return (body, None)
@@ -81,7 +80,7 @@ def _mark_lines(lines, sender):
candidate = get_signature_candidate(lines)
# at first consider everything to be text no signature
markers = bytearray('t'*len(lines))
markers = list('t' * len(lines))
# mark lines starting from bottom up
# mark only lines that belong to candidate
@@ -96,7 +95,7 @@ def _mark_lines(lines, sender):
elif is_signature_line(line, sender, EXTRACTOR):
markers[j] = 's'
return markers
return "".join(markers)
def _process_marked_lines(lines, markers):
@@ -111,3 +110,4 @@ def _process_marked_lines(lines, markers):
return (lines[:-signature.end()], lines[-signature.end():])
return (lines, None)

View File

@@ -6,9 +6,10 @@ body belongs to the signature.
"""
from __future__ import absolute_import
from numpy import genfromtxt
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
from sklearn.svm import LinearSVC
def init():
@@ -29,4 +30,40 @@ def train(classifier, train_data_filename, save_classifier_filename=None):
def load(saved_classifier_filename, train_data_filename):
"""Loads saved classifier. """
return joblib.load(saved_classifier_filename)
try:
return joblib.load(saved_classifier_filename)
except Exception:
import sys
if sys.version_info > (3, 0):
return load_compat(saved_classifier_filename)
raise
def load_compat(saved_classifier_filename):
import os
import pickle
import tempfile
# we need to switch to the data path to properly load the related _xx.npy files
cwd = os.getcwd()
os.chdir(os.path.dirname(saved_classifier_filename))
# convert encoding using pick.load and write to temp file which we'll tell joblib to use
pickle_file = open(saved_classifier_filename, 'rb')
classifier = pickle.load(pickle_file, encoding='latin1')
try:
# save our conversion if permissions allow
joblib.dump(classifier, saved_classifier_filename)
except Exception:
# can't write to classifier, use a temp file
tmp = tempfile.SpooledTemporaryFile()
joblib.dump(classifier, tmp)
saved_classifier_filename = tmp
# important, use joblib.load before switching back to original cwd
jb_classifier = joblib.load(saved_classifier_filename)
os.chdir(cwd)
return jb_classifier

View File

@@ -17,13 +17,14 @@ suffix which should be `_sender`.
"""
from __future__ import absolute_import
import os
import regex as re
from six.moves import range
from talon.signature.constants import SIGNATURE_MAX_LINES
from talon.signature.learning.featurespace import build_pattern, features
from six.moves import range
SENDER_SUFFIX = '_sender'
BODY_SUFFIX = '_body'
@@ -57,9 +58,14 @@ def parse_msg_sender(filename, sender_known=True):
algorithm:
>>> parse_msg_sender(filename, False)
"""
import sys
kwargs = {}
if sys.version_info > (3, 0):
kwargs["encoding"] = "utf8"
sender, msg = None, None
if os.path.isfile(filename) and not is_sender_filename(filename):
with open(filename) as f:
with open(filename, **kwargs) as f:
msg = f.read()
sender = u''
if sender_known:
@@ -147,7 +153,7 @@ def build_extraction_dataset(folder, dataset_filename,
continue
lines = msg.splitlines()
for i in range(1, min(SIGNATURE_MAX_LINES,
len(lines)) + 1):
len(lines)) + 1):
line = lines[-i]
label = -1
if line[:len(SIGNATURE_ANNOTATION)] == \

View File

@@ -1,19 +1,18 @@
# coding:utf-8
from __future__ import absolute_import
import logging
from random import shuffle
import chardet
import cchardet
import regex as re
from lxml.html import html5parser
from lxml.cssselect import CSSSelector
import chardet
import html5lib
import regex as re
import six
from lxml.cssselect import CSSSelector
from lxml.html import html5parser
from talon.constants import RE_DELIMITER
import six
def safe_format(format_string, *args, **kwargs):
@@ -128,7 +127,7 @@ def html_tree_to_text(tree):
parent.remove(c)
text = ""
text = ""
for el in tree.iter():
el_text = (el.text or '') + (el.tail or '')
if len(el_text) > 1:
@@ -177,6 +176,8 @@ def html_to_text(string):
def html_fromstring(s):
"""Parse html tree from string. Return None if the string can't be parsed.
"""
if isinstance(s, six.text_type):
s = s.encode('utf8')
try:
if html_too_big(s):
return None
@@ -189,6 +190,8 @@ def html_fromstring(s):
def html_document_fromstring(s):
"""Parse html tree from string. Return None if the string can't be parsed.
"""
if isinstance(s, six.text_type):
s = s.encode('utf8')
try:
if html_too_big(s):
return None
@@ -203,7 +206,9 @@ def cssselect(expr, tree):
def html_too_big(s):
return s.count('<') > _MAX_TAGS_COUNT
if isinstance(s, six.text_type):
s = s.encode('utf8')
return s.count(b'<') > _MAX_TAGS_COUNT
def _contains_charset_spec(s):
@@ -248,8 +253,7 @@ def _html5lib_parser():
_UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;'
b'charset=utf-8">')
_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
_HARDBREAKS = ['br', 'hr', 'tr']
_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")

View File

@@ -1,13 +1,13 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from . import *
from . fixtures import *
import regex as re
# noinspection PyUnresolvedReferences
import re
from talon import quotations, utils as u
from . import *
from .fixtures import *
RE_WHITESPACE = re.compile("\s")
RE_DOUBLE_WHITESPACE = re.compile("\s")
@@ -303,7 +303,12 @@ Reply
def extract_reply_and_check(filename):
f = open(filename)
import sys
kwargs = {}
if sys.version_info > (3, 0):
kwargs["encoding"] = "utf8"
f = open(filename, **kwargs)
msg_body = f.read()
reply = quotations.extract_from_html(msg_body)
@@ -373,7 +378,7 @@ reply
</blockquote>"""
msg_body = msg_body.replace('\n', '\r\n')
extracted = quotations.extract_from_html(msg_body)
assert_false(symbol in extracted)
assert_false(symbol in extracted)
# Keep new lines otherwise "My reply" becomes one word - "Myreply"
eq_("<html><head></head><body>My\nreply\n</body></html>", extracted)

View File

@@ -1,16 +1,16 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from .. import *
import os
from talon.signature.learning import dataset
from talon import signature
from talon.signature import extraction as e
from talon.signature import bruteforce
from six.moves import range
from talon.signature import bruteforce, extraction, extract
from talon.signature import extraction as e
from talon.signature.learning import dataset
from .. import *
def test_message_shorter_SIGNATURE_MAX_LINES():
sender = "bob@foo.bar"
@@ -18,23 +18,28 @@ def test_message_shorter_SIGNATURE_MAX_LINES():
Thanks in advance,
Bob"""
text, extracted_signature = signature.extract(body, sender)
text, extracted_signature = extract(body, sender)
eq_('\n'.join(body.splitlines()[:2]), text)
eq_('\n'.join(body.splitlines()[-2:]), extracted_signature)
def test_messages_longer_SIGNATURE_MAX_LINES():
import sys
kwargs = {}
if sys.version_info > (3, 0):
kwargs["encoding"] = "utf8"
for filename in os.listdir(STRIPPED):
filename = os.path.join(STRIPPED, filename)
if not filename.endswith('_body'):
continue
sender, body = dataset.parse_msg_sender(filename)
text, extracted_signature = signature.extract(body, sender)
text, extracted_signature = extract(body, sender)
extracted_signature = extracted_signature or ''
with open(filename[:-len('body')] + 'signature') as ms:
with open(filename[:-len('body')] + 'signature', **kwargs) as ms:
msg_signature = ms.read()
eq_(msg_signature.strip(), extracted_signature.strip())
stripped_msg = body.strip()[:len(body.strip())-len(msg_signature)]
stripped_msg = body.strip()[:len(body.strip()) - len(msg_signature)]
eq_(stripped_msg.strip(), text.strip())
@@ -47,7 +52,7 @@ Thanks in advance,
some text which doesn't seem to be a signature at all
Bob"""
text, extracted_signature = signature.extract(body, sender)
text, extracted_signature = extract(body, sender)
eq_('\n'.join(body.splitlines()[:2]), text)
eq_('\n'.join(body.splitlines()[-3:]), extracted_signature)
@@ -60,7 +65,7 @@ Thanks in advance,
some long text here which doesn't seem to be a signature at all
Bob"""
text, extracted_signature = signature.extract(body, sender)
text, extracted_signature = extract(body, sender)
eq_('\n'.join(body.splitlines()[:-1]), text)
eq_('Bob', extracted_signature)
@@ -68,13 +73,13 @@ Bob"""
some *long* text here which doesn't seem to be a signature at all
"""
((body, None), signature.extract(body, "david@example.com"))
((body, None), extract(body, "david@example.com"))
def test_basic():
msg_body = 'Blah\r\n--\r\n\r\nSergey Obukhov'
eq_(('Blah', '--\r\n\r\nSergey Obukhov'),
signature.extract(msg_body, 'Sergey'))
extract(msg_body, 'Sergey'))
def test_capitalized():
@@ -99,7 +104,7 @@ Doe Inc
Doe Inc
555-531-7967"""
eq_(sig, signature.extract(msg_body, 'Doe')[1])
eq_(sig, extract(msg_body, 'Doe')[1])
def test_over_2_text_lines_after_signature():
@@ -110,25 +115,25 @@ def test_over_2_text_lines_after_signature():
2 non signature lines in the end
It's not signature
"""
text, extracted_signature = signature.extract(body, "Bob")
text, extracted_signature = extract(body, "Bob")
eq_(extracted_signature, None)
def test_no_signature():
sender, body = "bob@foo.bar", "Hello"
eq_((body, None), signature.extract(body, sender))
eq_((body, None), extract(body, sender))
def test_handles_unicode():
sender, body = dataset.parse_msg_sender(UNICODE_MSG)
text, extracted_signature = signature.extract(body, sender)
text, extracted_signature = extract(body, sender)
@patch.object(signature.extraction, 'has_signature')
@patch.object(extraction, 'has_signature')
def test_signature_extract_crash(has_signature):
has_signature.side_effect = Exception('Bam!')
msg_body = u'Blah\r\n--\r\n\r\nСергей'
eq_((msg_body, None), signature.extract(msg_body, 'Сергей'))
eq_((msg_body, None), extract(msg_body, 'Сергей'))
def test_mark_lines():
@@ -137,19 +142,19 @@ def test_mark_lines():
# (starting from the bottom) because we don't count empty line
eq_('ttset',
e._mark_lines(['Bob Smith',
'Bob Smith',
'Bob Smith',
'',
'some text'], 'Bob Smith'))
'Bob Smith',
'Bob Smith',
'',
'some text'], 'Bob Smith'))
with patch.object(bruteforce, 'SIGNATURE_MAX_LINES', 3):
# we don't analyse the 1st line because
# signature cant start from the 1st line
eq_('tset',
e._mark_lines(['Bob Smith',
'Bob Smith',
'',
'some text'], 'Bob Smith'))
'Bob Smith',
'',
'some text'], 'Bob Smith'))
def test_process_marked_lines():

View File

@@ -119,6 +119,38 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> sent:
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_appointment():
msg_body = """Response
10/19/2017 @ 9:30 am for physical therapy
Bla
1517 4th Avenue Ste 300
London CA 19129, 555-421-6780
John Doe, FCLS
Mailgun Inc
555-941-0697
From: from@example.com [mailto:from@example.com]
Sent: Wednesday, October 18, 2017 2:05 PM
To: John Doer - SIU <jd@example.com>
Subject: RE: Claim # 5551188-1
Text"""
expected = """Response
10/19/2017 @ 9:30 am for physical therapy
Bla
1517 4th Avenue Ste 300
London CA 19129, 555-421-6780
John Doe, FCLS
Mailgun Inc
555-941-0697"""
eq_(expected, quotations.extract_from_plain(msg_body))
def test_line_starts_with_on():
msg_body = """Blah-blah-blah
On blah-blah-blah"""
@@ -401,6 +433,14 @@ Op 17-feb.-2015, om 13:18 heeft Julius Caesar <pantheon@rome.com> het volgende g
Small batch beard laboris tempor, non listicle hella Tumblr heirloom.
"""))
def test_vietnamese_from_block():
eq_('Hello', quotations.extract_from_plain(
u"""Hello
Vào 14:24 8 tháng 6, 2017, Hùng Nguyễn <hungnguyen@xxx.com> đã viết:
> Xin chào
"""))
def test_quotation_marker_false_positive():
msg_body = """Visit us now for assistance...
@@ -770,3 +810,16 @@ def test_split_email():
expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm"
markers = quotations.split_emails(msg)
eq_(markers, expected_markers)
def test_feedback_below_left_unparsed():
msg_body = """Please enter your feedback below. Thank you.
------------------------------------- Enter Feedback Below -------------------------------------
The user experience was unparallelled. Please continue production. I'm sending payment to ensure
that this line is intact."""
parsed = quotations.extract_from_plain(msg_body)
eq_(msg_body, parsed.decode('utf8'))

View File

@@ -1,12 +1,12 @@
# coding:utf-8
from __future__ import absolute_import
from . import *
from talon import utils as u
import cchardet
import six
from lxml import html
from talon import utils as u
from . import *
def test_get_delimiter():
@@ -16,35 +16,35 @@ def test_get_delimiter():
def test_unicode():
eq_ (u'hi', u.to_unicode('hi'))
eq_ (type(u.to_unicode('hi')), six.text_type )
eq_ (type(u.to_unicode(u'hi')), six.text_type )
eq_ (type(u.to_unicode('привет')), six.text_type )
eq_ (type(u.to_unicode(u'привет')), six.text_type )
eq_ (u"привет", u.to_unicode('привет'))
eq_ (u"привет", u.to_unicode(u'привет'))
eq_(u'hi', u.to_unicode('hi'))
eq_(type(u.to_unicode('hi')), six.text_type)
eq_(type(u.to_unicode(u'hi')), six.text_type)
eq_(type(u.to_unicode('привет')), six.text_type)
eq_(type(u.to_unicode(u'привет')), six.text_type)
eq_(u"привет", u.to_unicode('привет'))
eq_(u"привет", u.to_unicode(u'привет'))
# some latin1 stuff
eq_ (u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True))
eq_(u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True))
def test_detect_encoding():
eq_ ('ascii', u.detect_encoding(b'qwe').lower())
ok_ (u.detect_encoding(
eq_('ascii', u.detect_encoding(b'qwe').lower())
ok_(u.detect_encoding(
u'Versi\xf3n'.encode('iso-8859-2')).lower() in [
'iso-8859-1', 'iso-8859-2'])
eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower())
eq_('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower())
# fallback to utf-8
with patch.object(u.chardet, 'detect') as detect:
detect.side_effect = Exception
eq_ ('utf-8', u.detect_encoding('qwe'.encode('utf8')).lower())
eq_('utf-8', u.detect_encoding('qwe'.encode('utf8')).lower())
def test_quick_detect_encoding():
eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower())
ok_ (u.quick_detect_encoding(
eq_('ascii', u.quick_detect_encoding(b'qwe').lower())
ok_(u.quick_detect_encoding(
u'Versi\xf3n'.encode('windows-1252')).lower() in [
'windows-1252', 'windows-1250'])
eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower())
eq_('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower())
@patch.object(cchardet, 'detect')
@@ -84,7 +84,7 @@ Haha
eq_(u"привет!", u.html_to_text("<b>привет!</b>").decode('utf8'))
html = '<body><br/><br/>Hi</body>'
eq_ (b'Hi', u.html_to_text(html))
eq_(b'Hi', u.html_to_text(html))
html = """Hi
<style type="text/css">
@@ -104,7 +104,7 @@ font: 13px 'Lucida Grande', Arial, sans-serif;
}
</style>"""
eq_ (b'Hi', u.html_to_text(html))
eq_(b'Hi', u.html_to_text(html))
html = """<div>
<!-- COMMENT 1 -->
@@ -115,15 +115,16 @@ font: 13px 'Lucida Grande', Arial, sans-serif;
def test_comment_no_parent():
s = "<!-- COMMENT 1 --> no comment"
s = b'<!-- COMMENT 1 --> no comment'
d = u.html_document_fromstring(s)
eq_("no comment", u.html_tree_to_text(d))
eq_(b"no comment", u.html_tree_to_text(d))
@patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception()))
def test_html_fromstring_exception():
eq_(None, u.html_fromstring("<html></html>"))
@patch.object(u, 'html_too_big', Mock())
@patch.object(u.html5parser, 'fromstring')
def test_html_fromstring_too_big(fromstring):
@@ -158,5 +159,5 @@ def test_html_too_big():
@patch.object(u, '_MAX_TAGS_COUNT', 3)
def test_html_to_text():
eq_("Hello", u.html_to_text("<div>Hello</div>"))
eq_(b"Hello", u.html_to_text("<div>Hello</div>"))
eq_(None, u.html_to_text("<div><span>Hi</span></div>"))