Operate on unicode data exclusively

This commit is contained in:
Maxim Vladimirskiy
2022-02-04 16:08:41 +03:00
parent a8c7e6a972
commit 14f106ee76
6 changed files with 72 additions and 266 deletions

View File

@@ -4,14 +4,17 @@ from __future__ import absolute_import
# noinspection PyUnresolvedReferences
import re
from unittest.mock import Mock, patch
from nose.tools import assert_false, assert_true, eq_, ok_
from tests.fixtures import (OLK_SRC_BODY_SECTION,
REPLY_QUOTATIONS_SHARE_BLOCK,
REPLY_SEPARATED_BY_HR)
from talon import quotations, utils as u
from . import *
from .fixtures import *
from lxml import html
RE_WHITESPACE = re.compile("\s")
RE_DOUBLE_WHITESPACE = re.compile("\s")
RE_WHITESPACE = re.compile(r"\s")
RE_DOUBLE_WHITESPACE = re.compile(r"\s")
def test_quotation_splitter_inside_blockquote():
@@ -166,7 +169,7 @@ def test_unicode_in_reply():
<blockquote>
Quote
</blockquote>""".encode("utf-8")
</blockquote>"""
eq_("<html><head></head><body>Reply&#160;&#160;Text<br><div><br></div>"
"</body></html>",
@@ -314,7 +317,6 @@ def extract_reply_and_check(filename):
msg_body = f.read()
reply = quotations.extract_from_html(msg_body)
plain_reply = u.html_to_text(reply)
plain_reply = plain_reply.decode('utf8')
eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"),
RE_WHITESPACE.sub('', plain_reply))