29 Commits

Author SHA1 Message Date
Derrick J. Wippler
3083f86c75 Continue with quotation cut even if html cut throws an exception 2020-02-10 11:40:00 -06:00
Derrick J. Wippler
c575beb27d Test import clean up and pep8 2020-01-30 11:50:41 -06:00
Sergey Obukhov
d9ed7cc6d1 Merge pull request #190 from yoks/master
Add __init__.py into data folder, add data files into MANIFEST.in
2019-07-02 18:56:47 +03:00
Sergey Obukhov
0a0808c0a8 Merge branch 'master' into master 2019-07-01 20:48:46 +03:00
Sergey Obukhov
16354e3528 Merge pull request #191 from mailgun/thrawn/develop
PIP-423: Now removing namespaces from parsed HTML
2019-05-12 11:54:17 +03:00
Derrick J. Wippler
1018e88ec1 Now removing namespaces from parsed HTML 2019-05-10 11:16:12 -05:00
Ivan Anisimov
2916351517 Update setup.py 2019-03-16 22:17:26 +03:00
Ivan Anisimov
46d4b02c81 Update setup.py 2019-03-16 22:15:43 +03:00
Ivan Anisimov
58eac88a10 Update MANIFEST.in 2019-03-16 22:03:40 +03:00
Ivan Anisimov
2ef3d8dfbe Update MANIFEST.in 2019-03-16 22:01:00 +03:00
Ivan Anisimov
7cf4c29340 Create __init__.py 2019-03-16 21:54:09 +03:00
Sergey Obukhov
cdd84563dd Merge pull request #183 from mailgun/sergey/date
fix text with Date: misclassified as quotations splitter
2019-01-18 17:32:10 +03:00
Sergey Obukhov
8138ea9a60 fix text with Date: misclassified as quotations splitter 2019-01-18 16:49:39 +03:00
Sergey Obukhov
c171f9a875 Merge pull request #169 from Savageman/patch-2
Use regex match to detect outlook 2007, 2010, 2013
2018-11-05 10:43:20 +03:00
Sergey Obukhov
3f97a8b8ff Merge branch 'master' into patch-2 2018-11-05 10:42:00 +03:00
Esperat Julian
1147767ff3 Fix regression: windows mail format was left forgotten
Missing a | at the end of the regex, so next lines are part of the global search.
2018-11-04 19:42:12 +01:00
Sergey Obukhov
6a304215c3 Merge pull request #177 from mailgun/obukhov-sergey-patch-1
Update Readme with how to retrain on your own data
2018-11-02 15:22:18 +03:00
Sergey Obukhov
31714506bd Update Readme with how to retrain on your own data 2018-11-02 15:21:36 +03:00
Sergey Obukhov
403d80cf3b Merge pull request #161 from glaand/master
Fix: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
2018-11-02 15:03:02 +03:00
Sergey Obukhov
7cf20f2877 Merge branch 'master' into master 2018-11-02 14:52:38 +03:00
Sergey Obukhov
afff08b017 Merge branch 'master' into patch-2 2018-11-02 09:13:42 +03:00
Sergey Obukhov
685abb1905 Merge pull request #171 from gabriellima95/Add-Portuguese-Language
Add Portuguese language to quotations
2018-11-02 09:12:43 +03:00
Sergey Obukhov
41990727a3 Merge branch 'master' into Add-Portuguese-Language 2018-11-02 09:11:07 +03:00
Sergey Obukhov
b113d8ab33 Merge pull request #172 from ad-m/patch-1
Fix catastrophic backtracking in regexp
2018-11-02 09:09:49 +03:00
Adam Dobrawy
7bd0e9cc2f Fix catastrophic backtracking in regexp
Co-Author: @Nipsuli
2018-09-21 22:00:10 +02:00
gabriellima95
1e030a51d4 Add Portuguese language to quotations 2018-09-11 15:27:39 -03:00
Esperat Julian
238a5de5cc Use regex match to detect outlook 2007, 2010, 2013
I encountered a variant of the outlook quotations with a space after the semicolon.

To prevent multiplying the number of rules, I implemented a regex match instead (I found how to here: https://stackoverflow.com/a/34093801/211204).

I documented all the different variants as cleanly as I could.
2018-08-31 12:39:52 +02:00
André Glatzl
53b24ffb3d Cut out first some encoding html tags such as xml and doctype for avoiding conflict with unicode decoding 2017-12-19 15:15:10 +01:00
Sergey Obukhov
a7404afbcb Merge pull request #155 from mailgun/sergey/appointment
fix appointments in text
2017-10-23 16:34:08 -07:00
18 changed files with 228 additions and 72 deletions

View File

@@ -5,3 +5,10 @@ include classifier
include LICENSE
include MANIFEST.in
include README.rst
include talon/signature/data/train.data
include talon/signature/data/classifier
include talon/signature/data/classifier_01.npy
include talon/signature/data/classifier_02.npy
include talon/signature/data/classifier_03.npy
include talon/signature/data/classifier_04.npy
include talon/signature/data/classifier_05.npy

View File

@@ -129,6 +129,22 @@ start using it for talon.
.. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set
.. _forge: https://github.com/mailgun/forge
Training on your dataset
------------------------
talon comes with a pre-processed dataset and a pre-trained classifier. To retrain the classifier on your own dataset of raw emails, structure and annotate them in the same way the `forge`_ project does. Then do:
.. code:: python
from talon.signature.learning.dataset import build_extraction_dataset
from talon.signature.learning import classifier as c
build_extraction_dataset("/path/to/your/P/folder", "/path/to/talon/signature/data/train.data")
c.train(c.init(), "/path/to/talon/signature/data/train.data", "/path/to/talon/signature/data/classifier")
Note that for signature extraction you need just the folder with the positive samples with annotated signature lines (P folder).
.. _forge: https://github.com/mailgun/forge
Research
--------

View File

@@ -29,7 +29,7 @@ class InstallCommand(install):
setup(name='talon',
version='1.4.5',
version='1.4.8',
description=("Mailgun library "
"to extract message quotations and signatures."),
long_description=open("README.rst").read(),
@@ -48,7 +48,7 @@ setup(name='talon',
"regex>=1",
"numpy",
"scipy",
"scikit-learn>=0.16.1", # pickled versions of classifier, else rebuild
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
'chardet>=1.0.1',
'cchardet>=0.3.5',
'cssselect',

View File

@@ -87,23 +87,24 @@ def cut_gmail_quote(html_message):
def cut_microsoft_quote(html_message):
''' Cuts splitter block and all following blocks. '''
#use EXSLT extensions to have a regex match() function with lxml
ns = {"re": "http://exslt.org/regular-expressions"}
#general pattern: @style='border:none;border-top:solid <color> 1.0pt;padding:3.0pt 0<unit> 0<unit> 0<unit>'
#outlook 2007, 2010 (international) <color=#B5C4DF> <unit=cm>
#outlook 2007, 2010 (american) <color=#B5C4DF> <unit=pt>
#outlook 2013 (international) <color=#E1E1E1> <unit=cm>
#outlook 2013 (american) <color=#E1E1E1> <unit=pt>
#also handles a variant with a space after the semicolon
splitter = html_message.xpath(
#outlook 2007, 2010 (international)
"//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
"padding:3.0pt 0cm 0cm 0cm']|"
#outlook 2007, 2010 (american)
"//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
"padding:3.0pt 0in 0in 0in']|"
#outlook 2013 (international)
"//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;"
"padding:3.0pt 0cm 0cm 0cm']|"
#outlook 2013 (american)
"//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;"
"padding:3.0pt 0in 0in 0in']|"
#outlook 2007, 2010, 2013 (international, american)
"//div[@style[re:match(., 'border:none; ?border-top:solid #(E1E1E1|B5C4DF) 1.0pt; ?"
"padding:3.0pt 0(in|cm) 0(in|cm) 0(in|cm)')]]|"
#windows mail
"//div[@style='padding-top: 5px; "
"border-top-color: rgb(229, 229, 229); "
"border-top-width: 1px; border-top-style: solid;']"
, namespaces=ns
)
if splitter:

View File

@@ -22,7 +22,7 @@ import six
log = logging.getLogger(__name__)
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+\s*$", re.I | re.M)
RE_ON_DATE_SMB_WROTE = re.compile(
u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
@@ -38,6 +38,8 @@ RE_ON_DATE_SMB_WROTE = re.compile(
'Op',
# German
'Am',
# Portuguese
'Em',
# Norwegian
u'',
# Swedish, Danish
@@ -64,6 +66,8 @@ RE_ON_DATE_SMB_WROTE = re.compile(
'schreef','verzond','geschreven',
# German
'schrieb',
# Portuguese
'escreveu',
# Norwegian, Swedish
'skrev',
# Vietnamese
@@ -135,13 +139,17 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
'Oprindelig meddelelse',
))), re.I)
RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]?.*'.format(
RE_FROM_COLON_OR_DATE_COLON = re.compile(u'((_+\r?\n)?[\s]*:?[*]?({})[\s]?:([^\n$]+\n){{1,2}}){{2,}}'.format(
u'|'.join((
# "From" in different languages.
'From', 'Van', 'De', 'Von', 'Fra', u'Från',
# "Date" in different languages.
'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
))), re.I)
'Date', '[S]ent', 'Datum', u'Envoyé', 'Skickat', 'Sendt', 'Gesendet',
# "Subject" in different languages.
'Subject', 'Betreff', 'Objet', 'Emne', u'Ämne',
# "To" in different languages.
'To', 'An', 'Til', u'À', 'Till'
))), re.I | re.M)
# ---- John Smith wrote ----
RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format(
@@ -286,7 +294,7 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
# inlined reply
# use lookbehind assertions to find overlapping entries e.g. for 'mtmtm'
# both 't' entries should be found
for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers):
for inline_reply in re.finditer('(?<=m)e*(t[te]*)m', markers):
# long links could break sequence of quotation lines but they shouldn't
# be considered an inline reply
links = (
@@ -430,6 +438,9 @@ def _extract_from_html(msg_body):
Extract not quoted message from provided html message body
using tags and plain text algorithm.
Cut out first some encoding html tags such as xml and doctype
for avoiding conflict with unicode decoding
Cut out the 'blockquote', 'gmail_quote' tags.
Cut Microsoft quotations.
@@ -445,18 +456,27 @@ def _extract_from_html(msg_body):
return msg_body
msg_body = msg_body.replace(b'\r\n', b'\n')
msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body)
html_tree = html_document_fromstring(msg_body)
if html_tree is None:
return msg_body
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
html_quotations.cut_zimbra_quote(html_tree) or
html_quotations.cut_blockquote(html_tree) or
html_quotations.cut_microsoft_quote(html_tree) or
html_quotations.cut_by_id(html_tree) or
html_quotations.cut_from_block(html_tree)
)
cut_quotations = False
try:
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
html_quotations.cut_zimbra_quote(html_tree) or
html_quotations.cut_blockquote(html_tree) or
html_quotations.cut_microsoft_quote(html_tree) or
html_quotations.cut_by_id(html_tree) or
html_quotations.cut_from_block(html_tree)
)
except Exception as e:
log.exception('during html quotations cut')
pass
html_tree_copy = deepcopy(html_tree)
number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
@@ -502,9 +522,69 @@ def _extract_from_html(msg_body):
if _readable_text_empty(html_tree_copy):
return msg_body
# NOTE: We remove_namespaces() because we are using an HTML5 Parser, HTML
# parsers do not recognize namespaces in HTML tags. As such the rendered
# HTML tags are no longer recognizable HTML tags. Example: <o:p> becomes
# <oU0003Ap>. When we port this to golang we should look into using an
# XML Parser NOT and HTML5 Parser since we do not know what input a
# customer will send us. Switching to a common XML parser in python
# opens us up to a host of vulnerabilities.
# See https://docs.python.org/3/library/xml.html#xml-vulnerabilities
#
# The down sides to removing the namespaces is that customers might
# judge the XML namespaces important. If that is the case then support
# should encourage customers to preform XML parsing of the un-stripped
# body to get the full unmodified XML payload.
#
# Alternatives to this approach are
# 1. Ignore the U0003A in tag names and let the customer deal with it.
# This is not ideal, as most customers use stripped-html for viewing
# emails sent from a recipient, as such they cannot control the HTML
# provided by a recipient.
# 2. Preform a string replace of 'U0003A' to ':' on the rendered HTML
# string. While this would solve the issue simply, it runs the risk
# of replacing data outside the <tag> which might be essential to
# the customer.
remove_namespaces(html_tree_copy)
return html.tostring(html_tree_copy)
def remove_namespaces(root):
"""
Given the root of an HTML document iterate through all the elements
and remove any namespaces that might have been provided and remove
any attributes that contain a namespace
<html xmlns:o="urn:schemas-microsoft-com:office:office">
becomes
<html>
<o:p>Hi</o:p>
becomes
<p>Hi</p>
Start tags do NOT have a namespace; COLON characters have no special meaning.
if we don't remove the namespace the parser translates the tag name into a
unicode representation. For example <o:p> becomes <oU0003Ap>
See https://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#start-tags
"""
for child in root.iter():
for key, value in child.attrib.items():
# If the attribute includes a colon
if key.rfind("U0003A") != -1:
child.attrib.pop(key)
# If the tag includes a colon
idx = child.tag.rfind("U0003A")
if idx != -1:
child.tag = child.tag[idx+6:]
return root
def split_emails(msg):
"""
Given a message (which may consist of an email conversation thread with
@@ -557,7 +637,6 @@ def _correct_splitlines_in_headers(markers, lines):
updated_markers = ""
i = 0
in_header_block = False
for m in markers:
# Only set in_header_block flag when we hit an 's' and line is a header
if m == 's':

View File

@@ -62,7 +62,7 @@ RE_SIGNATURE_CANDIDATE = re.compile(r'''
def extract_signature(msg_body):
'''
"""
Analyzes message for a presence of signature block (by common patterns)
and returns tuple with two elements: message text without signature block
and the signature itself.
@@ -72,7 +72,7 @@ def extract_signature(msg_body):
>>> extract_signature('Hey man!')
('Hey man!', None)
'''
"""
try:
# identify line delimiter first
delimiter = get_delimiter(msg_body)

View File

@@ -0,0 +1 @@

View File

@@ -131,7 +131,7 @@ def html_tree_to_text(tree):
for el in tree.iter():
el_text = (el.text or '') + (el.tail or '')
if len(el_text) > 1:
if el.tag in _BLOCKTAGS:
if el.tag in _BLOCKTAGS + _HARDBREAKS:
text += "\n"
if el.tag == 'li':
text += " * "
@@ -142,7 +142,8 @@ def html_tree_to_text(tree):
if href:
text += "(%s) " % href
if el.tag in _HARDBREAKS and text and not text.endswith("\n"):
if (el.tag in _HARDBREAKS and text and
not text.endswith("\n") and not el_text):
text += "\n"
retval = _rm_excessive_newlines(text)

View File

@@ -1,6 +1,4 @@
from __future__ import absolute_import
from nose.tools import *
from mock import *
import talon

View File

@@ -2,12 +2,11 @@
from __future__ import absolute_import
# noinspection PyUnresolvedReferences
import re
from tests.fixtures import REPLY_QUOTATIONS_SHARE_BLOCK, OLK_SRC_BODY_SECTION, REPLY_SEPARATED_BY_HR
from nose.tools import eq_, ok_, assert_false, assert_true
from talon import quotations, utils as u
from . import *
from .fixtures import *
from mock import Mock, patch
import re
RE_WHITESPACE = re.compile("\s")
RE_DOUBLE_WHITESPACE = re.compile("\s")
@@ -424,3 +423,23 @@ def test_readable_html_empty():
def test_bad_html():
bad_html = "<html></html>"
eq_(bad_html, quotations.extract_from_html(bad_html))
def test_remove_namespaces():
msg_body = """
<html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns="http://www.w3.org/TR/REC-html40">
<body>
<o:p>Dear Sir,</o:p>
<o:p>Thank you for the email.</o:p>
<blockquote>thing</blockquote>
</body>
</html>
"""
rendered = quotations.extract_from_html(msg_body)
assert_true("<p>" in rendered)
assert_true("xmlns" in rendered)
assert_true("<o:p>" not in rendered)
assert_true("<xmlns:o>" not in rendered)

View File

@@ -1,10 +1,10 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from . import *
from . fixtures import *
from mock import Mock, patch
from talon import quotations
from nose.tools import eq_
@patch.object(quotations, 'extract_from_html')

View File

@@ -1,9 +1,10 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from .. import *
from nose.tools import eq_
from talon.signature import bruteforce
from mock import patch, Mock
def test_empty_body():

View File

@@ -2,14 +2,14 @@
from __future__ import absolute_import
import os
from six.moves import range
from talon.signature import bruteforce, extraction, extract
from talon.signature import extraction as e
from talon.signature.learning import dataset
from .. import *
from nose.tools import eq_
from .. import STRIPPED, UNICODE_MSG
from six.moves import range
from mock import patch
import os
def test_message_shorter_SIGNATURE_MAX_LINES():

View File

@@ -1,14 +1,13 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from ... import *
import os
from numpy import genfromtxt
from talon.signature.learning import dataset as d
from ... import EML_MSG_FILENAME, MSG_FILENAME_WITH_BODY_SUFFIX, TMP_DIR, EMAILS_DIR
from talon.signature.learning.featurespace import features
from talon.signature.learning import dataset as d
from nose.tools import eq_, assert_false, ok_
from numpy import genfromtxt
import os
def test_is_sender_filename():

View File

@@ -1,9 +1,10 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from ... import *
from talon.signature.learning import featurespace as fs
from nose.tools import eq_, assert_false, ok_
from mock import patch
def test_apply_features():

View File

@@ -1,13 +1,13 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from ... import *
import regex as re
from talon.signature.learning import helpers as h
from talon.signature.learning.helpers import *
from talon.signature.learning.helpers import RE_RELAX_PHONE, RE_NAME
from nose.tools import eq_, ok_, assert_false, assert_in
from mock import patch, Mock
from six.moves import range
import re
# First testing regex constants.
VALID = '''

View File

@@ -1,16 +1,15 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from . import *
from . fixtures import *
import os
import email.iterators
from tests.fixtures import STANDARD_REPLIES
from talon import quotations
import six
from six.moves import range
from six import StringIO
from nose.tools import eq_
from mock import patch
import email.iterators
import six
import os
@patch.object(quotations, 'MAX_LINES_COUNT', 1)
@@ -35,6 +34,7 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote:
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_pattern_on_date_polymail():
msg_body = """Test reply
@@ -190,14 +190,17 @@ Test"""
eq_('Test reply', quotations.extract_from_plain(
msg_body.format(six.text_type(original_message_indicator))))
def test_english_original_message():
_check_pattern_original_message('Original Message')
_check_pattern_original_message('Reply Message')
def test_german_original_message():
_check_pattern_original_message(u'Ursprüngliche Nachricht')
_check_pattern_original_message('Antwort Nachricht')
def test_danish_original_message():
_check_pattern_original_message('Oprindelig meddelelse')
@@ -296,6 +299,7 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote:
> Hello"""
eq_("Hi", quotations.extract_from_plain(msg_body))
def test_with_indent():
msg_body = """YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.
@@ -303,7 +307,8 @@ def test_with_indent():
Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur.
"""
eq_("YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.", quotations.extract_from_plain(msg_body))
eq_("YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.",
quotations.extract_from_plain(msg_body))
def test_short_quotation_with_newline():
@@ -343,6 +348,7 @@ Subject: The manager has commented on your Loop
Blah-blah-blah
"""))
def test_german_from_block():
eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
"""Allo! Follow up MIME!
@@ -355,6 +361,7 @@ Betreff: The manager has commented on your Loop
Blah-blah-blah
"""))
def test_french_multiline_from_block():
eq_('Lorem ipsum', quotations.extract_from_plain(
u"""Lorem ipsum
@@ -367,6 +374,7 @@ Objet : Follow Up
Blah-blah-blah
"""))
def test_french_from_block():
eq_('Lorem ipsum', quotations.extract_from_plain(
u"""Lorem ipsum
@@ -375,6 +383,7 @@ Le 23 janv. 2015 à 22:03, Brendan xxx <brendan.xxx@xxx.com<mailto:brendan.xxx@x
Bonjour!"""))
def test_polish_from_block():
eq_('Lorem ipsum', quotations.extract_from_plain(
u"""Lorem ipsum
@@ -385,6 +394,7 @@ napisał:
Blah!
"""))
def test_danish_from_block():
eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
"""Allo! Follow up MIME!
@@ -397,6 +407,7 @@ Emne: The manager has commented on your Loop
Blah-blah-blah
"""))
def test_swedish_from_block():
eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
u"""Allo! Follow up MIME!
@@ -408,6 +419,7 @@ Till: Isacson Leiff
Blah-blah-blah
"""))
def test_swedish_from_line():
eq_('Lorem', quotations.extract_from_plain(
"""Lorem
@@ -416,6 +428,7 @@ Den 14 september, 2015 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
"""))
def test_norwegian_from_line():
eq_('Lorem', quotations.extract_from_plain(
u"""Lorem
@@ -424,6 +437,7 @@ På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
"""))
def test_dutch_from_block():
eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
"""Gluten-free culpa lo-fi et nesciunt nostrud.
@@ -433,6 +447,7 @@ Op 17-feb.-2015, om 13:18 heeft Julius Caesar <pantheon@rome.com> het volgende g
Small batch beard laboris tempor, non listicle hella Tumblr heirloom.
"""))
def test_vietnamese_from_block():
eq_('Hello', quotations.extract_from_plain(
u"""Hello
@@ -442,6 +457,7 @@ Vào 14:24 8 tháng 6, 2017, Hùng Nguyễn <hungnguyen@xxx.com> đã viết:
> Xin chào
"""))
def test_quotation_marker_false_positive():
msg_body = """Visit us now for assistance...
>>> >>> http://www.domain.com <<<
@@ -453,7 +469,8 @@ def test_link_closed_with_quotation_marker_on_new_line():
msg_body = '''8.45am-1pm
From: somebody@example.com
Date: Wed, 16 May 2012 00:15:02 -0600
<http://email.example.com/c/dHJhY2tpbmdfY29kZT1mMDdjYzBmNzM1ZjYzMGIxNT
> <bob@example.com <mailto:bob@example.com> >
@@ -494,7 +511,9 @@ def test_from_block_starts_with_date():
msg_body = """Blah
Date: Wed, 16 May 2012 00:15:02 -0600
To: klizhentas@example.com"""
To: klizhentas@example.com
"""
eq_('Blah', quotations.extract_from_plain(msg_body))
@@ -564,11 +583,12 @@ def test_mark_message_lines():
# next line should be marked as splitter
'_____________',
'From: foo@bar.com',
'Date: Wed, 16 May 2012 00:15:02 -0600',
'',
'> Hi',
'',
'Signature']
eq_('tessemet', quotations.mark_message_lines(lines))
eq_('tesssemet', quotations.mark_message_lines(lines))
lines = ['Just testing the email reply',
'',
@@ -807,7 +827,7 @@ def test_split_email():
>
>
"""
expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm"
expected_markers = "stttttsttttetesetesmmmmmmsmmmmmmmmmmmmmmmm"
markers = quotations.split_emails(msg)
eq_(markers, expected_markers)
@@ -823,3 +843,15 @@ that this line is intact."""
parsed = quotations.extract_from_plain(msg_body)
eq_(msg_body, parsed.decode('utf8'))
def test_appointment():
msg_body = """Invitation for an interview:
Date: Wednesday 3, October 2011
Time: 7 : 00am
Address: 130 Fox St
Please bring in your ID."""
parsed = quotations.extract_from_plain(msg_body)
eq_(msg_body, parsed.decode('utf8'))

View File

@@ -2,12 +2,13 @@
from __future__ import absolute_import
from nose.tools import eq_, ok_, assert_false
from talon import utils as u
from mock import patch, Mock
import cchardet
import six
from talon import utils as u
from . import *
def test_get_delimiter():
eq_('\r\n', u.get_delimiter('abc\r\n123'))