1 Commits

Author SHA1 Message Date
Ralph Meijer
2377c387c7 Actually bump up talon's version up to 1.0.5 to match the tag. 2015-09-09 22:46:18 +02:00
16 changed files with 98 additions and 606 deletions

View File

@@ -95,7 +95,7 @@ classifiers. The core of machine learning algorithm lays in
apply to a message (``featurespace.py``), how data sets are built apply to a message (``featurespace.py``), how data sets are built
(``dataset.py``), classifiers interface (``classifier.py``). (``dataset.py``), classifiers interface (``classifier.py``).
Currently the data used for training is taken from our personal email The data used for training is taken from our personal email
conversations and from `ENRON`_ dataset. As a result of applying our set conversations and from `ENRON`_ dataset. As a result of applying our set
of features to the dataset we provide files ``classifier`` and of features to the dataset we provide files ``classifier`` and
``train.data`` that dont have any personal information but could be ``train.data`` that dont have any personal information but could be

View File

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
setup(name='talon', setup(name='talon',
version='1.2.9', version='1.0.5',
description=("Mailgun library " description=("Mailgun library "
"to extract message quotations and signatures."), "to extract message quotations and signatures."),
long_description=open("README.rst").read(), long_description=open("README.rst").read(),
@@ -14,14 +14,12 @@ setup(name='talon',
include_package_data=True, include_package_data=True,
zip_safe=True, zip_safe=True,
install_requires=[ install_requires=[
"lxml>=2.3.3", "lxml==2.3.3",
"regex>=1", "regex>=1",
"html2text",
"numpy", "numpy",
"scipy", "scipy",
"scikit-learn==0.16.1", # pickled versions of classifier, else rebuild "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild
'chardet>=1.0.1',
'cchardet>=0.3.5',
'cssselect'
], ],
tests_require=[ tests_require=[
"mock", "mock",

View File

@@ -12,7 +12,6 @@ CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX)
# HTML quote indicators (tag ids) # HTML quote indicators (tag ids)
QUOTE_IDS = ['OLK_SRC_BODY_SECTION'] QUOTE_IDS = ['OLK_SRC_BODY_SECTION']
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
def add_checkpoint(html_note, counter): def add_checkpoint(html_note, counter):
@@ -77,8 +76,8 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints):
def cut_gmail_quote(html_message): def cut_gmail_quote(html_message):
''' Cuts the outermost block element with class gmail_quote. ''' ''' Cuts the outermost block element with class gmail_quote. '''
gmail_quote = html_message.cssselect('div.gmail_quote') gmail_quote = html_message.cssselect('.gmail_quote')
if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)): if gmail_quote:
gmail_quote[0].getparent().remove(gmail_quote[0]) gmail_quote[0].getparent().remove(gmail_quote[0])
return True return True
@@ -86,12 +85,9 @@ def cut_gmail_quote(html_message):
def cut_microsoft_quote(html_message): def cut_microsoft_quote(html_message):
''' Cuts splitter block and all following blocks. ''' ''' Cuts splitter block and all following blocks. '''
splitter = html_message.xpath( splitter = html_message.xpath(
#outlook 2007, 2010 (international) #outlook 2007, 2010
"//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
"padding:3.0pt 0cm 0cm 0cm']|" "padding:3.0pt 0cm 0cm 0cm']|"
#outlook 2007, 2010 (american)
"//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
"padding:3.0pt 0in 0in 0in']|"
#windows mail #windows mail
"//div[@style='padding-top: 5px; " "//div[@style='padding-top: 5px; "
"border-top-color: rgb(229, 229, 229); " "border-top-color: rgb(229, 229, 229); "
@@ -142,14 +138,9 @@ def cut_by_id(html_message):
def cut_blockquote(html_message): def cut_blockquote(html_message):
''' Cuts the last non-nested blockquote with wrapping elements.''' ''' Cuts blockquote with wrapping elements. '''
quote = html_message.xpath( quote = html_message.find('.//blockquote')
'(.//blockquote)' if quote is not None:
'[not(@class="gmail_quote") and not(ancestor::blockquote)]'
'[last()]')
if quote:
quote = quote[0]
quote.getparent().remove(quote) quote.getparent().remove(quote)
return True return True
@@ -163,40 +154,13 @@ def cut_from_block(html_message):
if block: if block:
block = block[-1] block = block[-1]
parent_div = None
while block.getparent() is not None: while block.getparent() is not None:
if block.tag == 'div': if block.tag == 'div':
parent_div = block block.getparent().remove(block)
break
block = block.getparent()
if parent_div is not None:
maybe_body = parent_div.getparent()
# In cases where removing this enclosing div will remove all
# content, we should assume the quote is not enclosed in a tag.
parent_div_is_all_content = (
maybe_body is not None and maybe_body.tag == 'body' and
len(maybe_body.getchildren()) == 1)
if not parent_div_is_all_content:
parent = block.getparent()
next_sibling = block.getnext()
# remove all tags after found From block
# (From block and quoted message are in separate divs)
while next_sibling is not None:
parent.remove(block)
block = next_sibling
next_sibling = block.getnext()
# remove the last sibling (or the
# From block if no siblings)
if block is not None:
parent.remove(block)
return True return True
else: else:
return False block = block.getparent()
else:
# handle the case when From: block goes right after e.g. <hr> # handle the case when From: block goes right after e.g. <hr>
# and not enclosed in some tag # and not enclosed in some tag
block = html_message.xpath( block = html_message.xpath(
@@ -204,17 +168,7 @@ def cut_from_block(html_message):
"//*[starts-with(mg:tail(), 'Date:')]")) "//*[starts-with(mg:tail(), 'Date:')]"))
if block: if block:
block = block[0] block = block[0]
if RE_FWD.match(block.getparent().text or ''):
return False
while(block.getnext() is not None): while(block.getnext() is not None):
block.getparent().remove(block.getnext()) block.getparent().remove(block.getnext())
block.getparent().remove(block) block.getparent().remove(block)
return True return True
def cut_zimbra_quote(html_message):
zDivider = html_message.xpath('//hr[@data-marker="__DIVIDER__"]')
if zDivider:
zDivider[0].getparent().remove(zDivider[0])
return True

View File

@@ -10,8 +10,9 @@ import logging
from copy import deepcopy from copy import deepcopy
from lxml import html, etree from lxml import html, etree
import html2text
from talon.utils import get_delimiter, html_to_text from talon.utils import get_delimiter
from talon import html_quotations from talon import html_quotations
@@ -21,7 +22,7 @@ log = logging.getLogger(__name__)
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
RE_ON_DATE_SMB_WROTE = re.compile( RE_ON_DATE_SMB_WROTE = re.compile(
u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format( u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
# Beginning of the line # Beginning of the line
u'|'.join(( u'|'.join((
# English # English
@@ -31,13 +32,7 @@ RE_ON_DATE_SMB_WROTE = re.compile(
# Polish # Polish
'W dniu', 'W dniu',
# Dutch # Dutch
'Op', 'Op'
# German
'Am',
# Norwegian
u'',
# Swedish, Danish
'Den',
)), )),
# Date and sender separator # Date and sender separator
u'|'.join(( u'|'.join((
@@ -55,28 +50,18 @@ RE_ON_DATE_SMB_WROTE = re.compile(
# Polish # Polish
u'napisał', u'napisał',
# Dutch # Dutch
'schreef','verzond','geschreven', 'schreef','verzond','geschreven'
# German
'schrieb',
# Norwegian, Swedish
'skrev',
)) ))
)) ))
# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:' # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
RE_ON_DATE_WROTE_SMB = re.compile( RE_ON_DATE_WROTE_SMB = re.compile(
u'(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format( u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format(
# Beginning of the line # Beginning of the line
u'|'.join((
'Op', 'Op',
#German
'Am'
)),
# Ending of the line # Ending of the line
u'|'.join(( u'|'.join((
# Dutch # Dutch
'schreef','verzond','geschreven', 'schreef','verzond','geschreven'
# German
'schrieb'
)) ))
) )
) )
@@ -107,7 +92,7 @@ RE_EMPTY_QUOTATION = re.compile(
( (
# quotation border: splitter line or a number of quotation marker lines # quotation border: splitter line or a number of quotation marker lines
(?: (?:
(?:se*)+ s
| |
(?:me*){2,} (?:me*){2,}
) )
@@ -130,27 +115,20 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format( RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format(
u'|'.join(( u'|'.join((
# "From" in different languages. # "From" in different languages.
'From', 'Van', 'De', 'Von', 'Fra', u'Från', 'From', 'Van', 'De', 'Von', 'Fra',
# "Date" in different languages. # "Date" in different languages.
'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt', 'Date', 'Datum', u'Envoyé'
))), re.I) ))), re.I)
SPLITTER_PATTERNS = [ SPLITTER_PATTERNS = [
RE_ORIGINAL_MESSAGE, RE_ORIGINAL_MESSAGE,
# <date> <person>
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE),
RE_ON_DATE_SMB_WROTE, RE_ON_DATE_SMB_WROTE,
RE_ON_DATE_WROTE_SMB, RE_ON_DATE_WROTE_SMB,
RE_FROM_COLON_OR_DATE_COLON, RE_FROM_COLON_OR_DATE_COLON,
# 02.04.2012 14:20 пользователь "bob@example.com" <
# bob@xxx.mailgun.org> написал:
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S),
# 2014-10-17 11:28 GMT+03:00 Bob <
# bob@example.com>:
re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S),
# Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>:
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
'( \S+){3,6}@\S+:'), '( \S+){3,6}@\S+:')
# Sent from Samsung MobileName <address@example.com> wrote:
re.compile('Sent from Samsung .*@.*> wrote')
] ]
@@ -203,7 +181,6 @@ def mark_message_lines(lines):
else: else:
# in case splitter is spread across several lines # in case splitter is spread across several lines
splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES])) splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
if splitter: if splitter:
# append as many splitter markers as lines in splitter # append as many splitter markers as lines in splitter
splitter_lines = splitter.group().splitlines() splitter_lines = splitter.group().splitlines()
@@ -316,8 +293,12 @@ def extract_from_plain(msg_body):
delimiter = get_delimiter(msg_body) delimiter = get_delimiter(msg_body)
msg_body = preprocess(msg_body, delimiter) msg_body = preprocess(msg_body, delimiter)
lines = msg_body.splitlines()
# don't process too long messages # don't process too long messages
lines = msg_body.splitlines()[:MAX_LINES_COUNT] if len(lines) > MAX_LINES_COUNT:
return stripped_text
markers = mark_message_lines(lines) markers = mark_message_lines(lines)
lines = process_marked_lines(lines, markers) lines = process_marked_lines(lines, markers)
@@ -343,28 +324,43 @@ def extract_from_html(msg_body):
then checking deleted checkpoints, then checking deleted checkpoints,
then deleting necessary tags. then deleting necessary tags.
""" """
if msg_body.strip() == '': if msg_body.strip() == '':
return msg_body return msg_body
msg_body = msg_body.replace('\r\n', '').replace('\n', '')
html_tree = html.document_fromstring( html_tree = html.document_fromstring(
msg_body, msg_body,
parser=html.HTMLParser(encoding="utf-8") parser=html.HTMLParser(encoding="utf-8")
) )
cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
html_quotations.cut_zimbra_quote(html_tree) or
html_quotations.cut_blockquote(html_tree) or html_quotations.cut_blockquote(html_tree) or
html_quotations.cut_microsoft_quote(html_tree) or html_quotations.cut_microsoft_quote(html_tree) or
html_quotations.cut_by_id(html_tree) or html_quotations.cut_by_id(html_tree) or
html_quotations.cut_from_block(html_tree) html_quotations.cut_from_block(html_tree)
) )
html_tree_copy = deepcopy(html_tree) html_tree_copy = deepcopy(html_tree)
number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
quotation_checkpoints = [False] * number_of_checkpoints quotation_checkpoints = [False] * number_of_checkpoints
msg_with_checkpoints = html.tostring(html_tree) msg_with_checkpoints = html.tostring(html_tree)
plain_text = html_to_text(msg_with_checkpoints)
plain_text = preprocess(plain_text, '\n', content_type='text/html') h = html2text.HTML2Text()
h.body_width = 0 # generate plain text without wrap
# html2text adds unnecessary star symbols. Remove them.
# Mask star symbols
msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432')
plain_text = h.handle(msg_with_checkpoints)
# Remove created star symbols
plain_text = plain_text.replace('*', '')
# Unmask saved star symbols
plain_text = plain_text.replace('3423oorkg432', '*')
delimiter = get_delimiter(plain_text)
plain_text = preprocess(plain_text, delimiter, content_type='text/html')
lines = plain_text.splitlines() lines = plain_text.splitlines()
# Don't process too long messages # Don't process too long messages
@@ -386,6 +382,7 @@ def extract_from_html(msg_body):
return_flags = [] return_flags = []
process_marked_lines(lines, markers, return_flags) process_marked_lines(lines, markers, return_flags)
lines_were_deleted, first_deleted, last_deleted = return_flags lines_were_deleted, first_deleted, last_deleted = return_flags
if lines_were_deleted: if lines_were_deleted:
#collect checkpoints from deleted lines #collect checkpoints from deleted lines
for i in xrange(first_deleted, last_deleted): for i in xrange(first_deleted, last_deleted):

Binary file not shown.

View File

@@ -16,7 +16,7 @@ from talon.signature.constants import SIGNATURE_MAX_LINES
rc = re.compile rc = re.compile
RE_EMAIL = rc('\S@\S') RE_EMAIL = rc('@')
RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}') RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''') RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')
@@ -120,7 +120,7 @@ def contains_sender_names(sender):
names = names or sender names = names or sender
if names != '': if names != '':
return binary_regex_search(re.compile(names)) return binary_regex_search(re.compile(names))
return lambda s: 0 return lambda s: False
def extract_names(sender): def extract_names(sender):
@@ -134,7 +134,7 @@ def extract_names(sender):
>>> extract_names('') >>> extract_names('')
[] []
""" """
sender = to_unicode(sender, precise=True) sender = to_unicode(sender)
# Remove non-alphabetical characters # Remove non-alphabetical characters
sender = "".join([char if char.isalpha() else ' ' for char in sender]) sender = "".join([char if char.isalpha() else ' ' for char in sender])
# Remove too short words and words from "black" list i.e. # Remove too short words and words from "black" list i.e.
@@ -161,7 +161,7 @@ def categories_percent(s, categories):
50.0 50.0
''' '''
count = 0 count = 0
s = to_unicode(s, precise=True) s = to_unicode(s)
for c in s: for c in s:
if unicodedata.category(c) in categories: if unicodedata.category(c) in categories:
count += 1 count += 1
@@ -181,7 +181,7 @@ def punctuation_percent(s):
def capitalized_words_percent(s): def capitalized_words_percent(s):
'''Returns capitalized words percent.''' '''Returns capitalized words percent.'''
s = to_unicode(s, precise=True) s = to_unicode(s)
words = re.split('\s', s) words = re.split('\s', s)
words = [w for w in words if w.strip()] words = [w for w in words if w.strip()]
capitalized_words_counter = 0 capitalized_words_counter = 0

View File

@@ -2,16 +2,13 @@
import logging import logging
from random import shuffle from random import shuffle
import chardet
import cchardet
import regex as re
from lxml import html
from lxml.cssselect import CSSSelector
from talon.constants import RE_DELIMITER from talon.constants import RE_DELIMITER
log = logging.getLogger(__name__)
def safe_format(format_string, *args, **kwargs): def safe_format(format_string, *args, **kwargs):
""" """
Helper: formats string with any combination of bytestrings/unicode Helper: formats string with any combination of bytestrings/unicode
@@ -45,42 +42,12 @@ def to_unicode(str_or_unicode, precise=False):
u'привет' u'привет'
If `precise` flag is True, tries to guess the correct encoding first. If `precise` flag is True, tries to guess the correct encoding first.
""" """
encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8' encoding = detect_encoding(str_or_unicode) if precise else 'utf-8'
if isinstance(str_or_unicode, str): if isinstance(str_or_unicode, str):
return unicode(str_or_unicode, encoding, 'replace') return unicode(str_or_unicode, encoding, 'replace')
return str_or_unicode return str_or_unicode
def detect_encoding(string):
"""
Tries to detect the encoding of the passed string.
Defaults to UTF-8.
"""
try:
detected = chardet.detect(string)
if detected:
return detected.get('encoding') or 'utf-8'
except Exception, e:
pass
return 'utf-8'
def quick_detect_encoding(string):
"""
Tries to detect the encoding of the passed string.
Uses cchardet. Fallbacks to detect_encoding.
"""
try:
detected = cchardet.detect(string)
if detected:
return detected.get('encoding') or detect_encoding(string)
except Exception, e:
pass
return detect_encoding(string)
def to_utf8(str_or_unicode): def to_utf8(str_or_unicode):
""" """
Safely returns a UTF-8 version of a given string Safely returns a UTF-8 version of a given string
@@ -107,81 +74,3 @@ def get_delimiter(msg_body):
delimiter = '\n' delimiter = '\n'
return delimiter return delimiter
def html_to_text(string):
"""
Dead-simple HTML-to-text converter:
>>> html_to_text("one<br>two<br>three")
>>> "one\ntwo\nthree"
NOTES:
1. the string is expected to contain UTF-8 encoded HTML!
2. returns utf-8 encoded str (not unicode)
"""
s = _prepend_utf8_declaration(string)
s = s.replace("\n", "")
tree = html.fromstring(s)
for style in CSSSelector('style')(tree):
style.getparent().remove(style)
for c in tree.xpath('//comment()'):
c.getparent().remove(c)
text = ""
for el in tree.iter():
el_text = (el.text or '') + (el.tail or '')
if len(el_text) > 1:
if el.tag in _BLOCKTAGS:
text += "\n"
if el.tag == 'li':
text += " * "
text += el_text.strip() + " "
# add href to the output
href = el.attrib.get('href')
if href:
text += "(%s) " % href
if el.tag in _HARDBREAKS and text and not text.endswith("\n"):
text += "\n"
retval = _rm_excessive_newlines(text)
return _encode_utf8(retval)
def _contains_charset_spec(s):
"""Return True if the first 4KB contain charset spec
"""
return s.lower().find('html; charset=', 0, 4096) != -1
def _prepend_utf8_declaration(s):
"""Prepend 'utf-8' encoding declaration if the first 4KB don't have any
"""
return s if _contains_charset_spec(s) else _UTF8_DECLARATION + s
def _rm_excessive_newlines(s):
"""Remove excessive newlines that often happen due to tons of divs
"""
return _RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip()
def _encode_utf8(s):
"""Encode in 'utf-8' if unicode
"""
return s.encode('utf-8') if isinstance(s, unicode) else s
_UTF8_DECLARATION = ('<meta http-equiv="Content-Type" content="text/html;'
'charset=utf-8">')
_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
_HARDBREAKS = ['br', 'hr', 'tr']
_RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}")

View File

@@ -1,4 +1,3 @@
<?xml version="1.0" encoding="UTF-8"?>
<html> <html>
<head> <head>
<style><!-- <style><!--

View File

@@ -1,87 +0,0 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-2022-jp">
<meta name="Generator" content="Microsoft Word 14 (filtered medium)">
<style><!--
/* Font Definitions */
@font-face
{font-family:Calibri;
panose-1:2 15 5 2 2 2 4 3 2 4;}
@font-face
{font-family:Tahoma;
panose-1:2 11 6 4 3 5 4 4 2 4;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
{margin:0in;
margin-bottom:.0001pt;
font-size:12.0pt;
font-family:"Times New Roman","serif";}
h3
{mso-style-priority:9;
mso-style-link:"Heading 3 Char";
mso-margin-top-alt:auto;
margin-right:0in;
mso-margin-bottom-alt:auto;
margin-left:0in;
font-size:13.5pt;
font-family:"Times New Roman","serif";
font-weight:bold;}
a:link, span.MsoHyperlink
{mso-style-priority:99;
color:blue;
text-decoration:underline;}
a:visited, span.MsoHyperlinkFollowed
{mso-style-priority:99;
color:purple;
text-decoration:underline;}
p
{mso-style-priority:99;
mso-margin-top-alt:auto;
margin-right:0in;
mso-margin-bottom-alt:auto;
margin-left:0in;
font-size:12.0pt;
font-family:"Times New Roman","serif";}
span.Heading3Char
{mso-style-name:"Heading 3 Char";
mso-style-priority:9;
mso-style-link:"Heading 3";
font-family:"Cambria","serif";
color:#4F81BD;
font-weight:bold;}
span.EmailStyle19
{mso-style-type:personal-reply;
font-family:"Calibri","sans-serif";
color:#1F497D;}
.MsoChpDefault
{mso-style-type:export-only;
font-family:"Calibri","sans-serif";}
@page WordSection1
{size:8.5in 11.0in;
margin:1.0in 1.0in 1.0in 1.0in;}
div.WordSection1
{page:WordSection1;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="1026" />
</xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1" />
</o:shapelayout></xml><![endif]-->
</head>
<body lang="EN-US" link="blue" vlink="purple">
<div class="WordSection1">
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Hi. I am fine.<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Thanks,<o:p></o:p></span></p>
<p class="MsoNormal"><span style="font-size:11.0pt;font-family:&quot;Calibri&quot;,&quot;sans-serif&quot;;color:#1F497D">Alex<o:p></o:p></span></p>
<p class="MsoNormal"><b><span style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;">From:</span></b><span style="font-size:10.0pt;font-family:&quot;Tahoma&quot;,&quot;sans-serif&quot;"> Foo [mailto:foo@bar.com]
<b>On Behalf Of </b>baz@bar.com<br>
<b>Sent:</b> Monday, January 01, 2000 12:00 AM<br>
<b>To:</b> john@bar.com<br>
<b>Cc:</b> jane@bar.io<br>
<b>Subject:</b> Conversation<o:p></o:p></span></p>
<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
<p>Hello! How are you?<o:p></o:p></p>
<p class="MsoNormal"><o:p>&nbsp;</o:p></p>
</div>
</body>
</html>

View File

@@ -1,19 +0,0 @@
Content-Type: text/plain;
charset=us-ascii
Mime-Version: 1.0 (Mac OS X Mail 8.2 \(2104\))
Subject: Re: Hello there
X-Universally-Unique-Identifier: 85B1075D-5841-46A9-8565-FCB287A93AC4
From: Adam Renberg <adam@tictail.com>
In-Reply-To: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
Date: Sat, 22 Aug 2015 19:22:20 +0200
Content-Transfer-Encoding: 7bit
X-Smtp-Server: smtp.gmail.com:adam@tictail.com
Message-Id: <68001B29-8EA4-444C-A894-0537D2CA5208@tictail.com>
References: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com>
To: Adam Renberg <tgwizard@gmail.com>
Hello
> On 22 Aug 2015, at 19:21, Adam Renberg <tgwizard@gmail.com> wrote:
>
> Hi there!

View File

@@ -5,7 +5,9 @@ from . fixtures import *
import regex as re import regex as re
from talon import quotations, utils as u from talon import quotations
import html2text
RE_WHITESPACE = re.compile("\s") RE_WHITESPACE = re.compile("\s")
@@ -43,25 +45,7 @@ def test_quotation_splitter_outside_blockquote():
</div> </div>
</blockquote> </blockquote>
""" """
eq_("<html><body><p>Reply</p></body></html>", eq_("<html><body><p>Reply</p><div></div></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_regular_blockquote():
msg_body = """Reply
<blockquote>Regular</blockquote>
<div>
On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
</div>
<blockquote>
<div>
<blockquote>Nested</blockquote>
</div>
</blockquote>
"""
eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
@@ -131,29 +115,6 @@ def test_gmail_quote():
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_gmail_quote_compact():
msg_body = 'Reply' \
'<div class="gmail_quote">' \
'<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:' \
'<div>Test</div>' \
'</div>' \
'</div>'
eq_("<html><body><p>Reply</p></body></html>",
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_gmail_quote_blockquote():
msg_body = """Message
<blockquote class="gmail_quote">
<div class="gmail_default">
My name is William Shakespeare.
<br/>
</div>
</blockquote>"""
eq_(RE_WHITESPACE.sub('', msg_body),
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_unicode_in_reply(): def test_unicode_in_reply():
msg_body = u"""Reply \xa0 \xa0 Text<br> msg_body = u"""Reply \xa0 \xa0 Text<br>
@@ -161,7 +122,7 @@ def test_unicode_in_reply():
<br> <br>
</div> </div>
<blockquote> <blockquote class="gmail_quote">
Quote Quote
</blockquote>""".encode("utf-8") </blockquote>""".encode("utf-8")
@@ -279,35 +240,26 @@ def test_reply_separated_by_hr():
'', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR)))
def test_from_block_and_quotations_in_separate_divs(): RE_REPLY = re.compile(r"^Hi\. I am fine\.\s*\n\s*Thanks,\s*\n\s*Alex\s*$")
msg_body = '''
Reply
<div>
<hr/>
<div>
<font>
<b>From: bob@example.com</b>
<b>Date: Thu, 24 Mar 2016 08:07:12 -0700</b>
</font>
</div>
<div>
Quoted message
</div>
</div>
'''
eq_('<html><body><p>Reply</p><div><hr></div></body></html>',
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def extract_reply_and_check(filename): def extract_reply_and_check(filename):
f = open(filename) f = open(filename)
msg_body = f.read() msg_body = f.read().decode("utf-8")
reply = quotations.extract_from_html(msg_body) reply = quotations.extract_from_html(msg_body)
plain_reply = u.html_to_text(reply)
eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"), h = html2text.HTML2Text()
RE_WHITESPACE.sub('', plain_reply)) h.body_width = 0
plain_reply = h.handle(reply)
#remove &nbsp; spaces
plain_reply = plain_reply.replace(u'\xa0', u' ')
if RE_REPLY.match(plain_reply):
eq_(1, 1)
else:
eq_("Hi. I am fine.\n\nThanks,\nAlex", plain_reply)
def test_gmail_reply(): def test_gmail_reply():
@@ -330,10 +282,6 @@ def test_ms_outlook_2007_reply():
extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2007.html") extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2007.html")
def test_ms_outlook_2010_reply():
extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2010.html")
def test_thunderbird_reply(): def test_thunderbird_reply():
extract_reply_and_check("tests/fixtures/html_replies/thunderbird.html") extract_reply_and_check("tests/fixtures/html_replies/thunderbird.html")
@@ -344,37 +292,3 @@ def test_windows_mail_reply():
def test_yandex_ru_reply(): def test_yandex_ru_reply():
extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html") extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html")
def test_CRLF():
"""CR is not converted to '&#13;'
"""
symbol = '&#13;'
extracted = quotations.extract_from_html('<html>\r\n</html>')
assert_false(symbol in extracted)
eq_('<html></html>', RE_WHITESPACE.sub('', extracted))
msg_body = """Reply
<blockquote>
<div>
On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
</div>
<div>
Test
</div>
</blockquote>"""
msg_body = msg_body.replace('\n', '\r\n')
extracted = quotations.extract_from_html(msg_body)
assert_false(symbol in extracted)
eq_("<html><body><p>Reply</p></body></html>",
RE_WHITESPACE.sub('', extracted))
def test_gmail_forwarded_msg():
msg_body = """<div dir="ltr"><br><div class="gmail_quote">---------- Forwarded message ----------<br>From: <b class="gmail_sendername">Bob</b> <span dir="ltr">&lt;<a href="mailto:bob@example.com">bob@example.com</a>&gt;</span><br>Date: Fri, Feb 11, 2010 at 5:59 PM<br>Subject: Bob WFH today<br>To: Mary &lt;<a href="mailto:mary@example.com">mary@example.com</a>&gt;<br><br><br><div dir="ltr">eom</div>
</div><br></div>"""
extracted = quotations.extract_from_html(msg_body)
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))

View File

@@ -6,9 +6,7 @@ from talon.signature.learning import featurespace as fs
def test_apply_features(): def test_apply_features():
s = '''This is John Doe s = '''John Doe
Tuesday @3pm suits. I'll chat to you then.
VP Research and Development, Xxxx Xxxx Xxxxx VP Research and Development, Xxxx Xxxx Xxxxx
@@ -21,12 +19,11 @@ john@example.com'''
# note that we don't consider the first line because signatures don't # note that we don't consider the first line because signatures don't
# usually take all the text, empty lines are not considered # usually take all the text, empty lines are not considered
eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
with patch.object(fs, 'SIGNATURE_MAX_LINES', 5): with patch.object(fs, 'SIGNATURE_MAX_LINES', 4):
features = fs.features(sender) features = fs.features(sender)
new_result = fs.apply_features(s, features) new_result = fs.apply_features(s, features)
# result remains the same because we don't consider empty lines # result remains the same because we don't consider empty lines

View File

@@ -12,11 +12,11 @@ from talon import quotations
@patch.object(quotations, 'MAX_LINES_COUNT', 1) @patch.object(quotations, 'MAX_LINES_COUNT', 1)
def test_too_many_lines(): def test_too_many_lines():
msg_body = """Test reply msg_body = """Test reply
Hi
-----Original Message----- -----Original Message-----
Test""" Test"""
eq_("Test reply", quotations.extract_from_plain(msg_body)) eq_(msg_body, quotations.extract_from_plain(msg_body))
def test_pattern_on_date_somebody_wrote(): def test_pattern_on_date_somebody_wrote():
@@ -32,19 +32,6 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote:
eq_("Test reply", quotations.extract_from_plain(msg_body)) eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_pattern_sent_from_samsung_smb_wrote():
msg_body = """Test reply
Sent from Samsung MobileName <address@example.com> wrote:
>
> Test
>
> Roman"""
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_pattern_on_date_wrote_somebody(): def test_pattern_on_date_wrote_somebody():
eq_('Lorem', quotations.extract_from_plain( eq_('Lorem', quotations.extract_from_plain(
"""Lorem """Lorem
@@ -67,18 +54,6 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote:
eq_("Test reply", quotations.extract_from_plain(msg_body)) eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_date_time_email_splitter():
msg_body = """Test reply
2014-10-17 11:28 GMT+03:00 Postmaster <
postmaster@sandboxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.mailgun.org>:
> First from site
>
"""
eq_("Test reply", quotations.extract_from_plain(msg_body))
def test_pattern_on_date_somebody_wrote_allows_space_in_front(): def test_pattern_on_date_somebody_wrote_allows_space_in_front():
msg_body = """Thanks Thanmai msg_body = """Thanks Thanmai
On Mar 8, 2012 9:59 AM, "Example.com" < On Mar 8, 2012 9:59 AM, "Example.com" <
@@ -336,33 +311,6 @@ Emne: The manager has commented on your Loop
Blah-blah-blah Blah-blah-blah
""")) """))
def test_swedish_from_block():
eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
u"""Allo! Follow up MIME!
Från: Anno Sportel [mailto:anno.spoel@hsbcssad.com]
Skickat: den 26 augusti 2015 14:45
Till: Isacson Leiff
Ämne: RE: Week 36
Blah-blah-blah
"""))
def test_swedish_from_line():
eq_('Lorem', quotations.extract_from_plain(
"""Lorem
Den 14 september, 2015 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
"""))
def test_norwegian_from_line():
eq_('Lorem', quotations.extract_from_plain(
u"""Lorem
På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
"""))
def test_dutch_from_block(): def test_dutch_from_block():
eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain( eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
"""Gluten-free culpa lo-fi et nesciunt nostrud. """Gluten-free culpa lo-fi et nesciunt nostrud.

View File

@@ -1,107 +1,9 @@
# coding:utf-8
from . import * from . import *
from talon import utils as u from talon import utils
import cchardet
def test_get_delimiter(): def test_get_delimiter():
eq_('\r\n', u.get_delimiter('abc\r\n123')) eq_('\r\n', utils.get_delimiter('abc\r\n123'))
eq_('\n', u.get_delimiter('abc\n123')) eq_('\n', utils.get_delimiter('abc\n123'))
eq_('\n', u.get_delimiter('abc')) eq_('\n', utils.get_delimiter('abc'))
def test_unicode():
eq_ (u'hi', u.to_unicode('hi'))
eq_ (type(u.to_unicode('hi')), unicode )
eq_ (type(u.to_unicode(u'hi')), unicode )
eq_ (type(u.to_unicode('привет')), unicode )
eq_ (type(u.to_unicode(u'привет')), unicode )
eq_ (u"привет", u.to_unicode('привет'))
eq_ (u"привет", u.to_unicode(u'привет'))
# some latin1 stuff
eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True))
def test_detect_encoding():
eq_ ('ascii', u.detect_encoding('qwe').lower())
eq_ ('iso-8859-2', u.detect_encoding('Versi\xf3n').lower())
eq_ ('utf-8', u.detect_encoding('привет').lower())
# fallback to utf-8
with patch.object(u.chardet, 'detect') as detect:
detect.side_effect = Exception
eq_ ('utf-8', u.detect_encoding('qwe').lower())
def test_quick_detect_encoding():
eq_ ('ascii', u.quick_detect_encoding('qwe').lower())
eq_ ('windows-1252', u.quick_detect_encoding('Versi\xf3n').lower())
eq_ ('utf-8', u.quick_detect_encoding('привет').lower())
@patch.object(cchardet, 'detect')
@patch.object(u, 'detect_encoding')
def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect):
cchardet_detect.return_value = {'encoding': 'ascii'}
eq_('ascii', u.quick_detect_encoding("qwe"))
cchardet_detect.assert_called_once_with("qwe")
# fallback to detect_encoding
cchardet_detect.return_value = {}
detect_encoding.return_value = 'utf-8'
eq_('utf-8', u.quick_detect_encoding("qwe"))
# exception
detect_encoding.reset_mock()
cchardet_detect.side_effect = Exception()
detect_encoding.return_value = 'utf-8'
eq_('utf-8', u.quick_detect_encoding("qwe"))
ok_(detect_encoding.called)
def test_html_to_text():
html = """<body>
<p>Hello world!</p>
<br>
<ul>
<li>One!</li>
<li>Two</li>
</ul>
<p>
Haha
</p>
</body>"""
text = u.html_to_text(html)
eq_("Hello world! \n\n * One! \n * Two \nHaha", text)
eq_("привет!", u.html_to_text("<b>привет!</b>"))
html = '<body><br/><br/>Hi</body>'
eq_ ('Hi', u.html_to_text(html))
html = """Hi
<style type="text/css">
div, p, li {
font: 13px 'Lucida Grande', Arial, sans-serif;
}
</style>
<style type="text/css">
h1 {
font: 13px 'Lucida Grande', Arial, sans-serif;
}
</style>"""
eq_ ('Hi', u.html_to_text(html))
html = """<div>
<!-- COMMENT 1 -->
<span>TEXT 1</span>
<p>TEXT 2 <!-- COMMENT 2 --></p>
</div>"""
eq_('TEXT 1 \nTEXT 2', u.html_to_text(html))