Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
35645f9ade | ||
|
|
7c3d91301c | ||
|
|
5bcf7403ad | ||
|
|
2d6c092b65 | ||
|
|
6d0689cad6 | ||
|
|
3f80e93ee0 | ||
|
|
1b18abab1d | ||
|
|
03dd5af5ab | ||
|
|
dfba82b07c | ||
|
|
08ca02c87f | ||
|
|
b61f4ec095 | ||
|
|
9dbe6a494b | ||
|
|
44e70939d6 |
13
README.rst
13
README.rst
@@ -95,7 +95,7 @@ classifiers. The core of machine learning algorithm lays in
|
|||||||
apply to a message (``featurespace.py``), how data sets are built
|
apply to a message (``featurespace.py``), how data sets are built
|
||||||
(``dataset.py``), classifier’s interface (``classifier.py``).
|
(``dataset.py``), classifier’s interface (``classifier.py``).
|
||||||
|
|
||||||
The data used for training is taken from our personal email
|
Currently the data used for training is taken from our personal email
|
||||||
conversations and from `ENRON`_ dataset. As a result of applying our set
|
conversations and from `ENRON`_ dataset. As a result of applying our set
|
||||||
of features to the dataset we provide files ``classifier`` and
|
of features to the dataset we provide files ``classifier`` and
|
||||||
``train.data`` that don’t have any personal information but could be
|
``train.data`` that don’t have any personal information but could be
|
||||||
@@ -116,8 +116,19 @@ or
|
|||||||
from talon.signature.learning.classifier import train, init
|
from talon.signature.learning.classifier import train, init
|
||||||
train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME)
|
train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME)
|
||||||
|
|
||||||
|
Open-source Dataset
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
Recently we started a `forge`_ project to create an open-source, annotated dataset of raw emails. In the project we
|
||||||
|
used a subset of `ENRON`_ data, cleansed of private, health and financial information by `EDRM`_. At the moment over 190
|
||||||
|
emails are annotated. Any contribution and collaboration on the project are welcome. Once the dataset is ready we plan to
|
||||||
|
start using it for talon.
|
||||||
|
|
||||||
.. _scikit-learn: http://scikit-learn.org
|
.. _scikit-learn: http://scikit-learn.org
|
||||||
.. _ENRON: https://www.cs.cmu.edu/~enron/
|
.. _ENRON: https://www.cs.cmu.edu/~enron/
|
||||||
|
.. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set
|
||||||
|
.. _forge: https://github.com/mailgun/forge
|
||||||
|
|
||||||
|
|
||||||
Research
|
Research
|
||||||
--------
|
--------
|
||||||
|
|||||||
2
setup.py
2
setup.py
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
|||||||
|
|
||||||
|
|
||||||
setup(name='talon',
|
setup(name='talon',
|
||||||
version='1.2.6',
|
version='1.2.10',
|
||||||
description=("Mailgun library "
|
description=("Mailgun library "
|
||||||
"to extract message quotations and signatures."),
|
"to extract message quotations and signatures."),
|
||||||
long_description=open("README.rst").read(),
|
long_description=open("README.rst").read(),
|
||||||
|
|||||||
@@ -86,9 +86,12 @@ def cut_gmail_quote(html_message):
|
|||||||
def cut_microsoft_quote(html_message):
|
def cut_microsoft_quote(html_message):
|
||||||
''' Cuts splitter block and all following blocks. '''
|
''' Cuts splitter block and all following blocks. '''
|
||||||
splitter = html_message.xpath(
|
splitter = html_message.xpath(
|
||||||
#outlook 2007, 2010
|
#outlook 2007, 2010 (international)
|
||||||
"//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
|
"//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
|
||||||
"padding:3.0pt 0cm 0cm 0cm']|"
|
"padding:3.0pt 0cm 0cm 0cm']|"
|
||||||
|
#outlook 2007, 2010 (american)
|
||||||
|
"//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
|
||||||
|
"padding:3.0pt 0in 0in 0in']|"
|
||||||
#windows mail
|
#windows mail
|
||||||
"//div[@style='padding-top: 5px; "
|
"//div[@style='padding-top: 5px; "
|
||||||
"border-top-color: rgb(229, 229, 229); "
|
"border-top-color: rgb(229, 229, 229); "
|
||||||
@@ -175,7 +178,21 @@ def cut_from_block(html_message):
|
|||||||
len(maybe_body.getchildren()) == 1)
|
len(maybe_body.getchildren()) == 1)
|
||||||
|
|
||||||
if not parent_div_is_all_content:
|
if not parent_div_is_all_content:
|
||||||
block.getparent().remove(block)
|
parent = block.getparent()
|
||||||
|
next_sibling = block.getnext()
|
||||||
|
|
||||||
|
# remove all tags after found From block
|
||||||
|
# (From block and quoted message are in separate divs)
|
||||||
|
while next_sibling is not None:
|
||||||
|
parent.remove(block)
|
||||||
|
block = next_sibling
|
||||||
|
next_sibling = block.getnext()
|
||||||
|
|
||||||
|
# remove the last sibling (or the
|
||||||
|
# From block if no siblings)
|
||||||
|
if block is not None:
|
||||||
|
parent.remove(block)
|
||||||
|
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|||||||
@@ -279,6 +279,26 @@ def test_reply_separated_by_hr():
|
|||||||
'', quotations.extract_from_html(REPLY_SEPARATED_BY_HR)))
|
'', quotations.extract_from_html(REPLY_SEPARATED_BY_HR)))
|
||||||
|
|
||||||
|
|
||||||
|
def test_from_block_and_quotations_in_separate_divs():
|
||||||
|
msg_body = '''
|
||||||
|
Reply
|
||||||
|
<div>
|
||||||
|
<hr/>
|
||||||
|
<div>
|
||||||
|
<font>
|
||||||
|
<b>From: bob@example.com</b>
|
||||||
|
<b>Date: Thu, 24 Mar 2016 08:07:12 -0700</b>
|
||||||
|
</font>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
Quoted message
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
'''
|
||||||
|
eq_('<html><body><p>Reply</p><div><hr></div></body></html>',
|
||||||
|
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
|
||||||
|
|
||||||
|
|
||||||
def extract_reply_and_check(filename):
|
def extract_reply_and_check(filename):
|
||||||
f = open(filename)
|
f = open(filename)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user