diff --git a/README.rst b/README.rst index c804e61..1f8d7e0 100644 --- a/README.rst +++ b/README.rst @@ -116,8 +116,19 @@ or from talon.signature.learning.classifier import train, init train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME) +Open-source Dataset +------------------- + +Recently we started a `forge`_ project to create an open-source, annotated dataset of raw emails. In the project we +used a subset of `ENRON`_ data, cleansed of private, health and financial information by `EDRM`_. At the moment over 190 +emails are annotated. Any contribution and collaboration on the project are welcome. Once the dataset is ready we plan to +start using it for talon. + .. _scikit-learn: http://scikit-learn.org .. _ENRON: https://www.cs.cmu.edu/~enron/ +.. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set +.. _forge: https://github.com/mailgun/forge + Research -------- diff --git a/setup.py b/setup.py index bedb0db..8be1da5 100755 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages setup(name='talon', - version='1.2.9', + version='1.2.10', description=("Mailgun library " "to extract message quotations and signatures."), long_description=open("README.rst").read(),