7 Commits

Author SHA1 Message Date
Sergey Obukhov
52505bba8a Update README.rst
Clarified that some signature extraction methods require initializing the lib first.
2014-09-14 09:03:10 -07:00
Sergey Obukhov
79cd4fcc52 Merge pull request #15 from willemdelbare/master
added extra splitter expressions for Dutch, French, German
2014-09-14 08:38:39 -07:00
Willem Delbare
a4f156b174 added extra splitter expressions for Dutch, French, German 2014-09-13 15:33:08 +02:00
Sergey Obukhov
1789ccf3c8 Merge branch 'master' of github.com:mailgun/talon 2014-07-24 20:37:47 -07:00
Sergey Obukhov
7a42ab3b28 fix #4 add flanker to setup.py 2014-07-24 20:37:33 -07:00
Sergey Obukhov
12b0e88a01 Merge pull request #5 from pborreli/typos
Fixed typos
2014-07-24 20:32:57 -07:00
Pascal Borreli
8b78da5977 Fixed typos 2014-07-25 02:40:37 +00:00
6 changed files with 26 additions and 17 deletions

View File

@@ -3,7 +3,7 @@ talon
Mailgun library to extract message quotations and signatures. Mailgun library to extract message quotations and signatures.
If you ever tried to parse message quotations or signatures you know that absense of any formatting standards in this area could make this task a nightmare. Hopefully this library will make your life much easier. The name of the project is inspired by TALON - multipurpose robot designed to perform missions ranging from reconnaissance to combat and operate in a number of hostile environments. Thats what a good quotations and signature parser should be like :smile: If you ever tried to parse message quotations or signatures you know that absence of any formatting standards in this area could make this task a nightmare. Hopefully this library will make your life much easier. The name of the project is inspired by TALON - multipurpose robot designed to perform missions ranging from reconnaissance to combat and operate in a number of hostile environments. Thats what a good quotations and signature parser should be like :smile:
Usage Usage
----- -----
@@ -71,6 +71,11 @@ the power of machine learning algorithms:
.. code:: python .. code:: python
import talon
# don't forget to init the library first
# it loads machine learning classifiers
talon.init()
from talon import signature from talon import signature

View File

@@ -26,7 +26,8 @@ setup(name='talon',
"html2text", "html2text",
"nose==1.2.1", "nose==1.2.1",
"mock", "mock",
"coverage" "coverage",
"flanker"
] ]
) )

View File

@@ -73,6 +73,9 @@ SPLITTER_PATTERNS = [
re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE), re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE),
RE_ON_DATE_SMB_WROTE, RE_ON_DATE_SMB_WROTE,
re.compile('(_+\r?\n)?[\s]*(:?[*]?From|Date):[*]? .*'), re.compile('(_+\r?\n)?[\s]*(:?[*]?From|Date):[*]? .*'),
re.compile('(_+\r?\n)?[\s]*(:?[*]?Van|Datum):[*]? .*'),
re.compile('(_+\r?\n)?[\s]*(:?[*]?De|Date):[*]? .*'),
re.compile('(_+\r?\n)?[\s]*(:?[*]?Von|Datum):[*]? .*'),
re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
'( \S+){3,6}@\S+:') '( \S+){3,6}@\S+:')
] ]
@@ -81,7 +84,7 @@ SPLITTER_PATTERNS = [
RE_LINK = re.compile('<(http://[^>]*)>') RE_LINK = re.compile('<(http://[^>]*)>')
RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@') RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@')
RE_PARANTHESIS_LINK = re.compile("\(https?://") RE_PARENTHESIS_LINK = re.compile("\(https?://")
SPLITTER_MAX_LINES = 4 SPLITTER_MAX_LINES = 4
MAX_LINES_COUNT = 1000 MAX_LINES_COUNT = 1000
@@ -169,8 +172,8 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
# long links could break sequence of quotation lines but they shouldn't # long links could break sequence of quotation lines but they shouldn't
# be considered an inline reply # be considered an inline reply
links = ( links = (
RE_PARANTHESIS_LINK.search(lines[inline_reply.start() - 1]) or RE_PARENTHESIS_LINK.search(lines[inline_reply.start() - 1]) or
RE_PARANTHESIS_LINK.match(lines[inline_reply.start()].strip())) RE_PARENTHESIS_LINK.match(lines[inline_reply.start()].strip()))
if not links: if not links:
return_flags[:] = [False, -1, -1] return_flags[:] = [False, -1, -1]
return lines return lines
@@ -197,7 +200,7 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
"""Prepares msg_body for being stripped. """Prepares msg_body for being stripped.
Replaces link brackets so that they couldn't be taken for quotation marker. Replaces link brackets so that they couldn't be taken for quotation marker.
Splits line in two if splitter pattern preceeded by some text on the same Splits line in two if splitter pattern preceded by some text on the same
line (done only for 'On <date> <person> wrote:' pattern). line (done only for 'On <date> <person> wrote:' pattern).
""" """
# normalize links i.e. replace '<', '>' wrapping the link with some symbols # normalize links i.e. replace '<', '>' wrapping the link with some symbols
@@ -213,7 +216,7 @@ def preprocess(msg_body, delimiter, content_type='text/plain'):
msg_body = re.sub(RE_LINK, link_wrapper, msg_body) msg_body = re.sub(RE_LINK, link_wrapper, msg_body)
def splitter_wrapper(splitter): def splitter_wrapper(splitter):
"""Wrapps splitter with new line""" """Wraps splitter with new line"""
if splitter.start() and msg_body[splitter.start() - 1] != '\n': if splitter.start() and msg_body[splitter.start() - 1] != '\n':
return '%s%s' % (delimiter, splitter.group()) return '%s%s' % (delimiter, splitter.group())
else: else:
@@ -268,7 +271,7 @@ def extract_from_html(msg_body):
then converting html to text, then converting html to text,
then extracting quotations from text, then extracting quotations from text,
then checking deleted checkpoints, then checking deleted checkpoints,
then deleting neccessary tags. then deleting necessary tags.
""" """
if msg_body.strip() == '': if msg_body.strip() == '':

View File

@@ -49,7 +49,7 @@ RE_PHONE_SIGNATURE = re.compile(r'''
# c - could be signature line # c - could be signature line
# d - line starts with dashes (could be signature or list item) # d - line starts with dashes (could be signature or list item)
# l - long line # l - long line
RE_SIGNATURE_CANDIDAATE = re.compile(r''' RE_SIGNATURE_CANDIDATE = re.compile(r'''
(?P<candidate>c+d)[^d] (?P<candidate>c+d)[^d]
| |
(?P<candidate>c+d)$ (?P<candidate>c+d)$
@@ -184,5 +184,5 @@ def _process_marked_candidate_indexes(candidate, markers):
>>> _process_marked_candidate_indexes([9, 12, 14, 15, 17], 'clddc') >>> _process_marked_candidate_indexes([9, 12, 14, 15, 17], 'clddc')
[15, 17] [15, 17]
""" """
match = RE_SIGNATURE_CANDIDAATE.match(markers[::-1]) match = RE_SIGNATURE_CANDIDATE.match(markers[::-1])
return candidate[-match.end('candidate'):] if match else [] return candidate[-match.end('candidate'):] if match else []

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" The module provides functions for convertion of a message body/body lines """ The module provides functions for conversion of a message body/body lines
into classifiers features space. into classifiers features space.
The body and the message sender string are converted into unicode before The body and the message sender string are converted into unicode before
@@ -47,9 +47,9 @@ def apply_features(body, features):
'''Applies features to message body lines. '''Applies features to message body lines.
Returns list of lists. Each of the lists corresponds to the body line Returns list of lists. Each of the lists corresponds to the body line
and is constituted by the numbers of features occurances (0 or 1). and is constituted by the numbers of features occurrences (0 or 1).
E.g. if element j of list i equals 1 this means that E.g. if element j of list i equals 1 this means that
feature j occured in line i (counting from the last line of the body). feature j occurred in line i (counting from the last line of the body).
''' '''
# collect all non empty lines # collect all non empty lines
lines = [line for line in body.splitlines() if line.strip()] lines = [line for line in body.splitlines() if line.strip()]
@@ -66,7 +66,7 @@ def build_pattern(body, features):
'''Converts body into a pattern i.e. a point in the features space. '''Converts body into a pattern i.e. a point in the features space.
Applies features to the body lines and sums up the results. Applies features to the body lines and sums up the results.
Elements of the pattern indicate how many times a certain feature occured Elements of the pattern indicate how many times a certain feature occurred
in the last lines of the body. in the last lines of the body.
''' '''
line_patterns = apply_features(body, features) line_patterns = apply_features(body, features)

View File

@@ -94,7 +94,7 @@ def binary_regex_match(prog):
def flatten_list(list_to_flatten): def flatten_list(list_to_flatten):
"""Simple list comprehesion to flatten list. """Simple list comprehension to flatten list.
>>> flatten_list([[1, 2], [3, 4, 5]]) >>> flatten_list([[1, 2], [3, 4, 5]])
[1, 2, 3, 4, 5] [1, 2, 3, 4, 5]
@@ -155,7 +155,7 @@ def extract_names(sender):
def categories_percent(s, categories): def categories_percent(s, categories):
'''Returns category characters persent. '''Returns category characters percent.
>>> categories_percent("qqq ggg hhh", ["Po"]) >>> categories_percent("qqq ggg hhh", ["Po"])
0.0 0.0
@@ -177,7 +177,7 @@ def categories_percent(s, categories):
def punctuation_percent(s): def punctuation_percent(s):
'''Returns punctuation persent. '''Returns punctuation percent.
>>> punctuation_percent("qqq ggg hhh") >>> punctuation_percent("qqq ggg hhh")
0.0 0.0