initial commit

2014-07-23 21:12:54 -07:00
commit 170f11038b
80 changed files with 7481 additions and 0 deletions
--- a/talon/init.py
+++ b/talon/init.py
@@ -0,0 +1,7 @@
+from talon.quotations import register_xpath_extensions
+from talon import signature
+
+
+def init():
+    register_xpath_extensions()
+    signature.initialize()
--- a/talon/constants.py
+++ b/talon/constants.py
@@ -0,0 +1,4 @@
+import regex as re
+
+
+RE_DELIMITER = re.compile('\r?\n')
--- a/talon/html_quotations.py
+++ b/talon/html_quotations.py
@@ -0,0 +1,174 @@
+"""
+The module's functions operate on message bodies trying to extract original
+messages (without quoted messages) from html
+"""
+
+import regex as re
+
+
+CHECKPOINT_PREFIX = '#!%!'
+CHECKPOINT_SUFFIX = '!%!#'
+CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX)
+
+# HTML quote indicators (tag ids)
+QUOTE_IDS = ['OLK_SRC_BODY_SECTION']
+
+
+def add_checkpoint(html_note, counter):
+    """Recursively adds checkpoints to html tree.
+    """
+    if html_note.text:
+        html_note.text = (html_note.text + CHECKPOINT_PREFIX +
+                          str(counter) + CHECKPOINT_SUFFIX)
+    else:
+        html_note.text = (CHECKPOINT_PREFIX + str(counter) +
+                          CHECKPOINT_SUFFIX)
+    counter += 1
+
+    for child in html_note.iterchildren():
+        counter = add_checkpoint(child, counter)
+
+    if html_note.tail:
+        html_note.tail = (html_note.tail + CHECKPOINT_PREFIX +
+                          str(counter) + CHECKPOINT_SUFFIX)
+    else:
+        html_note.tail = (CHECKPOINT_PREFIX + str(counter) +
+                          CHECKPOINT_SUFFIX)
+    counter += 1
+
+    return counter
+
+
+def delete_quotation_tags(html_note, counter, quotation_checkpoints):
+    """Deletes tags with quotation checkpoints from html tree.
+    """
+    tag_in_quotation = True
+
+    if quotation_checkpoints[counter]:
+        html_note.text = ''
+    else:
+        tag_in_quotation = False
+    counter += 1
+
+    quotation_children = []  # Children tags which are in quotation.
+    for child in html_note.iterchildren():
+        counter, child_tag_in_quotation = delete_quotation_tags(
+            child, counter,
+            quotation_checkpoints
+        )
+        if child_tag_in_quotation:
+            quotation_children.append(child)
+
+    if quotation_checkpoints[counter]:
+        html_note.tail = ''
+    else:
+        tag_in_quotation = False
+    counter += 1
+
+    if tag_in_quotation:
+        return counter, tag_in_quotation
+    else:
+        # Remove quotation children.
+        for child in quotation_children:
+            html_note.remove(child)
+        return counter, tag_in_quotation
+
+
+def cut_gmail_quote(html_message):
+    ''' Cuts the outermost block element with class gmail_quote. '''
+    gmail_quote = html_message.cssselect('.gmail_quote')
+    if gmail_quote:
+        gmail_quote[0].getparent().remove(gmail_quote[0])
+        return True
+
+
+def cut_microsoft_quote(html_message):
+    ''' Cuts splitter block and all following blocks. '''
+    splitter = html_message.xpath(
+        #outlook 2007, 2010
+        "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;"
+        "padding:3.0pt 0cm 0cm 0cm']|"
+        #windows mail
+        "//div[@style='padding-top: 5px; "
+        "border-top-color: rgb(229, 229, 229); "
+        "border-top-width: 1px; border-top-style: solid;']"
+    )
+
+    if splitter:
+        splitter = splitter[0]
+        #outlook 2010
+        if splitter == splitter.getparent().getchildren()[0]:
+            splitter = splitter.getparent()
+    else:
+        #outlook 2003
+        splitter = html_message.xpath(
+            "//div"
+            "/div[@class='MsoNormal' and @align='center' "
+            "and @style='text-align:center']"
+            "/font"
+            "/span"
+            "/hr[@size='3' and @width='100%' and @align='center' "
+            "and @tabindex='-1']"
+        )
+        if len(splitter):
+            splitter = splitter[0]
+            splitter = splitter.getparent().getparent()
+            splitter = splitter.getparent().getparent()
+
+    if len(splitter):
+        parent = splitter.getparent()
+        after_splitter = splitter.getnext()
+        while after_splitter is not None:
+            parent.remove(after_splitter)
+            after_splitter = splitter.getnext()
+        parent.remove(splitter)
+        return True
+
+    return False
+
+
+def cut_by_id(html_message):
+    found = False
+    for quote_id in QUOTE_IDS:
+        quote = html_message.cssselect('#{}'.format(quote_id))
+        if quote:
+            found = True
+            quote[0].getparent().remove(quote[0])
+    return found
+
+
+def cut_blockquote(html_message):
+    ''' Cuts blockquote with wrapping elements. '''
+    quote = html_message.find('.//blockquote')
+    if quote is not None:
+        quote.getparent().remove(quote)
+        return True
+
+
+def cut_from_block(html_message):
+    """Cuts div tag which wraps block starting with "From:"."""
+    # handle the case when From: block is enclosed in some tag
+    block = html_message.xpath(
+        ("//*[starts-with(mg:text_content(), 'From:')]|"
+         "//*[starts-with(mg:text_content(), 'Date:')]"))
+
+    if block:
+        block = block[-1]
+        while block.getparent() is not None:
+            if block.tag == 'div':
+                block.getparent().remove(block)
+                return True
+            else:
+                block = block.getparent()
+    else:
+        # handle the case when From: block goes right after e.g. <hr>
+        # and not enclosed in some tag
+        block = html_message.xpath(
+            ("//*[starts-with(mg:tail(), 'From:')]|"
+             "//*[starts-with(mg:tail(), 'Date:')]"))
+        if block:
+            block = block[0]
+            while(block.getnext() is not None):
+                block.getparent().remove(block.getnext())
+            block.getparent().remove(block)
+            return True
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -0,0 +1,376 @@
+# -*- coding: utf-8 -*-
+
+"""
+The module's functions operate on message bodies trying to extract
+original messages (without quoted messages)
+"""
+
+import regex as re
+import logging
+from copy import deepcopy
+
+from lxml import html, etree
+import html2text
+
+from talon.constants import RE_DELIMITER
+from talon.utils import random_token, get_delimiter
+from talon import html_quotations
+
+
+log = logging.getLogger(__name__)
+
+
+RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
+
+RE_ON_DATE_SMB_WROTE = re.compile(
+    r'''
+    (
+        -*  # could include dashes
+        [ ]?On[ ].*,  # date part ends with comma
+        (.*\n){0,2}  # splitter takes 4 lines at most
+        .*(wrote|sent):
+    )
+    ''', re.VERBOSE)
+
+RE_QUOTATION = re.compile(
+    r'''
+    (
+        # quotation border: splitter line or a number of quotation marker lines
+        (?:
+            s
+            |
+            (?:me*){2,}
+        )
+
+        # quotation lines could be marked as splitter or text, etc.
+        .*
+
+        # but we expect it to end with a quotation marker line
+        me*
+    )
+
+    # after quotations should be text only or nothing at all
+    [te]*$
+    ''', re.VERBOSE)
+
+RE_EMPTY_QUOTATION = re.compile(
+    r'''
+    (
+        # quotation border: splitter line or a number of quotation marker lines
+        (?:
+            s
+            |
+            (?:me*){2,}
+        )
+    )
+    e*
+    ''', re.VERBOSE)
+
+SPLITTER_PATTERNS = [
+    # ------Original Message------ or ---- Reply Message ----
+    re.compile("[\s]*[-]+[ ]*(Original|Reply) Message[ ]*[-]+", re.I),
+    # <date> <person>
+    re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE),
+    RE_ON_DATE_SMB_WROTE,
+    re.compile('(_+\r?\n)?[\s]*(:?[*]?From|Date):[*]? .*'),
+    re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
+               '( \S+){3,6}@\S+:')
+    ]
+
+
+RE_LINK = re.compile('<(http://[^>]*)>')
+RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@')
+
+RE_PARANTHESIS_LINK = re.compile("\(https?://")
+
+SPLITTER_MAX_LINES = 4
+MAX_LINES_COUNT = 1000
+
+QUOT_PATTERN = re.compile('^>+ ?')
+NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
+
+
+def extract_from(msg_body, content_type='text/plain'):
+    try:
+        if content_type == 'text/plain':
+            return extract_from_plain(msg_body)
+        elif content_type == 'text/html':
+            return extract_from_html(msg_body)
+    except Exception, e:
+        log.exception('ERROR extracting message')
+
+    return msg_body
+
+
+def mark_message_lines(lines):
+    """Mark message lines with markers to distinguish quotation lines.
+
+    Markers:
+
+    * e - empty line
+    * m - line that starts with quotation marker '>'
+    * s - splitter line
+    * t - presumably lines from the last message in the conversation
+
+    >>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question'])
+    'tsem'
+    """
+    markers = bytearray(len(lines))
+    i = 0
+    while i < len(lines):
+        if not lines[i].strip():
+            markers[i] = 'e'  # empty line
+        elif QUOT_PATTERN.match(lines[i]):
+            markers[i] = 'm'  # line with quotation marker
+        elif RE_FWD.match(lines[i]):
+            markers[i] = 'f'  # ---- Forwarded message ----
+        else:
+            # in case splitter is spread across several lines
+            splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
+            if splitter:
+                # append as many splitter markers as lines in splitter
+                splitter_lines = splitter.group().splitlines()
+                for j in xrange(len(splitter_lines)):
+                    markers[i + j] = 's'
+
+                # skip splitter lines
+                i += len(splitter_lines) - 1
+            else:
+                # probably the line from the last message in the conversation
+                markers[i] = 't'
+        i += 1
+
+    return markers
+
+
+def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
+    """Run regexes against message's marked lines to strip quotations.
+
+    Return only last message lines.
+    >>> mark_message_lines(['Hello', 'From: foo@bar.com', '', '> Hi', 'tsem'])
+    ['Hello']
+
+    Also returns return_flags.
+    return_flags = [were_lines_deleted, first_deleted_line,
+                    last_deleted_line]
+    """
+    # if there are no splitter there should be no markers
+    if 's' not in markers and not re.search('(me*){3}', markers):
+        markers = markers.replace('m', 't')
+
+    if re.match('[te]*f', markers):
+        return_flags[:] = [False, -1, -1]
+        return lines
+
+    # inlined reply
+    # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm'
+    # both 't' entries should be found
+    for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers):
+        # long links could break sequence of quotation lines but they shouldn't
+        # be considered an inline reply
+        links = (
+            RE_PARANTHESIS_LINK.search(lines[inline_reply.start() - 1]) or
+            RE_PARANTHESIS_LINK.match(lines[inline_reply.start()].strip()))
+        if not links:
+            return_flags[:] = [False, -1, -1]
+            return lines
+
+    # cut out text lines coming after splitter if there are no markers there
+    quotation = re.search('(se*)+((t|f)+e*)+', markers)
+    if quotation:
+        return_flags[:] = [True, quotation.start(), len(lines)]
+        return lines[:quotation.start()]
+
+    # handle the case with markers
+    quotation = (RE_QUOTATION.search(markers) or
+                 RE_EMPTY_QUOTATION.search(markers))
+
+    if quotation:
+        return_flags[:] = True, quotation.start(1), quotation.end(1)
+        return lines[:quotation.start(1)] + lines[quotation.end(1):]
+
+    return_flags[:] = [False, -1, -1]
+    return lines
+
+
+def preprocess(msg_body, delimiter, content_type='text/plain'):
+    """Prepares msg_body for being stripped.
+
+    Replaces link brackets so that they couldn't be taken for quotation marker.
+    Splits line in two if splitter pattern preceeded by some text on the same
+    line (done only for 'On <date> <person> wrote:' pattern).
+    """
+    # normalize links i.e. replace '<', '>' wrapping the link with some symbols
+    # so that '>' closing the link couldn't be mistakenly taken for quotation
+    # marker.
+    def link_wrapper(link):
+        newline_index = msg_body[:link.start()].rfind("\n")
+        if msg_body[newline_index + 1] == ">":
+            return link.group()
+        else:
+            return "@@%s@@" % link.group(1)
+
+    msg_body = re.sub(RE_LINK, link_wrapper, msg_body)
+
+    def splitter_wrapper(splitter):
+        """Wrapps splitter with new line"""
+        if splitter.start() and msg_body[splitter.start() - 1] != '\n':
+            return '%s%s' % (delimiter, splitter.group())
+        else:
+            return splitter.group()
+
+    if content_type == 'text/plain':
+        msg_body = re.sub(RE_ON_DATE_SMB_WROTE, splitter_wrapper, msg_body)
+
+    return msg_body
+
+
+def postprocess(msg_body):
+    """Make up for changes done at preprocessing message.
+
+    Replace link brackets back to '<' and '>'.
+    """
+    return re.sub(RE_NORMALIZED_LINK, r'<\1>', msg_body).strip()
+
+
+def extract_from_plain(msg_body):
+    """Extracts a non quoted message from provided plain text."""
+    stripped_text = msg_body
+
+    delimiter = get_delimiter(msg_body)
+    msg_body = preprocess(msg_body, delimiter)
+    lines = msg_body.splitlines()
+
+    # don't process too long messages
+    if len(lines) > MAX_LINES_COUNT:
+        return stripped_text
+
+    markers = mark_message_lines(lines)
+    lines = process_marked_lines(lines, markers)
+
+    # concatenate lines, change links back, strip and return
+    msg_body = delimiter.join(lines)
+    msg_body = postprocess(msg_body)
+    return msg_body
+
+
+def extract_from_html(msg_body):
+    """
+    Extract not quoted message from provided html message body
+    using tags and plain text algorithm.
+
+    Cut out the 'blockquote', 'gmail_quote' tags.
+    Cut Microsoft quotations.
+
+    Then use plain text algorithm to cut out splitter or
+    leftover quotation.
+    This works by adding checkpoint text to all html tags,
+    then converting html to text,
+    then extracting quotations from text,
+    then checking deleted checkpoints,
+    then deleting neccessary tags.
+    """
+
+    if msg_body.strip() == '':
+        return msg_body
+
+    html_tree = html.document_fromstring(
+        msg_body,
+        parser=html.HTMLParser(encoding="utf-8")
+    )
+
+    cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
+                      html_quotations.cut_blockquote(html_tree) or
+                      html_quotations.cut_microsoft_quote(html_tree) or
+                      html_quotations.cut_by_id(html_tree) or
+                      html_quotations.cut_from_block(html_tree)
+                      )
+
+    html_tree_copy = deepcopy(html_tree)
+
+    number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
+    quotation_checkpoints = [False for i in xrange(number_of_checkpoints)]
+    msg_with_checkpoints = html.tostring(html_tree)
+
+    h = html2text.HTML2Text()
+    h.body_width = 0  # generate plain text without wrap
+
+    # html2text adds unnecessary star symbols. Remove them.
+    # Mask star symbols
+    msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432')
+    plain_text = h.handle(msg_with_checkpoints)
+    # Remove created star symbols
+    plain_text = plain_text.replace('*', '')
+    # Unmask saved star symbols
+    plain_text = plain_text.replace('3423oorkg432', '*')
+
+    delimiter = get_delimiter(plain_text)
+
+    plain_text = preprocess(plain_text, delimiter, content_type='text/html')
+    lines = plain_text.splitlines()
+
+    # Don't process too long messages
+    if len(lines) > MAX_LINES_COUNT:
+        return msg_body
+
+    # Collect checkpoints on each line
+    line_checkpoints = [
+        [int(i[4:-4])  # Only checkpoint number
+         for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)]
+        for line in lines]
+
+    # Remove checkpoints
+    lines = [re.sub(html_quotations.CHECKPOINT_PATTERN, '', line)
+             for line in lines]
+
+    # Use plain text quotation extracting algorithm
+    markers = mark_message_lines(lines)
+    return_flags = []
+    process_marked_lines(lines, markers, return_flags)
+    lines_were_deleted, first_deleted, last_deleted = return_flags
+
+    if lines_were_deleted:
+        #collect checkpoints from deleted lines
+        for i in xrange(first_deleted, last_deleted):
+            for checkpoint in line_checkpoints[i]:
+                quotation_checkpoints[checkpoint] = True
+    else:
+        if cut_quotations:
+            return html.tostring(html_tree_copy)
+        else:
+            return msg_body
+
+    # Remove tags with quotation checkpoints
+    html_quotations.delete_quotation_tags(
+        html_tree_copy, 0, quotation_checkpoints
+    )
+
+    return html.tostring(html_tree_copy)
+
+
+def is_splitter(line):
+    '''
+    Returns Matcher object if provided string is a splitter and
+    None otherwise.
+    '''
+    for pattern in SPLITTER_PATTERNS:
+        matcher = re.match(pattern, line)
+        if matcher:
+            return matcher
+
+
+def text_content(context):
+    '''XPath Extension function to return a node text content.'''
+    return context.context_node.text_content().strip()
+
+
+def tail(context):
+    '''XPath Extension function to return a node tail text.'''
+    return context.context_node.tail or ''
+
+
+def register_xpath_extensions():
+    ns = etree.FunctionNamespace("http://mailgun.net")
+    ns.prefix = 'mg'
+    ns['text_content'] = text_content
+    ns['tail'] = tail
--- a/talon/signature/init.py
+++ b/talon/signature/init.py
@@ -0,0 +1,48 @@
+"""The package exploits machine learning for parsing message signatures.
+
+The public interface consists of only one `extract` function:
+
+>>> (body, signature) = extract(body, sender)
+
+Where body is the original message `body` and `sender` corresponds to a person
+who sent the message.
+
+When importing the package classifiers instances are loaded.
+So each process will have it's classifiers in memory.
+
+The import of the package and the call to the `extract` function are better be
+enclosed in a try-catch block in case they fail.
+
+.. warning:: When making changes to features or emails the classifier is
+trained against, don't forget to regenerate:
+
+* signature/data/train.data and
+* signature/data/classifier
+"""
+
+import os
+import sys
+from cStringIO import StringIO
+
+from . import extraction
+from . extraction import extract
+from . learning import classifier
+
+
+DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
+
+EXTRACTOR_FILENAME = os.path.join(DATA_DIR, 'classifier')
+EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data')
+
+
+def initialize():
+    try:
+        # redirect output
+        so, sys.stdout = sys.stdout, StringIO()
+
+        extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME,
+                                               EXTRACTOR_DATA)
+        sys.stdout = so
+    except Exception, e:
+        raise Exception(
+            "Failed initializing signature parsing with classifiers", e)
--- a/talon/signature/bruteforce.py
+++ b/talon/signature/bruteforce.py
@@ -0,0 +1,188 @@
+import logging
+
+import regex as re
+
+from talon.utils import get_delimiter
+from talon.signature.constants import (SIGNATURE_MAX_LINES,
+                                       TOO_LONG_SIGNATURE_LINE)
+
+log = logging.getLogger(__name__)
+
+
+# regex to fetch signature based on common signature words
+RE_SIGNATURE = re.compile(r'''
+               (
+                   (?:
+                       ^[\s]*--*[\s]*[a-z \.]*$
+                       |
+                       ^thanks[\s,!]*$
+                       |
+                       ^regards[\s,!]*$
+                       |
+                       ^cheers[\s,!]*$
+                       |
+                       ^best[ a-z]*[\s,!]*$
+                   )
+                   .*
+               )
+               ''', re.I | re.X | re.M | re.S)
+
+
+# signatures appended by phone email clients
+RE_PHONE_SIGNATURE = re.compile(r'''
+               (
+                   (?:
+                       ^sent[ ]{1}from[ ]{1}my[\s,!\w]*$
+                       |
+                       ^sent[ ]from[ ]Mailbox[ ]for[ ]iPhone.*$
+                       |
+                       ^sent[ ]([\S]*[ ])?from[ ]my[ ]BlackBerry.*$
+                       |
+                       ^Enviado[ ]desde[ ]mi[ ]([\S]+[ ]){0,2}BlackBerry.*$
+                   )
+                   .*
+               )
+               ''', re.I | re.X | re.M | re.S)
+
+
+# see _mark_candidate_indexes() for details
+# c - could be signature line
+# d - line starts with dashes (could be signature or list item)
+# l - long line
+RE_SIGNATURE_CANDIDAATE = re.compile(r'''
+    (?P<candidate>c+d)[^d]
+    |
+    (?P<candidate>c+d)$
+    |
+    (?P<candidate>c+)
+    |
+    (?P<candidate>d)[^d]
+    |
+    (?P<candidate>d)$
+''', re.I | re.X | re.M | re.S)
+
+
+def extract_signature(msg_body):
+    '''
+    Analyzes message for a presence of signature block (by common patterns)
+    and returns tuple with two elements: message text without signature block
+    and the signature itself.
+
+    >>> extract_signature('Hey man! How r u?\n\n--\nRegards,\nRoman')
+    ('Hey man! How r u?', '--\nRegards,\nRoman')
+
+    >>> extract_signature('Hey man!')
+    ('Hey man!', None)
+    '''
+    try:
+        # identify line delimiter first
+        delimiter = get_delimiter(msg_body)
+
+        # make an assumption
+        stripped_body = msg_body.strip()
+        phone_signature = None
+
+        # strip off phone signature
+        phone_signature = RE_PHONE_SIGNATURE.search(msg_body)
+        if phone_signature:
+            stripped_body = stripped_body[:phone_signature.start()]
+            phone_signature = phone_signature.group()
+
+        # decide on signature candidate
+        lines = stripped_body.splitlines()
+        candidate = get_signature_candidate(lines)
+        candidate = delimiter.join(candidate)
+
+        # try to extract signature
+        signature = RE_SIGNATURE.search(candidate)
+        if not signature:
+            return (stripped_body.strip(), phone_signature)
+        else:
+            signature = signature.group()
+            # when we splitlines() and then join them
+            # we can lose a new line at the end
+            # we did it when identifying a candidate
+            # so we had to do it for stripped_body now
+            stripped_body = delimiter.join(lines)
+            stripped_body = stripped_body[:-len(signature)]
+
+            if phone_signature:
+                signature = delimiter.join([signature, phone_signature])
+
+            return (stripped_body.strip(),
+                    signature.strip())
+    except Exception, e:
+        log.exception('ERROR extracting signature')
+        return (msg_body, None)
+
+
+def get_signature_candidate(lines):
+    """Return lines that could hold signature
+
+    The lines should:
+
+    * be among last SIGNATURE_MAX_LINES non-empty lines.
+    * not include first line
+    * be shorter than TOO_LONG_SIGNATURE_LINE
+    * not include more than one line that starts with dashes
+    """
+    # non empty lines indexes
+    non_empty = [i for i, line in enumerate(lines) if line.strip()]
+
+    # if message is empty or just one line then there is no signature
+    if len(non_empty) <= 1:
+        return []
+
+    # we don't expect signature to start at the 1st line
+    candidate = non_empty[1:]
+    # signature shouldn't be longer then SIGNATURE_MAX_LINES
+    candidate = candidate[-SIGNATURE_MAX_LINES:]
+
+    markers = _mark_candidate_indexes(lines, candidate)
+    candidate = _process_marked_candidate_indexes(candidate, markers)
+
+    # get actual lines for the candidate instead of indexes
+    if candidate:
+        candidate = lines[candidate[0]:]
+        return candidate
+
+    return []
+
+
+def _mark_candidate_indexes(lines, candidate):
+    """Mark candidate indexes with markers
+
+    Markers:
+
+    * c - line that could be a signature line
+    * l - long line
+    * d - line that starts with dashes but has other chars as well
+
+    >>> _mark_candidate_lines(['Some text', '', '-', 'Bob'], [0, 2, 3])
+    'cdc'
+    """
+    # at first consider everything to be potential signature lines
+    markers = bytearray('c'*len(candidate))
+
+    # mark lines starting from bottom up
+    for i, line_idx in reversed(list(enumerate(candidate))):
+        if len(lines[line_idx].strip()) > TOO_LONG_SIGNATURE_LINE:
+            markers[i] = 'l'
+        else:
+            line = lines[line_idx].strip()
+            if line.startswith('-') and line.strip("-"):
+                markers[i] = 'd'
+
+    return markers
+
+
+def _process_marked_candidate_indexes(candidate, markers):
+    """
+    Run regexes against candidate's marked indexes to strip
+    signature candidate.
+
+    >>> _process_marked_candidate_indexes([9, 12, 14, 15, 17], 'clddc')
+    [15, 17]
+    """
+    match = RE_SIGNATURE_CANDIDAATE.match(markers[::-1])
+    return candidate[-match.end('candidate'):] if match else []
--- a/talon/signature/constants.py
+++ b/talon/signature/constants.py
@@ -0,0 +1,2 @@
+SIGNATURE_MAX_LINES = 11
+TOO_LONG_SIGNATURE_LINE = 60
--- a/talon/signature/data/classifier
+++ b/talon/signature/data/classifier
--- a/talon/signature/data/train.data
+++ b/talon/signature/data/train.data
--- a/talon/signature/extraction.py
+++ b/talon/signature/extraction.py
@@ -0,0 +1,116 @@
+# -*- coding: utf-8 -*-
+
+import os
+import logging
+
+import regex as re
+from PyML import SparseDataSet
+
+from talon.constants import RE_DELIMITER
+from talon.signature.constants import (SIGNATURE_MAX_LINES,
+                                       TOO_LONG_SIGNATURE_LINE)
+from talon.signature.learning.featurespace import features, build_pattern
+from talon.utils import get_delimiter
+from talon.signature.bruteforce import get_signature_candidate
+from talon.signature.learning.helpers import has_signature
+
+
+log = logging.getLogger(__name__)
+
+EXTRACTOR = None
+
+# regex signature pattern for reversed lines
+# assumes that all long lines have been excluded
+RE_REVERSE_SIGNATURE = re.compile(r'''
+# signature should consists of blocks like this
+(?:
+   # it could end with empty line
+   e*
+   # there could be text lines but no more than 2 in a row
+   (te*){,2}
+   # every block should end with signature line
+   s
+)+
+''', re.I | re.X | re.M | re.S)
+
+
+def is_signature_line(line, sender, classifier):
+    '''Checks if the line belongs to signature. Returns True or False.'''
+    data = SparseDataSet([build_pattern(line, features(sender))])
+    return classifier.decisionFunc(data, 0) > 0
+
+
+def extract(body, sender):
+    """Strips signature from the body of the message.
+
+    Returns stripped body and signature as a tuple.
+    If no signature is found the corresponding returned value is None.
+    """
+    try:
+        delimiter = get_delimiter(body)
+
+        body = body.strip()
+
+        if has_signature(body, sender):
+            lines = body.splitlines()
+
+            markers = _mark_lines(lines, sender)
+            text, signature = _process_marked_lines(lines, markers)
+
+            if signature:
+                text = delimiter.join(text)
+                if text.strip():
+                    return (text, delimiter.join(signature))
+    except Exception, e:
+        log.exception('ERROR when extracting signature with classifiers')
+
+    return (body, None)
+
+
+def _mark_lines(lines, sender):
+    """Mark message lines with markers to distinguish signature lines.
+
+    Markers:
+
+    * e - empty line
+    * s - line identified as signature
+    * t - other i.e. ordinary text line
+
+    >>> mark_message_lines(['Some text', '', 'Bob'], 'Bob')
+    'tes'
+    """
+    global EXTRACTOR
+
+    candidate = get_signature_candidate(lines)
+
+    # at first consider everything to be text no signature
+    markers = bytearray('t'*len(lines))
+
+    # mark lines starting from bottom up
+    # mark only lines that belong to candidate
+    # no need to mark all lines of the message
+    for i, line in reversed(list(enumerate(candidate))):
+        # markers correspond to lines not candidate
+        # so we need to recalculate our index to be
+        # relative to lines not candidate
+        j = len(lines) - len(candidate) + i
+        if not line.strip():
+            markers[j] = 'e'
+        elif is_signature_line(line, sender, EXTRACTOR):
+            markers[j] = 's'
+
+    return markers
+
+
+def _process_marked_lines(lines, markers):
+    """Run regexes against message's marked lines to strip signature.
+
+    >>> _process_marked_lines(['Some text', '', 'Bob'], 'tes')
+    (['Some text', ''], ['Bob'])
+    """
+    # reverse lines and match signature pattern for reversed lines
+    signature = RE_REVERSE_SIGNATURE.match(markers[::-1])
+    if signature:
+        return (lines[:-signature.end()], lines[-signature.end():])
+
+    return (lines, None)
--- a/talon/signature/learning/init.py
+++ b/talon/signature/learning/init.py
--- a/talon/signature/learning/classifier.py
+++ b/talon/signature/learning/classifier.py
@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+
+"""The module's functions could init, train, save and load a classifier.
+The classifier could be used to detect if a certain line of the message
+body belongs to the signature.
+"""
+
+import os
+import sys
+
+from PyML import SparseDataSet, SVM
+
+
+def init():
+    '''Inits classifier with optimal options.'''
+    return SVM(C=10, optimization='liblinear')
+
+
+def train(classifier, train_data_filename, save_classifier_filename=None):
+    '''Trains and saves classifier so that it could be easily loaded later.'''
+    data = SparseDataSet(train_data_filename, labelsColumn=-1)
+    classifier.train(data)
+    if save_classifier_filename:
+        classifier.save(save_classifier_filename)
+    return classifier
+
+
+def load(saved_classifier_filename, train_data_filename):
+    """Loads saved classifier.
+
+    Classifier should be loaded with the same data it was trained against
+    """
+    train_data = SparseDataSet(train_data_filename, labelsColumn=-1)
+    classifier = init()
+    classifier.load(saved_classifier_filename, train_data)
+    return classifier
--- a/talon/signature/learning/dataset.py
+++ b/talon/signature/learning/dataset.py
@@ -0,0 +1,161 @@
+# -*- coding: utf-8 -*-
+
+"""The module's functions build datasets to train/assess classifiers.
+
+For signature detection the input should be a folder with two directories
+that contain emails with and without signatures.
+
+For signature extraction the input should be a folder with annotated emails.
+To indicate that a line is a signature line use #sig# at the start of the line.
+
+A sender of an email could be specified in the same file as
+the message body e.g. when .eml format is used or in a separate file.
+
+In the letter case it is assumed that a body filename ends with the `_body`
+suffix and the corresponding sender file has the same name except for the
+suffix which should be `_sender`.
+"""
+
+import os
+import regex as re
+
+from talon.signature.constants import SIGNATURE_MAX_LINES
+from talon.signature.learning.featurespace import build_pattern, features
+
+
+SENDER_SUFFIX = '_sender'
+BODY_SUFFIX = '_body'
+
+SIGNATURE_ANNOTATION = '#sig#'
+REPLY_ANNOTATION = '#reply#'
+
+ANNOTATIONS = [SIGNATURE_ANNOTATION, REPLY_ANNOTATION]
+
+
+def is_sender_filename(filename):
+    """Checks if the file could contain message sender's name."""
+    return filename.endswith(SENDER_SUFFIX)
+
+
+def build_sender_filename(msg_filename):
+    """By the message filename gives expected sender's filename."""
+    return msg_filename[:-len(BODY_SUFFIX)] + SENDER_SUFFIX
+
+
+def parse_msg_sender(filename, sender_known=True):
+    """Given a filename returns the sender and the message.
+
+    Here the message is assumed to be a whole MIME message or just
+    message body.
+
+    >>> sender, msg = parse_msg_sender('msg.eml')
+    >>> sender, msg = parse_msg_sender('msg_body')
+
+    If you don't want to consider the sender's name in your classification
+    algorithm:
+    >>> parse_msg_sender(filename, False)
+    """
+    sender, msg = None, None
+    if os.path.isfile(filename) and not is_sender_filename(filename):
+        with open(filename) as f:
+            msg = f.read()
+            sender = u''
+            if sender_known:
+                sender_filename = build_sender_filename(filename)
+                if os.path.exists(sender_filename):
+                    with open(sender_filename) as sender_file:
+                        sender = sender_file.read().strip()
+                else:
+                    # if sender isn't found then the next line fails
+                    # and it is ok
+                    lines = msg.splitlines()
+                    for line in lines:
+                        match = re.match('From:(.*)', line)
+                        if match:
+                            sender = match.group(1)
+                            break
+    return (sender, msg)
+
+
+def build_detection_class(folder, dataset_filename,
+                          label, sender_known=True):
+    """Builds signature detection class.
+
+    Signature detection dataset includes patterns for two classes:
+    * class for positive patterns (goes with label 1)
+    * class for negative patterns (goes with label -1)
+
+    The patterns are build of emails from `folder` and appended to
+    dataset file.
+
+    >>> build_signature_detection_class('emails/P', 'train.data', 1)
+    """
+    with open(dataset_filename, 'a') as dataset:
+        for filename in os.listdir(folder):
+            filename = os.path.join(folder, filename)
+            sender, msg = parse_msg_sender(filename, sender_known)
+            if sender is None or msg is None:
+                continue
+            msg = re.sub('|'.join(ANNOTATIONS), '', msg)
+            X = build_pattern(msg, features(sender))
+            X.append(label)
+            labeled_pattern = ','.join([str(e) for e in X])
+            dataset.write(labeled_pattern + '\n')
+
+
+def build_detection_dataset(folder, dataset_filename,
+                            sender_known=True):
+    """Builds signature detection dataset using emails from folder.
+
+    folder should have the following structure:
+    x-- folder
+    |    x-- P
+    |    |    | -- positive sample email 1
+    |    |    | -- positive sample email 2
+    |    |    | -- ...
+    |    x-- N
+    |    |    | -- negative sample email 1
+    |    |    | -- negative sample email 2
+    |    |    | -- ...
+
+    If the dataset file already exist it is rewritten.
+    """
+    if os.path.exists(dataset_filename):
+        os.remove(dataset_filename)
+    build_detection_class(os.path.join(folder, u'P'),
+                          dataset_filename, 1)
+    build_detection_class(os.path.join(folder, u'N'),
+                          dataset_filename, -1)
+
+
+def build_extraction_dataset(folder, dataset_filename,
+                             sender_known=True):
+    """Builds signature extraction dataset using emails in the `folder`.
+
+    The emails in the `folder` should be annotated i.e. signature lines
+    should be marked with `#sig#`.
+    """
+    if os.path.exists(dataset_filename):
+        os.remove(dataset_filename)
+    with open(dataset_filename, 'a') as dataset:
+        for filename in os.listdir(folder):
+            filename = os.path.join(folder, filename)
+            sender, msg = parse_msg_sender(filename, sender_known)
+            if not sender or not msg:
+                continue
+            lines = msg.splitlines()
+            for i in xrange(1, min(SIGNATURE_MAX_LINES,
+                                   len(lines)) + 1):
+                line = lines[-i]
+                label = -1
+                if line[:len(SIGNATURE_ANNOTATION)] == \
+                        SIGNATURE_ANNOTATION:
+                    label = 1
+                    line = line[len(SIGNATURE_ANNOTATION):]
+                elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION:
+                    line = line[len(REPLY_ANNOTATION):]
+
+                X = build_pattern(line, features(sender))
+                X.append(label)
+                labeled_pattern = ','.join([str(e) for e in X])
+                dataset.write(labeled_pattern + '\n')
--- a/talon/signature/learning/featurespace.py
+++ b/talon/signature/learning/featurespace.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+
+""" The module provides functions for convertion of a message body/body lines
+into classifiers features space.
+
+The body and the message sender string are converted into unicode before
+applying features to them.
+"""
+
+from talon.signature.constants import SIGNATURE_MAX_LINES
+from talon.signature.learning.helpers import *
+
+
+def features(sender=''):
+    '''Returns a list of signature features.'''
+    return [
+        # This one isn't from paper.
+        # Meant to match companies names, sender's names, address.
+        many_capitalized_words,
+        # This one is not from paper.
+        # Line is too long.
+        # This one is less aggressive than `Line is too short`
+        lambda line: 1 if len(line) > 60 else 0,
+        # Line contains email pattern.
+        binary_regex_search(RE_EMAIL),
+        # Line contains url.
+        binary_regex_search(RE_URL),
+        # Line contains phone number pattern.
+        binary_regex_search(RE_RELAX_PHONE),
+        # Line matches the regular expression "^[\s]*---*[\s]*$".
+        binary_regex_match(RE_SEPARATOR),
+        # Line has a sequence of 10 or more special characters.
+        binary_regex_search(RE_SPECIAL_CHARS),
+        # Line contains any typical signature words.
+        binary_regex_search(RE_SIGNATURE_WORDS),
+        # Line contains a pattern like Vitor R. Carvalho or William W. Cohen.
+        binary_regex_search(RE_NAME),
+        # Percentage of punctuation symbols in the line is larger than 50%
+        lambda line: 1 if punctuation_percent(line) > 50 else 0,
+        # Percentage of punctuation symbols in the line is larger than 90%
+        lambda line: 1 if punctuation_percent(line) > 90 else 0,
+        contains_sender_names(sender)
+        ]
+
+
+def apply_features(body, features):
+    '''Applies features to message body lines.
+
+    Returns list of lists. Each of the lists corresponds to the body line
+    and is constituted by the numbers of features occurances (0 or 1).
+    E.g. if element j of list i equals 1 this means that
+    feature j occured in line i (counting from the last line of the body).
+    '''
+    # collect all non empty lines
+    lines = [line for line in body.splitlines() if line.strip()]
+
+    # take the last SIGNATURE_MAX_LINES
+    last_lines = lines[-SIGNATURE_MAX_LINES:]
+
+    # apply features, fallback to zeros
+    return ([[f(line) for f in features] for line in last_lines] or
+            [[0 for f in features]])
+
+
+def build_pattern(body, features):
+    '''Converts body into a pattern i.e. a point in the features space.
+
+    Applies features to the body lines and sums up the results.
+    Elements of the pattern indicate how many times a certain feature occured
+    in the last lines of the body.
+    '''
+    line_patterns = apply_features(body, features)
+    return reduce(lambda x, y: [i + j for i, j in zip(x, y)], line_patterns)
--- a/talon/signature/learning/helpers.py
+++ b/talon/signature/learning/helpers.py
@@ -0,0 +1,233 @@
+# -*- coding: utf-8 -*-
+
+""" The module provides:
+* functions used when evaluating signature's features
+* regexp's constants used when evaluating signature's features
+
+"""
+
+import unicodedata
+import regex as re
+
+from talon.utils import to_unicode
+
+from talon.signature.constants import SIGNATURE_MAX_LINES
+
+
+rc = re.compile
+
+RE_EMAIL = rc('@')
+RE_RELAX_PHONE = rc('.*(\(? ?[\d]{2,3} ?\)?.{,3}){2,}')
+RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''')
+
+# Taken from:
+# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
+# Line matches the regular expression "^[\s]*---*[\s]*$".
+RE_SEPARATOR = rc('^[\s]*---*[\s]*$')
+
+# Taken from:
+# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
+# Line has a sequence of 10 or more special characters.
+RE_SPECIAL_CHARS = rc(('^[\s]*([\*]|#|[\+]|[\^]|-|[\~]|[\&]|[\$]|_|[\!]|'
+                       '[\/]|[\%]|[\:]|[\=]){10,}[\s]*$'))
+
+RE_SIGNATURE_WORDS = rc(('(T|t)hank.*,|(B|b)est|(R|r)egards|'
+                         '^sent[ ]{1}from[ ]{1}my[\s,!\w]*$|BR|(S|s)incerely|'
+                         '(C|c)orporation|Group'))
+
+# Taken from:
+# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
+# Line contains a pattern like Vitor R. Carvalho or William W. Cohen.
+RE_NAME = rc('[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+')
+
+# Pattern to match if e.g. 'Sender:' header field has sender names.
+SENDER_WITH_NAME_PATTERN = '([\s]*[\S]+,?)+[\s]*<.*>.*'
+RE_SENDER_WITH_NAME = rc(SENDER_WITH_NAME_PATTERN)
+
+# Reply line clue line endings, as in regular expression:
+# " wrote:$" or " writes:$"
+RE_CLUE_LINE_END = rc('.*(W|w)rotes?:$')
+
+INVALID_WORD_START = rc('\(|\+|[\d]')
+
+BAD_SENDER_NAMES = [
+    # known mail domains
+    'hotmail', 'gmail', 'yandex', 'mail', 'yahoo', 'mailgun', 'mailgunhq',
+    'example',
+    # first level domains
+    'com', 'org', 'net', 'ru',
+    # bad words
+    'mailto'
+    ]
+
+
+def binary_regex_search(prog):
+    '''Returns a function that returns 1 or 0 depending on regex search result.
+
+    If regular expression compiled into prog is present in a string
+    the result of calling the returned function with the string will be 1
+    and 0 otherwise.
+
+    >>> import regex as re
+    >>> binary_regex_search(re.compile("12"))("12")
+    1
+    >>> binary_regex_search(re.compile("12"))("34")
+    0
+    '''
+    return lambda s: 1 if prog.search(s) else 0
+
+
+def binary_regex_match(prog):
+    '''Returns a function that returns 1 or 0 depending on regex match result.
+
+    If a string matches regular expression compiled into prog
+    the result of calling the returned function with the string will be 1
+    and 0 otherwise.
+
+    >>> import regex as re
+    >>> binary_regex_match(re.compile("12"))("12 3")
+    1
+    >>> binary_regex_match(re.compile("12"))("3 12")
+    0
+    '''
+    return lambda s: 1 if prog.match(s) else 0
+
+
+def flatten_list(list_to_flatten):
+    """Simple list comprehesion to flatten list.
+
+    >>> flatten_list([[1, 2], [3, 4, 5]])
+    [1, 2, 3, 4, 5]
+    >>> flatten_list([[1], [[2]]])
+    [1, [2]]
+    >>> flatten_list([1, [2]])
+    Traceback (most recent call last):
+    ...
+    TypeError: 'int' object is not iterable
+    """
+    return [e for sublist in list_to_flatten for e in sublist]
+
+
+def contains_sender_names(sender):
+    '''Returns a functions to search sender\'s name or it\'s part.
+
+    >>> feature = contains_sender_names("Sergey N.  Obukhov <xxx@example.com>")
+    >>> feature("Sergey Obukhov")
+    1
+    >>> feature("BR, Sergey N.")
+    1
+    >>> feature("Sergey")
+    1
+    >>> contains_sender_names("<serobnic@mail.ru>")("Serobnic")
+    1
+    >>> contains_sender_names("<serobnic@mail.ru>")("serobnic")
+    1
+    '''
+    names = '( |$)|'.join(flatten_list([[e, e.capitalize()]
+                                        for e in extract_names(sender)]))
+    names = names or sender
+    if names != '':
+        return binary_regex_search(re.compile(names))
+    return lambda s: False
+
+
+def extract_names(sender):
+    """Tries to extract sender's names from `From:` header.
+
+    It could extract not only the actual names but e.g.
+    the name of the company, parts of email, etc.
+
+    >>> extract_names('Sergey N.  Obukhov <serobnic@mail.ru>')
+    ['Sergey', 'Obukhov', 'serobnic']
+    >>> extract_names('')
+    []
+    """
+    sender = to_unicode(sender)
+    # Remove non-alphabetical characters
+    sender = "".join([char if char.isalpha() else ' ' for char in sender])
+    # Remove too short words and words from "black" list i.e.
+    # words like `ru`, `gmail`, `com`, `org`, etc.
+    sender = [word for word in sender.split() if len(word) > 1 and
+              not word in BAD_SENDER_NAMES]
+    # Remove duplicates
+    names = list(set(sender))
+    return names
+
+
+def categories_percent(s, categories):
+    '''Returns category characters persent.
+
+    >>> categories_percent("qqq ggg hhh", ["Po"])
+    0.0
+    >>> categories_percent("q,w.", ["Po"])
+    50.0
+    >>> categories_percent("qqq ggg hhh", ["Nd"])
+    0.0
+    >>> categories_percent("q5", ["Nd"])
+    50.0
+    >>> categories_percent("s.s,5s", ["Po", "Nd"])
+    50.0
+    '''
+    count = 0
+    s = to_unicode(s)
+    for c in s:
+        if unicodedata.category(c) in categories:
+            count += 1
+    return 100 * float(count) / len(s) if len(s) else 0
+
+
+def punctuation_percent(s):
+    '''Returns punctuation persent.
+
+    >>> punctuation_percent("qqq ggg hhh")
+    0.0
+    >>> punctuation_percent("q,w.")
+    50.0
+    '''
+    return categories_percent(s, ['Po'])
+
+
+def capitalized_words_percent(s):
+    '''Returns capitalized words percent.'''
+    s = to_unicode(s)
+    words = re.split('\s', s)
+    words = [w for w in words if w.strip()]
+    capitalized_words_counter = 0
+    valid_words_counter = 0
+    for word in words:
+        if not INVALID_WORD_START.match(word):
+            valid_words_counter += 1
+            if word[0].isupper():
+                capitalized_words_counter += 1
+    if valid_words_counter > 0 and len(words) > 1:
+        return 100 * float(capitalized_words_counter) / valid_words_counter
+
+    return 0
+
+
+def many_capitalized_words(s):
+    """Returns a function to check percentage of capitalized words.
+
+    The function returns 1 if percentage greater then 65% and 0 otherwise.
+    """
+    return 1 if capitalized_words_percent(s) > 66 else 0
+
+
+def has_signature(body, sender):
+    '''Checks if the body has signature. Returns True or False.'''
+    non_empty = [line for line in body.splitlines() if line.strip()]
+    candidate = non_empty[-SIGNATURE_MAX_LINES:]
+    upvotes = 0
+    for line in candidate:
+        # we check lines for sender's name, phone, email and url,
+        # those signature lines don't take more then 27 lines
+        if len(line.strip()) > 27:
+            continue
+        elif contains_sender_names(sender)(line):
+            return True
+        elif (binary_regex_search(RE_RELAX_PHONE)(line) +
+              binary_regex_search(RE_EMAIL)(line) +
+              binary_regex_search(RE_URL)(line) == 1):
+            upvotes += 1
+    if upvotes > 1:
+        return True
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -0,0 +1,76 @@
+# coding:utf-8
+
+import logging
+from random import shuffle
+
+from talon.constants import RE_DELIMITER
+
+
+log = logging.getLogger(__name__)
+
+
+def safe_format(format_string, *args, **kwargs):
+    """
+    Helper: formats string with any combination of bytestrings/unicode
+    strings without raising exceptions
+    """
+    try:
+        if not args and not kwargs:
+            return format_string
+        else:
+            return format_string.format(*args, **kwargs)
+
+    # catch encoding errors and transform everything into utf-8 string
+    # before logging:
+    except (UnicodeEncodeError, UnicodeDecodeError):
+        format_string = to_utf8(format_string)
+        args = [to_utf8(p) for p in args]
+        kwargs = {k: to_utf8(v) for k, v in kwargs.iteritems()}
+        return format_string.format(*args, **kwargs)
+
+    # ignore other errors
+    except:
+        return u''
+
+
+def to_unicode(str_or_unicode, precise=False):
+    """
+    Safely returns a unicode version of a given string
+    >>> utils.to_unicode('привет')
+        u'привет'
+    >>> utils.to_unicode(u'привет')
+        u'привет'
+    If `precise` flag is True, tries to guess the correct encoding first.
+    """
+    encoding = detect_encoding(str_or_unicode) if precise else 'utf-8'
+    if isinstance(str_or_unicode, str):
+        return unicode(str_or_unicode, encoding, 'replace')
+    return str_or_unicode
+
+
+def to_utf8(str_or_unicode):
+    """
+    Safely returns a UTF-8 version of a given string
+    >>> utils.to_utf8(u'hi')
+        'hi'
+    """
+    if isinstance(str_or_unicode, unicode):
+        return str_or_unicode.encode("utf-8", "ignore")
+    return str(str_or_unicode)
+
+
+def random_token(length=7):
+    vals = ("a b c d e f g h i j k l m n o p q r s t u v w x y z "
+            "0 1 2 3 4 5 6 7 8 9").split(' ')
+    shuffle(vals)
+    return ''.join(vals[:length])
+
+
+def get_delimiter(msg_body):
+    delimiter = RE_DELIMITER.search(msg_body)
+    if delimiter:
+        delimiter = delimiter.group()
+    else:
+        delimiter = '\n'
+
+    return delimiter