initial commit

2014-07-23 21:12:54 -07:00
commit 170f11038b
80 changed files with 7481 additions and 0 deletions
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -0,0 +1,376 @@
+# -*- coding: utf-8 -*-
+
+"""
+The module's functions operate on message bodies trying to extract
+original messages (without quoted messages)
+"""
+
+import regex as re
+import logging
+from copy import deepcopy
+
+from lxml import html, etree
+import html2text
+
+from talon.constants import RE_DELIMITER
+from talon.utils import random_token, get_delimiter
+from talon import html_quotations
+
+
+log = logging.getLogger(__name__)
+
+
+RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
+
+RE_ON_DATE_SMB_WROTE = re.compile(
+    r'''
+    (
+        -*  # could include dashes
+        [ ]?On[ ].*,  # date part ends with comma
+        (.*\n){0,2}  # splitter takes 4 lines at most
+        .*(wrote|sent):
+    )
+    ''', re.VERBOSE)
+
+RE_QUOTATION = re.compile(
+    r'''
+    (
+        # quotation border: splitter line or a number of quotation marker lines
+        (?:
+            s
+            |
+            (?:me*){2,}
+        )
+
+        # quotation lines could be marked as splitter or text, etc.
+        .*
+
+        # but we expect it to end with a quotation marker line
+        me*
+    )
+
+    # after quotations should be text only or nothing at all
+    [te]*$
+    ''', re.VERBOSE)
+
+RE_EMPTY_QUOTATION = re.compile(
+    r'''
+    (
+        # quotation border: splitter line or a number of quotation marker lines
+        (?:
+            s
+            |
+            (?:me*){2,}
+        )
+    )
+    e*
+    ''', re.VERBOSE)
+
+SPLITTER_PATTERNS = [
+    # ------Original Message------ or ---- Reply Message ----
+    re.compile("[\s]*[-]+[ ]*(Original|Reply) Message[ ]*[-]+", re.I),
+    # <date> <person>
+    re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE),
+    RE_ON_DATE_SMB_WROTE,
+    re.compile('(_+\r?\n)?[\s]*(:?[*]?From|Date):[*]? .*'),
+    re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?'
+               '( \S+){3,6}@\S+:')
+    ]
+
+
+RE_LINK = re.compile('<(http://[^>]*)>')
+RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@')
+
+RE_PARANTHESIS_LINK = re.compile("\(https?://")
+
+SPLITTER_MAX_LINES = 4
+MAX_LINES_COUNT = 1000
+
+QUOT_PATTERN = re.compile('^>+ ?')
+NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
+
+
+def extract_from(msg_body, content_type='text/plain'):
+    try:
+        if content_type == 'text/plain':
+            return extract_from_plain(msg_body)
+        elif content_type == 'text/html':
+            return extract_from_html(msg_body)
+    except Exception, e:
+        log.exception('ERROR extracting message')
+
+    return msg_body
+
+
+def mark_message_lines(lines):
+    """Mark message lines with markers to distinguish quotation lines.
+
+    Markers:
+
+    * e - empty line
+    * m - line that starts with quotation marker '>'
+    * s - splitter line
+    * t - presumably lines from the last message in the conversation
+
+    >>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question'])
+    'tsem'
+    """
+    markers = bytearray(len(lines))
+    i = 0
+    while i < len(lines):
+        if not lines[i].strip():
+            markers[i] = 'e'  # empty line
+        elif QUOT_PATTERN.match(lines[i]):
+            markers[i] = 'm'  # line with quotation marker
+        elif RE_FWD.match(lines[i]):
+            markers[i] = 'f'  # ---- Forwarded message ----
+        else:
+            # in case splitter is spread across several lines
+            splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
+            if splitter:
+                # append as many splitter markers as lines in splitter
+                splitter_lines = splitter.group().splitlines()
+                for j in xrange(len(splitter_lines)):
+                    markers[i + j] = 's'
+
+                # skip splitter lines
+                i += len(splitter_lines) - 1
+            else:
+                # probably the line from the last message in the conversation
+                markers[i] = 't'
+        i += 1
+
+    return markers
+
+
+def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
+    """Run regexes against message's marked lines to strip quotations.
+
+    Return only last message lines.
+    >>> mark_message_lines(['Hello', 'From: foo@bar.com', '', '> Hi', 'tsem'])
+    ['Hello']
+
+    Also returns return_flags.
+    return_flags = [were_lines_deleted, first_deleted_line,
+                    last_deleted_line]
+    """
+    # if there are no splitter there should be no markers
+    if 's' not in markers and not re.search('(me*){3}', markers):
+        markers = markers.replace('m', 't')
+
+    if re.match('[te]*f', markers):
+        return_flags[:] = [False, -1, -1]
+        return lines
+
+    # inlined reply
+    # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm'
+    # both 't' entries should be found
+    for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers):
+        # long links could break sequence of quotation lines but they shouldn't
+        # be considered an inline reply
+        links = (
+            RE_PARANTHESIS_LINK.search(lines[inline_reply.start() - 1]) or
+            RE_PARANTHESIS_LINK.match(lines[inline_reply.start()].strip()))
+        if not links:
+            return_flags[:] = [False, -1, -1]
+            return lines
+
+    # cut out text lines coming after splitter if there are no markers there
+    quotation = re.search('(se*)+((t|f)+e*)+', markers)
+    if quotation:
+        return_flags[:] = [True, quotation.start(), len(lines)]
+        return lines[:quotation.start()]
+
+    # handle the case with markers
+    quotation = (RE_QUOTATION.search(markers) or
+                 RE_EMPTY_QUOTATION.search(markers))
+
+    if quotation:
+        return_flags[:] = True, quotation.start(1), quotation.end(1)
+        return lines[:quotation.start(1)] + lines[quotation.end(1):]
+
+    return_flags[:] = [False, -1, -1]
+    return lines
+
+
+def preprocess(msg_body, delimiter, content_type='text/plain'):
+    """Prepares msg_body for being stripped.
+
+    Replaces link brackets so that they couldn't be taken for quotation marker.
+    Splits line in two if splitter pattern preceeded by some text on the same
+    line (done only for 'On <date> <person> wrote:' pattern).
+    """
+    # normalize links i.e. replace '<', '>' wrapping the link with some symbols
+    # so that '>' closing the link couldn't be mistakenly taken for quotation
+    # marker.
+    def link_wrapper(link):
+        newline_index = msg_body[:link.start()].rfind("\n")
+        if msg_body[newline_index + 1] == ">":
+            return link.group()
+        else:
+            return "@@%s@@" % link.group(1)
+
+    msg_body = re.sub(RE_LINK, link_wrapper, msg_body)
+
+    def splitter_wrapper(splitter):
+        """Wrapps splitter with new line"""
+        if splitter.start() and msg_body[splitter.start() - 1] != '\n':
+            return '%s%s' % (delimiter, splitter.group())
+        else:
+            return splitter.group()
+
+    if content_type == 'text/plain':
+        msg_body = re.sub(RE_ON_DATE_SMB_WROTE, splitter_wrapper, msg_body)
+
+    return msg_body
+
+
+def postprocess(msg_body):
+    """Make up for changes done at preprocessing message.
+
+    Replace link brackets back to '<' and '>'.
+    """
+    return re.sub(RE_NORMALIZED_LINK, r'<\1>', msg_body).strip()
+
+
+def extract_from_plain(msg_body):
+    """Extracts a non quoted message from provided plain text."""
+    stripped_text = msg_body
+
+    delimiter = get_delimiter(msg_body)
+    msg_body = preprocess(msg_body, delimiter)
+    lines = msg_body.splitlines()
+
+    # don't process too long messages
+    if len(lines) > MAX_LINES_COUNT:
+        return stripped_text
+
+    markers = mark_message_lines(lines)
+    lines = process_marked_lines(lines, markers)
+
+    # concatenate lines, change links back, strip and return
+    msg_body = delimiter.join(lines)
+    msg_body = postprocess(msg_body)
+    return msg_body
+
+
+def extract_from_html(msg_body):
+    """
+    Extract not quoted message from provided html message body
+    using tags and plain text algorithm.
+
+    Cut out the 'blockquote', 'gmail_quote' tags.
+    Cut Microsoft quotations.
+
+    Then use plain text algorithm to cut out splitter or
+    leftover quotation.
+    This works by adding checkpoint text to all html tags,
+    then converting html to text,
+    then extracting quotations from text,
+    then checking deleted checkpoints,
+    then deleting neccessary tags.
+    """
+
+    if msg_body.strip() == '':
+        return msg_body
+
+    html_tree = html.document_fromstring(
+        msg_body,
+        parser=html.HTMLParser(encoding="utf-8")
+    )
+
+    cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
+                      html_quotations.cut_blockquote(html_tree) or
+                      html_quotations.cut_microsoft_quote(html_tree) or
+                      html_quotations.cut_by_id(html_tree) or
+                      html_quotations.cut_from_block(html_tree)
+                      )
+
+    html_tree_copy = deepcopy(html_tree)
+
+    number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
+    quotation_checkpoints = [False for i in xrange(number_of_checkpoints)]
+    msg_with_checkpoints = html.tostring(html_tree)
+
+    h = html2text.HTML2Text()
+    h.body_width = 0  # generate plain text without wrap
+
+    # html2text adds unnecessary star symbols. Remove them.
+    # Mask star symbols
+    msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432')
+    plain_text = h.handle(msg_with_checkpoints)
+    # Remove created star symbols
+    plain_text = plain_text.replace('*', '')
+    # Unmask saved star symbols
+    plain_text = plain_text.replace('3423oorkg432', '*')
+
+    delimiter = get_delimiter(plain_text)
+
+    plain_text = preprocess(plain_text, delimiter, content_type='text/html')
+    lines = plain_text.splitlines()
+
+    # Don't process too long messages
+    if len(lines) > MAX_LINES_COUNT:
+        return msg_body
+
+    # Collect checkpoints on each line
+    line_checkpoints = [
+        [int(i[4:-4])  # Only checkpoint number
+         for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)]
+        for line in lines]
+
+    # Remove checkpoints
+    lines = [re.sub(html_quotations.CHECKPOINT_PATTERN, '', line)
+             for line in lines]
+
+    # Use plain text quotation extracting algorithm
+    markers = mark_message_lines(lines)
+    return_flags = []
+    process_marked_lines(lines, markers, return_flags)
+    lines_were_deleted, first_deleted, last_deleted = return_flags
+
+    if lines_were_deleted:
+        #collect checkpoints from deleted lines
+        for i in xrange(first_deleted, last_deleted):
+            for checkpoint in line_checkpoints[i]:
+                quotation_checkpoints[checkpoint] = True
+    else:
+        if cut_quotations:
+            return html.tostring(html_tree_copy)
+        else:
+            return msg_body
+
+    # Remove tags with quotation checkpoints
+    html_quotations.delete_quotation_tags(
+        html_tree_copy, 0, quotation_checkpoints
+    )
+
+    return html.tostring(html_tree_copy)
+
+
+def is_splitter(line):
+    '''
+    Returns Matcher object if provided string is a splitter and
+    None otherwise.
+    '''
+    for pattern in SPLITTER_PATTERNS:
+        matcher = re.match(pattern, line)
+        if matcher:
+            return matcher
+
+
+def text_content(context):
+    '''XPath Extension function to return a node text content.'''
+    return context.context_node.text_content().strip()
+
+
+def tail(context):
+    '''XPath Extension function to return a node tail text.'''
+    return context.context_node.tail or ''
+
+
+def register_xpath_extensions():
+    ns = etree.FunctionNamespace("http://mailgun.net")
+    ns.prefix = 'mg'
+    ns['text_content'] = text_content
+    ns['tail'] = tail