initial commit
This commit is contained in:
		
							
								
								
									
										7
									
								
								talon/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										7
									
								
								talon/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,7 @@ | ||||
| from talon.quotations import register_xpath_extensions | ||||
| from talon import signature | ||||
|  | ||||
|  | ||||
| def init(): | ||||
|     register_xpath_extensions() | ||||
|     signature.initialize() | ||||
							
								
								
									
										4
									
								
								talon/constants.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								talon/constants.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,4 @@ | ||||
| import regex as re | ||||
|  | ||||
|  | ||||
| RE_DELIMITER = re.compile('\r?\n') | ||||
							
								
								
									
										174
									
								
								talon/html_quotations.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										174
									
								
								talon/html_quotations.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,174 @@ | ||||
| """ | ||||
| The module's functions operate on message bodies trying to extract original | ||||
| messages (without quoted messages) from html | ||||
| """ | ||||
|  | ||||
| import regex as re | ||||
|  | ||||
|  | ||||
| CHECKPOINT_PREFIX = '#!%!' | ||||
| CHECKPOINT_SUFFIX = '!%!#' | ||||
| CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX) | ||||
|  | ||||
| # HTML quote indicators (tag ids) | ||||
| QUOTE_IDS = ['OLK_SRC_BODY_SECTION'] | ||||
|  | ||||
|  | ||||
| def add_checkpoint(html_note, counter): | ||||
|     """Recursively adds checkpoints to html tree. | ||||
|     """ | ||||
|     if html_note.text: | ||||
|         html_note.text = (html_note.text + CHECKPOINT_PREFIX + | ||||
|                           str(counter) + CHECKPOINT_SUFFIX) | ||||
|     else: | ||||
|         html_note.text = (CHECKPOINT_PREFIX + str(counter) + | ||||
|                           CHECKPOINT_SUFFIX) | ||||
|     counter += 1 | ||||
|  | ||||
|     for child in html_note.iterchildren(): | ||||
|         counter = add_checkpoint(child, counter) | ||||
|  | ||||
|     if html_note.tail: | ||||
|         html_note.tail = (html_note.tail + CHECKPOINT_PREFIX + | ||||
|                           str(counter) + CHECKPOINT_SUFFIX) | ||||
|     else: | ||||
|         html_note.tail = (CHECKPOINT_PREFIX + str(counter) + | ||||
|                           CHECKPOINT_SUFFIX) | ||||
|     counter += 1 | ||||
|  | ||||
|     return counter | ||||
|  | ||||
|  | ||||
| def delete_quotation_tags(html_note, counter, quotation_checkpoints): | ||||
|     """Deletes tags with quotation checkpoints from html tree. | ||||
|     """ | ||||
|     tag_in_quotation = True | ||||
|  | ||||
|     if quotation_checkpoints[counter]: | ||||
|         html_note.text = '' | ||||
|     else: | ||||
|         tag_in_quotation = False | ||||
|     counter += 1 | ||||
|  | ||||
|     quotation_children = []  # Children tags which are in quotation. | ||||
|     for child in html_note.iterchildren(): | ||||
|         counter, child_tag_in_quotation = delete_quotation_tags( | ||||
|             child, counter, | ||||
|             quotation_checkpoints | ||||
|         ) | ||||
|         if child_tag_in_quotation: | ||||
|             quotation_children.append(child) | ||||
|  | ||||
|     if quotation_checkpoints[counter]: | ||||
|         html_note.tail = '' | ||||
|     else: | ||||
|         tag_in_quotation = False | ||||
|     counter += 1 | ||||
|  | ||||
|     if tag_in_quotation: | ||||
|         return counter, tag_in_quotation | ||||
|     else: | ||||
|         # Remove quotation children. | ||||
|         for child in quotation_children: | ||||
|             html_note.remove(child) | ||||
|         return counter, tag_in_quotation | ||||
|  | ||||
|  | ||||
| def cut_gmail_quote(html_message): | ||||
|     ''' Cuts the outermost block element with class gmail_quote. ''' | ||||
|     gmail_quote = html_message.cssselect('.gmail_quote') | ||||
|     if gmail_quote: | ||||
|         gmail_quote[0].getparent().remove(gmail_quote[0]) | ||||
|         return True | ||||
|  | ||||
|  | ||||
| def cut_microsoft_quote(html_message): | ||||
|     ''' Cuts splitter block and all following blocks. ''' | ||||
|     splitter = html_message.xpath( | ||||
|         #outlook 2007, 2010 | ||||
|         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" | ||||
|         "padding:3.0pt 0cm 0cm 0cm']|" | ||||
|         #windows mail | ||||
|         "//div[@style='padding-top: 5px; " | ||||
|         "border-top-color: rgb(229, 229, 229); " | ||||
|         "border-top-width: 1px; border-top-style: solid;']" | ||||
|     ) | ||||
|  | ||||
|     if splitter: | ||||
|         splitter = splitter[0] | ||||
|         #outlook 2010 | ||||
|         if splitter == splitter.getparent().getchildren()[0]: | ||||
|             splitter = splitter.getparent() | ||||
|     else: | ||||
|         #outlook 2003 | ||||
|         splitter = html_message.xpath( | ||||
|             "//div" | ||||
|             "/div[@class='MsoNormal' and @align='center' " | ||||
|             "and @style='text-align:center']" | ||||
|             "/font" | ||||
|             "/span" | ||||
|             "/hr[@size='3' and @width='100%' and @align='center' " | ||||
|             "and @tabindex='-1']" | ||||
|         ) | ||||
|         if len(splitter): | ||||
|             splitter = splitter[0] | ||||
|             splitter = splitter.getparent().getparent() | ||||
|             splitter = splitter.getparent().getparent() | ||||
|  | ||||
|     if len(splitter): | ||||
|         parent = splitter.getparent() | ||||
|         after_splitter = splitter.getnext() | ||||
|         while after_splitter is not None: | ||||
|             parent.remove(after_splitter) | ||||
|             after_splitter = splitter.getnext() | ||||
|         parent.remove(splitter) | ||||
|         return True | ||||
|  | ||||
|     return False | ||||
|  | ||||
|  | ||||
| def cut_by_id(html_message): | ||||
|     found = False | ||||
|     for quote_id in QUOTE_IDS: | ||||
|         quote = html_message.cssselect('#{}'.format(quote_id)) | ||||
|         if quote: | ||||
|             found = True | ||||
|             quote[0].getparent().remove(quote[0]) | ||||
|     return found | ||||
|  | ||||
|  | ||||
| def cut_blockquote(html_message): | ||||
|     ''' Cuts blockquote with wrapping elements. ''' | ||||
|     quote = html_message.find('.//blockquote') | ||||
|     if quote is not None: | ||||
|         quote.getparent().remove(quote) | ||||
|         return True | ||||
|  | ||||
|  | ||||
| def cut_from_block(html_message): | ||||
|     """Cuts div tag which wraps block starting with "From:".""" | ||||
|     # handle the case when From: block is enclosed in some tag | ||||
|     block = html_message.xpath( | ||||
|         ("//*[starts-with(mg:text_content(), 'From:')]|" | ||||
|          "//*[starts-with(mg:text_content(), 'Date:')]")) | ||||
|  | ||||
|     if block: | ||||
|         block = block[-1] | ||||
|         while block.getparent() is not None: | ||||
|             if block.tag == 'div': | ||||
|                 block.getparent().remove(block) | ||||
|                 return True | ||||
|             else: | ||||
|                 block = block.getparent() | ||||
|     else: | ||||
|         # handle the case when From: block goes right after e.g. <hr> | ||||
|         # and not enclosed in some tag | ||||
|         block = html_message.xpath( | ||||
|             ("//*[starts-with(mg:tail(), 'From:')]|" | ||||
|              "//*[starts-with(mg:tail(), 'Date:')]")) | ||||
|         if block: | ||||
|             block = block[0] | ||||
|             while(block.getnext() is not None): | ||||
|                 block.getparent().remove(block.getnext()) | ||||
|             block.getparent().remove(block) | ||||
|             return True | ||||
							
								
								
									
										376
									
								
								talon/quotations.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										376
									
								
								talon/quotations.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,376 @@ | ||||
| # -*- coding: utf-8 -*- | ||||
|  | ||||
| """ | ||||
| The module's functions operate on message bodies trying to extract | ||||
| original messages (without quoted messages) | ||||
| """ | ||||
|  | ||||
| import regex as re | ||||
| import logging | ||||
| from copy import deepcopy | ||||
|  | ||||
| from lxml import html, etree | ||||
| import html2text | ||||
|  | ||||
| from talon.constants import RE_DELIMITER | ||||
| from talon.utils import random_token, get_delimiter | ||||
| from talon import html_quotations | ||||
|  | ||||
|  | ||||
| log = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) | ||||
|  | ||||
| RE_ON_DATE_SMB_WROTE = re.compile( | ||||
|     r''' | ||||
|     ( | ||||
|         -*  # could include dashes | ||||
|         [ ]?On[ ].*,  # date part ends with comma | ||||
|         (.*\n){0,2}  # splitter takes 4 lines at most | ||||
|         .*(wrote|sent): | ||||
|     ) | ||||
|     ''', re.VERBOSE) | ||||
|  | ||||
| RE_QUOTATION = re.compile( | ||||
|     r''' | ||||
|     ( | ||||
|         # quotation border: splitter line or a number of quotation marker lines | ||||
|         (?: | ||||
|             s | ||||
|             | | ||||
|             (?:me*){2,} | ||||
|         ) | ||||
|  | ||||
|         # quotation lines could be marked as splitter or text, etc. | ||||
|         .* | ||||
|  | ||||
|         # but we expect it to end with a quotation marker line | ||||
|         me* | ||||
|     ) | ||||
|  | ||||
|     # after quotations should be text only or nothing at all | ||||
|     [te]*$ | ||||
|     ''', re.VERBOSE) | ||||
|  | ||||
| RE_EMPTY_QUOTATION = re.compile( | ||||
|     r''' | ||||
|     ( | ||||
|         # quotation border: splitter line or a number of quotation marker lines | ||||
|         (?: | ||||
|             s | ||||
|             | | ||||
|             (?:me*){2,} | ||||
|         ) | ||||
|     ) | ||||
|     e* | ||||
|     ''', re.VERBOSE) | ||||
|  | ||||
| SPLITTER_PATTERNS = [ | ||||
|     # ------Original Message------ or ---- Reply Message ---- | ||||
|     re.compile("[\s]*[-]+[ ]*(Original|Reply) Message[ ]*[-]+", re.I), | ||||
|     # <date> <person> | ||||
|     re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE), | ||||
|     RE_ON_DATE_SMB_WROTE, | ||||
|     re.compile('(_+\r?\n)?[\s]*(:?[*]?From|Date):[*]? .*'), | ||||
|     re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' | ||||
|                '( \S+){3,6}@\S+:') | ||||
|     ] | ||||
|  | ||||
|  | ||||
| RE_LINK = re.compile('<(http://[^>]*)>') | ||||
| RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@') | ||||
|  | ||||
| RE_PARANTHESIS_LINK = re.compile("\(https?://") | ||||
|  | ||||
| SPLITTER_MAX_LINES = 4 | ||||
| MAX_LINES_COUNT = 1000 | ||||
|  | ||||
| QUOT_PATTERN = re.compile('^>+ ?') | ||||
| NO_QUOT_LINE = re.compile('^[^>].*[\S].*') | ||||
|  | ||||
|  | ||||
| def extract_from(msg_body, content_type='text/plain'): | ||||
|     try: | ||||
|         if content_type == 'text/plain': | ||||
|             return extract_from_plain(msg_body) | ||||
|         elif content_type == 'text/html': | ||||
|             return extract_from_html(msg_body) | ||||
|     except Exception, e: | ||||
|         log.exception('ERROR extracting message') | ||||
|  | ||||
|     return msg_body | ||||
|  | ||||
|  | ||||
| def mark_message_lines(lines): | ||||
|     """Mark message lines with markers to distinguish quotation lines. | ||||
|  | ||||
|     Markers: | ||||
|  | ||||
|     * e - empty line | ||||
|     * m - line that starts with quotation marker '>' | ||||
|     * s - splitter line | ||||
|     * t - presumably lines from the last message in the conversation | ||||
|  | ||||
|     >>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question']) | ||||
|     'tsem' | ||||
|     """ | ||||
|     markers = bytearray(len(lines)) | ||||
|     i = 0 | ||||
|     while i < len(lines): | ||||
|         if not lines[i].strip(): | ||||
|             markers[i] = 'e'  # empty line | ||||
|         elif QUOT_PATTERN.match(lines[i]): | ||||
|             markers[i] = 'm'  # line with quotation marker | ||||
|         elif RE_FWD.match(lines[i]): | ||||
|             markers[i] = 'f'  # ---- Forwarded message ---- | ||||
|         else: | ||||
|             # in case splitter is spread across several lines | ||||
|             splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES])) | ||||
|             if splitter: | ||||
|                 # append as many splitter markers as lines in splitter | ||||
|                 splitter_lines = splitter.group().splitlines() | ||||
|                 for j in xrange(len(splitter_lines)): | ||||
|                     markers[i + j] = 's' | ||||
|  | ||||
|                 # skip splitter lines | ||||
|                 i += len(splitter_lines) - 1 | ||||
|             else: | ||||
|                 # probably the line from the last message in the conversation | ||||
|                 markers[i] = 't' | ||||
|         i += 1 | ||||
|  | ||||
|     return markers | ||||
|  | ||||
|  | ||||
| def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): | ||||
|     """Run regexes against message's marked lines to strip quotations. | ||||
|  | ||||
|     Return only last message lines. | ||||
|     >>> mark_message_lines(['Hello', 'From: foo@bar.com', '', '> Hi', 'tsem']) | ||||
|     ['Hello'] | ||||
|  | ||||
|     Also returns return_flags. | ||||
|     return_flags = [were_lines_deleted, first_deleted_line, | ||||
|                     last_deleted_line] | ||||
|     """ | ||||
|     # if there are no splitter there should be no markers | ||||
|     if 's' not in markers and not re.search('(me*){3}', markers): | ||||
|         markers = markers.replace('m', 't') | ||||
|  | ||||
|     if re.match('[te]*f', markers): | ||||
|         return_flags[:] = [False, -1, -1] | ||||
|         return lines | ||||
|  | ||||
|     # inlined reply | ||||
|     # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm' | ||||
|     # both 't' entries should be found | ||||
|     for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers): | ||||
|         # long links could break sequence of quotation lines but they shouldn't | ||||
|         # be considered an inline reply | ||||
|         links = ( | ||||
|             RE_PARANTHESIS_LINK.search(lines[inline_reply.start() - 1]) or | ||||
|             RE_PARANTHESIS_LINK.match(lines[inline_reply.start()].strip())) | ||||
|         if not links: | ||||
|             return_flags[:] = [False, -1, -1] | ||||
|             return lines | ||||
|  | ||||
|     # cut out text lines coming after splitter if there are no markers there | ||||
|     quotation = re.search('(se*)+((t|f)+e*)+', markers) | ||||
|     if quotation: | ||||
|         return_flags[:] = [True, quotation.start(), len(lines)] | ||||
|         return lines[:quotation.start()] | ||||
|  | ||||
|     # handle the case with markers | ||||
|     quotation = (RE_QUOTATION.search(markers) or | ||||
|                  RE_EMPTY_QUOTATION.search(markers)) | ||||
|  | ||||
|     if quotation: | ||||
|         return_flags[:] = True, quotation.start(1), quotation.end(1) | ||||
|         return lines[:quotation.start(1)] + lines[quotation.end(1):] | ||||
|  | ||||
|     return_flags[:] = [False, -1, -1] | ||||
|     return lines | ||||
|  | ||||
|  | ||||
| def preprocess(msg_body, delimiter, content_type='text/plain'): | ||||
|     """Prepares msg_body for being stripped. | ||||
|  | ||||
|     Replaces link brackets so that they couldn't be taken for quotation marker. | ||||
|     Splits line in two if splitter pattern preceeded by some text on the same | ||||
|     line (done only for 'On <date> <person> wrote:' pattern). | ||||
|     """ | ||||
|     # normalize links i.e. replace '<', '>' wrapping the link with some symbols | ||||
|     # so that '>' closing the link couldn't be mistakenly taken for quotation | ||||
|     # marker. | ||||
|     def link_wrapper(link): | ||||
|         newline_index = msg_body[:link.start()].rfind("\n") | ||||
|         if msg_body[newline_index + 1] == ">": | ||||
|             return link.group() | ||||
|         else: | ||||
|             return "@@%s@@" % link.group(1) | ||||
|  | ||||
|     msg_body = re.sub(RE_LINK, link_wrapper, msg_body) | ||||
|  | ||||
|     def splitter_wrapper(splitter): | ||||
|         """Wrapps splitter with new line""" | ||||
|         if splitter.start() and msg_body[splitter.start() - 1] != '\n': | ||||
|             return '%s%s' % (delimiter, splitter.group()) | ||||
|         else: | ||||
|             return splitter.group() | ||||
|  | ||||
|     if content_type == 'text/plain': | ||||
|         msg_body = re.sub(RE_ON_DATE_SMB_WROTE, splitter_wrapper, msg_body) | ||||
|  | ||||
|     return msg_body | ||||
|  | ||||
|  | ||||
| def postprocess(msg_body): | ||||
|     """Make up for changes done at preprocessing message. | ||||
|  | ||||
|     Replace link brackets back to '<' and '>'. | ||||
|     """ | ||||
|     return re.sub(RE_NORMALIZED_LINK, r'<\1>', msg_body).strip() | ||||
|  | ||||
|  | ||||
| def extract_from_plain(msg_body): | ||||
|     """Extracts a non quoted message from provided plain text.""" | ||||
|     stripped_text = msg_body | ||||
|  | ||||
|     delimiter = get_delimiter(msg_body) | ||||
|     msg_body = preprocess(msg_body, delimiter) | ||||
|     lines = msg_body.splitlines() | ||||
|  | ||||
|     # don't process too long messages | ||||
|     if len(lines) > MAX_LINES_COUNT: | ||||
|         return stripped_text | ||||
|  | ||||
|     markers = mark_message_lines(lines) | ||||
|     lines = process_marked_lines(lines, markers) | ||||
|  | ||||
|     # concatenate lines, change links back, strip and return | ||||
|     msg_body = delimiter.join(lines) | ||||
|     msg_body = postprocess(msg_body) | ||||
|     return msg_body | ||||
|  | ||||
|  | ||||
| def extract_from_html(msg_body): | ||||
|     """ | ||||
|     Extract not quoted message from provided html message body | ||||
|     using tags and plain text algorithm. | ||||
|  | ||||
|     Cut out the 'blockquote', 'gmail_quote' tags. | ||||
|     Cut Microsoft quotations. | ||||
|  | ||||
|     Then use plain text algorithm to cut out splitter or | ||||
|     leftover quotation. | ||||
|     This works by adding checkpoint text to all html tags, | ||||
|     then converting html to text, | ||||
|     then extracting quotations from text, | ||||
|     then checking deleted checkpoints, | ||||
|     then deleting neccessary tags. | ||||
|     """ | ||||
|  | ||||
|     if msg_body.strip() == '': | ||||
|         return msg_body | ||||
|  | ||||
|     html_tree = html.document_fromstring( | ||||
|         msg_body, | ||||
|         parser=html.HTMLParser(encoding="utf-8") | ||||
|     ) | ||||
|  | ||||
|     cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or | ||||
|                       html_quotations.cut_blockquote(html_tree) or | ||||
|                       html_quotations.cut_microsoft_quote(html_tree) or | ||||
|                       html_quotations.cut_by_id(html_tree) or | ||||
|                       html_quotations.cut_from_block(html_tree) | ||||
|                       ) | ||||
|  | ||||
|     html_tree_copy = deepcopy(html_tree) | ||||
|  | ||||
|     number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) | ||||
|     quotation_checkpoints = [False for i in xrange(number_of_checkpoints)] | ||||
|     msg_with_checkpoints = html.tostring(html_tree) | ||||
|  | ||||
|     h = html2text.HTML2Text() | ||||
|     h.body_width = 0  # generate plain text without wrap | ||||
|  | ||||
|     # html2text adds unnecessary star symbols. Remove them. | ||||
|     # Mask star symbols | ||||
|     msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432') | ||||
|     plain_text = h.handle(msg_with_checkpoints) | ||||
|     # Remove created star symbols | ||||
|     plain_text = plain_text.replace('*', '') | ||||
|     # Unmask saved star symbols | ||||
|     plain_text = plain_text.replace('3423oorkg432', '*') | ||||
|  | ||||
|     delimiter = get_delimiter(plain_text) | ||||
|  | ||||
|     plain_text = preprocess(plain_text, delimiter, content_type='text/html') | ||||
|     lines = plain_text.splitlines() | ||||
|  | ||||
|     # Don't process too long messages | ||||
|     if len(lines) > MAX_LINES_COUNT: | ||||
|         return msg_body | ||||
|  | ||||
|     # Collect checkpoints on each line | ||||
|     line_checkpoints = [ | ||||
|         [int(i[4:-4])  # Only checkpoint number | ||||
|          for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)] | ||||
|         for line in lines] | ||||
|  | ||||
|     # Remove checkpoints | ||||
|     lines = [re.sub(html_quotations.CHECKPOINT_PATTERN, '', line) | ||||
|              for line in lines] | ||||
|  | ||||
|     # Use plain text quotation extracting algorithm | ||||
|     markers = mark_message_lines(lines) | ||||
|     return_flags = [] | ||||
|     process_marked_lines(lines, markers, return_flags) | ||||
|     lines_were_deleted, first_deleted, last_deleted = return_flags | ||||
|  | ||||
|     if lines_were_deleted: | ||||
|         #collect checkpoints from deleted lines | ||||
|         for i in xrange(first_deleted, last_deleted): | ||||
|             for checkpoint in line_checkpoints[i]: | ||||
|                 quotation_checkpoints[checkpoint] = True | ||||
|     else: | ||||
|         if cut_quotations: | ||||
|             return html.tostring(html_tree_copy) | ||||
|         else: | ||||
|             return msg_body | ||||
|  | ||||
|     # Remove tags with quotation checkpoints | ||||
|     html_quotations.delete_quotation_tags( | ||||
|         html_tree_copy, 0, quotation_checkpoints | ||||
|     ) | ||||
|  | ||||
|     return html.tostring(html_tree_copy) | ||||
|  | ||||
|  | ||||
| def is_splitter(line): | ||||
|     ''' | ||||
|     Returns Matcher object if provided string is a splitter and | ||||
|     None otherwise. | ||||
|     ''' | ||||
|     for pattern in SPLITTER_PATTERNS: | ||||
|         matcher = re.match(pattern, line) | ||||
|         if matcher: | ||||
|             return matcher | ||||
|  | ||||
|  | ||||
| def text_content(context): | ||||
|     '''XPath Extension function to return a node text content.''' | ||||
|     return context.context_node.text_content().strip() | ||||
|  | ||||
|  | ||||
| def tail(context): | ||||
|     '''XPath Extension function to return a node tail text.''' | ||||
|     return context.context_node.tail or '' | ||||
|  | ||||
|  | ||||
| def register_xpath_extensions(): | ||||
|     ns = etree.FunctionNamespace("http://mailgun.net") | ||||
|     ns.prefix = 'mg' | ||||
|     ns['text_content'] = text_content | ||||
|     ns['tail'] = tail | ||||
							
								
								
									
										48
									
								
								talon/signature/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								talon/signature/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,48 @@ | ||||
| """The package exploits machine learning for parsing message signatures. | ||||
|  | ||||
| The public interface consists of only one `extract` function: | ||||
|  | ||||
| >>> (body, signature) = extract(body, sender) | ||||
|  | ||||
| Where body is the original message `body` and `sender` corresponds to a person | ||||
| who sent the message. | ||||
|  | ||||
| When importing the package classifiers instances are loaded. | ||||
| So each process will have it's classifiers in memory. | ||||
|  | ||||
| The import of the package and the call to the `extract` function are better be | ||||
| enclosed in a try-catch block in case they fail. | ||||
|  | ||||
| .. warning:: When making changes to features or emails the classifier is | ||||
| trained against, don't forget to regenerate: | ||||
|  | ||||
| * signature/data/train.data and | ||||
| * signature/data/classifier | ||||
| """ | ||||
|  | ||||
| import os | ||||
| import sys | ||||
| from cStringIO import StringIO | ||||
|  | ||||
| from . import extraction | ||||
| from . extraction import extract | ||||
| from . learning import classifier | ||||
|  | ||||
|  | ||||
| DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') | ||||
|  | ||||
| EXTRACTOR_FILENAME = os.path.join(DATA_DIR, 'classifier') | ||||
| EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data') | ||||
|  | ||||
|  | ||||
| def initialize(): | ||||
|     try: | ||||
|         # redirect output | ||||
|         so, sys.stdout = sys.stdout, StringIO() | ||||
|  | ||||
|         extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME, | ||||
|                                                EXTRACTOR_DATA) | ||||
|         sys.stdout = so | ||||
|     except Exception, e: | ||||
|         raise Exception( | ||||
|             "Failed initializing signature parsing with classifiers", e) | ||||
							
								
								
									
										188
									
								
								talon/signature/bruteforce.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										188
									
								
								talon/signature/bruteforce.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,188 @@ | ||||
| import logging | ||||
|  | ||||
| import regex as re | ||||
|  | ||||
| from talon.utils import get_delimiter | ||||
| from talon.signature.constants import (SIGNATURE_MAX_LINES, | ||||
|                                        TOO_LONG_SIGNATURE_LINE) | ||||
|  | ||||
| log = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| # regex to fetch signature based on common signature words | ||||
| RE_SIGNATURE = re.compile(r''' | ||||
|                ( | ||||
|                    (?: | ||||
|                        ^[\s]*--*[\s]*[a-z \.]*$ | ||||
|                        | | ||||
|                        ^thanks[\s,!]*$ | ||||
|                        | | ||||
|                        ^regards[\s,!]*$ | ||||
|                        | | ||||
|                        ^cheers[\s,!]*$ | ||||
|                        | | ||||
|                        ^best[ a-z]*[\s,!]*$ | ||||
|                    ) | ||||
|                    .* | ||||
|                ) | ||||
|                ''', re.I | re.X | re.M | re.S) | ||||
|  | ||||
|  | ||||
| # signatures appended by phone email clients | ||||
| RE_PHONE_SIGNATURE = re.compile(r''' | ||||
|                ( | ||||
|                    (?: | ||||
|                        ^sent[ ]{1}from[ ]{1}my[\s,!\w]*$ | ||||
|                        | | ||||
|                        ^sent[ ]from[ ]Mailbox[ ]for[ ]iPhone.*$ | ||||
|                        | | ||||
|                        ^sent[ ]([\S]*[ ])?from[ ]my[ ]BlackBerry.*$ | ||||
|                        | | ||||
|                        ^Enviado[ ]desde[ ]mi[ ]([\S]+[ ]){0,2}BlackBerry.*$ | ||||
|                    ) | ||||
|                    .* | ||||
|                ) | ||||
|                ''', re.I | re.X | re.M | re.S) | ||||
|  | ||||
|  | ||||
| # see _mark_candidate_indexes() for details | ||||
| # c - could be signature line | ||||
| # d - line starts with dashes (could be signature or list item) | ||||
| # l - long line | ||||
| RE_SIGNATURE_CANDIDAATE = re.compile(r''' | ||||
|     (?P<candidate>c+d)[^d] | ||||
|     | | ||||
|     (?P<candidate>c+d)$ | ||||
|     | | ||||
|     (?P<candidate>c+) | ||||
|     | | ||||
|     (?P<candidate>d)[^d] | ||||
|     | | ||||
|     (?P<candidate>d)$ | ||||
| ''', re.I | re.X | re.M | re.S) | ||||
|  | ||||
|  | ||||
| def extract_signature(msg_body): | ||||
|     ''' | ||||
|     Analyzes message for a presence of signature block (by common patterns) | ||||
|     and returns tuple with two elements: message text without signature block | ||||
|     and the signature itself. | ||||
|  | ||||
|     >>> extract_signature('Hey man! How r u?\n\n--\nRegards,\nRoman') | ||||
|     ('Hey man! How r u?', '--\nRegards,\nRoman') | ||||
|  | ||||
|     >>> extract_signature('Hey man!') | ||||
|     ('Hey man!', None) | ||||
|     ''' | ||||
|     try: | ||||
|         # identify line delimiter first | ||||
|         delimiter = get_delimiter(msg_body) | ||||
|  | ||||
|         # make an assumption | ||||
|         stripped_body = msg_body.strip() | ||||
|         phone_signature = None | ||||
|  | ||||
|         # strip off phone signature | ||||
|         phone_signature = RE_PHONE_SIGNATURE.search(msg_body) | ||||
|         if phone_signature: | ||||
|             stripped_body = stripped_body[:phone_signature.start()] | ||||
|             phone_signature = phone_signature.group() | ||||
|  | ||||
|         # decide on signature candidate | ||||
|         lines = stripped_body.splitlines() | ||||
|         candidate = get_signature_candidate(lines) | ||||
|         candidate = delimiter.join(candidate) | ||||
|  | ||||
|         # try to extract signature | ||||
|         signature = RE_SIGNATURE.search(candidate) | ||||
|         if not signature: | ||||
|             return (stripped_body.strip(), phone_signature) | ||||
|         else: | ||||
|             signature = signature.group() | ||||
|             # when we splitlines() and then join them | ||||
|             # we can lose a new line at the end | ||||
|             # we did it when identifying a candidate | ||||
|             # so we had to do it for stripped_body now | ||||
|             stripped_body = delimiter.join(lines) | ||||
|             stripped_body = stripped_body[:-len(signature)] | ||||
|  | ||||
|             if phone_signature: | ||||
|                 signature = delimiter.join([signature, phone_signature]) | ||||
|  | ||||
|             return (stripped_body.strip(), | ||||
|                     signature.strip()) | ||||
|     except Exception, e: | ||||
|         log.exception('ERROR extracting signature') | ||||
|         return (msg_body, None) | ||||
|  | ||||
|  | ||||
| def get_signature_candidate(lines): | ||||
|     """Return lines that could hold signature | ||||
|  | ||||
|     The lines should: | ||||
|  | ||||
|     * be among last SIGNATURE_MAX_LINES non-empty lines. | ||||
|     * not include first line | ||||
|     * be shorter than TOO_LONG_SIGNATURE_LINE | ||||
|     * not include more than one line that starts with dashes | ||||
|     """ | ||||
|     # non empty lines indexes | ||||
|     non_empty = [i for i, line in enumerate(lines) if line.strip()] | ||||
|  | ||||
|     # if message is empty or just one line then there is no signature | ||||
|     if len(non_empty) <= 1: | ||||
|         return [] | ||||
|  | ||||
|     # we don't expect signature to start at the 1st line | ||||
|     candidate = non_empty[1:] | ||||
|     # signature shouldn't be longer then SIGNATURE_MAX_LINES | ||||
|     candidate = candidate[-SIGNATURE_MAX_LINES:] | ||||
|  | ||||
|     markers = _mark_candidate_indexes(lines, candidate) | ||||
|     candidate = _process_marked_candidate_indexes(candidate, markers) | ||||
|  | ||||
|     # get actual lines for the candidate instead of indexes | ||||
|     if candidate: | ||||
|         candidate = lines[candidate[0]:] | ||||
|         return candidate | ||||
|  | ||||
|     return [] | ||||
|  | ||||
|  | ||||
| def _mark_candidate_indexes(lines, candidate): | ||||
|     """Mark candidate indexes with markers | ||||
|  | ||||
|     Markers: | ||||
|  | ||||
|     * c - line that could be a signature line | ||||
|     * l - long line | ||||
|     * d - line that starts with dashes but has other chars as well | ||||
|  | ||||
|     >>> _mark_candidate_lines(['Some text', '', '-', 'Bob'], [0, 2, 3]) | ||||
|     'cdc' | ||||
|     """ | ||||
|     # at first consider everything to be potential signature lines | ||||
|     markers = bytearray('c'*len(candidate)) | ||||
|  | ||||
|     # mark lines starting from bottom up | ||||
|     for i, line_idx in reversed(list(enumerate(candidate))): | ||||
|         if len(lines[line_idx].strip()) > TOO_LONG_SIGNATURE_LINE: | ||||
|             markers[i] = 'l' | ||||
|         else: | ||||
|             line = lines[line_idx].strip() | ||||
|             if line.startswith('-') and line.strip("-"): | ||||
|                 markers[i] = 'd' | ||||
|  | ||||
|     return markers | ||||
|  | ||||
|  | ||||
| def _process_marked_candidate_indexes(candidate, markers): | ||||
|     """ | ||||
|     Run regexes against candidate's marked indexes to strip | ||||
|     signature candidate. | ||||
|  | ||||
|     >>> _process_marked_candidate_indexes([9, 12, 14, 15, 17], 'clddc') | ||||
|     [15, 17] | ||||
|     """ | ||||
|     match = RE_SIGNATURE_CANDIDAATE.match(markers[::-1]) | ||||
|     return candidate[-match.end('candidate'):] if match else [] | ||||
							
								
								
									
										2
									
								
								talon/signature/constants.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								talon/signature/constants.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | ||||
| SIGNATURE_MAX_LINES = 11 | ||||
| TOO_LONG_SIGNATURE_LINE = 60 | ||||
							
								
								
									
										1
									
								
								talon/signature/data/classifier
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								talon/signature/data/classifier
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										2912
									
								
								talon/signature/data/train.data
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2912
									
								
								talon/signature/data/train.data
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										116
									
								
								talon/signature/extraction.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										116
									
								
								talon/signature/extraction.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,116 @@ | ||||
| # -*- coding: utf-8 -*- | ||||
|  | ||||
| import os | ||||
| import logging | ||||
|  | ||||
| import regex as re | ||||
| from PyML import SparseDataSet | ||||
|  | ||||
| from talon.constants import RE_DELIMITER | ||||
| from talon.signature.constants import (SIGNATURE_MAX_LINES, | ||||
|                                        TOO_LONG_SIGNATURE_LINE) | ||||
| from talon.signature.learning.featurespace import features, build_pattern | ||||
| from talon.utils import get_delimiter | ||||
| from talon.signature.bruteforce import get_signature_candidate | ||||
| from talon.signature.learning.helpers import has_signature | ||||
|  | ||||
|  | ||||
| log = logging.getLogger(__name__) | ||||
|  | ||||
| EXTRACTOR = None | ||||
|  | ||||
| # regex signature pattern for reversed lines | ||||
| # assumes that all long lines have been excluded | ||||
| RE_REVERSE_SIGNATURE = re.compile(r''' | ||||
| # signature should consists of blocks like this | ||||
| (?: | ||||
|    # it could end with empty line | ||||
|    e* | ||||
|    # there could be text lines but no more than 2 in a row | ||||
|    (te*){,2} | ||||
|    # every block should end with signature line | ||||
|    s | ||||
| )+ | ||||
| ''', re.I | re.X | re.M | re.S) | ||||
|  | ||||
|  | ||||
| def is_signature_line(line, sender, classifier): | ||||
|     '''Checks if the line belongs to signature. Returns True or False.''' | ||||
|     data = SparseDataSet([build_pattern(line, features(sender))]) | ||||
|     return classifier.decisionFunc(data, 0) > 0 | ||||
|  | ||||
|  | ||||
| def extract(body, sender): | ||||
|     """Strips signature from the body of the message. | ||||
|  | ||||
|     Returns stripped body and signature as a tuple. | ||||
|     If no signature is found the corresponding returned value is None. | ||||
|     """ | ||||
|     try: | ||||
|         delimiter = get_delimiter(body) | ||||
|  | ||||
|         body = body.strip() | ||||
|  | ||||
|         if has_signature(body, sender): | ||||
|             lines = body.splitlines() | ||||
|  | ||||
|             markers = _mark_lines(lines, sender) | ||||
|             text, signature = _process_marked_lines(lines, markers) | ||||
|  | ||||
|             if signature: | ||||
|                 text = delimiter.join(text) | ||||
|                 if text.strip(): | ||||
|                     return (text, delimiter.join(signature)) | ||||
|     except Exception, e: | ||||
|         log.exception('ERROR when extracting signature with classifiers') | ||||
|  | ||||
|     return (body, None) | ||||
|  | ||||
|  | ||||
| def _mark_lines(lines, sender): | ||||
|     """Mark message lines with markers to distinguish signature lines. | ||||
|  | ||||
|     Markers: | ||||
|  | ||||
|     * e - empty line | ||||
|     * s - line identified as signature | ||||
|     * t - other i.e. ordinary text line | ||||
|  | ||||
|     >>> mark_message_lines(['Some text', '', 'Bob'], 'Bob') | ||||
|     'tes' | ||||
|     """ | ||||
|     global EXTRACTOR | ||||
|  | ||||
|     candidate = get_signature_candidate(lines) | ||||
|  | ||||
|     # at first consider everything to be text no signature | ||||
|     markers = bytearray('t'*len(lines)) | ||||
|  | ||||
|     # mark lines starting from bottom up | ||||
|     # mark only lines that belong to candidate | ||||
|     # no need to mark all lines of the message | ||||
|     for i, line in reversed(list(enumerate(candidate))): | ||||
|         # markers correspond to lines not candidate | ||||
|         # so we need to recalculate our index to be | ||||
|         # relative to lines not candidate | ||||
|         j = len(lines) - len(candidate) + i | ||||
|         if not line.strip(): | ||||
|             markers[j] = 'e' | ||||
|         elif is_signature_line(line, sender, EXTRACTOR): | ||||
|             markers[j] = 's' | ||||
|  | ||||
|     return markers | ||||
|  | ||||
|  | ||||
| def _process_marked_lines(lines, markers): | ||||
|     """Run regexes against message's marked lines to strip signature. | ||||
|  | ||||
|     >>> _process_marked_lines(['Some text', '', 'Bob'], 'tes') | ||||
|     (['Some text', ''], ['Bob']) | ||||
|     """ | ||||
|     # reverse lines and match signature pattern for reversed lines | ||||
|     signature = RE_REVERSE_SIGNATURE.match(markers[::-1]) | ||||
|     if signature: | ||||
|         return (lines[:-signature.end()], lines[-signature.end():]) | ||||
|  | ||||
|     return (lines, None) | ||||
							
								
								
									
										0
									
								
								talon/signature/learning/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								talon/signature/learning/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										36
									
								
								talon/signature/learning/classifier.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								talon/signature/learning/classifier.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,36 @@ | ||||
| # -*- coding: utf-8 -*- | ||||
|  | ||||
| """The module's functions could init, train, save and load a classifier. | ||||
| The classifier could be used to detect if a certain line of the message | ||||
| body belongs to the signature. | ||||
| """ | ||||
|  | ||||
| import os | ||||
| import sys | ||||
|  | ||||
| from PyML import SparseDataSet, SVM | ||||
|  | ||||
|  | ||||
| def init(): | ||||
|     '''Inits classifier with optimal options.''' | ||||
|     return SVM(C=10, optimization='liblinear') | ||||
|  | ||||
|  | ||||
| def train(classifier, train_data_filename, save_classifier_filename=None): | ||||
|     '''Trains and saves classifier so that it could be easily loaded later.''' | ||||
|     data = SparseDataSet(train_data_filename, labelsColumn=-1) | ||||
|     classifier.train(data) | ||||
|     if save_classifier_filename: | ||||
|         classifier.save(save_classifier_filename) | ||||
|     return classifier | ||||
|  | ||||
|  | ||||
| def load(saved_classifier_filename, train_data_filename): | ||||
|     """Loads saved classifier. | ||||
|  | ||||
|     Classifier should be loaded with the same data it was trained against | ||||
|     """ | ||||
|     train_data = SparseDataSet(train_data_filename, labelsColumn=-1) | ||||
|     classifier = init() | ||||
|     classifier.load(saved_classifier_filename, train_data) | ||||
|     return classifier | ||||
							
								
								
									
										161
									
								
								talon/signature/learning/dataset.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										161
									
								
								talon/signature/learning/dataset.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,161 @@ | ||||
| # -*- coding: utf-8 -*- | ||||
|  | ||||
| """The module's functions build datasets to train/assess classifiers. | ||||
|  | ||||
| For signature detection the input should be a folder with two directories | ||||
| that contain emails with and without signatures. | ||||
|  | ||||
| For signature extraction the input should be a folder with annotated emails. | ||||
| To indicate that a line is a signature line use #sig# at the start of the line. | ||||
|  | ||||
| A sender of an email could be specified in the same file as | ||||
| the message body e.g. when .eml format is used or in a separate file. | ||||
|  | ||||
| In the letter case it is assumed that a body filename ends with the `_body` | ||||
| suffix and the corresponding sender file has the same name except for the | ||||
| suffix which should be `_sender`. | ||||
| """ | ||||
|  | ||||
| import os | ||||
| import regex as re | ||||
|  | ||||
| from talon.signature.constants import SIGNATURE_MAX_LINES | ||||
| from talon.signature.learning.featurespace import build_pattern, features | ||||
|  | ||||
|  | ||||
| SENDER_SUFFIX = '_sender' | ||||
| BODY_SUFFIX = '_body' | ||||
|  | ||||
| SIGNATURE_ANNOTATION = '#sig#' | ||||
| REPLY_ANNOTATION = '#reply#' | ||||
|  | ||||
| ANNOTATIONS = [SIGNATURE_ANNOTATION, REPLY_ANNOTATION] | ||||
|  | ||||
|  | ||||
| def is_sender_filename(filename): | ||||
|     """Checks if the file could contain message sender's name.""" | ||||
|     return filename.endswith(SENDER_SUFFIX) | ||||
|  | ||||
|  | ||||
| def build_sender_filename(msg_filename): | ||||
|     """By the message filename gives expected sender's filename.""" | ||||
|     return msg_filename[:-len(BODY_SUFFIX)] + SENDER_SUFFIX | ||||
|  | ||||
|  | ||||
| def parse_msg_sender(filename, sender_known=True): | ||||
|     """Given a filename returns the sender and the message. | ||||
|  | ||||
|     Here the message is assumed to be a whole MIME message or just | ||||
|     message body. | ||||
|  | ||||
|     >>> sender, msg = parse_msg_sender('msg.eml') | ||||
|     >>> sender, msg = parse_msg_sender('msg_body') | ||||
|  | ||||
|     If you don't want to consider the sender's name in your classification | ||||
|     algorithm: | ||||
|     >>> parse_msg_sender(filename, False) | ||||
|     """ | ||||
|     sender, msg = None, None | ||||
|     if os.path.isfile(filename) and not is_sender_filename(filename): | ||||
|         with open(filename) as f: | ||||
|             msg = f.read() | ||||
|             sender = u'' | ||||
|             if sender_known: | ||||
|                 sender_filename = build_sender_filename(filename) | ||||
|                 if os.path.exists(sender_filename): | ||||
|                     with open(sender_filename) as sender_file: | ||||
|                         sender = sender_file.read().strip() | ||||
|                 else: | ||||
|                     # if sender isn't found then the next line fails | ||||
|                     # and it is ok | ||||
|                     lines = msg.splitlines() | ||||
|                     for line in lines: | ||||
|                         match = re.match('From:(.*)', line) | ||||
|                         if match: | ||||
|                             sender = match.group(1) | ||||
|                             break | ||||
|     return (sender, msg) | ||||
|  | ||||
|  | ||||
| def build_detection_class(folder, dataset_filename, | ||||
|                           label, sender_known=True): | ||||
|     """Builds signature detection class. | ||||
|  | ||||
|     Signature detection dataset includes patterns for two classes: | ||||
|     * class for positive patterns (goes with label 1) | ||||
|     * class for negative patterns (goes with label -1) | ||||
|  | ||||
|     The patterns are build of emails from `folder` and appended to | ||||
|     dataset file. | ||||
|  | ||||
|     >>> build_signature_detection_class('emails/P', 'train.data', 1) | ||||
|     """ | ||||
|     with open(dataset_filename, 'a') as dataset: | ||||
|         for filename in os.listdir(folder): | ||||
|             filename = os.path.join(folder, filename) | ||||
|             sender, msg = parse_msg_sender(filename, sender_known) | ||||
|             if sender is None or msg is None: | ||||
|                 continue | ||||
|             msg = re.sub('|'.join(ANNOTATIONS), '', msg) | ||||
|             X = build_pattern(msg, features(sender)) | ||||
|             X.append(label) | ||||
|             labeled_pattern = ','.join([str(e) for e in X]) | ||||
|             dataset.write(labeled_pattern + '\n') | ||||
|  | ||||
|  | ||||
| def build_detection_dataset(folder, dataset_filename, | ||||
|                             sender_known=True): | ||||
|     """Builds signature detection dataset using emails from folder. | ||||
|  | ||||
|     folder should have the following structure: | ||||
|     x-- folder | ||||
|     |    x-- P | ||||
|     |    |    | -- positive sample email 1 | ||||
|     |    |    | -- positive sample email 2 | ||||
|     |    |    | -- ... | ||||
|     |    x-- N | ||||
|     |    |    | -- negative sample email 1 | ||||
|     |    |    | -- negative sample email 2 | ||||
|     |    |    | -- ... | ||||
|  | ||||
|     If the dataset file already exist it is rewritten. | ||||
|     """ | ||||
|     if os.path.exists(dataset_filename): | ||||
|         os.remove(dataset_filename) | ||||
|     build_detection_class(os.path.join(folder, u'P'), | ||||
|                           dataset_filename, 1) | ||||
|     build_detection_class(os.path.join(folder, u'N'), | ||||
|                           dataset_filename, -1) | ||||
|  | ||||
|  | ||||
| def build_extraction_dataset(folder, dataset_filename, | ||||
|                              sender_known=True): | ||||
|     """Builds signature extraction dataset using emails in the `folder`. | ||||
|  | ||||
|     The emails in the `folder` should be annotated i.e. signature lines | ||||
|     should be marked with `#sig#`. | ||||
|     """ | ||||
|     if os.path.exists(dataset_filename): | ||||
|         os.remove(dataset_filename) | ||||
|     with open(dataset_filename, 'a') as dataset: | ||||
|         for filename in os.listdir(folder): | ||||
|             filename = os.path.join(folder, filename) | ||||
|             sender, msg = parse_msg_sender(filename, sender_known) | ||||
|             if not sender or not msg: | ||||
|                 continue | ||||
|             lines = msg.splitlines() | ||||
|             for i in xrange(1, min(SIGNATURE_MAX_LINES, | ||||
|                                    len(lines)) + 1): | ||||
|                 line = lines[-i] | ||||
|                 label = -1 | ||||
|                 if line[:len(SIGNATURE_ANNOTATION)] == \ | ||||
|                         SIGNATURE_ANNOTATION: | ||||
|                     label = 1 | ||||
|                     line = line[len(SIGNATURE_ANNOTATION):] | ||||
|                 elif line[:len(REPLY_ANNOTATION)] == REPLY_ANNOTATION: | ||||
|                     line = line[len(REPLY_ANNOTATION):] | ||||
|  | ||||
|                 X = build_pattern(line, features(sender)) | ||||
|                 X.append(label) | ||||
|                 labeled_pattern = ','.join([str(e) for e in X]) | ||||
|                 dataset.write(labeled_pattern + '\n') | ||||
							
								
								
									
										73
									
								
								talon/signature/learning/featurespace.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										73
									
								
								talon/signature/learning/featurespace.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,73 @@ | ||||
| # -*- coding: utf-8 -*- | ||||
|  | ||||
| """ The module provides functions for convertion of a message body/body lines | ||||
| into classifiers features space. | ||||
|  | ||||
| The body and the message sender string are converted into unicode before | ||||
| applying features to them. | ||||
| """ | ||||
|  | ||||
| from talon.signature.constants import SIGNATURE_MAX_LINES | ||||
| from talon.signature.learning.helpers import * | ||||
|  | ||||
|  | ||||
| def features(sender=''): | ||||
|     '''Returns a list of signature features.''' | ||||
|     return [ | ||||
|         # This one isn't from paper. | ||||
|         # Meant to match companies names, sender's names, address. | ||||
|         many_capitalized_words, | ||||
|         # This one is not from paper. | ||||
|         # Line is too long. | ||||
|         # This one is less aggressive than `Line is too short` | ||||
|         lambda line: 1 if len(line) > 60 else 0, | ||||
|         # Line contains email pattern. | ||||
|         binary_regex_search(RE_EMAIL), | ||||
|         # Line contains url. | ||||
|         binary_regex_search(RE_URL), | ||||
|         # Line contains phone number pattern. | ||||
|         binary_regex_search(RE_RELAX_PHONE), | ||||
|         # Line matches the regular expression "^[\s]*---*[\s]*$". | ||||
|         binary_regex_match(RE_SEPARATOR), | ||||
|         # Line has a sequence of 10 or more special characters. | ||||
|         binary_regex_search(RE_SPECIAL_CHARS), | ||||
|         # Line contains any typical signature words. | ||||
|         binary_regex_search(RE_SIGNATURE_WORDS), | ||||
|         # Line contains a pattern like Vitor R. Carvalho or William W. Cohen. | ||||
|         binary_regex_search(RE_NAME), | ||||
|         # Percentage of punctuation symbols in the line is larger than 50% | ||||
|         lambda line: 1 if punctuation_percent(line) > 50 else 0, | ||||
|         # Percentage of punctuation symbols in the line is larger than 90% | ||||
|         lambda line: 1 if punctuation_percent(line) > 90 else 0, | ||||
|         contains_sender_names(sender) | ||||
|         ] | ||||
|  | ||||
|  | ||||
| def apply_features(body, features): | ||||
|     '''Applies features to message body lines. | ||||
|  | ||||
|     Returns list of lists. Each of the lists corresponds to the body line | ||||
|     and is constituted by the numbers of features occurances (0 or 1). | ||||
|     E.g. if element j of list i equals 1 this means that | ||||
|     feature j occured in line i (counting from the last line of the body). | ||||
|     ''' | ||||
|     # collect all non empty lines | ||||
|     lines = [line for line in body.splitlines() if line.strip()] | ||||
|  | ||||
|     # take the last SIGNATURE_MAX_LINES | ||||
|     last_lines = lines[-SIGNATURE_MAX_LINES:] | ||||
|  | ||||
|     # apply features, fallback to zeros | ||||
|     return ([[f(line) for f in features] for line in last_lines] or | ||||
|             [[0 for f in features]]) | ||||
|  | ||||
|  | ||||
| def build_pattern(body, features): | ||||
|     '''Converts body into a pattern i.e. a point in the features space. | ||||
|  | ||||
|     Applies features to the body lines and sums up the results. | ||||
|     Elements of the pattern indicate how many times a certain feature occured | ||||
|     in the last lines of the body. | ||||
|     ''' | ||||
|     line_patterns = apply_features(body, features) | ||||
|     return reduce(lambda x, y: [i + j for i, j in zip(x, y)], line_patterns) | ||||
							
								
								
									
										233
									
								
								talon/signature/learning/helpers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										233
									
								
								talon/signature/learning/helpers.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,233 @@ | ||||
| # -*- coding: utf-8 -*- | ||||
|  | ||||
| """ The module provides: | ||||
| * functions used when evaluating signature's features | ||||
| * regexp's constants used when evaluating signature's features | ||||
|  | ||||
| """ | ||||
|  | ||||
| import unicodedata | ||||
| import regex as re | ||||
|  | ||||
| from talon.utils import to_unicode | ||||
|  | ||||
| from talon.signature.constants import SIGNATURE_MAX_LINES | ||||
|  | ||||
|  | ||||
| rc = re.compile | ||||
|  | ||||
| RE_EMAIL = rc('@') | ||||
| RE_RELAX_PHONE = rc('.*(\(? ?[\d]{2,3} ?\)?.{,3}){2,}') | ||||
| RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''') | ||||
|  | ||||
| # Taken from: | ||||
| # http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf | ||||
| # Line matches the regular expression "^[\s]*---*[\s]*$". | ||||
| RE_SEPARATOR = rc('^[\s]*---*[\s]*$') | ||||
|  | ||||
| # Taken from: | ||||
| # http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf | ||||
| # Line has a sequence of 10 or more special characters. | ||||
| RE_SPECIAL_CHARS = rc(('^[\s]*([\*]|#|[\+]|[\^]|-|[\~]|[\&]|[\$]|_|[\!]|' | ||||
|                        '[\/]|[\%]|[\:]|[\=]){10,}[\s]*$')) | ||||
|  | ||||
| RE_SIGNATURE_WORDS = rc(('(T|t)hank.*,|(B|b)est|(R|r)egards|' | ||||
|                          '^sent[ ]{1}from[ ]{1}my[\s,!\w]*$|BR|(S|s)incerely|' | ||||
|                          '(C|c)orporation|Group')) | ||||
|  | ||||
| # Taken from: | ||||
| # http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf | ||||
| # Line contains a pattern like Vitor R. Carvalho or William W. Cohen. | ||||
| RE_NAME = rc('[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+') | ||||
|  | ||||
| # Pattern to match if e.g. 'Sender:' header field has sender names. | ||||
| SENDER_WITH_NAME_PATTERN = '([\s]*[\S]+,?)+[\s]*<.*>.*' | ||||
| RE_SENDER_WITH_NAME = rc(SENDER_WITH_NAME_PATTERN) | ||||
|  | ||||
| # Reply line clue line endings, as in regular expression: | ||||
| # " wrote:$" or " writes:$" | ||||
| RE_CLUE_LINE_END = rc('.*(W|w)rotes?:$') | ||||
|  | ||||
| INVALID_WORD_START = rc('\(|\+|[\d]') | ||||
|  | ||||
| BAD_SENDER_NAMES = [ | ||||
|     # known mail domains | ||||
|     'hotmail', 'gmail', 'yandex', 'mail', 'yahoo', 'mailgun', 'mailgunhq', | ||||
|     'example', | ||||
|     # first level domains | ||||
|     'com', 'org', 'net', 'ru', | ||||
|     # bad words | ||||
|     'mailto' | ||||
|     ] | ||||
|  | ||||
|  | ||||
| def binary_regex_search(prog): | ||||
|     '''Returns a function that returns 1 or 0 depending on regex search result. | ||||
|  | ||||
|     If regular expression compiled into prog is present in a string | ||||
|     the result of calling the returned function with the string will be 1 | ||||
|     and 0 otherwise. | ||||
|  | ||||
|     >>> import regex as re | ||||
|     >>> binary_regex_search(re.compile("12"))("12") | ||||
|     1 | ||||
|     >>> binary_regex_search(re.compile("12"))("34") | ||||
|     0 | ||||
|     ''' | ||||
|     return lambda s: 1 if prog.search(s) else 0 | ||||
|  | ||||
|  | ||||
| def binary_regex_match(prog): | ||||
|     '''Returns a function that returns 1 or 0 depending on regex match result. | ||||
|  | ||||
|     If a string matches regular expression compiled into prog | ||||
|     the result of calling the returned function with the string will be 1 | ||||
|     and 0 otherwise. | ||||
|  | ||||
|     >>> import regex as re | ||||
|     >>> binary_regex_match(re.compile("12"))("12 3") | ||||
|     1 | ||||
|     >>> binary_regex_match(re.compile("12"))("3 12") | ||||
|     0 | ||||
|     ''' | ||||
|     return lambda s: 1 if prog.match(s) else 0 | ||||
|  | ||||
|  | ||||
| def flatten_list(list_to_flatten): | ||||
|     """Simple list comprehesion to flatten list. | ||||
|  | ||||
|     >>> flatten_list([[1, 2], [3, 4, 5]]) | ||||
|     [1, 2, 3, 4, 5] | ||||
|     >>> flatten_list([[1], [[2]]]) | ||||
|     [1, [2]] | ||||
|     >>> flatten_list([1, [2]]) | ||||
|     Traceback (most recent call last): | ||||
|     ... | ||||
|     TypeError: 'int' object is not iterable | ||||
|     """ | ||||
|     return [e for sublist in list_to_flatten for e in sublist] | ||||
|  | ||||
|  | ||||
| def contains_sender_names(sender): | ||||
|     '''Returns a functions to search sender\'s name or it\'s part. | ||||
|  | ||||
|     >>> feature = contains_sender_names("Sergey N.  Obukhov <xxx@example.com>") | ||||
|     >>> feature("Sergey Obukhov") | ||||
|     1 | ||||
|     >>> feature("BR, Sergey N.") | ||||
|     1 | ||||
|     >>> feature("Sergey") | ||||
|     1 | ||||
|     >>> contains_sender_names("<serobnic@mail.ru>")("Serobnic") | ||||
|     1 | ||||
|     >>> contains_sender_names("<serobnic@mail.ru>")("serobnic") | ||||
|     1 | ||||
|     ''' | ||||
|     names = '( |$)|'.join(flatten_list([[e, e.capitalize()] | ||||
|                                         for e in extract_names(sender)])) | ||||
|     names = names or sender | ||||
|     if names != '': | ||||
|         return binary_regex_search(re.compile(names)) | ||||
|     return lambda s: False | ||||
|  | ||||
|  | ||||
| def extract_names(sender): | ||||
|     """Tries to extract sender's names from `From:` header. | ||||
|  | ||||
|     It could extract not only the actual names but e.g. | ||||
|     the name of the company, parts of email, etc. | ||||
|  | ||||
|     >>> extract_names('Sergey N.  Obukhov <serobnic@mail.ru>') | ||||
|     ['Sergey', 'Obukhov', 'serobnic'] | ||||
|     >>> extract_names('') | ||||
|     [] | ||||
|     """ | ||||
|     sender = to_unicode(sender) | ||||
|     # Remove non-alphabetical characters | ||||
|     sender = "".join([char if char.isalpha() else ' ' for char in sender]) | ||||
|     # Remove too short words and words from "black" list i.e. | ||||
|     # words like `ru`, `gmail`, `com`, `org`, etc. | ||||
|     sender = [word for word in sender.split() if len(word) > 1 and | ||||
|               not word in BAD_SENDER_NAMES] | ||||
|     # Remove duplicates | ||||
|     names = list(set(sender)) | ||||
|     return names | ||||
|  | ||||
|  | ||||
| def categories_percent(s, categories): | ||||
|     '''Returns category characters persent. | ||||
|  | ||||
|     >>> categories_percent("qqq ggg hhh", ["Po"]) | ||||
|     0.0 | ||||
|     >>> categories_percent("q,w.", ["Po"]) | ||||
|     50.0 | ||||
|     >>> categories_percent("qqq ggg hhh", ["Nd"]) | ||||
|     0.0 | ||||
|     >>> categories_percent("q5", ["Nd"]) | ||||
|     50.0 | ||||
|     >>> categories_percent("s.s,5s", ["Po", "Nd"]) | ||||
|     50.0 | ||||
|     ''' | ||||
|     count = 0 | ||||
|     s = to_unicode(s) | ||||
|     for c in s: | ||||
|         if unicodedata.category(c) in categories: | ||||
|             count += 1 | ||||
|     return 100 * float(count) / len(s) if len(s) else 0 | ||||
|  | ||||
|  | ||||
| def punctuation_percent(s): | ||||
|     '''Returns punctuation persent. | ||||
|  | ||||
|     >>> punctuation_percent("qqq ggg hhh") | ||||
|     0.0 | ||||
|     >>> punctuation_percent("q,w.") | ||||
|     50.0 | ||||
|     ''' | ||||
|     return categories_percent(s, ['Po']) | ||||
|  | ||||
|  | ||||
| def capitalized_words_percent(s): | ||||
|     '''Returns capitalized words percent.''' | ||||
|     s = to_unicode(s) | ||||
|     words = re.split('\s', s) | ||||
|     words = [w for w in words if w.strip()] | ||||
|     capitalized_words_counter = 0 | ||||
|     valid_words_counter = 0 | ||||
|     for word in words: | ||||
|         if not INVALID_WORD_START.match(word): | ||||
|             valid_words_counter += 1 | ||||
|             if word[0].isupper(): | ||||
|                 capitalized_words_counter += 1 | ||||
|     if valid_words_counter > 0 and len(words) > 1: | ||||
|         return 100 * float(capitalized_words_counter) / valid_words_counter | ||||
|  | ||||
|     return 0 | ||||
|  | ||||
|  | ||||
| def many_capitalized_words(s): | ||||
|     """Returns a function to check percentage of capitalized words. | ||||
|  | ||||
|     The function returns 1 if percentage greater then 65% and 0 otherwise. | ||||
|     """ | ||||
|     return 1 if capitalized_words_percent(s) > 66 else 0 | ||||
|  | ||||
|  | ||||
| def has_signature(body, sender): | ||||
|     '''Checks if the body has signature. Returns True or False.''' | ||||
|     non_empty = [line for line in body.splitlines() if line.strip()] | ||||
|     candidate = non_empty[-SIGNATURE_MAX_LINES:] | ||||
|     upvotes = 0 | ||||
|     for line in candidate: | ||||
|         # we check lines for sender's name, phone, email and url, | ||||
|         # those signature lines don't take more then 27 lines | ||||
|         if len(line.strip()) > 27: | ||||
|             continue | ||||
|         elif contains_sender_names(sender)(line): | ||||
|             return True | ||||
|         elif (binary_regex_search(RE_RELAX_PHONE)(line) + | ||||
|               binary_regex_search(RE_EMAIL)(line) + | ||||
|               binary_regex_search(RE_URL)(line) == 1): | ||||
|             upvotes += 1 | ||||
|     if upvotes > 1: | ||||
|         return True | ||||
							
								
								
									
										76
									
								
								talon/utils.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										76
									
								
								talon/utils.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,76 @@ | ||||
| # coding:utf-8 | ||||
|  | ||||
| import logging | ||||
| from random import shuffle | ||||
|  | ||||
| from talon.constants import RE_DELIMITER | ||||
|  | ||||
|  | ||||
| log = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| def safe_format(format_string, *args, **kwargs): | ||||
|     """ | ||||
|     Helper: formats string with any combination of bytestrings/unicode | ||||
|     strings without raising exceptions | ||||
|     """ | ||||
|     try: | ||||
|         if not args and not kwargs: | ||||
|             return format_string | ||||
|         else: | ||||
|             return format_string.format(*args, **kwargs) | ||||
|  | ||||
|     # catch encoding errors and transform everything into utf-8 string | ||||
|     # before logging: | ||||
|     except (UnicodeEncodeError, UnicodeDecodeError): | ||||
|         format_string = to_utf8(format_string) | ||||
|         args = [to_utf8(p) for p in args] | ||||
|         kwargs = {k: to_utf8(v) for k, v in kwargs.iteritems()} | ||||
|         return format_string.format(*args, **kwargs) | ||||
|  | ||||
|     # ignore other errors | ||||
|     except: | ||||
|         return u'' | ||||
|  | ||||
|  | ||||
| def to_unicode(str_or_unicode, precise=False): | ||||
|     """ | ||||
|     Safely returns a unicode version of a given string | ||||
|     >>> utils.to_unicode('привет') | ||||
|         u'привет' | ||||
|     >>> utils.to_unicode(u'привет') | ||||
|         u'привет' | ||||
|     If `precise` flag is True, tries to guess the correct encoding first. | ||||
|     """ | ||||
|     encoding = detect_encoding(str_or_unicode) if precise else 'utf-8' | ||||
|     if isinstance(str_or_unicode, str): | ||||
|         return unicode(str_or_unicode, encoding, 'replace') | ||||
|     return str_or_unicode | ||||
|  | ||||
|  | ||||
| def to_utf8(str_or_unicode): | ||||
|     """ | ||||
|     Safely returns a UTF-8 version of a given string | ||||
|     >>> utils.to_utf8(u'hi') | ||||
|         'hi' | ||||
|     """ | ||||
|     if isinstance(str_or_unicode, unicode): | ||||
|         return str_or_unicode.encode("utf-8", "ignore") | ||||
|     return str(str_or_unicode) | ||||
|  | ||||
|  | ||||
| def random_token(length=7): | ||||
|     vals = ("a b c d e f g h i j k l m n o p q r s t u v w x y z " | ||||
|             "0 1 2 3 4 5 6 7 8 9").split(' ') | ||||
|     shuffle(vals) | ||||
|     return ''.join(vals[:length]) | ||||
|  | ||||
|  | ||||
| def get_delimiter(msg_body): | ||||
|     delimiter = RE_DELIMITER.search(msg_body) | ||||
|     if delimiter: | ||||
|         delimiter = delimiter.group() | ||||
|     else: | ||||
|         delimiter = '\n' | ||||
|  | ||||
|     return delimiter | ||||
		Reference in New Issue
	
	Block a user