Compare commits
	
		
			66 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | f04b872e14 | ||
|  | e61894e425 | ||
|  | 35fbdaadac | ||
|  | 8441bc7328 | ||
|  | 37c95ff97b | ||
|  | 5b1ca33c57 | ||
|  | ec8e09b34e | ||
|  | bcf97eccfa | ||
|  | f53b5cc7a6 | ||
|  | 27adde7aa7 | ||
|  | a9719833e0 | ||
|  | 7bf37090ca | ||
|  | 44fcef7123 | ||
|  | 69a44b10a1 | ||
|  | b085e3d049 | ||
|  | 4b953bcddc | ||
|  | 315eaa7080 | ||
|  | 5a9bc967f1 | ||
|  | a0d7236d0b | ||
|  | 21e9a31ffe | ||
|  | 4ee46c0a97 | ||
|  | 10d9a930f9 | ||
|  | a21ccdb21b | ||
|  | 7cdd7a8f35 | ||
|  | 01e03a47e0 | ||
|  | 1b9a71551a | ||
|  | 911efd1db4 | ||
|  | e61f0a68c4 | ||
|  | cefbcffd59 | ||
|  | 622a98d6d5 | ||
|  | 7901f5d1dc | ||
|  | 555c34d7a8 | ||
|  | dcc0d1de20 | ||
|  | 7bdf4d622b | ||
|  | 4a7207b0d0 | ||
|  | ad9c2ca0e8 | ||
|  | da998ddb60 | ||
|  | 07f68815df | ||
|  | 35645f9ade | ||
|  | 7c3d91301c | ||
|  | 5bcf7403ad | ||
|  | 2d6c092b65 | ||
|  | 6d0689cad6 | ||
|  | 3f80e93ee0 | ||
|  | 1b18abab1d | ||
|  | 03dd5af5ab | ||
|  | dfba82b07c | ||
|  | 08ca02c87f | ||
|  | b61f4ec095 | ||
|  | 9dbe6a494b | ||
|  | 44e70939d6 | ||
|  | ab6066eafa | ||
|  | 42258cdd36 | ||
|  | d3de9e6893 | ||
|  | 333beb94af | ||
|  | f3c0942c49 | ||
|  | 02adf53ab9 | ||
|  | 3497b5cab4 | ||
|  | 9c17dca17c | ||
|  | de342d3177 | ||
|  | 743b452daf | ||
|  | c762f3c337 | ||
|  | 31803d41bc | ||
|  | 2ecd9779fc | ||
|  | 5a7047233e | ||
|  | 999e9c3725 | 
| @@ -1,5 +1,3 @@ | |||||||
| recursive-include tests * |  | ||||||
| recursive-include talon * |  | ||||||
| recursive-exclude tests *.pyc *~ | recursive-exclude tests *.pyc *~ | ||||||
| recursive-exclude talon *.pyc *~ | recursive-exclude talon *.pyc *~ | ||||||
| include train.data | include train.data | ||||||
|   | |||||||
							
								
								
									
										13
									
								
								README.rst
									
									
									
									
									
								
							
							
						
						
									
										13
									
								
								README.rst
									
									
									
									
									
								
							| @@ -95,7 +95,7 @@ classifiers. The core of machine learning algorithm lays in | |||||||
| apply to a message (``featurespace.py``), how data sets are built | apply to a message (``featurespace.py``), how data sets are built | ||||||
| (``dataset.py``), classifier’s interface (``classifier.py``). | (``dataset.py``), classifier’s interface (``classifier.py``). | ||||||
|  |  | ||||||
| The data used for training is taken from our personal email | Currently the data used for training is taken from our personal email | ||||||
| conversations and from `ENRON`_ dataset. As a result of applying our set | conversations and from `ENRON`_ dataset. As a result of applying our set | ||||||
| of features to the dataset we provide files ``classifier`` and | of features to the dataset we provide files ``classifier`` and | ||||||
| ``train.data`` that don’t have any personal information but could be | ``train.data`` that don’t have any personal information but could be | ||||||
| @@ -116,8 +116,19 @@ or | |||||||
|     from talon.signature.learning.classifier import train, init |     from talon.signature.learning.classifier import train, init | ||||||
|     train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME) |     train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME) | ||||||
|  |  | ||||||
|  | Open-source Dataset | ||||||
|  | ------------------- | ||||||
|  |  | ||||||
|  | Recently we started a `forge`_ project to create an open-source, annotated dataset of raw emails. In the project we | ||||||
|  | used a subset of `ENRON`_ data, cleansed of private, health and financial information by `EDRM`_. At the moment over 190 | ||||||
|  | emails are annotated. Any contribution and collaboration on the project are welcome. Once the dataset is ready we plan to | ||||||
|  | start using it for talon. | ||||||
|  |  | ||||||
| .. _scikit-learn: http://scikit-learn.org | .. _scikit-learn: http://scikit-learn.org | ||||||
| .. _ENRON: https://www.cs.cmu.edu/~enron/ | .. _ENRON: https://www.cs.cmu.edu/~enron/ | ||||||
|  | .. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set | ||||||
|  | .. _forge: https://github.com/mailgun/forge | ||||||
|  |  | ||||||
|  |  | ||||||
| Research | Research | ||||||
| -------- | -------- | ||||||
|   | |||||||
							
								
								
									
										38
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										38
									
								
								setup.py
									
									
									
									
									
								
							| @@ -1,8 +1,35 @@ | |||||||
|  | from __future__ import absolute_import | ||||||
| from setuptools import setup, find_packages | from setuptools import setup, find_packages | ||||||
|  | from setuptools.command.install import install | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class InstallCommand(install): | ||||||
|  |     user_options = install.user_options + [ | ||||||
|  |         ('no-ml', None, "Don't install without Machine Learning modules."), | ||||||
|  |     ] | ||||||
|  |  | ||||||
|  |     boolean_options = install.boolean_options + ['no-ml'] | ||||||
|  |  | ||||||
|  |     def initialize_options(self): | ||||||
|  |         install.initialize_options(self) | ||||||
|  |         self.no_ml = None | ||||||
|  |  | ||||||
|  |     def finalize_options(self): | ||||||
|  |         install.finalize_options(self) | ||||||
|  |         if self.no_ml: | ||||||
|  |             dist = self.distribution | ||||||
|  |             dist.packages=find_packages(exclude=[ | ||||||
|  |                 'tests', | ||||||
|  |                 'tests.*', | ||||||
|  |                 'talon.signature', | ||||||
|  |                 'talon.signature.*', | ||||||
|  |             ]) | ||||||
|  |             for not_required in ['numpy', 'scipy', 'scikit-learn==0.16.1']: | ||||||
|  |                 dist.install_requires.remove(not_required) | ||||||
|  |  | ||||||
|  |  | ||||||
| setup(name='talon', | setup(name='talon', | ||||||
|       version='1.2.1', |       version='1.3.1', | ||||||
|       description=("Mailgun library " |       description=("Mailgun library " | ||||||
|                    "to extract message quotations and signatures."), |                    "to extract message quotations and signatures."), | ||||||
|       long_description=open("README.rst").read(), |       long_description=open("README.rst").read(), | ||||||
| @@ -10,7 +37,10 @@ setup(name='talon', | |||||||
|       author_email='admin@mailgunhq.com', |       author_email='admin@mailgunhq.com', | ||||||
|       url='https://github.com/mailgun/talon', |       url='https://github.com/mailgun/talon', | ||||||
|       license='APACHE2', |       license='APACHE2', | ||||||
|       packages=find_packages(exclude=['tests']), |       cmdclass={ | ||||||
|  |           'install': InstallCommand, | ||||||
|  |       }, | ||||||
|  |       packages=find_packages(exclude=['tests', 'tests.*']), | ||||||
|       include_package_data=True, |       include_package_data=True, | ||||||
|       zip_safe=True, |       zip_safe=True, | ||||||
|       install_requires=[ |       install_requires=[ | ||||||
| @@ -21,7 +51,9 @@ setup(name='talon', | |||||||
|           "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild |           "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild | ||||||
|           'chardet>=1.0.1', |           'chardet>=1.0.1', | ||||||
|           'cchardet>=0.3.5', |           'cchardet>=0.3.5', | ||||||
|           'cssselect' |           'cssselect', | ||||||
|  |           'six>=1.10.0', | ||||||
|  |           'html5lib' | ||||||
|           ], |           ], | ||||||
|       tests_require=[ |       tests_require=[ | ||||||
|           "mock", |           "mock", | ||||||
|   | |||||||
| @@ -1,7 +1,13 @@ | |||||||
|  | from __future__ import absolute_import | ||||||
| from talon.quotations import register_xpath_extensions | from talon.quotations import register_xpath_extensions | ||||||
| from talon import signature | try: | ||||||
|  |     from talon import signature | ||||||
|  |     ML_ENABLED = True | ||||||
|  | except ImportError: | ||||||
|  |     ML_ENABLED = False | ||||||
|  |  | ||||||
|  |  | ||||||
| def init(): | def init(): | ||||||
|     register_xpath_extensions() |     register_xpath_extensions() | ||||||
|  |     if ML_ENABLED: | ||||||
|         signature.initialize() |         signature.initialize() | ||||||
|   | |||||||
| @@ -1,3 +1,4 @@ | |||||||
|  | from __future__ import absolute_import | ||||||
| import regex as re | import regex as re | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -3,8 +3,10 @@ The module's functions operate on message bodies trying to extract original | |||||||
| messages (without quoted messages) from html | messages (without quoted messages) from html | ||||||
| """ | """ | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| import regex as re | import regex as re | ||||||
|  |  | ||||||
|  | from talon.utils import cssselect  | ||||||
|  |  | ||||||
| CHECKPOINT_PREFIX = '#!%!' | CHECKPOINT_PREFIX = '#!%!' | ||||||
| CHECKPOINT_SUFFIX = '!%!#' | CHECKPOINT_SUFFIX = '!%!#' | ||||||
| @@ -12,6 +14,7 @@ CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX) | |||||||
|  |  | ||||||
| # HTML quote indicators (tag ids) | # HTML quote indicators (tag ids) | ||||||
| QUOTE_IDS = ['OLK_SRC_BODY_SECTION'] | QUOTE_IDS = ['OLK_SRC_BODY_SECTION'] | ||||||
|  | RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) | ||||||
|  |  | ||||||
|  |  | ||||||
| def add_checkpoint(html_note, counter): | def add_checkpoint(html_note, counter): | ||||||
| @@ -76,8 +79,8 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints): | |||||||
|  |  | ||||||
| def cut_gmail_quote(html_message): | def cut_gmail_quote(html_message): | ||||||
|     ''' Cuts the outermost block element with class gmail_quote. ''' |     ''' Cuts the outermost block element with class gmail_quote. ''' | ||||||
|     gmail_quote = html_message.cssselect('div.gmail_quote') |     gmail_quote = cssselect('div.gmail_quote', html_message) | ||||||
|     if gmail_quote: |     if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)): | ||||||
|         gmail_quote[0].getparent().remove(gmail_quote[0]) |         gmail_quote[0].getparent().remove(gmail_quote[0]) | ||||||
|         return True |         return True | ||||||
|  |  | ||||||
| @@ -85,9 +88,12 @@ def cut_gmail_quote(html_message): | |||||||
| def cut_microsoft_quote(html_message): | def cut_microsoft_quote(html_message): | ||||||
|     ''' Cuts splitter block and all following blocks. ''' |     ''' Cuts splitter block and all following blocks. ''' | ||||||
|     splitter = html_message.xpath( |     splitter = html_message.xpath( | ||||||
|         #outlook 2007, 2010 |         #outlook 2007, 2010 (international) | ||||||
|         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" |         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" | ||||||
|         "padding:3.0pt 0cm 0cm 0cm']|" |         "padding:3.0pt 0cm 0cm 0cm']|" | ||||||
|  |         #outlook 2007, 2010 (american) | ||||||
|  |         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" | ||||||
|  |         "padding:3.0pt 0in 0in 0in']|" | ||||||
|         #windows mail |         #windows mail | ||||||
|         "//div[@style='padding-top: 5px; " |         "//div[@style='padding-top: 5px; " | ||||||
|         "border-top-color: rgb(229, 229, 229); " |         "border-top-color: rgb(229, 229, 229); " | ||||||
| @@ -130,7 +136,7 @@ def cut_microsoft_quote(html_message): | |||||||
| def cut_by_id(html_message): | def cut_by_id(html_message): | ||||||
|     found = False |     found = False | ||||||
|     for quote_id in QUOTE_IDS: |     for quote_id in QUOTE_IDS: | ||||||
|         quote = html_message.cssselect('#{}'.format(quote_id)) |         quote = cssselect('#{}'.format(quote_id), html_message) | ||||||
|         if quote: |         if quote: | ||||||
|             found = True |             found = True | ||||||
|             quote[0].getparent().remove(quote[0]) |             quote[0].getparent().remove(quote[0]) | ||||||
| @@ -172,8 +178,23 @@ def cut_from_block(html_message): | |||||||
|             parent_div_is_all_content = ( |             parent_div_is_all_content = ( | ||||||
|                 maybe_body is not None and maybe_body.tag == 'body' and |                 maybe_body is not None and maybe_body.tag == 'body' and | ||||||
|                 len(maybe_body.getchildren()) == 1) |                 len(maybe_body.getchildren()) == 1) | ||||||
|  |  | ||||||
|             if not parent_div_is_all_content: |             if not parent_div_is_all_content: | ||||||
|                 block.getparent().remove(block) |                 parent = block.getparent() | ||||||
|  |                 next_sibling = block.getnext() | ||||||
|  |  | ||||||
|  |                 # remove all tags after found From block | ||||||
|  |                 # (From block and quoted message are in separate divs) | ||||||
|  |                 while next_sibling is not None: | ||||||
|  |                     parent.remove(block) | ||||||
|  |                     block = next_sibling | ||||||
|  |                     next_sibling = block.getnext() | ||||||
|  |  | ||||||
|  |                 # remove the last sibling (or the | ||||||
|  |                 # From block if no siblings) | ||||||
|  |                 if block is not None: | ||||||
|  |                     parent.remove(block) | ||||||
|  |  | ||||||
|                 return True |                 return True | ||||||
|         else: |         else: | ||||||
|             return False |             return False | ||||||
| @@ -185,7 +206,17 @@ def cut_from_block(html_message): | |||||||
|          "//*[starts-with(mg:tail(), 'Date:')]")) |          "//*[starts-with(mg:tail(), 'Date:')]")) | ||||||
|     if block: |     if block: | ||||||
|         block = block[0] |         block = block[0] | ||||||
|  |  | ||||||
|  |         if RE_FWD.match(block.getparent().text or ''): | ||||||
|  |             return False | ||||||
|  |          | ||||||
|         while(block.getnext() is not None): |         while(block.getnext() is not None): | ||||||
|             block.getparent().remove(block.getnext()) |             block.getparent().remove(block.getnext()) | ||||||
|         block.getparent().remove(block) |         block.getparent().remove(block) | ||||||
|         return True |         return True | ||||||
|  |  | ||||||
|  | def cut_zimbra_quote(html_message): | ||||||
|  |     zDivider = html_message.xpath('//hr[@data-marker="__DIVIDER__"]') | ||||||
|  |     if zDivider: | ||||||
|  |         zDivider[0].getparent().remove(zDivider[0]) | ||||||
|  |         return True | ||||||
|   | |||||||
| @@ -5,14 +5,18 @@ The module's functions operate on message bodies trying to extract | |||||||
| original messages (without quoted messages) | original messages (without quoted messages) | ||||||
| """ | """ | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| import regex as re | import regex as re | ||||||
| import logging | import logging | ||||||
| from copy import deepcopy | from copy import deepcopy | ||||||
|  |  | ||||||
| from lxml import html, etree | from lxml import html, etree | ||||||
|  |  | ||||||
| from talon.utils import get_delimiter, html_to_text | from talon.utils import (get_delimiter, html_tree_to_text, | ||||||
|  |                          html_document_fromstring) | ||||||
| from talon import html_quotations | from talon import html_quotations | ||||||
|  | from six.moves import range | ||||||
|  | import six | ||||||
|  |  | ||||||
|  |  | ||||||
| log = logging.getLogger(__name__) | log = logging.getLogger(__name__) | ||||||
| @@ -137,13 +141,20 @@ RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? . | |||||||
|  |  | ||||||
| SPLITTER_PATTERNS = [ | SPLITTER_PATTERNS = [ | ||||||
|     RE_ORIGINAL_MESSAGE, |     RE_ORIGINAL_MESSAGE, | ||||||
|     # <date> <person> |  | ||||||
|     re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S), |  | ||||||
|     RE_ON_DATE_SMB_WROTE, |     RE_ON_DATE_SMB_WROTE, | ||||||
|     RE_ON_DATE_WROTE_SMB, |     RE_ON_DATE_WROTE_SMB, | ||||||
|     RE_FROM_COLON_OR_DATE_COLON, |     RE_FROM_COLON_OR_DATE_COLON, | ||||||
|  |     # 02.04.2012 14:20 пользователь "bob@example.com" < | ||||||
|  |     # bob@xxx.mailgun.org> написал: | ||||||
|  |     re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S), | ||||||
|  |     # 2014-10-17 11:28 GMT+03:00 Bob < | ||||||
|  |     # bob@example.com>: | ||||||
|  |     re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S), | ||||||
|  |     # Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>: | ||||||
|     re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' |     re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' | ||||||
|                '( \S+){3,6}@\S+:') |                '( \S+){3,6}@\S+:'), | ||||||
|  |     # Sent from Samsung MobileName <address@example.com> wrote: | ||||||
|  |     re.compile('Sent from Samsung .*@.*> wrote') | ||||||
|     ] |     ] | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -154,6 +165,9 @@ RE_PARENTHESIS_LINK = re.compile("\(https?://") | |||||||
|  |  | ||||||
| SPLITTER_MAX_LINES = 4 | SPLITTER_MAX_LINES = 4 | ||||||
| MAX_LINES_COUNT = 1000 | MAX_LINES_COUNT = 1000 | ||||||
|  | # an extensive research shows that exceeding this limit | ||||||
|  | # leads to excessive processing time | ||||||
|  | MAX_HTML_LEN = 2794202 | ||||||
|  |  | ||||||
| QUOT_PATTERN = re.compile('^>+ ?') | QUOT_PATTERN = re.compile('^>+ ?') | ||||||
| NO_QUOT_LINE = re.compile('^[^>].*[\S].*') | NO_QUOT_LINE = re.compile('^[^>].*[\S].*') | ||||||
| @@ -184,7 +198,7 @@ def mark_message_lines(lines): | |||||||
|     >>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question']) |     >>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question']) | ||||||
|     'tsem' |     'tsem' | ||||||
|     """ |     """ | ||||||
|     markers = bytearray(len(lines)) |     markers = ['e' for _ in lines] | ||||||
|     i = 0 |     i = 0 | ||||||
|     while i < len(lines): |     while i < len(lines): | ||||||
|         if not lines[i].strip(): |         if not lines[i].strip(): | ||||||
| @@ -200,7 +214,7 @@ def mark_message_lines(lines): | |||||||
|             if splitter: |             if splitter: | ||||||
|                 # append as many splitter markers as lines in splitter |                 # append as many splitter markers as lines in splitter | ||||||
|                 splitter_lines = splitter.group().splitlines() |                 splitter_lines = splitter.group().splitlines() | ||||||
|                 for j in xrange(len(splitter_lines)): |                 for j in range(len(splitter_lines)): | ||||||
|                     markers[i + j] = 's' |                     markers[i + j] = 's' | ||||||
|  |  | ||||||
|                 # skip splitter lines |                 # skip splitter lines | ||||||
| @@ -210,7 +224,7 @@ def mark_message_lines(lines): | |||||||
|                 markers[i] = 't' |                 markers[i] = 't' | ||||||
|         i += 1 |         i += 1 | ||||||
|  |  | ||||||
|     return markers |     return ''.join(markers) | ||||||
|  |  | ||||||
|  |  | ||||||
| def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): | def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): | ||||||
| @@ -224,6 +238,7 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): | |||||||
|     return_flags = [were_lines_deleted, first_deleted_line, |     return_flags = [were_lines_deleted, first_deleted_line, | ||||||
|                     last_deleted_line] |                     last_deleted_line] | ||||||
|     """ |     """ | ||||||
|  |     markers = ''.join(markers) | ||||||
|     # if there are no splitter there should be no markers |     # if there are no splitter there should be no markers | ||||||
|     if 's' not in markers and not re.search('(me*){3}', markers): |     if 's' not in markers and not re.search('(me*){3}', markers): | ||||||
|         markers = markers.replace('m', 't') |         markers = markers.replace('m', 't') | ||||||
| @@ -269,10 +284,15 @@ def preprocess(msg_body, delimiter, content_type='text/plain'): | |||||||
|     Replaces link brackets so that they couldn't be taken for quotation marker. |     Replaces link brackets so that they couldn't be taken for quotation marker. | ||||||
|     Splits line in two if splitter pattern preceded by some text on the same |     Splits line in two if splitter pattern preceded by some text on the same | ||||||
|     line (done only for 'On <date> <person> wrote:' pattern). |     line (done only for 'On <date> <person> wrote:' pattern). | ||||||
|  |  | ||||||
|  |     Converts msg_body into a unicode. | ||||||
|     """ |     """ | ||||||
|     # normalize links i.e. replace '<', '>' wrapping the link with some symbols |     # normalize links i.e. replace '<', '>' wrapping the link with some symbols | ||||||
|     # so that '>' closing the link couldn't be mistakenly taken for quotation |     # so that '>' closing the link couldn't be mistakenly taken for quotation | ||||||
|     # marker. |     # marker. | ||||||
|  |     if isinstance(msg_body, bytes): | ||||||
|  |         msg_body = msg_body.decode('utf8') | ||||||
|  |  | ||||||
|     def link_wrapper(link): |     def link_wrapper(link): | ||||||
|         newline_index = msg_body[:link.start()].rfind("\n") |         newline_index = msg_body[:link.start()].rfind("\n") | ||||||
|         if msg_body[newline_index + 1] == ">": |         if msg_body[newline_index + 1] == ">": | ||||||
| @@ -335,16 +355,51 @@ def extract_from_html(msg_body): | |||||||
|     then extracting quotations from text, |     then extracting quotations from text, | ||||||
|     then checking deleted checkpoints, |     then checking deleted checkpoints, | ||||||
|     then deleting necessary tags. |     then deleting necessary tags. | ||||||
|  |  | ||||||
|  |     Returns a unicode string. | ||||||
|     """ |     """ | ||||||
|     if msg_body.strip() == '': |     if isinstance(msg_body, six.text_type): | ||||||
|  |         msg_body = msg_body.encode('utf8') | ||||||
|  |     elif not isinstance(msg_body, bytes): | ||||||
|  |         msg_body = msg_body.encode('ascii') | ||||||
|  |  | ||||||
|  |     result = _extract_from_html(msg_body) | ||||||
|  |     if isinstance(result, bytes): | ||||||
|  |         result = result.decode('utf8') | ||||||
|  |  | ||||||
|  |     return result | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _extract_from_html(msg_body): | ||||||
|  |     """ | ||||||
|  |     Extract not quoted message from provided html message body | ||||||
|  |     using tags and plain text algorithm. | ||||||
|  |  | ||||||
|  |     Cut out the 'blockquote', 'gmail_quote' tags. | ||||||
|  |     Cut Microsoft quotations. | ||||||
|  |  | ||||||
|  |     Then use plain text algorithm to cut out splitter or | ||||||
|  |     leftover quotation. | ||||||
|  |     This works by adding checkpoint text to all html tags, | ||||||
|  |     then converting html to text, | ||||||
|  |     then extracting quotations from text, | ||||||
|  |     then checking deleted checkpoints, | ||||||
|  |     then deleting necessary tags. | ||||||
|  |     """ | ||||||
|  |     if len(msg_body) > MAX_HTML_LEN: | ||||||
|  |         return msg_body | ||||||
|  |  | ||||||
|  |     if msg_body.strip() == b'': | ||||||
|  |         return msg_body | ||||||
|  |  | ||||||
|  |     msg_body = msg_body.replace(b'\r\n', b'\n') | ||||||
|  |     html_tree = html_document_fromstring(msg_body) | ||||||
|  |  | ||||||
|  |     if html_tree is None: | ||||||
|         return msg_body |         return msg_body | ||||||
|  |  | ||||||
|     msg_body = msg_body.replace('\r\n', '').replace('\n', '') |  | ||||||
|     html_tree = html.document_fromstring( |  | ||||||
|         msg_body, |  | ||||||
|         parser=html.HTMLParser(encoding="utf-8") |  | ||||||
|     ) |  | ||||||
|     cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or |     cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or | ||||||
|  |                       html_quotations.cut_zimbra_quote(html_tree) or | ||||||
|                       html_quotations.cut_blockquote(html_tree) or |                       html_quotations.cut_blockquote(html_tree) or | ||||||
|                       html_quotations.cut_microsoft_quote(html_tree) or |                       html_quotations.cut_microsoft_quote(html_tree) or | ||||||
|                       html_quotations.cut_by_id(html_tree) or |                       html_quotations.cut_by_id(html_tree) or | ||||||
| @@ -354,8 +409,7 @@ def extract_from_html(msg_body): | |||||||
|  |  | ||||||
|     number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) |     number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) | ||||||
|     quotation_checkpoints = [False] * number_of_checkpoints |     quotation_checkpoints = [False] * number_of_checkpoints | ||||||
|     msg_with_checkpoints = html.tostring(html_tree) |     plain_text = html_tree_to_text(html_tree) | ||||||
|     plain_text = html_to_text(msg_with_checkpoints) |  | ||||||
|     plain_text = preprocess(plain_text, '\n', content_type='text/html') |     plain_text = preprocess(plain_text, '\n', content_type='text/html') | ||||||
|     lines = plain_text.splitlines() |     lines = plain_text.splitlines() | ||||||
|  |  | ||||||
| @@ -378,25 +432,31 @@ def extract_from_html(msg_body): | |||||||
|     return_flags = [] |     return_flags = [] | ||||||
|     process_marked_lines(lines, markers, return_flags) |     process_marked_lines(lines, markers, return_flags) | ||||||
|     lines_were_deleted, first_deleted, last_deleted = return_flags |     lines_were_deleted, first_deleted, last_deleted = return_flags | ||||||
|  |  | ||||||
|  |     if not lines_were_deleted and not cut_quotations: | ||||||
|  |         return msg_body | ||||||
|  |  | ||||||
|     if lines_were_deleted: |     if lines_were_deleted: | ||||||
|         #collect checkpoints from deleted lines |         #collect checkpoints from deleted lines | ||||||
|         for i in xrange(first_deleted, last_deleted): |         for i in range(first_deleted, last_deleted): | ||||||
|             for checkpoint in line_checkpoints[i]: |             for checkpoint in line_checkpoints[i]: | ||||||
|                 quotation_checkpoints[checkpoint] = True |                 quotation_checkpoints[checkpoint] = True | ||||||
|     else: |  | ||||||
|         if cut_quotations: |  | ||||||
|             return html.tostring(html_tree_copy) |  | ||||||
|         else: |  | ||||||
|             return msg_body |  | ||||||
|  |  | ||||||
|         # Remove tags with quotation checkpoints |         # Remove tags with quotation checkpoints | ||||||
|         html_quotations.delete_quotation_tags( |         html_quotations.delete_quotation_tags( | ||||||
|             html_tree_copy, 0, quotation_checkpoints |             html_tree_copy, 0, quotation_checkpoints | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|  |     if _readable_text_empty(html_tree_copy): | ||||||
|  |         return msg_body | ||||||
|  |  | ||||||
|     return html.tostring(html_tree_copy) |     return html.tostring(html_tree_copy) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _readable_text_empty(html_tree): | ||||||
|  |     return not bool(html_tree_to_text(html_tree).strip()) | ||||||
|  |  | ||||||
|  |  | ||||||
| def is_splitter(line): | def is_splitter(line): | ||||||
|     ''' |     ''' | ||||||
|     Returns Matcher object if provided string is a splitter and |     Returns Matcher object if provided string is a splitter and | ||||||
| @@ -410,7 +470,7 @@ def is_splitter(line): | |||||||
|  |  | ||||||
| def text_content(context): | def text_content(context): | ||||||
|     '''XPath Extension function to return a node text content.''' |     '''XPath Extension function to return a node text content.''' | ||||||
|     return context.context_node.text_content().strip() |     return context.context_node.xpath("string()").strip() | ||||||
|  |  | ||||||
|  |  | ||||||
| def tail(context): | def tail(context): | ||||||
|   | |||||||
| @@ -20,6 +20,7 @@ trained against, don't forget to regenerate: | |||||||
| * signature/data/classifier | * signature/data/classifier | ||||||
| """ | """ | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| import os | import os | ||||||
|  |  | ||||||
| from . import extraction | from . import extraction | ||||||
|   | |||||||
| @@ -1,3 +1,4 @@ | |||||||
|  | from __future__ import absolute_import | ||||||
| import logging | import logging | ||||||
|  |  | ||||||
| import regex as re | import regex as re | ||||||
| @@ -111,7 +112,7 @@ def extract_signature(msg_body): | |||||||
|  |  | ||||||
|             return (stripped_body.strip(), |             return (stripped_body.strip(), | ||||||
|                     signature.strip()) |                     signature.strip()) | ||||||
|     except Exception, e: |     except Exception as e: | ||||||
|         log.exception('ERROR extracting signature') |         log.exception('ERROR extracting signature') | ||||||
|         return (msg_body, None) |         return (msg_body, None) | ||||||
|  |  | ||||||
|   | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,5 +1,6 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| import logging | import logging | ||||||
|  |  | ||||||
| import regex as re | import regex as re | ||||||
|   | |||||||
| @@ -5,6 +5,7 @@ The classifier could be used to detect if a certain line of the message | |||||||
| body belongs to the signature. | body belongs to the signature. | ||||||
| """ | """ | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| from numpy import genfromtxt | from numpy import genfromtxt | ||||||
| from sklearn.svm import LinearSVC | from sklearn.svm import LinearSVC | ||||||
| from sklearn.externals import joblib | from sklearn.externals import joblib | ||||||
|   | |||||||
| @@ -16,11 +16,13 @@ suffix and the corresponding sender file has the same name except for the | |||||||
| suffix which should be `_sender`. | suffix which should be `_sender`. | ||||||
| """ | """ | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| import os | import os | ||||||
| import regex as re | import regex as re | ||||||
|  |  | ||||||
| from talon.signature.constants import SIGNATURE_MAX_LINES | from talon.signature.constants import SIGNATURE_MAX_LINES | ||||||
| from talon.signature.learning.featurespace import build_pattern, features | from talon.signature.learning.featurespace import build_pattern, features | ||||||
|  | from six.moves import range | ||||||
|  |  | ||||||
|  |  | ||||||
| SENDER_SUFFIX = '_sender' | SENDER_SUFFIX = '_sender' | ||||||
| @@ -144,7 +146,7 @@ def build_extraction_dataset(folder, dataset_filename, | |||||||
|             if not sender or not msg: |             if not sender or not msg: | ||||||
|                 continue |                 continue | ||||||
|             lines = msg.splitlines() |             lines = msg.splitlines() | ||||||
|             for i in xrange(1, min(SIGNATURE_MAX_LINES, |             for i in range(1, min(SIGNATURE_MAX_LINES, | ||||||
|                                    len(lines)) + 1): |                                    len(lines)) + 1): | ||||||
|                 line = lines[-i] |                 line = lines[-i] | ||||||
|                 label = -1 |                 label = -1 | ||||||
|   | |||||||
| @@ -7,9 +7,12 @@ The body and the message sender string are converted into unicode before | |||||||
| applying features to them. | applying features to them. | ||||||
| """ | """ | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| from talon.signature.constants import (SIGNATURE_MAX_LINES, | from talon.signature.constants import (SIGNATURE_MAX_LINES, | ||||||
|                                        TOO_LONG_SIGNATURE_LINE) |                                        TOO_LONG_SIGNATURE_LINE) | ||||||
| from talon.signature.learning.helpers import * | from talon.signature.learning.helpers import * | ||||||
|  | from six.moves import zip | ||||||
|  | from functools import reduce | ||||||
|  |  | ||||||
|  |  | ||||||
| def features(sender=''): | def features(sender=''): | ||||||
|   | |||||||
| @@ -6,6 +6,7 @@ | |||||||
|  |  | ||||||
| """ | """ | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| import unicodedata | import unicodedata | ||||||
| import regex as re | import regex as re | ||||||
|  |  | ||||||
| @@ -184,12 +185,13 @@ def capitalized_words_percent(s): | |||||||
|     s = to_unicode(s, precise=True) |     s = to_unicode(s, precise=True) | ||||||
|     words = re.split('\s', s) |     words = re.split('\s', s) | ||||||
|     words = [w for w in words if w.strip()] |     words = [w for w in words if w.strip()] | ||||||
|  |     words = [w for w in words if len(w) > 2]     | ||||||
|     capitalized_words_counter = 0 |     capitalized_words_counter = 0 | ||||||
|     valid_words_counter = 0 |     valid_words_counter = 0 | ||||||
|     for word in words: |     for word in words: | ||||||
|         if not INVALID_WORD_START.match(word): |         if not INVALID_WORD_START.match(word): | ||||||
|             valid_words_counter += 1 |             valid_words_counter += 1 | ||||||
|             if word[0].isupper(): |             if word[0].isupper() and not word[1].isupper(): | ||||||
|                 capitalized_words_counter += 1 |                 capitalized_words_counter += 1 | ||||||
|     if valid_words_counter > 0 and len(words) > 1: |     if valid_words_counter > 0 and len(words) > 1: | ||||||
|         return 100 * float(capitalized_words_counter) / valid_words_counter |         return 100 * float(capitalized_words_counter) / valid_words_counter | ||||||
|   | |||||||
							
								
								
									
										114
									
								
								talon/utils.py
									
									
									
									
									
								
							
							
						
						
									
										114
									
								
								talon/utils.py
									
									
									
									
									
								
							| @@ -1,15 +1,19 @@ | |||||||
| # coding:utf-8 | # coding:utf-8 | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| import logging | import logging | ||||||
| from random import shuffle | from random import shuffle | ||||||
| import chardet | import chardet | ||||||
| import cchardet | import cchardet | ||||||
| import regex as re | import regex as re | ||||||
|  |  | ||||||
| from lxml import html | from lxml.html import html5parser | ||||||
| from lxml.cssselect import CSSSelector | from lxml.cssselect import CSSSelector | ||||||
|  |  | ||||||
|  | import html5lib | ||||||
|  |  | ||||||
| from talon.constants import RE_DELIMITER | from talon.constants import RE_DELIMITER | ||||||
|  | import six | ||||||
|  |  | ||||||
|  |  | ||||||
| def safe_format(format_string, *args, **kwargs): | def safe_format(format_string, *args, **kwargs): | ||||||
| @@ -28,7 +32,7 @@ def safe_format(format_string, *args, **kwargs): | |||||||
|     except (UnicodeEncodeError, UnicodeDecodeError): |     except (UnicodeEncodeError, UnicodeDecodeError): | ||||||
|         format_string = to_utf8(format_string) |         format_string = to_utf8(format_string) | ||||||
|         args = [to_utf8(p) for p in args] |         args = [to_utf8(p) for p in args] | ||||||
|         kwargs = {k: to_utf8(v) for k, v in kwargs.iteritems()} |         kwargs = {k: to_utf8(v) for k, v in six.iteritems(kwargs)} | ||||||
|         return format_string.format(*args, **kwargs) |         return format_string.format(*args, **kwargs) | ||||||
|  |  | ||||||
|     # ignore other errors |     # ignore other errors | ||||||
| @@ -45,9 +49,9 @@ def to_unicode(str_or_unicode, precise=False): | |||||||
|         u'привет' |         u'привет' | ||||||
|     If `precise` flag is True, tries to guess the correct encoding first. |     If `precise` flag is True, tries to guess the correct encoding first. | ||||||
|     """ |     """ | ||||||
|  |     if not isinstance(str_or_unicode, six.text_type): | ||||||
|         encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8' |         encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8' | ||||||
|     if isinstance(str_or_unicode, str): |         return six.text_type(str_or_unicode, encoding, 'replace') | ||||||
|         return unicode(str_or_unicode, encoding, 'replace') |  | ||||||
|     return str_or_unicode |     return str_or_unicode | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -57,11 +61,12 @@ def detect_encoding(string): | |||||||
|  |  | ||||||
|     Defaults to UTF-8. |     Defaults to UTF-8. | ||||||
|     """ |     """ | ||||||
|  |     assert isinstance(string, bytes) | ||||||
|     try: |     try: | ||||||
|         detected = chardet.detect(string) |         detected = chardet.detect(string) | ||||||
|         if detected: |         if detected: | ||||||
|             return detected.get('encoding') or 'utf-8' |             return detected.get('encoding') or 'utf-8' | ||||||
|     except Exception, e: |     except Exception as e: | ||||||
|         pass |         pass | ||||||
|     return 'utf-8' |     return 'utf-8' | ||||||
|  |  | ||||||
| @@ -72,11 +77,12 @@ def quick_detect_encoding(string): | |||||||
|  |  | ||||||
|     Uses cchardet. Fallbacks to detect_encoding. |     Uses cchardet. Fallbacks to detect_encoding. | ||||||
|     """ |     """ | ||||||
|  |     assert isinstance(string, bytes) | ||||||
|     try: |     try: | ||||||
|         detected = cchardet.detect(string) |         detected = cchardet.detect(string) | ||||||
|         if detected: |         if detected: | ||||||
|             return detected.get('encoding') or detect_encoding(string) |             return detected.get('encoding') or detect_encoding(string) | ||||||
|     except Exception, e: |     except Exception as e: | ||||||
|         pass |         pass | ||||||
|     return detect_encoding(string) |     return detect_encoding(string) | ||||||
|  |  | ||||||
| @@ -87,7 +93,7 @@ def to_utf8(str_or_unicode): | |||||||
|     >>> utils.to_utf8(u'hi') |     >>> utils.to_utf8(u'hi') | ||||||
|         'hi' |         'hi' | ||||||
|     """ |     """ | ||||||
|     if isinstance(str_or_unicode, unicode): |     if not isinstance(str_or_unicode, six.text_type): | ||||||
|         return str_or_unicode.encode("utf-8", "ignore") |         return str_or_unicode.encode("utf-8", "ignore") | ||||||
|     return str(str_or_unicode) |     return str(str_or_unicode) | ||||||
|  |  | ||||||
| @@ -109,26 +115,18 @@ def get_delimiter(msg_body): | |||||||
|     return delimiter |     return delimiter | ||||||
|  |  | ||||||
|  |  | ||||||
| def html_to_text(string): | def html_tree_to_text(tree): | ||||||
|     """ |  | ||||||
|     Dead-simple HTML-to-text converter: |  | ||||||
|         >>> html_to_text("one<br>two<br>three") |  | ||||||
|         >>> "one\ntwo\nthree" |  | ||||||
|  |  | ||||||
|     NOTES: |  | ||||||
|         1. the string is expected to contain UTF-8 encoded HTML! |  | ||||||
|         2. returns utf-8 encoded str (not unicode) |  | ||||||
|     """ |  | ||||||
|     s = _prepend_utf8_declaration(string) |  | ||||||
|     s = s.replace("\n", "") |  | ||||||
|  |  | ||||||
|     tree = html.fromstring(s) |  | ||||||
|  |  | ||||||
|     for style in CSSSelector('style')(tree): |     for style in CSSSelector('style')(tree): | ||||||
|         style.getparent().remove(style) |         style.getparent().remove(style) | ||||||
|  |  | ||||||
|     for c in tree.xpath('//comment()'): |     for c in tree.xpath('//comment()'): | ||||||
|         c.getparent().remove(c) |         parent = c.getparent() | ||||||
|  |  | ||||||
|  |         # comment with no parent does not impact produced text | ||||||
|  |         if parent is None: | ||||||
|  |             continue | ||||||
|  |  | ||||||
|  |         parent.remove(c) | ||||||
|  |  | ||||||
|     text   = "" |     text   = "" | ||||||
|     for el in tree.iter(): |     for el in tree.iter(): | ||||||
| @@ -152,10 +150,56 @@ def html_to_text(string): | |||||||
|     return _encode_utf8(retval) |     return _encode_utf8(retval) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def html_to_text(string): | ||||||
|  |     """ | ||||||
|  |     Dead-simple HTML-to-text converter: | ||||||
|  |         >>> html_to_text("one<br>two<br>three") | ||||||
|  |         >>> "one\ntwo\nthree" | ||||||
|  |  | ||||||
|  |     NOTES: | ||||||
|  |         1. the string is expected to contain UTF-8 encoded HTML! | ||||||
|  |         2. returns utf-8 encoded str (not unicode) | ||||||
|  |         3. if html can't be parsed returns None | ||||||
|  |     """ | ||||||
|  |     if isinstance(string, six.text_type): | ||||||
|  |         string = string.encode('utf8') | ||||||
|  |  | ||||||
|  |     s = _prepend_utf8_declaration(string) | ||||||
|  |     s = s.replace(b"\n", b"") | ||||||
|  |     tree = html_fromstring(s) | ||||||
|  |  | ||||||
|  |     if tree is None: | ||||||
|  |         return None | ||||||
|  |  | ||||||
|  |     return html_tree_to_text(tree) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def html_fromstring(s): | ||||||
|  |     """Parse html tree from string. Return None if the string can't be parsed. | ||||||
|  |     """ | ||||||
|  |     try: | ||||||
|  |         return html5parser.fromstring(s, parser=_html5lib_parser()) | ||||||
|  |     except Exception: | ||||||
|  |         pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def html_document_fromstring(s): | ||||||
|  |     """Parse html tree from string. Return None if the string can't be parsed. | ||||||
|  |     """ | ||||||
|  |     try: | ||||||
|  |         return html5parser.document_fromstring(s, parser=_html5lib_parser()) | ||||||
|  |     except Exception: | ||||||
|  |         pass | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def cssselect(expr, tree): | ||||||
|  |     return CSSSelector(expr)(tree) | ||||||
|  |  | ||||||
|  |  | ||||||
| def _contains_charset_spec(s): | def _contains_charset_spec(s): | ||||||
|     """Return True if the first 4KB contain charset spec |     """Return True if the first 4KB contain charset spec | ||||||
|     """ |     """ | ||||||
|     return s.lower().find('html; charset=', 0, 4096) != -1 |     return s.lower().find(b'html; charset=', 0, 4096) != -1 | ||||||
|  |  | ||||||
|  |  | ||||||
| def _prepend_utf8_declaration(s): | def _prepend_utf8_declaration(s): | ||||||
| @@ -173,15 +217,29 @@ def _rm_excessive_newlines(s): | |||||||
| def _encode_utf8(s): | def _encode_utf8(s): | ||||||
|     """Encode in 'utf-8' if unicode |     """Encode in 'utf-8' if unicode | ||||||
|     """ |     """ | ||||||
|     return s.encode('utf-8') if isinstance(s, unicode) else s |     return s.encode('utf-8') if isinstance(s, six.text_type) else s | ||||||
|  |  | ||||||
|  |  | ||||||
| _UTF8_DECLARATION = ('<meta http-equiv="Content-Type" content="text/html;' | def _html5lib_parser(): | ||||||
|                      'charset=utf-8">') |     """ | ||||||
|  |     html5lib is a pure-python library that conforms to the WHATWG HTML spec | ||||||
|  |     and is not vulnarable to certain attacks common for XML libraries | ||||||
|  |     """ | ||||||
|  |     return html5lib.HTMLParser( | ||||||
|  |         # build lxml tree | ||||||
|  |         html5lib.treebuilders.getTreeBuilder("lxml"), | ||||||
|  |         # remove namespace value from inside lxml.html.html5paser element tag | ||||||
|  |         # otherwise it yields something like "{http://www.w3.org/1999/xhtml}div" | ||||||
|  |         # instead of "div", throwing the algo off | ||||||
|  |         namespaceHTMLElements=False | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | _UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;' | ||||||
|  |                      b'charset=utf-8">') | ||||||
|  |  | ||||||
|  |  | ||||||
| _BLOCKTAGS  = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] | _BLOCKTAGS  = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] | ||||||
| _HARDBREAKS = ['br', 'hr', 'tr'] | _HARDBREAKS = ['br', 'hr', 'tr'] | ||||||
|  |  | ||||||
|  |  | ||||||
| _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") | _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") | ||||||
|   | |||||||
| @@ -1,3 +1,4 @@ | |||||||
|  | from __future__ import absolute_import | ||||||
| from nose.tools import * | from nose.tools import * | ||||||
| from mock import * | from mock import * | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,5 +1,6 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| from . import * | from . import * | ||||||
| from . fixtures import * | from . fixtures import * | ||||||
|  |  | ||||||
| @@ -26,7 +27,7 @@ def test_quotation_splitter_inside_blockquote(): | |||||||
|  |  | ||||||
| </blockquote>""" | </blockquote>""" | ||||||
|  |  | ||||||
|     eq_("<html><body><p>Reply</p></body></html>", |     eq_("<html><head></head><body>Reply</body></html>", | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -43,7 +44,7 @@ def test_quotation_splitter_outside_blockquote(): | |||||||
|   </div> |   </div> | ||||||
| </blockquote> | </blockquote> | ||||||
| """ | """ | ||||||
|     eq_("<html><body><p>Reply</p></body></html>", |     eq_("<html><head></head><body>Reply</body></html>", | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -61,7 +62,7 @@ def test_regular_blockquote(): | |||||||
|   </div> |   </div> | ||||||
| </blockquote> | </blockquote> | ||||||
| """ | """ | ||||||
|     eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote></body></html>", |     eq_("<html><head></head><body>Reply<blockquote>Regular</blockquote></body></html>", | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -84,6 +85,7 @@ Reply | |||||||
|  |  | ||||||
|     reply = """ |     reply = """ | ||||||
| <html> | <html> | ||||||
|  | <head></head> | ||||||
| <body> | <body> | ||||||
| Reply | Reply | ||||||
|  |  | ||||||
| @@ -127,7 +129,18 @@ def test_gmail_quote(): | |||||||
|     </div> |     </div> | ||||||
|   </div> |   </div> | ||||||
| </div>""" | </div>""" | ||||||
|     eq_("<html><body><p>Reply</p></body></html>", |     eq_("<html><head></head><body>Reply</body></html>", | ||||||
|  |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_gmail_quote_compact(): | ||||||
|  |     msg_body = 'Reply' \ | ||||||
|  |                '<div class="gmail_quote">' \ | ||||||
|  |                '<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:' \ | ||||||
|  |                '<div>Test</div>' \ | ||||||
|  |                '</div>' \ | ||||||
|  |                '</div>' | ||||||
|  |     eq_("<html><head></head><body>Reply</body></html>", | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -154,7 +167,7 @@ def test_unicode_in_reply(): | |||||||
|   Quote |   Quote | ||||||
| </blockquote>""".encode("utf-8") | </blockquote>""".encode("utf-8") | ||||||
|  |  | ||||||
|     eq_("<html><body><p>Reply  Text<br></p><div><br></div>" |     eq_("<html><head></head><body>Reply  Text<br><div><br></div>" | ||||||
|         "</body></html>", |         "</body></html>", | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
| @@ -180,6 +193,7 @@ def test_blockquote_disclaimer(): | |||||||
|  |  | ||||||
|     stripped_html = """ |     stripped_html = """ | ||||||
| <html> | <html> | ||||||
|  |   <head></head> | ||||||
|   <body> |   <body> | ||||||
|   <div> |   <div> | ||||||
|     <div> |     <div> | ||||||
| @@ -211,7 +225,7 @@ def test_date_block(): | |||||||
|   </div> |   </div> | ||||||
| </div> | </div> | ||||||
| """ | """ | ||||||
|     eq_('<html><body><div>message<br></div></body></html>', |     eq_('<html><head></head><body><div>message<br></div></body></html>', | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -228,7 +242,7 @@ Subject: You Have New Mail From Mary!<br><br> | |||||||
| text | text | ||||||
| </div></div> | </div></div> | ||||||
| """ | """ | ||||||
|     eq_('<html><body><div>message<br></div></body></html>', |     eq_('<html><head></head><body><div>message<br></div></body></html>', | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -246,7 +260,7 @@ def test_reply_shares_div_with_from_block(): | |||||||
|  |  | ||||||
|   </div> |   </div> | ||||||
| </body>''' | </body>''' | ||||||
|     eq_('<html><body><div>Blah<br><br></div></body></html>', |     eq_('<html><head></head><body><div>Blah<br><br></div></body></html>', | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -257,23 +271,44 @@ def test_reply_quotations_share_block(): | |||||||
|  |  | ||||||
|  |  | ||||||
| def test_OLK_SRC_BODY_SECTION_stripped(): | def test_OLK_SRC_BODY_SECTION_stripped(): | ||||||
|     eq_('<html><body><div>Reply</div></body></html>', |     eq_('<html><head></head><body><div>Reply</div></body></html>', | ||||||
|         RE_WHITESPACE.sub( |         RE_WHITESPACE.sub( | ||||||
|             '', quotations.extract_from_html(OLK_SRC_BODY_SECTION))) |             '', quotations.extract_from_html(OLK_SRC_BODY_SECTION))) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_reply_separated_by_hr(): | def test_reply_separated_by_hr(): | ||||||
|     eq_('<html><body><div>Hi<div>there</div></div></body></html>', |     eq_('<html><head></head><body><div>Hi<div>there</div></div></body></html>', | ||||||
|         RE_WHITESPACE.sub( |         RE_WHITESPACE.sub( | ||||||
|             '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) |             '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_from_block_and_quotations_in_separate_divs(): | ||||||
|  |     msg_body = ''' | ||||||
|  | Reply | ||||||
|  | <div> | ||||||
|  |   <hr/> | ||||||
|  |   <div> | ||||||
|  |     <font> | ||||||
|  |       <b>From: bob@example.com</b> | ||||||
|  |       <b>Date: Thu, 24 Mar 2016 08:07:12 -0700</b> | ||||||
|  |     </font> | ||||||
|  |   </div> | ||||||
|  |   <div> | ||||||
|  |     Quoted message | ||||||
|  |   </div> | ||||||
|  | </div> | ||||||
|  | ''' | ||||||
|  |     eq_('<html><head></head><body>Reply<div><hr></div></body></html>', | ||||||
|  |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| def extract_reply_and_check(filename): | def extract_reply_and_check(filename): | ||||||
|     f = open(filename) |     f = open(filename) | ||||||
|  |  | ||||||
|     msg_body = f.read() |     msg_body = f.read() | ||||||
|     reply = quotations.extract_from_html(msg_body) |     reply = quotations.extract_from_html(msg_body) | ||||||
|     plain_reply = u.html_to_text(reply) |     plain_reply = u.html_to_text(reply) | ||||||
|  |     plain_reply = plain_reply.decode('utf8') | ||||||
|  |  | ||||||
|     eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"), |     eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"), | ||||||
|         RE_WHITESPACE.sub('', plain_reply)) |         RE_WHITESPACE.sub('', plain_reply)) | ||||||
| @@ -323,7 +358,8 @@ def test_CRLF(): | |||||||
|     assert_false(symbol in extracted) |     assert_false(symbol in extracted) | ||||||
|     eq_('<html></html>', RE_WHITESPACE.sub('', extracted)) |     eq_('<html></html>', RE_WHITESPACE.sub('', extracted)) | ||||||
|  |  | ||||||
|     msg_body = """Reply |     msg_body = """My | ||||||
|  | reply | ||||||
| <blockquote> | <blockquote> | ||||||
|  |  | ||||||
|   <div> |   <div> | ||||||
| @@ -338,5 +374,48 @@ def test_CRLF(): | |||||||
|     msg_body = msg_body.replace('\n', '\r\n') |     msg_body = msg_body.replace('\n', '\r\n') | ||||||
|     extracted = quotations.extract_from_html(msg_body) |     extracted = quotations.extract_from_html(msg_body) | ||||||
|     assert_false(symbol in extracted)     |     assert_false(symbol in extracted)     | ||||||
|     eq_("<html><body><p>Reply</p></body></html>", |     # Keep new lines otherwise "My reply" becomes one word - "Myreply"  | ||||||
|         RE_WHITESPACE.sub('', extracted)) |     eq_("<html><head></head><body>My\nreply\n</body></html>", extracted) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_gmail_forwarded_msg(): | ||||||
|  |     msg_body = """<div dir="ltr"><br><div class="gmail_quote">---------- Forwarded message ----------<br>From: <b class="gmail_sendername">Bob</b> <span dir="ltr"><<a href="mailto:bob@example.com">bob@example.com</a>></span><br>Date: Fri, Feb 11, 2010 at 5:59 PM<br>Subject: Bob WFH today<br>To: Mary <<a href="mailto:mary@example.com">mary@example.com</a>><br><br><br><div dir="ltr">eom</div> | ||||||
|  | </div><br></div>""" | ||||||
|  |     extracted = quotations.extract_from_html(msg_body) | ||||||
|  |     eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @patch.object(quotations, 'MAX_HTML_LEN', 1) | ||||||
|  | def test_too_large_html(): | ||||||
|  |     msg_body = 'Reply' \ | ||||||
|  |                '<div class="gmail_quote">' \ | ||||||
|  |                '<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:' \ | ||||||
|  |                '<div>Test</div>' \ | ||||||
|  |                '</div>' \ | ||||||
|  |                '</div>' | ||||||
|  |     eq_(RE_WHITESPACE.sub('', msg_body), | ||||||
|  |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_readable_html_empty(): | ||||||
|  |     msg_body = """ | ||||||
|  | <blockquote> | ||||||
|  |   Reply | ||||||
|  |   <div> | ||||||
|  |     On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: | ||||||
|  |   </div> | ||||||
|  |  | ||||||
|  |   <div> | ||||||
|  |     Test | ||||||
|  |   </div> | ||||||
|  |  | ||||||
|  | </blockquote>""" | ||||||
|  |  | ||||||
|  |     eq_(RE_WHITESPACE.sub('', msg_body), | ||||||
|  |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @patch.object(quotations, 'html_document_fromstring', Mock(return_value=None)) | ||||||
|  | def test_bad_html(): | ||||||
|  |     bad_html = "<html></html>" | ||||||
|  |     eq_(bad_html, quotations.extract_from_html(bad_html)) | ||||||
|   | |||||||
| @@ -1,5 +1,6 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| from . import * | from . import * | ||||||
| from . fixtures import * | from . fixtures import * | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,5 +1,6 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| from .. import * | from .. import * | ||||||
|  |  | ||||||
| from talon.signature import bruteforce | from talon.signature import bruteforce | ||||||
|   | |||||||
| @@ -1,5 +1,6 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| from .. import * | from .. import * | ||||||
|  |  | ||||||
| import os | import os | ||||||
| @@ -8,6 +9,7 @@ from talon.signature.learning import dataset | |||||||
| from talon import signature | from talon import signature | ||||||
| from talon.signature import extraction as e | from talon.signature import extraction as e | ||||||
| from talon.signature import bruteforce | from talon.signature import bruteforce | ||||||
|  | from six.moves import range | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_message_shorter_SIGNATURE_MAX_LINES(): | def test_message_shorter_SIGNATURE_MAX_LINES(): | ||||||
| @@ -75,6 +77,31 @@ def test_basic(): | |||||||
|         signature.extract(msg_body, 'Sergey')) |         signature.extract(msg_body, 'Sergey')) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_capitalized(): | ||||||
|  |     msg_body = """Hi Mary, | ||||||
|  |  | ||||||
|  | Do you still need a DJ for your wedding? I've included a video demo of one of our DJs available for your wedding date. | ||||||
|  |  | ||||||
|  | DJ Doe  | ||||||
|  | http://example.com | ||||||
|  | Password: SUPERPASSWORD | ||||||
|  |  | ||||||
|  | Would you like to check out more? | ||||||
|  |  | ||||||
|  |  | ||||||
|  | At your service, | ||||||
|  |  | ||||||
|  | John Smith | ||||||
|  | Doe Inc | ||||||
|  | 555-531-7967""" | ||||||
|  |  | ||||||
|  |     sig = """John Smith | ||||||
|  | Doe Inc | ||||||
|  | 555-531-7967""" | ||||||
|  |  | ||||||
|  |     eq_(sig, signature.extract(msg_body, 'Doe')[1]) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_over_2_text_lines_after_signature(): | def test_over_2_text_lines_after_signature(): | ||||||
|     body = """Blah |     body = """Blah | ||||||
|  |  | ||||||
| @@ -127,20 +154,20 @@ def test_mark_lines(): | |||||||
|  |  | ||||||
| def test_process_marked_lines(): | def test_process_marked_lines(): | ||||||
|     # no signature found |     # no signature found | ||||||
|     eq_((range(5), None), e._process_marked_lines(range(5), 'telt')) |     eq_((list(range(5)), None), e._process_marked_lines(list(range(5)), 'telt')) | ||||||
|  |  | ||||||
|     # signature in the middle of the text |     # signature in the middle of the text | ||||||
|     eq_((range(9), None), e._process_marked_lines(range(9), 'tesestelt')) |     eq_((list(range(9)), None), e._process_marked_lines(list(range(9)), 'tesestelt')) | ||||||
|  |  | ||||||
|     # long line splits signature |     # long line splits signature | ||||||
|     eq_((range(7), [7, 8]), |     eq_((list(range(7)), [7, 8]), | ||||||
|         e._process_marked_lines(range(9), 'tsslsless')) |         e._process_marked_lines(list(range(9)), 'tsslsless')) | ||||||
|  |  | ||||||
|     eq_((range(20), [20]), |     eq_((list(range(20)), [20]), | ||||||
|         e._process_marked_lines(range(21), 'ttttttstttesllelelets')) |         e._process_marked_lines(list(range(21)), 'ttttttstttesllelelets')) | ||||||
|  |  | ||||||
|     # some signature lines could be identified as text |     # some signature lines could be identified as text | ||||||
|     eq_(([0], range(1, 9)), e._process_marked_lines(range(9), 'tsetetest')) |     eq_(([0], list(range(1, 9))), e._process_marked_lines(list(range(9)), 'tsetetest')) | ||||||
|  |  | ||||||
|     eq_(([], range(5)), |     eq_(([], list(range(5))), | ||||||
|         e._process_marked_lines(range(5), "ststt")) |         e._process_marked_lines(list(range(5)), "ststt")) | ||||||
|   | |||||||
| @@ -1,5 +1,6 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| from ... import * | from ... import * | ||||||
| import os | import os | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,5 +1,6 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| from ... import * | from ... import * | ||||||
|  |  | ||||||
| from talon.signature.learning import featurespace as fs | from talon.signature.learning import featurespace as fs | ||||||
|   | |||||||
| @@ -1,11 +1,13 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| from ... import * | from ... import * | ||||||
|  |  | ||||||
| import regex as re | import regex as re | ||||||
|  |  | ||||||
| from talon.signature.learning import helpers as h | from talon.signature.learning import helpers as h | ||||||
| from talon.signature.learning.helpers import * | from talon.signature.learning.helpers import * | ||||||
|  | from six.moves import range | ||||||
|  |  | ||||||
| # First testing regex constants. | # First testing regex constants. | ||||||
| VALID = ''' | VALID = ''' | ||||||
| @@ -154,7 +156,7 @@ def test_extract_names(): | |||||||
|         # check that extracted names could be compiled |         # check that extracted names could be compiled | ||||||
|         try: |         try: | ||||||
|             re.compile("|".join(extracted_names)) |             re.compile("|".join(extracted_names)) | ||||||
|         except Exception, e: |         except Exception as e: | ||||||
|             ok_(False, ("Failed to compile extracted names {}" |             ok_(False, ("Failed to compile extracted names {}" | ||||||
|                         "\n\nReason: {}").format(extracted_names, e)) |                         "\n\nReason: {}").format(extracted_names, e)) | ||||||
|         if expected_names: |         if expected_names: | ||||||
| @@ -190,10 +192,11 @@ def test_punctuation_percent(categories_percent): | |||||||
| def test_capitalized_words_percent(): | def test_capitalized_words_percent(): | ||||||
|     eq_(0.0, h.capitalized_words_percent('')) |     eq_(0.0, h.capitalized_words_percent('')) | ||||||
|     eq_(100.0, h.capitalized_words_percent('Example Corp')) |     eq_(100.0, h.capitalized_words_percent('Example Corp')) | ||||||
|     eq_(50.0, h.capitalized_words_percent('Qqq qqq QQQ 123 sss')) |     eq_(50.0, h.capitalized_words_percent('Qqq qqq Aqs 123 sss')) | ||||||
|     eq_(100.0, h.capitalized_words_percent('Cell 713-444-7368')) |     eq_(100.0, h.capitalized_words_percent('Cell 713-444-7368')) | ||||||
|     eq_(100.0, h.capitalized_words_percent('8th Floor')) |     eq_(100.0, h.capitalized_words_percent('8th Floor')) | ||||||
|     eq_(0.0, h.capitalized_words_percent('(212) 230-9276')) |     eq_(0.0, h.capitalized_words_percent('(212) 230-9276')) | ||||||
|  |     eq_(50.0, h.capitalized_words_percent('Password: REMARKABLE')) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_has_signature(): | def test_has_signature(): | ||||||
| @@ -204,7 +207,7 @@ def test_has_signature(): | |||||||
|                         'sender@example.com')) |                         'sender@example.com')) | ||||||
|     assert_false(h.has_signature('http://www.example.com/555-555-5555', |     assert_false(h.has_signature('http://www.example.com/555-555-5555', | ||||||
|                                  'sender@example.com')) |                                  'sender@example.com')) | ||||||
|     long_line = ''.join(['q' for e in xrange(28)]) |     long_line = ''.join(['q' for e in range(28)]) | ||||||
|     assert_false(h.has_signature(long_line + ' sender', 'sender@example.com')) |     assert_false(h.has_signature(long_line + ' sender', 'sender@example.com')) | ||||||
|     # wont crash on an empty string |     # wont crash on an empty string | ||||||
|     assert_false(h.has_signature('', '')) |     assert_false(h.has_signature('', '')) | ||||||
|   | |||||||
| @@ -1,5 +1,6 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| from . import * | from . import * | ||||||
| from . fixtures import * | from . fixtures import * | ||||||
|  |  | ||||||
| @@ -7,6 +8,9 @@ import os | |||||||
|  |  | ||||||
| import email.iterators | import email.iterators | ||||||
| from talon import quotations | from talon import quotations | ||||||
|  | import six | ||||||
|  | from six.moves import range | ||||||
|  | from six import StringIO | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(quotations, 'MAX_LINES_COUNT', 1) | @patch.object(quotations, 'MAX_LINES_COUNT', 1) | ||||||
| @@ -32,6 +36,19 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote: | |||||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_pattern_sent_from_samsung_smb_wrote(): | ||||||
|  |     msg_body = """Test reply | ||||||
|  |  | ||||||
|  | Sent from Samsung MobileName <address@example.com> wrote: | ||||||
|  |  | ||||||
|  | > | ||||||
|  | > Test | ||||||
|  | > | ||||||
|  | > Roman""" | ||||||
|  |  | ||||||
|  |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_pattern_on_date_wrote_somebody(): | def test_pattern_on_date_wrote_somebody(): | ||||||
|     eq_('Lorem', quotations.extract_from_plain( |     eq_('Lorem', quotations.extract_from_plain( | ||||||
|     """Lorem |     """Lorem | ||||||
| @@ -54,6 +71,18 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote: | |||||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_date_time_email_splitter(): | ||||||
|  |     msg_body = """Test reply | ||||||
|  |  | ||||||
|  | 2014-10-17 11:28 GMT+03:00 Postmaster < | ||||||
|  | postmaster@sandboxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.mailgun.org>: | ||||||
|  |  | ||||||
|  | > First from site | ||||||
|  | > | ||||||
|  |     """ | ||||||
|  |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_pattern_on_date_somebody_wrote_allows_space_in_front(): | def test_pattern_on_date_somebody_wrote_allows_space_in_front(): | ||||||
|     msg_body = """Thanks Thanmai |     msg_body = """Thanks Thanmai | ||||||
|  On Mar 8, 2012 9:59 AM, "Example.com" < |  On Mar 8, 2012 9:59 AM, "Example.com" < | ||||||
| @@ -113,7 +142,7 @@ def _check_pattern_original_message(original_message_indicator): | |||||||
| -----{}----- | -----{}----- | ||||||
|  |  | ||||||
| Test""" | Test""" | ||||||
|     eq_('Test reply', quotations.extract_from_plain(msg_body.format(unicode(original_message_indicator)))) |     eq_('Test reply', quotations.extract_from_plain(msg_body.format(six.text_type(original_message_indicator)))) | ||||||
|  |  | ||||||
| def test_english_original_message(): | def test_english_original_message(): | ||||||
|     _check_pattern_original_message('Original Message') |     _check_pattern_original_message('Original Message') | ||||||
| @@ -637,6 +666,15 @@ def test_preprocess_postprocess_2_links(): | |||||||
|     eq_(msg_body, quotations.extract_from_plain(msg_body)) |     eq_(msg_body, quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def body_iterator(msg, decode=False): | ||||||
|  |     for subpart in msg.walk(): | ||||||
|  |         payload = subpart.get_payload(decode=decode) | ||||||
|  |         if isinstance(payload, six.text_type): | ||||||
|  |             yield payload | ||||||
|  |         else: | ||||||
|  |             yield payload.decode('utf8') | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_standard_replies(): | def test_standard_replies(): | ||||||
|     for filename in os.listdir(STANDARD_REPLIES): |     for filename in os.listdir(STANDARD_REPLIES): | ||||||
|         filename = os.path.join(STANDARD_REPLIES, filename) |         filename = os.path.join(STANDARD_REPLIES, filename) | ||||||
| @@ -644,8 +682,8 @@ def test_standard_replies(): | |||||||
|             continue |             continue | ||||||
|         with open(filename) as f: |         with open(filename) as f: | ||||||
|             message = email.message_from_file(f) |             message = email.message_from_file(f) | ||||||
|             body = email.iterators.typed_subpart_iterator(message, subtype='plain').next() |             body = next(email.iterators.typed_subpart_iterator(message, subtype='plain')) | ||||||
|             text = ''.join(email.iterators.body_line_iterator(body, True)) |             text = ''.join(body_iterator(body, True)) | ||||||
|  |  | ||||||
|             stripped_text = quotations.extract_from_plain(text) |             stripped_text = quotations.extract_from_plain(text) | ||||||
|             reply_text_fn = filename[:-4] + '_reply_text' |             reply_text_fn = filename[:-4] + '_reply_text' | ||||||
|   | |||||||
| @@ -1,9 +1,12 @@ | |||||||
| # coding:utf-8 | # coding:utf-8 | ||||||
|  |  | ||||||
|  | from __future__ import absolute_import | ||||||
| from . import * | from . import * | ||||||
|  |  | ||||||
| from talon import utils as u | from talon import utils as u | ||||||
| import cchardet | import cchardet | ||||||
|  | import six | ||||||
|  | from lxml import html | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_get_delimiter(): | def test_get_delimiter(): | ||||||
| @@ -14,49 +17,49 @@ def test_get_delimiter(): | |||||||
|  |  | ||||||
| def test_unicode(): | def test_unicode(): | ||||||
|     eq_ (u'hi', u.to_unicode('hi')) |     eq_ (u'hi', u.to_unicode('hi')) | ||||||
|     eq_ (type(u.to_unicode('hi')), unicode ) |     eq_ (type(u.to_unicode('hi')), six.text_type ) | ||||||
|     eq_ (type(u.to_unicode(u'hi')), unicode ) |     eq_ (type(u.to_unicode(u'hi')), six.text_type ) | ||||||
|     eq_ (type(u.to_unicode('привет')), unicode ) |     eq_ (type(u.to_unicode('привет')), six.text_type ) | ||||||
|     eq_ (type(u.to_unicode(u'привет')), unicode ) |     eq_ (type(u.to_unicode(u'привет')), six.text_type ) | ||||||
|     eq_ (u"привет", u.to_unicode('привет')) |     eq_ (u"привет", u.to_unicode('привет')) | ||||||
|     eq_ (u"привет", u.to_unicode(u'привет')) |     eq_ (u"привет", u.to_unicode(u'привет')) | ||||||
|     # some latin1 stuff |     # some latin1 stuff | ||||||
|     eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True)) |     eq_ (u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_detect_encoding(): | def test_detect_encoding(): | ||||||
|     eq_ ('ascii', u.detect_encoding('qwe').lower()) |     eq_ ('ascii', u.detect_encoding(b'qwe').lower()) | ||||||
|     eq_ ('iso-8859-2', u.detect_encoding('Versi\xf3n').lower()) |     eq_ ('iso-8859-2', u.detect_encoding(u'Versi\xf3n'.encode('iso-8859-2')).lower()) | ||||||
|     eq_ ('utf-8', u.detect_encoding('привет').lower()) |     eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower()) | ||||||
|     # fallback to utf-8 |     # fallback to utf-8 | ||||||
|     with patch.object(u.chardet, 'detect') as detect: |     with patch.object(u.chardet, 'detect') as detect: | ||||||
|         detect.side_effect = Exception |         detect.side_effect = Exception | ||||||
|         eq_ ('utf-8', u.detect_encoding('qwe').lower()) |         eq_ ('utf-8', u.detect_encoding('qwe'.encode('utf8')).lower()) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_quick_detect_encoding(): | def test_quick_detect_encoding(): | ||||||
|     eq_ ('ascii', u.quick_detect_encoding('qwe').lower()) |     eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower()) | ||||||
|     eq_ ('windows-1252', u.quick_detect_encoding('Versi\xf3n').lower()) |     eq_ ('windows-1252', u.quick_detect_encoding(u'Versi\xf3n'.encode('windows-1252')).lower()) | ||||||
|     eq_ ('utf-8', u.quick_detect_encoding('привет').lower()) |     eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower()) | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(cchardet, 'detect') | @patch.object(cchardet, 'detect') | ||||||
| @patch.object(u, 'detect_encoding') | @patch.object(u, 'detect_encoding') | ||||||
| def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect): | def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect): | ||||||
|     cchardet_detect.return_value = {'encoding': 'ascii'} |     cchardet_detect.return_value = {'encoding': 'ascii'} | ||||||
|     eq_('ascii', u.quick_detect_encoding("qwe")) |     eq_('ascii', u.quick_detect_encoding(b"qwe")) | ||||||
|     cchardet_detect.assert_called_once_with("qwe") |     cchardet_detect.assert_called_once_with(b"qwe") | ||||||
|  |  | ||||||
|     # fallback to detect_encoding |     # fallback to detect_encoding | ||||||
|     cchardet_detect.return_value = {} |     cchardet_detect.return_value = {} | ||||||
|     detect_encoding.return_value = 'utf-8' |     detect_encoding.return_value = 'utf-8' | ||||||
|     eq_('utf-8', u.quick_detect_encoding("qwe")) |     eq_('utf-8', u.quick_detect_encoding(b"qwe")) | ||||||
|  |  | ||||||
|     # exception |     # exception | ||||||
|     detect_encoding.reset_mock() |     detect_encoding.reset_mock() | ||||||
|     cchardet_detect.side_effect = Exception() |     cchardet_detect.side_effect = Exception() | ||||||
|     detect_encoding.return_value = 'utf-8' |     detect_encoding.return_value = 'utf-8' | ||||||
|     eq_('utf-8', u.quick_detect_encoding("qwe")) |     eq_('utf-8', u.quick_detect_encoding(b"qwe")) | ||||||
|     ok_(detect_encoding.called) |     ok_(detect_encoding.called) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -73,11 +76,11 @@ Haha | |||||||
| </p> | </p> | ||||||
| </body>""" | </body>""" | ||||||
|     text = u.html_to_text(html) |     text = u.html_to_text(html) | ||||||
|     eq_("Hello world! \n\n  * One! \n  * Two \nHaha", text) |     eq_(b"Hello world! \n\n  * One! \n  * Two \nHaha", text) | ||||||
|     eq_("привет!", u.html_to_text("<b>привет!</b>")) |     eq_(u"привет!", u.html_to_text("<b>привет!</b>").decode('utf8')) | ||||||
|  |  | ||||||
|     html = '<body><br/><br/>Hi</body>' |     html = '<body><br/><br/>Hi</body>' | ||||||
|     eq_ ('Hi', u.html_to_text(html)) |     eq_ (b'Hi', u.html_to_text(html)) | ||||||
|  |  | ||||||
|     html = """Hi |     html = """Hi | ||||||
| <style type="text/css"> | <style type="text/css"> | ||||||
| @@ -97,11 +100,34 @@ font: 13px 'Lucida Grande', Arial, sans-serif; | |||||||
|  |  | ||||||
| } | } | ||||||
| </style>""" | </style>""" | ||||||
|     eq_ ('Hi', u.html_to_text(html)) |     eq_ (b'Hi', u.html_to_text(html)) | ||||||
|  |  | ||||||
|     html = """<div> |     html = """<div> | ||||||
| <!-- COMMENT 1 --> | <!-- COMMENT 1 --> | ||||||
| <span>TEXT 1</span> | <span>TEXT 1</span> | ||||||
| <p>TEXT 2 <!-- COMMENT 2 --></p> | <p>TEXT 2 <!-- COMMENT 2 --></p> | ||||||
| </div>""" | </div>""" | ||||||
|     eq_('TEXT 1 \nTEXT 2', u.html_to_text(html)) |     eq_(b'TEXT 1 \nTEXT 2', u.html_to_text(html)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_comment_no_parent(): | ||||||
|  |     s = "<!-- COMMENT 1 --> no comment" | ||||||
|  |     d = u.html_document_fromstring(s) | ||||||
|  |     eq_("no comment", u.html_tree_to_text(d)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception())) | ||||||
|  | def test_html_fromstring_exception(): | ||||||
|  |     eq_(None, u.html_fromstring("<html></html>")) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @patch.object(u.html5parser, 'document_fromstring') | ||||||
|  | def test_html_document_fromstring_exception(document_fromstring): | ||||||
|  |     document_fromstring.side_effect = Exception() | ||||||
|  |     eq_(None, u.html_document_fromstring("<html></html>")) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @patch.object(u, 'html_fromstring', Mock(return_value=None)) | ||||||
|  | def test_bad_html_to_text(): | ||||||
|  |     bad_html = "one<br>two<br>three" | ||||||
|  |     eq_(None, u.html_to_text(bad_html)) | ||||||
|   | |||||||
							
								
								
									
										1
									
								
								train.py
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								train.py
									
									
									
									
									
								
							| @@ -1,3 +1,4 @@ | |||||||
|  | from __future__ import absolute_import | ||||||
| from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA | from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA | ||||||
| from talon.signature.learning.classifier import train, init | from talon.signature.learning.classifier import train, init | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user