Compare commits
	
		
			72 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 5bcf7403ad | ||
|  | 2d6c092b65 | ||
|  | 6d0689cad6 | ||
|  | 3f80e93ee0 | ||
|  | 1b18abab1d | ||
|  | 03dd5af5ab | ||
|  | dfba82b07c | ||
|  | 08ca02c87f | ||
|  | b61f4ec095 | ||
|  | 9dbe6a494b | ||
|  | 44e70939d6 | ||
|  | ab6066eafa | ||
|  | 42258cdd36 | ||
|  | d3de9e6893 | ||
|  | 333beb94af | ||
|  | f3c0942c49 | ||
|  | 02adf53ab9 | ||
|  | 3497b5cab4 | ||
|  | 9c17dca17c | ||
|  | de342d3177 | ||
|  | 743b452daf | ||
|  | c762f3c337 | ||
|  | 31803d41bc | ||
|  | 2ecd9779fc | ||
|  | 5a7047233e | ||
|  | 999e9c3725 | ||
|  | f6940fe878 | ||
|  | ce65ff8fc8 | ||
|  | eed6784f25 | ||
|  | 3d9ae356ea | ||
|  | f688d074b5 | ||
|  | 41457d8fbd | ||
|  | 2c416ecc0e | ||
|  | 3ab33c557b | ||
|  | 8db05f4950 | ||
|  | 3d5bc82a03 | ||
|  | 14e3a0d80b | ||
|  | fcd9e2716a | ||
|  | d62d633215 | ||
|  | 3b0c9273c1 | ||
|  | e4c1c11845 | ||
|  | ae508fe0e5 | ||
|  | 2cb9b5399c | ||
|  | 134c47f515 | ||
|  | d328c9d128 | ||
|  | 77b62b0fef | ||
|  | ad09b18f3f | ||
|  | b5af9c03a5 | ||
|  | 176c7e7532 | ||
|  | 15976888a0 | ||
|  | 9bee502903 | ||
|  | e3cb8dc3e6 | ||
|  | 385285e5de | ||
|  | 127771dac9 | ||
|  | cc98befba5 | ||
|  | 567549cba4 | ||
|  | 76c4f49be8 | ||
|  | d9d89dc250 | ||
|  | 9358db6cee | ||
|  | 08c9d7db03 | ||
|  | 390b0a6dc9 | ||
|  | ed6b861a47 | ||
|  | 85c7ee980c | ||
|  | 7ea773e6a9 | ||
|  | e3c4ff38fe | ||
|  | 8b1f87b1c0 | ||
|  | c5e4cd9ab4 | ||
|  | 215e36e9ed | ||
|  | e3ef622031 | ||
|  | f16760c466 | ||
|  | b36287e573 | ||
|  | 4df7aa284b | 
							
								
								
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -49,3 +49,6 @@ tramp | |||||||
|  |  | ||||||
| # Trial temp | # Trial temp | ||||||
| _trial_temp | _trial_temp | ||||||
|  |  | ||||||
|  | # OSX | ||||||
|  | .DS_Store | ||||||
							
								
								
									
										20
									
								
								README.rst
									
									
									
									
									
								
							
							
						
						
									
										20
									
								
								README.rst
									
									
									
									
									
								
							| @@ -89,20 +89,34 @@ the power of machine learning algorithms: | |||||||
|     # text == "Thanks Sasha, I can't go any higher and is why I limited it to the\nhomepage." |     # text == "Thanks Sasha, I can't go any higher and is why I limited it to the\nhomepage." | ||||||
|     # signature == "John Doe\nvia mobile" |     # signature == "John Doe\nvia mobile" | ||||||
|  |  | ||||||
| For machine learning talon currently uses `PyML`_ library to build SVM | For machine learning talon currently uses the `scikit-learn`_ library to build SVM | ||||||
| classifiers. The core of machine learning algorithm lays in | classifiers. The core of machine learning algorithm lays in | ||||||
| ``talon.signature.learning package``. It defines a set of features to | ``talon.signature.learning package``. It defines a set of features to | ||||||
| apply to a message (``featurespace.py``), how data sets are built | apply to a message (``featurespace.py``), how data sets are built | ||||||
| (``dataset.py``), classifier’s interface (``classifier.py``). | (``dataset.py``), classifier’s interface (``classifier.py``). | ||||||
|  |  | ||||||
| The data used for training is taken from our personal email | Currently the data used for training is taken from our personal email | ||||||
| conversations and from `ENRON`_ dataset. As a result of applying our set | conversations and from `ENRON`_ dataset. As a result of applying our set | ||||||
| of features to the dataset we provide files ``classifier`` and | of features to the dataset we provide files ``classifier`` and | ||||||
| ``train.data`` that don’t have any personal information but could be | ``train.data`` that don’t have any personal information but could be | ||||||
| used to load trained classifier. Those files should be regenerated every | used to load trained classifier. Those files should be regenerated every | ||||||
| time the feature/data set is changed. | time the feature/data set is changed. | ||||||
|  |  | ||||||
| .. _PyML: http://pyml.sourceforge.net/ | To regenerate the model files, you can run | ||||||
|  |  | ||||||
|  | .. code:: sh | ||||||
|  |  | ||||||
|  |     python train.py | ||||||
|  |  | ||||||
|  | or | ||||||
|  |  | ||||||
|  | .. code:: python | ||||||
|  |      | ||||||
|  |     from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA | ||||||
|  |     from talon.signature.learning.classifier import train, init | ||||||
|  |     train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME) | ||||||
|  |  | ||||||
|  | .. _scikit-learn: http://scikit-learn.org | ||||||
| .. _ENRON: https://www.cs.cmu.edu/~enron/ | .. _ENRON: https://www.cs.cmu.edu/~enron/ | ||||||
|  |  | ||||||
| Research | Research | ||||||
|   | |||||||
							
								
								
									
										101
									
								
								setup.py
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							
							
						
						
									
										101
									
								
								setup.py
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							| @@ -1,13 +1,8 @@ | |||||||
| import os |  | ||||||
| import sys |  | ||||||
| import contextlib |  | ||||||
|  |  | ||||||
| from distutils.spawn import find_executable |  | ||||||
| from setuptools import setup, find_packages | from setuptools import setup, find_packages | ||||||
|  |  | ||||||
|  |  | ||||||
| setup(name='talon', | setup(name='talon', | ||||||
|       version='1.0.2', |       version='1.2.9', | ||||||
|       description=("Mailgun library " |       description=("Mailgun library " | ||||||
|                    "to extract message quotations and signatures."), |                    "to extract message quotations and signatures."), | ||||||
|       long_description=open("README.rst").read(), |       long_description=open("README.rst").read(), | ||||||
| @@ -19,88 +14,18 @@ setup(name='talon', | |||||||
|       include_package_data=True, |       include_package_data=True, | ||||||
|       zip_safe=True, |       zip_safe=True, | ||||||
|       install_requires=[ |       install_requires=[ | ||||||
|           "lxml==2.3.3", |           "lxml>=2.3.3", | ||||||
|           "regex==0.1.20110315", |           "regex>=1", | ||||||
|           "chardet==1.0.1", |           "numpy", | ||||||
|           "dnspython==1.11.1", |           "scipy", | ||||||
|           "html2text", |           "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild | ||||||
|           "nose==1.2.1", |           'chardet>=1.0.1', | ||||||
|  |           'cchardet>=0.3.5', | ||||||
|  |           'cssselect' | ||||||
|  |           ], | ||||||
|  |       tests_require=[ | ||||||
|           "mock", |           "mock", | ||||||
|           "coverage", |           "nose>=1.2.1", | ||||||
|           "flanker" |           "coverage" | ||||||
|           ] |           ] | ||||||
|       ) |       ) | ||||||
|  |  | ||||||
|  |  | ||||||
| def install_pyml(): |  | ||||||
|     ''' |  | ||||||
|     Downloads and installs PyML |  | ||||||
|     ''' |  | ||||||
|     try: |  | ||||||
|         import PyML |  | ||||||
|     except: |  | ||||||
|         pass |  | ||||||
|     else: |  | ||||||
|         return |  | ||||||
|  |  | ||||||
|     # install numpy first |  | ||||||
|     pip('install numpy==1.6.1 --upgrade') |  | ||||||
|  |  | ||||||
|     pyml_tarball = ( |  | ||||||
|         'http://09cce49df173f6f6e61f-fd6930021b51685920a6fa76529ee321' |  | ||||||
|         '.r45.cf2.rackcdn.com/PyML-0.7.9.tar.gz') |  | ||||||
|     pyml_srcidr = 'PyML-0.7.9' |  | ||||||
|  |  | ||||||
|     # see if PyML tarball needs to be fetched: |  | ||||||
|     if not dir_exists(pyml_srcidr): |  | ||||||
|         run("curl %s | tar -xz" % pyml_tarball) |  | ||||||
|  |  | ||||||
|     # compile&install: |  | ||||||
|     with cd(pyml_srcidr): |  | ||||||
|         python('setup.py build') |  | ||||||
|         python('setup.py install') |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def run(command): |  | ||||||
|     if os.system(command) != 0: |  | ||||||
|         raise Exception("Failed '{}'".format(command)) |  | ||||||
|     else: |  | ||||||
|         return 0 |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def python(command): |  | ||||||
|     command = '{} {}'.format(sys.executable, command) |  | ||||||
|     run(command) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def enforce_executable(name, install_info): |  | ||||||
|     if os.system("which {}".format(name)) != 0: |  | ||||||
|         raise Exception( |  | ||||||
|             '{} utility is missing.\nTo install, run:\n\n{}\n'.format( |  | ||||||
|                 name, install_info)) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def pip(command): |  | ||||||
|     command = '{} {}'.format(find_executable('pip'), command) |  | ||||||
|     run(command) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def dir_exists(path): |  | ||||||
|     return os.path.isdir(path) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @contextlib.contextmanager |  | ||||||
| def cd(directory): |  | ||||||
|     curdir = os.getcwd() |  | ||||||
|     try: |  | ||||||
|         os.chdir(directory) |  | ||||||
|         yield {} |  | ||||||
|     finally: |  | ||||||
|         os.chdir(curdir) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| if __name__ == '__main__': |  | ||||||
|     if len(sys.argv) > 1 and sys.argv[1] in ['develop', 'install']: |  | ||||||
|         enforce_executable('curl', 'sudo aptitude install curl') |  | ||||||
|  |  | ||||||
|         install_pyml() |  | ||||||
|   | |||||||
| @@ -12,6 +12,7 @@ CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX) | |||||||
|  |  | ||||||
| # HTML quote indicators (tag ids) | # HTML quote indicators (tag ids) | ||||||
| QUOTE_IDS = ['OLK_SRC_BODY_SECTION'] | QUOTE_IDS = ['OLK_SRC_BODY_SECTION'] | ||||||
|  | RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) | ||||||
|  |  | ||||||
|  |  | ||||||
| def add_checkpoint(html_note, counter): | def add_checkpoint(html_note, counter): | ||||||
| @@ -76,8 +77,8 @@ def delete_quotation_tags(html_note, counter, quotation_checkpoints): | |||||||
|  |  | ||||||
| def cut_gmail_quote(html_message): | def cut_gmail_quote(html_message): | ||||||
|     ''' Cuts the outermost block element with class gmail_quote. ''' |     ''' Cuts the outermost block element with class gmail_quote. ''' | ||||||
|     gmail_quote = html_message.cssselect('.gmail_quote') |     gmail_quote = html_message.cssselect('div.gmail_quote') | ||||||
|     if gmail_quote: |     if gmail_quote and (gmail_quote[0].text is None or not RE_FWD.match(gmail_quote[0].text)): | ||||||
|         gmail_quote[0].getparent().remove(gmail_quote[0]) |         gmail_quote[0].getparent().remove(gmail_quote[0]) | ||||||
|         return True |         return True | ||||||
|  |  | ||||||
| @@ -85,9 +86,12 @@ def cut_gmail_quote(html_message): | |||||||
| def cut_microsoft_quote(html_message): | def cut_microsoft_quote(html_message): | ||||||
|     ''' Cuts splitter block and all following blocks. ''' |     ''' Cuts splitter block and all following blocks. ''' | ||||||
|     splitter = html_message.xpath( |     splitter = html_message.xpath( | ||||||
|         #outlook 2007, 2010 |         #outlook 2007, 2010 (international) | ||||||
|         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" |         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" | ||||||
|         "padding:3.0pt 0cm 0cm 0cm']|" |         "padding:3.0pt 0cm 0cm 0cm']|" | ||||||
|  |         #outlook 2007, 2010 (american) | ||||||
|  |         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" | ||||||
|  |         "padding:3.0pt 0in 0in 0in']|" | ||||||
|         #windows mail |         #windows mail | ||||||
|         "//div[@style='padding-top: 5px; " |         "//div[@style='padding-top: 5px; " | ||||||
|         "border-top-color: rgb(229, 229, 229); " |         "border-top-color: rgb(229, 229, 229); " | ||||||
| @@ -138,9 +142,14 @@ def cut_by_id(html_message): | |||||||
|  |  | ||||||
|  |  | ||||||
| def cut_blockquote(html_message): | def cut_blockquote(html_message): | ||||||
|     ''' Cuts blockquote with wrapping elements. ''' |     ''' Cuts the last non-nested blockquote with wrapping elements.''' | ||||||
|     quote = html_message.find('.//blockquote') |     quote = html_message.xpath( | ||||||
|     if quote is not None: |         '(.//blockquote)' | ||||||
|  |         '[not(@class="gmail_quote") and not(ancestor::blockquote)]' | ||||||
|  |         '[last()]') | ||||||
|  |  | ||||||
|  |     if quote: | ||||||
|  |         quote = quote[0] | ||||||
|         quote.getparent().remove(quote) |         quote.getparent().remove(quote) | ||||||
|         return True |         return True | ||||||
|  |  | ||||||
| @@ -154,13 +163,40 @@ def cut_from_block(html_message): | |||||||
|  |  | ||||||
|     if block: |     if block: | ||||||
|         block = block[-1] |         block = block[-1] | ||||||
|  |         parent_div = None | ||||||
|         while block.getparent() is not None: |         while block.getparent() is not None: | ||||||
|             if block.tag == 'div': |             if block.tag == 'div': | ||||||
|                 block.getparent().remove(block) |                 parent_div = block | ||||||
|  |                 break | ||||||
|  |             block = block.getparent() | ||||||
|  |         if parent_div is not None: | ||||||
|  |             maybe_body = parent_div.getparent() | ||||||
|  |             # In cases where removing this enclosing div will remove all | ||||||
|  |             # content, we should assume the quote is not enclosed in a tag. | ||||||
|  |             parent_div_is_all_content = ( | ||||||
|  |                 maybe_body is not None and maybe_body.tag == 'body' and | ||||||
|  |                 len(maybe_body.getchildren()) == 1) | ||||||
|  |  | ||||||
|  |             if not parent_div_is_all_content: | ||||||
|  |                 parent = block.getparent() | ||||||
|  |                 next_sibling = block.getnext() | ||||||
|  |  | ||||||
|  |                 # remove all tags after found From block | ||||||
|  |                 # (From block and quoted message are in separate divs) | ||||||
|  |                 while next_sibling is not None: | ||||||
|  |                     parent.remove(block) | ||||||
|  |                     block = next_sibling | ||||||
|  |                     next_sibling = block.getnext() | ||||||
|  |  | ||||||
|  |                 # remove the last sibling (or the | ||||||
|  |                 # From block if no siblings) | ||||||
|  |                 if block is not None: | ||||||
|  |                     parent.remove(block) | ||||||
|  |  | ||||||
|                 return True |                 return True | ||||||
|         else: |         else: | ||||||
|                 block = block.getparent() |             return False | ||||||
|     else: |  | ||||||
|     # handle the case when From: block goes right after e.g. <hr> |     # handle the case when From: block goes right after e.g. <hr> | ||||||
|     # and not enclosed in some tag |     # and not enclosed in some tag | ||||||
|     block = html_message.xpath( |     block = html_message.xpath( | ||||||
| @@ -168,7 +204,17 @@ def cut_from_block(html_message): | |||||||
|          "//*[starts-with(mg:tail(), 'Date:')]")) |          "//*[starts-with(mg:tail(), 'Date:')]")) | ||||||
|     if block: |     if block: | ||||||
|         block = block[0] |         block = block[0] | ||||||
|  |  | ||||||
|  |         if RE_FWD.match(block.getparent().text or ''): | ||||||
|  |             return False | ||||||
|  |          | ||||||
|         while(block.getnext() is not None): |         while(block.getnext() is not None): | ||||||
|             block.getparent().remove(block.getnext()) |             block.getparent().remove(block.getnext()) | ||||||
|         block.getparent().remove(block) |         block.getparent().remove(block) | ||||||
|         return True |         return True | ||||||
|  |  | ||||||
|  | def cut_zimbra_quote(html_message): | ||||||
|  |     zDivider = html_message.xpath('//hr[@data-marker="__DIVIDER__"]') | ||||||
|  |     if zDivider: | ||||||
|  |         zDivider[0].getparent().remove(zDivider[0]) | ||||||
|  |         return True | ||||||
|   | |||||||
| @@ -10,10 +10,8 @@ import logging | |||||||
| from copy import deepcopy | from copy import deepcopy | ||||||
|  |  | ||||||
| from lxml import html, etree | from lxml import html, etree | ||||||
| import html2text |  | ||||||
|  |  | ||||||
| from talon.constants import RE_DELIMITER | from talon.utils import get_delimiter, html_to_text | ||||||
| from talon.utils import random_token, get_delimiter |  | ||||||
| from talon import html_quotations | from talon import html_quotations | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -23,7 +21,7 @@ log = logging.getLogger(__name__) | |||||||
| RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) | RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) | ||||||
|  |  | ||||||
| RE_ON_DATE_SMB_WROTE = re.compile( | RE_ON_DATE_SMB_WROTE = re.compile( | ||||||
|     u'(-*[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format( |     u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format( | ||||||
|         # Beginning of the line |         # Beginning of the line | ||||||
|         u'|'.join(( |         u'|'.join(( | ||||||
|             # English |             # English | ||||||
| @@ -33,7 +31,13 @@ RE_ON_DATE_SMB_WROTE = re.compile( | |||||||
|             # Polish |             # Polish | ||||||
|             'W dniu', |             'W dniu', | ||||||
|             # Dutch |             # Dutch | ||||||
|             'Op' |             'Op', | ||||||
|  |             # German | ||||||
|  |             'Am', | ||||||
|  |             # Norwegian | ||||||
|  |             u'På', | ||||||
|  |             # Swedish, Danish | ||||||
|  |             'Den', | ||||||
|         )), |         )), | ||||||
|         # Date and sender separator |         # Date and sender separator | ||||||
|         u'|'.join(( |         u'|'.join(( | ||||||
| @@ -51,18 +55,28 @@ RE_ON_DATE_SMB_WROTE = re.compile( | |||||||
|             # Polish |             # Polish | ||||||
|             u'napisał', |             u'napisał', | ||||||
|             # Dutch |             # Dutch | ||||||
|             'schreef','verzond','geschreven' |             'schreef','verzond','geschreven', | ||||||
|  |             # German | ||||||
|  |             'schrieb', | ||||||
|  |             # Norwegian, Swedish | ||||||
|  |             'skrev', | ||||||
|         )) |         )) | ||||||
|     )) |     )) | ||||||
| # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:' | # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:' | ||||||
| RE_ON_DATE_WROTE_SMB = re.compile( | RE_ON_DATE_WROTE_SMB = re.compile( | ||||||
|     u'(-*[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ].*:)'.format( |     u'(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format( | ||||||
|         # Beginning of the line |         # Beginning of the line | ||||||
|  |         u'|'.join(( | ||||||
|         	'Op', |         	'Op', | ||||||
|  |         	#German | ||||||
|  |         	'Am' | ||||||
|  |         )), | ||||||
|         # Ending of the line |         # Ending of the line | ||||||
|         u'|'.join(( |         u'|'.join(( | ||||||
|             # Dutch |             # Dutch | ||||||
|             'schreef','verzond','geschreven' |             'schreef','verzond','geschreven', | ||||||
|  |             # German | ||||||
|  |             'schrieb' | ||||||
|         )) |         )) | ||||||
|     ) |     ) | ||||||
|     ) |     ) | ||||||
| @@ -93,7 +107,7 @@ RE_EMPTY_QUOTATION = re.compile( | |||||||
|     ( |     ( | ||||||
|         # quotation border: splitter line or a number of quotation marker lines |         # quotation border: splitter line or a number of quotation marker lines | ||||||
|         (?: |         (?: | ||||||
|             s |             (?:se*)+ | ||||||
|             | |             | | ||||||
|             (?:me*){2,} |             (?:me*){2,} | ||||||
|         ) |         ) | ||||||
| @@ -116,20 +130,27 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format( | |||||||
| RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format( | RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format( | ||||||
|     u'|'.join(( |     u'|'.join(( | ||||||
|         # "From" in different languages. |         # "From" in different languages. | ||||||
|         'From', 'Van', 'De', 'Von', 'Fra', |         'From', 'Van', 'De', 'Von', 'Fra', u'Från', | ||||||
|         # "Date" in different languages. |         # "Date" in different languages. | ||||||
|         'Date', 'Datum', u'Envoyé' |         'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt', | ||||||
|     ))), re.I) |     ))), re.I) | ||||||
|  |  | ||||||
| SPLITTER_PATTERNS = [ | SPLITTER_PATTERNS = [ | ||||||
|     RE_ORIGINAL_MESSAGE, |     RE_ORIGINAL_MESSAGE, | ||||||
|     # <date> <person> |  | ||||||
|     re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.VERBOSE), |  | ||||||
|     RE_ON_DATE_SMB_WROTE, |     RE_ON_DATE_SMB_WROTE, | ||||||
|     RE_ON_DATE_WROTE_SMB, |     RE_ON_DATE_WROTE_SMB, | ||||||
|     RE_FROM_COLON_OR_DATE_COLON, |     RE_FROM_COLON_OR_DATE_COLON, | ||||||
|  |     # 02.04.2012 14:20 пользователь "bob@example.com" < | ||||||
|  |     # bob@xxx.mailgun.org> написал: | ||||||
|  |     re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S), | ||||||
|  |     # 2014-10-17 11:28 GMT+03:00 Bob < | ||||||
|  |     # bob@example.com>: | ||||||
|  |     re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S), | ||||||
|  |     # Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>: | ||||||
|     re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' |     re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' | ||||||
|                '( \S+){3,6}@\S+:') |                '( \S+){3,6}@\S+:'), | ||||||
|  |     # Sent from Samsung MobileName <address@example.com> wrote: | ||||||
|  |     re.compile('Sent from Samsung .*@.*> wrote') | ||||||
|     ] |     ] | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -151,7 +172,7 @@ def extract_from(msg_body, content_type='text/plain'): | |||||||
|             return extract_from_plain(msg_body) |             return extract_from_plain(msg_body) | ||||||
|         elif content_type == 'text/html': |         elif content_type == 'text/html': | ||||||
|             return extract_from_html(msg_body) |             return extract_from_html(msg_body) | ||||||
|     except Exception, e: |     except Exception: | ||||||
|         log.exception('ERROR extracting message') |         log.exception('ERROR extracting message') | ||||||
|  |  | ||||||
|     return msg_body |     return msg_body | ||||||
| @@ -182,6 +203,7 @@ def mark_message_lines(lines): | |||||||
|         else: |         else: | ||||||
|             # in case splitter is spread across several lines |             # in case splitter is spread across several lines | ||||||
|             splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES])) |             splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES])) | ||||||
|  |  | ||||||
|             if splitter: |             if splitter: | ||||||
|                 # append as many splitter markers as lines in splitter |                 # append as many splitter markers as lines in splitter | ||||||
|                 splitter_lines = splitter.group().splitlines() |                 splitter_lines = splitter.group().splitlines() | ||||||
| @@ -294,12 +316,8 @@ def extract_from_plain(msg_body): | |||||||
|  |  | ||||||
|     delimiter = get_delimiter(msg_body) |     delimiter = get_delimiter(msg_body) | ||||||
|     msg_body = preprocess(msg_body, delimiter) |     msg_body = preprocess(msg_body, delimiter) | ||||||
|     lines = msg_body.splitlines() |  | ||||||
|  |  | ||||||
|     # don't process too long messages |     # don't process too long messages | ||||||
|     if len(lines) > MAX_LINES_COUNT: |     lines = msg_body.splitlines()[:MAX_LINES_COUNT] | ||||||
|         return stripped_text |  | ||||||
|  |  | ||||||
|     markers = mark_message_lines(lines) |     markers = mark_message_lines(lines) | ||||||
|     lines = process_marked_lines(lines, markers) |     lines = process_marked_lines(lines, markers) | ||||||
|  |  | ||||||
| @@ -325,43 +343,28 @@ def extract_from_html(msg_body): | |||||||
|     then checking deleted checkpoints, |     then checking deleted checkpoints, | ||||||
|     then deleting necessary tags. |     then deleting necessary tags. | ||||||
|     """ |     """ | ||||||
|  |  | ||||||
|     if msg_body.strip() == '': |     if msg_body.strip() == '': | ||||||
|         return msg_body |         return msg_body | ||||||
|  |  | ||||||
|  |     msg_body = msg_body.replace('\r\n', '').replace('\n', '') | ||||||
|     html_tree = html.document_fromstring( |     html_tree = html.document_fromstring( | ||||||
|         msg_body, |         msg_body, | ||||||
|         parser=html.HTMLParser(encoding="utf-8") |         parser=html.HTMLParser(encoding="utf-8") | ||||||
|     ) |     ) | ||||||
|  |  | ||||||
|     cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or |     cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or | ||||||
|  |                       html_quotations.cut_zimbra_quote(html_tree) or | ||||||
|                       html_quotations.cut_blockquote(html_tree) or |                       html_quotations.cut_blockquote(html_tree) or | ||||||
|                       html_quotations.cut_microsoft_quote(html_tree) or |                       html_quotations.cut_microsoft_quote(html_tree) or | ||||||
|                       html_quotations.cut_by_id(html_tree) or |                       html_quotations.cut_by_id(html_tree) or | ||||||
|                       html_quotations.cut_from_block(html_tree) |                       html_quotations.cut_from_block(html_tree) | ||||||
|                       ) |                       ) | ||||||
|  |  | ||||||
|     html_tree_copy = deepcopy(html_tree) |     html_tree_copy = deepcopy(html_tree) | ||||||
|  |  | ||||||
|     number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) |     number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) | ||||||
|     quotation_checkpoints = [False for i in xrange(number_of_checkpoints)] |     quotation_checkpoints = [False] * number_of_checkpoints | ||||||
|     msg_with_checkpoints = html.tostring(html_tree) |     msg_with_checkpoints = html.tostring(html_tree) | ||||||
|  |     plain_text = html_to_text(msg_with_checkpoints) | ||||||
|     h = html2text.HTML2Text() |     plain_text = preprocess(plain_text, '\n', content_type='text/html') | ||||||
|     h.body_width = 0  # generate plain text without wrap |  | ||||||
|  |  | ||||||
|     # html2text adds unnecessary star symbols. Remove them. |  | ||||||
|     # Mask star symbols |  | ||||||
|     msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432') |  | ||||||
|     plain_text = h.handle(msg_with_checkpoints) |  | ||||||
|     # Remove created star symbols |  | ||||||
|     plain_text = plain_text.replace('*', '') |  | ||||||
|     # Unmask saved star symbols |  | ||||||
|     plain_text = plain_text.replace('3423oorkg432', '*') |  | ||||||
|  |  | ||||||
|     delimiter = get_delimiter(plain_text) |  | ||||||
|  |  | ||||||
|     plain_text = preprocess(plain_text, delimiter, content_type='text/html') |  | ||||||
|     lines = plain_text.splitlines() |     lines = plain_text.splitlines() | ||||||
|  |  | ||||||
|     # Don't process too long messages |     # Don't process too long messages | ||||||
| @@ -383,7 +386,6 @@ def extract_from_html(msg_body): | |||||||
|     return_flags = [] |     return_flags = [] | ||||||
|     process_marked_lines(lines, markers, return_flags) |     process_marked_lines(lines, markers, return_flags) | ||||||
|     lines_were_deleted, first_deleted, last_deleted = return_flags |     lines_were_deleted, first_deleted, last_deleted = return_flags | ||||||
|  |  | ||||||
|     if lines_were_deleted: |     if lines_were_deleted: | ||||||
|         #collect checkpoints from deleted lines |         #collect checkpoints from deleted lines | ||||||
|         for i in xrange(first_deleted, last_deleted): |         for i in xrange(first_deleted, last_deleted): | ||||||
|   | |||||||
| @@ -21,11 +21,9 @@ trained against, don't forget to regenerate: | |||||||
| """ | """ | ||||||
|  |  | ||||||
| import os | import os | ||||||
| import sys |  | ||||||
| from cStringIO import StringIO |  | ||||||
|  |  | ||||||
| from . import extraction | from . import extraction | ||||||
| from . extraction import extract | from . extraction import extract  #noqa | ||||||
| from . learning import classifier | from . learning import classifier | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -36,13 +34,5 @@ EXTRACTOR_DATA = os.path.join(DATA_DIR, 'train.data') | |||||||
|  |  | ||||||
|  |  | ||||||
| def initialize(): | def initialize(): | ||||||
|     try: |  | ||||||
|         # redirect output |  | ||||||
|         so, sys.stdout = sys.stdout, StringIO() |  | ||||||
|  |  | ||||||
|     extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME, |     extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME, | ||||||
|                                            EXTRACTOR_DATA) |                                            EXTRACTOR_DATA) | ||||||
|         sys.stdout = so |  | ||||||
|     except Exception, e: |  | ||||||
|         raise Exception( |  | ||||||
|             "Failed initializing signature parsing with classifiers", e) |  | ||||||
|   | |||||||
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								talon/signature/data/classifier_01.npy
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								talon/signature/data/classifier_01.npy
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								talon/signature/data/classifier_02.npy
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								talon/signature/data/classifier_02.npy
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								talon/signature/data/classifier_03.npy
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								talon/signature/data/classifier_03.npy
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								talon/signature/data/classifier_04.npy
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								talon/signature/data/classifier_04.npy
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								talon/signature/data/classifier_05.npy
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								talon/signature/data/classifier_05.npy
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| @@ -1,14 +1,10 @@ | |||||||
| # -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||||
|  |  | ||||||
| import os |  | ||||||
| import logging | import logging | ||||||
|  |  | ||||||
| import regex as re | import regex as re | ||||||
| from PyML import SparseDataSet | import numpy | ||||||
|  |  | ||||||
| from talon.constants import RE_DELIMITER |  | ||||||
| from talon.signature.constants import (SIGNATURE_MAX_LINES, |  | ||||||
|                                        TOO_LONG_SIGNATURE_LINE) |  | ||||||
| from talon.signature.learning.featurespace import features, build_pattern | from talon.signature.learning.featurespace import features, build_pattern | ||||||
| from talon.utils import get_delimiter | from talon.utils import get_delimiter | ||||||
| from talon.signature.bruteforce import get_signature_candidate | from talon.signature.bruteforce import get_signature_candidate | ||||||
| @@ -36,8 +32,8 @@ RE_REVERSE_SIGNATURE = re.compile(r''' | |||||||
|  |  | ||||||
| def is_signature_line(line, sender, classifier): | def is_signature_line(line, sender, classifier): | ||||||
|     '''Checks if the line belongs to signature. Returns True or False.''' |     '''Checks if the line belongs to signature. Returns True or False.''' | ||||||
|     data = SparseDataSet([build_pattern(line, features(sender))]) |     data = numpy.array(build_pattern(line, features(sender))) | ||||||
|     return classifier.decisionFunc(data, 0) > 0 |     return classifier.predict(data) > 0 | ||||||
|  |  | ||||||
|  |  | ||||||
| def extract(body, sender): | def extract(body, sender): | ||||||
| @@ -61,7 +57,7 @@ def extract(body, sender): | |||||||
|                 text = delimiter.join(text) |                 text = delimiter.join(text) | ||||||
|                 if text.strip(): |                 if text.strip(): | ||||||
|                     return (text, delimiter.join(signature)) |                     return (text, delimiter.join(signature)) | ||||||
|     except Exception, e: |     except Exception: | ||||||
|         log.exception('ERROR when extracting signature with classifiers') |         log.exception('ERROR when extracting signature with classifiers') | ||||||
|  |  | ||||||
|     return (body, None) |     return (body, None) | ||||||
|   | |||||||
| @@ -5,32 +5,27 @@ The classifier could be used to detect if a certain line of the message | |||||||
| body belongs to the signature. | body belongs to the signature. | ||||||
| """ | """ | ||||||
|  |  | ||||||
| import os | from numpy import genfromtxt | ||||||
| import sys | from sklearn.svm import LinearSVC | ||||||
|  | from sklearn.externals import joblib | ||||||
| from PyML import SparseDataSet, SVM |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def init(): | def init(): | ||||||
|     '''Inits classifier with optimal options.''' |     """Inits classifier with optimal options.""" | ||||||
|     return SVM(C=10, optimization='liblinear') |     return LinearSVC(C=10.0) | ||||||
|  |  | ||||||
|  |  | ||||||
| def train(classifier, train_data_filename, save_classifier_filename=None): | def train(classifier, train_data_filename, save_classifier_filename=None): | ||||||
|     '''Trains and saves classifier so that it could be easily loaded later.''' |     """Trains and saves classifier so that it could be easily loaded later.""" | ||||||
|     data = SparseDataSet(train_data_filename, labelsColumn=-1) |     file_data = genfromtxt(train_data_filename, delimiter=",") | ||||||
|     classifier.train(data) |     train_data, labels = file_data[:, :-1], file_data[:, -1] | ||||||
|  |     classifier.fit(train_data, labels) | ||||||
|  |  | ||||||
|     if save_classifier_filename: |     if save_classifier_filename: | ||||||
|         classifier.save(save_classifier_filename) |         joblib.dump(classifier, save_classifier_filename) | ||||||
|     return classifier |     return classifier | ||||||
|  |  | ||||||
|  |  | ||||||
| def load(saved_classifier_filename, train_data_filename): | def load(saved_classifier_filename, train_data_filename): | ||||||
|     """Loads saved classifier. |     """Loads saved classifier. """ | ||||||
|  |     return joblib.load(saved_classifier_filename) | ||||||
|     Classifier should be loaded with the same data it was trained against |  | ||||||
|     """ |  | ||||||
|     train_data = SparseDataSet(train_data_filename, labelsColumn=-1) |  | ||||||
|     classifier = init() |  | ||||||
|     classifier.load(saved_classifier_filename, train_data) |  | ||||||
|     return classifier |  | ||||||
|   | |||||||
| @@ -16,8 +16,8 @@ from talon.signature.constants import SIGNATURE_MAX_LINES | |||||||
|  |  | ||||||
| rc = re.compile | rc = re.compile | ||||||
|  |  | ||||||
| RE_EMAIL = rc('@') | RE_EMAIL = rc('\S@\S') | ||||||
| RE_RELAX_PHONE = rc('.*(\(? ?[\d]{2,3} ?\)?.{,3}){2,}') | RE_RELAX_PHONE = rc('(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}') | ||||||
| RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''') | RE_URL = rc(r'''https?://|www\.[\S]+\.[\S]''') | ||||||
|  |  | ||||||
| # Taken from: | # Taken from: | ||||||
| @@ -40,14 +40,6 @@ RE_SIGNATURE_WORDS = rc(('(T|t)hank.*,|(B|b)est|(R|r)egards|' | |||||||
| # Line contains a pattern like Vitor R. Carvalho or William W. Cohen. | # Line contains a pattern like Vitor R. Carvalho or William W. Cohen. | ||||||
| RE_NAME = rc('[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+') | RE_NAME = rc('[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+') | ||||||
|  |  | ||||||
| # Pattern to match if e.g. 'Sender:' header field has sender names. |  | ||||||
| SENDER_WITH_NAME_PATTERN = '([\s]*[\S]+,?)+[\s]*<.*>.*' |  | ||||||
| RE_SENDER_WITH_NAME = rc(SENDER_WITH_NAME_PATTERN) |  | ||||||
|  |  | ||||||
| # Reply line clue line endings, as in regular expression: |  | ||||||
| # " wrote:$" or " writes:$" |  | ||||||
| RE_CLUE_LINE_END = rc('.*(W|w)rotes?:$') |  | ||||||
|  |  | ||||||
| INVALID_WORD_START = rc('\(|\+|[\d]') | INVALID_WORD_START = rc('\(|\+|[\d]') | ||||||
|  |  | ||||||
| BAD_SENDER_NAMES = [ | BAD_SENDER_NAMES = [ | ||||||
| @@ -128,7 +120,7 @@ def contains_sender_names(sender): | |||||||
|     names = names or sender |     names = names or sender | ||||||
|     if names != '': |     if names != '': | ||||||
|         return binary_regex_search(re.compile(names)) |         return binary_regex_search(re.compile(names)) | ||||||
|     return lambda s: False |     return lambda s: 0 | ||||||
|  |  | ||||||
|  |  | ||||||
| def extract_names(sender): | def extract_names(sender): | ||||||
| @@ -142,7 +134,7 @@ def extract_names(sender): | |||||||
|     >>> extract_names('') |     >>> extract_names('') | ||||||
|     [] |     [] | ||||||
|     """ |     """ | ||||||
|     sender = to_unicode(sender) |     sender = to_unicode(sender, precise=True) | ||||||
|     # Remove non-alphabetical characters |     # Remove non-alphabetical characters | ||||||
|     sender = "".join([char if char.isalpha() else ' ' for char in sender]) |     sender = "".join([char if char.isalpha() else ' ' for char in sender]) | ||||||
|     # Remove too short words and words from "black" list i.e. |     # Remove too short words and words from "black" list i.e. | ||||||
| @@ -169,7 +161,7 @@ def categories_percent(s, categories): | |||||||
|     50.0 |     50.0 | ||||||
|     ''' |     ''' | ||||||
|     count = 0 |     count = 0 | ||||||
|     s = to_unicode(s) |     s = to_unicode(s, precise=True) | ||||||
|     for c in s: |     for c in s: | ||||||
|         if unicodedata.category(c) in categories: |         if unicodedata.category(c) in categories: | ||||||
|             count += 1 |             count += 1 | ||||||
| @@ -189,7 +181,7 @@ def punctuation_percent(s): | |||||||
|  |  | ||||||
| def capitalized_words_percent(s): | def capitalized_words_percent(s): | ||||||
|     '''Returns capitalized words percent.''' |     '''Returns capitalized words percent.''' | ||||||
|     s = to_unicode(s) |     s = to_unicode(s, precise=True) | ||||||
|     words = re.split('\s', s) |     words = re.split('\s', s) | ||||||
|     words = [w for w in words if w.strip()] |     words = [w for w in words if w.strip()] | ||||||
|     capitalized_words_counter = 0 |     capitalized_words_counter = 0 | ||||||
|   | |||||||
							
								
								
									
										119
									
								
								talon/utils.py
									
									
									
									
									
								
							
							
						
						
									
										119
									
								
								talon/utils.py
									
									
									
									
									
								
							| @@ -2,13 +2,16 @@ | |||||||
|  |  | ||||||
| import logging | import logging | ||||||
| from random import shuffle | from random import shuffle | ||||||
|  | import chardet | ||||||
|  | import cchardet | ||||||
|  | import regex as re | ||||||
|  |  | ||||||
|  | from lxml import html | ||||||
|  | from lxml.cssselect import CSSSelector | ||||||
|  |  | ||||||
| from talon.constants import RE_DELIMITER | from talon.constants import RE_DELIMITER | ||||||
|  |  | ||||||
|  |  | ||||||
| log = logging.getLogger(__name__) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def safe_format(format_string, *args, **kwargs): | def safe_format(format_string, *args, **kwargs): | ||||||
|     """ |     """ | ||||||
|     Helper: formats string with any combination of bytestrings/unicode |     Helper: formats string with any combination of bytestrings/unicode | ||||||
| @@ -42,12 +45,42 @@ def to_unicode(str_or_unicode, precise=False): | |||||||
|         u'привет' |         u'привет' | ||||||
|     If `precise` flag is True, tries to guess the correct encoding first. |     If `precise` flag is True, tries to guess the correct encoding first. | ||||||
|     """ |     """ | ||||||
|     encoding = detect_encoding(str_or_unicode) if precise else 'utf-8' |     encoding = quick_detect_encoding(str_or_unicode) if precise else 'utf-8' | ||||||
|     if isinstance(str_or_unicode, str): |     if isinstance(str_or_unicode, str): | ||||||
|         return unicode(str_or_unicode, encoding, 'replace') |         return unicode(str_or_unicode, encoding, 'replace') | ||||||
|     return str_or_unicode |     return str_or_unicode | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def detect_encoding(string): | ||||||
|  |     """ | ||||||
|  |     Tries to detect the encoding of the passed string. | ||||||
|  |  | ||||||
|  |     Defaults to UTF-8. | ||||||
|  |     """ | ||||||
|  |     try: | ||||||
|  |         detected = chardet.detect(string) | ||||||
|  |         if detected: | ||||||
|  |             return detected.get('encoding') or 'utf-8' | ||||||
|  |     except Exception, e: | ||||||
|  |         pass | ||||||
|  |     return 'utf-8' | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def quick_detect_encoding(string): | ||||||
|  |     """ | ||||||
|  |     Tries to detect the encoding of the passed string. | ||||||
|  |  | ||||||
|  |     Uses cchardet. Fallbacks to detect_encoding. | ||||||
|  |     """ | ||||||
|  |     try: | ||||||
|  |         detected = cchardet.detect(string) | ||||||
|  |         if detected: | ||||||
|  |             return detected.get('encoding') or detect_encoding(string) | ||||||
|  |     except Exception, e: | ||||||
|  |         pass | ||||||
|  |     return detect_encoding(string) | ||||||
|  |  | ||||||
|  |  | ||||||
| def to_utf8(str_or_unicode): | def to_utf8(str_or_unicode): | ||||||
|     """ |     """ | ||||||
|     Safely returns a UTF-8 version of a given string |     Safely returns a UTF-8 version of a given string | ||||||
| @@ -74,3 +107,81 @@ def get_delimiter(msg_body): | |||||||
|         delimiter = '\n' |         delimiter = '\n' | ||||||
|  |  | ||||||
|     return delimiter |     return delimiter | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def html_to_text(string): | ||||||
|  |     """ | ||||||
|  |     Dead-simple HTML-to-text converter: | ||||||
|  |         >>> html_to_text("one<br>two<br>three") | ||||||
|  |         >>> "one\ntwo\nthree" | ||||||
|  |  | ||||||
|  |     NOTES: | ||||||
|  |         1. the string is expected to contain UTF-8 encoded HTML! | ||||||
|  |         2. returns utf-8 encoded str (not unicode) | ||||||
|  |     """ | ||||||
|  |     s = _prepend_utf8_declaration(string) | ||||||
|  |     s = s.replace("\n", "") | ||||||
|  |  | ||||||
|  |     tree = html.fromstring(s) | ||||||
|  |  | ||||||
|  |     for style in CSSSelector('style')(tree): | ||||||
|  |         style.getparent().remove(style) | ||||||
|  |  | ||||||
|  |     for c in tree.xpath('//comment()'): | ||||||
|  |         c.getparent().remove(c) | ||||||
|  |  | ||||||
|  |     text   = "" | ||||||
|  |     for el in tree.iter(): | ||||||
|  |         el_text = (el.text or '') + (el.tail or '') | ||||||
|  |         if len(el_text) > 1: | ||||||
|  |             if el.tag in _BLOCKTAGS: | ||||||
|  |                 text += "\n" | ||||||
|  |             if el.tag == 'li': | ||||||
|  |                 text += "  * " | ||||||
|  |             text += el_text.strip() + " " | ||||||
|  |  | ||||||
|  |             # add href to the output | ||||||
|  |             href = el.attrib.get('href') | ||||||
|  |             if href: | ||||||
|  |                 text += "(%s) " % href | ||||||
|  |  | ||||||
|  |         if el.tag in _HARDBREAKS and text and not text.endswith("\n"): | ||||||
|  |             text += "\n" | ||||||
|  |  | ||||||
|  |     retval = _rm_excessive_newlines(text) | ||||||
|  |     return _encode_utf8(retval) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _contains_charset_spec(s): | ||||||
|  |     """Return True if the first 4KB contain charset spec | ||||||
|  |     """ | ||||||
|  |     return s.lower().find('html; charset=', 0, 4096) != -1 | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _prepend_utf8_declaration(s): | ||||||
|  |     """Prepend 'utf-8' encoding declaration if the first 4KB don't have any | ||||||
|  |     """ | ||||||
|  |     return s if _contains_charset_spec(s) else _UTF8_DECLARATION + s | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _rm_excessive_newlines(s): | ||||||
|  |     """Remove excessive newlines that often happen due to tons of divs | ||||||
|  |     """ | ||||||
|  |     return _RE_EXCESSIVE_NEWLINES.sub("\n\n", s).strip() | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def _encode_utf8(s): | ||||||
|  |     """Encode in 'utf-8' if unicode | ||||||
|  |     """ | ||||||
|  |     return s.encode('utf-8') if isinstance(s, unicode) else s | ||||||
|  |  | ||||||
|  |  | ||||||
|  | _UTF8_DECLARATION = ('<meta http-equiv="Content-Type" content="text/html;' | ||||||
|  |                      'charset=utf-8">') | ||||||
|  |  | ||||||
|  |  | ||||||
|  | _BLOCKTAGS  = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] | ||||||
|  | _HARDBREAKS = ['br', 'hr', 'tr'] | ||||||
|  |  | ||||||
|  |  | ||||||
|  | _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") | ||||||
|   | |||||||
							
								
								
									
										1
									
								
								tests/fixtures/html_replies/hotmail.html
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								tests/fixtures/html_replies/hotmail.html
									
									
									
									
										vendored
									
									
								
							| @@ -1,3 +1,4 @@ | |||||||
|  | <?xml version="1.0" encoding="UTF-8"?> | ||||||
| <html> | <html> | ||||||
| <head> | <head> | ||||||
| <style><!-- | <style><!-- | ||||||
|   | |||||||
							
								
								
									
										87
									
								
								tests/fixtures/html_replies/ms_outlook_2010.html
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										87
									
								
								tests/fixtures/html_replies/ms_outlook_2010.html
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,87 @@ | |||||||
|  | <html> | ||||||
|  | <head> | ||||||
|  | <meta http-equiv="Content-Type" content="text/html; charset=iso-2022-jp"> | ||||||
|  | <meta name="Generator" content="Microsoft Word 14 (filtered medium)"> | ||||||
|  | <style><!-- | ||||||
|  | /* Font Definitions */ | ||||||
|  | @font-face | ||||||
|  | 	{font-family:Calibri; | ||||||
|  | 	panose-1:2 15 5 2 2 2 4 3 2 4;} | ||||||
|  | @font-face | ||||||
|  | 	{font-family:Tahoma; | ||||||
|  | 	panose-1:2 11 6 4 3 5 4 4 2 4;} | ||||||
|  | /* Style Definitions */ | ||||||
|  | p.MsoNormal, li.MsoNormal, div.MsoNormal | ||||||
|  | 	{margin:0in; | ||||||
|  | 	margin-bottom:.0001pt; | ||||||
|  | 	font-size:12.0pt; | ||||||
|  | 	font-family:"Times New Roman","serif";} | ||||||
|  | h3 | ||||||
|  | 	{mso-style-priority:9; | ||||||
|  | 	mso-style-link:"Heading 3 Char"; | ||||||
|  | 	mso-margin-top-alt:auto; | ||||||
|  | 	margin-right:0in; | ||||||
|  | 	mso-margin-bottom-alt:auto; | ||||||
|  | 	margin-left:0in; | ||||||
|  | 	font-size:13.5pt; | ||||||
|  | 	font-family:"Times New Roman","serif"; | ||||||
|  | 	font-weight:bold;} | ||||||
|  | a:link, span.MsoHyperlink | ||||||
|  | 	{mso-style-priority:99; | ||||||
|  | 	color:blue; | ||||||
|  | 	text-decoration:underline;} | ||||||
|  | a:visited, span.MsoHyperlinkFollowed | ||||||
|  | 	{mso-style-priority:99; | ||||||
|  | 	color:purple; | ||||||
|  | 	text-decoration:underline;} | ||||||
|  | p | ||||||
|  | 	{mso-style-priority:99; | ||||||
|  | 	mso-margin-top-alt:auto; | ||||||
|  | 	margin-right:0in; | ||||||
|  | 	mso-margin-bottom-alt:auto; | ||||||
|  | 	margin-left:0in; | ||||||
|  | 	font-size:12.0pt; | ||||||
|  | 	font-family:"Times New Roman","serif";} | ||||||
|  | span.Heading3Char | ||||||
|  | 	{mso-style-name:"Heading 3 Char"; | ||||||
|  | 	mso-style-priority:9; | ||||||
|  | 	mso-style-link:"Heading 3"; | ||||||
|  | 	font-family:"Cambria","serif"; | ||||||
|  | 	color:#4F81BD; | ||||||
|  | 	font-weight:bold;} | ||||||
|  | span.EmailStyle19 | ||||||
|  | 	{mso-style-type:personal-reply; | ||||||
|  | 	font-family:"Calibri","sans-serif"; | ||||||
|  | 	color:#1F497D;} | ||||||
|  | .MsoChpDefault | ||||||
|  | 	{mso-style-type:export-only; | ||||||
|  | 	font-family:"Calibri","sans-serif";} | ||||||
|  | @page WordSection1 | ||||||
|  | 	{size:8.5in 11.0in; | ||||||
|  | 	margin:1.0in 1.0in 1.0in 1.0in;} | ||||||
|  | div.WordSection1 | ||||||
|  | 	{page:WordSection1;} | ||||||
|  | --></style><!--[if gte mso 9]><xml> | ||||||
|  | <o:shapedefaults v:ext="edit" spidmax="1026" /> | ||||||
|  | </xml><![endif]--><!--[if gte mso 9]><xml> | ||||||
|  | <o:shapelayout v:ext="edit"> | ||||||
|  | <o:idmap v:ext="edit" data="1" /> | ||||||
|  | </o:shapelayout></xml><![endif]--> | ||||||
|  | </head> | ||||||
|  | <body lang="EN-US" link="blue" vlink="purple"> | ||||||
|  | <div class="WordSection1"> | ||||||
|  | <p class="MsoNormal"><span style="font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D">Hi. I am fine.<o:p></o:p></span></p> | ||||||
|  | <p class="MsoNormal"><span style="font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D">Thanks,<o:p></o:p></span></p> | ||||||
|  | <p class="MsoNormal"><span style="font-size:11.0pt;font-family:"Calibri","sans-serif";color:#1F497D">Alex<o:p></o:p></span></p> | ||||||
|  | <p class="MsoNormal"><b><span style="font-size:10.0pt;font-family:"Tahoma","sans-serif"">From:</span></b><span style="font-size:10.0pt;font-family:"Tahoma","sans-serif""> Foo [mailto:foo@bar.com] | ||||||
|  | <b>On Behalf Of </b>baz@bar.com<br> | ||||||
|  | <b>Sent:</b> Monday, January 01, 2000 12:00 AM<br> | ||||||
|  | <b>To:</b> john@bar.com<br> | ||||||
|  | <b>Cc:</b> jane@bar.io<br> | ||||||
|  | <b>Subject:</b> Conversation<o:p></o:p></span></p> | ||||||
|  | <p class="MsoNormal"><o:p> </o:p></p> | ||||||
|  | <p>Hello! How are you?<o:p></o:p></p> | ||||||
|  | <p class="MsoNormal"><o:p> </o:p></p> | ||||||
|  | </div> | ||||||
|  | </body> | ||||||
|  | </html> | ||||||
							
								
								
									
										19
									
								
								tests/fixtures/standard_replies/apple_mail_2.eml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								tests/fixtures/standard_replies/apple_mail_2.eml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,19 @@ | |||||||
|  | Content-Type: text/plain; | ||||||
|  | 	charset=us-ascii | ||||||
|  | Mime-Version: 1.0 (Mac OS X Mail 8.2 \(2104\)) | ||||||
|  | Subject: Re: Hello there | ||||||
|  | X-Universally-Unique-Identifier: 85B1075D-5841-46A9-8565-FCB287A93AC4 | ||||||
|  | From: Adam Renberg <adam@tictail.com> | ||||||
|  | In-Reply-To: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com> | ||||||
|  | Date: Sat, 22 Aug 2015 19:22:20 +0200 | ||||||
|  | Content-Transfer-Encoding: 7bit | ||||||
|  | X-Smtp-Server: smtp.gmail.com:adam@tictail.com | ||||||
|  | Message-Id: <68001B29-8EA4-444C-A894-0537D2CA5208@tictail.com> | ||||||
|  | References: <CABzQGhkMXDxUt_tSVQcg=43aniUhtsVfCZVzu-PG0kwS_uzqMw@mail.gmail.com> | ||||||
|  | To: Adam Renberg <tgwizard@gmail.com> | ||||||
|  |  | ||||||
|  | Hello | ||||||
|  | > On 22 Aug 2015, at 19:21, Adam Renberg <tgwizard@gmail.com> wrote: | ||||||
|  | > | ||||||
|  | > Hi there! | ||||||
|  |  | ||||||
							
								
								
									
										2
									
								
								tests/fixtures/standard_replies/iphone.eml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								tests/fixtures/standard_replies/iphone.eml
									
									
									
									
										vendored
									
									
								
							| @@ -9,7 +9,7 @@ To: bob <bob@example.com> | |||||||
| Content-Transfer-Encoding: quoted-printable | Content-Transfer-Encoding: quoted-printable | ||||||
| Mime-Version: 1.0 (1.0) | Mime-Version: 1.0 (1.0) | ||||||
|  |  | ||||||
| hello | Hello | ||||||
|  |  | ||||||
| Sent from my iPhone | Sent from my iPhone | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										3
									
								
								tests/fixtures/standard_replies/iphone_reply_text
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								tests/fixtures/standard_replies/iphone_reply_text
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | |||||||
|  | Hello | ||||||
|  |  | ||||||
|  | Sent from my iPhone | ||||||
| @@ -4,11 +4,8 @@ from . import * | |||||||
| from . fixtures import * | from . fixtures import * | ||||||
|  |  | ||||||
| import regex as re | import regex as re | ||||||
| from flanker import mime |  | ||||||
|  |  | ||||||
| from talon import quotations | from talon import quotations, utils as u | ||||||
|  |  | ||||||
| import html2text |  | ||||||
|  |  | ||||||
|  |  | ||||||
| RE_WHITESPACE = re.compile("\s") | RE_WHITESPACE = re.compile("\s") | ||||||
| @@ -46,7 +43,25 @@ def test_quotation_splitter_outside_blockquote(): | |||||||
|   </div> |   </div> | ||||||
| </blockquote> | </blockquote> | ||||||
| """ | """ | ||||||
|     eq_("<html><body><p>Reply</p><div></div></body></html>", |     eq_("<html><body><p>Reply</p></body></html>", | ||||||
|  |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_regular_blockquote(): | ||||||
|  |     msg_body = """Reply | ||||||
|  | <blockquote>Regular</blockquote> | ||||||
|  |  | ||||||
|  | <div> | ||||||
|  |   On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: | ||||||
|  | </div> | ||||||
|  |  | ||||||
|  | <blockquote> | ||||||
|  |   <div> | ||||||
|  |     <blockquote>Nested</blockquote> | ||||||
|  |   </div> | ||||||
|  | </blockquote> | ||||||
|  | """ | ||||||
|  |     eq_("<html><body><p>Reply</p><blockquote>Regular</blockquote></body></html>", | ||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -116,6 +131,29 @@ def test_gmail_quote(): | |||||||
|         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_gmail_quote_compact(): | ||||||
|  |     msg_body = 'Reply' \ | ||||||
|  |                '<div class="gmail_quote">' \ | ||||||
|  |                '<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:' \ | ||||||
|  |                '<div>Test</div>' \ | ||||||
|  |                '</div>' \ | ||||||
|  |                '</div>' | ||||||
|  |     eq_("<html><body><p>Reply</p></body></html>", | ||||||
|  |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_gmail_quote_blockquote(): | ||||||
|  |     msg_body = """Message | ||||||
|  | <blockquote class="gmail_quote"> | ||||||
|  |   <div class="gmail_default"> | ||||||
|  |     My name is William Shakespeare. | ||||||
|  |     <br/> | ||||||
|  |   </div> | ||||||
|  | </blockquote>""" | ||||||
|  |     eq_(RE_WHITESPACE.sub('', msg_body), | ||||||
|  |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_unicode_in_reply(): | def test_unicode_in_reply(): | ||||||
|     msg_body = u"""Reply \xa0 \xa0 Text<br> |     msg_body = u"""Reply \xa0 \xa0 Text<br> | ||||||
|  |  | ||||||
| @@ -123,7 +161,7 @@ def test_unicode_in_reply(): | |||||||
|   <br> |   <br> | ||||||
| </div> | </div> | ||||||
|  |  | ||||||
| <blockquote class="gmail_quote"> | <blockquote> | ||||||
|   Quote |   Quote | ||||||
| </blockquote>""".encode("utf-8") | </blockquote>""".encode("utf-8") | ||||||
|  |  | ||||||
| @@ -224,10 +262,7 @@ def test_reply_shares_div_with_from_block(): | |||||||
|  |  | ||||||
|  |  | ||||||
| def test_reply_quotations_share_block(): | def test_reply_quotations_share_block(): | ||||||
|     msg = mime.from_string(REPLY_QUOTATIONS_SHARE_BLOCK) |     stripped_html = quotations.extract_from_plain(REPLY_QUOTATIONS_SHARE_BLOCK) | ||||||
|     html_part = list(msg.walk())[1] |  | ||||||
|     assert html_part.content_type == 'text/html' |  | ||||||
|     stripped_html = quotations.extract_from_html(html_part.body) |  | ||||||
|     ok_(stripped_html) |     ok_(stripped_html) | ||||||
|     ok_('From' not in stripped_html) |     ok_('From' not in stripped_html) | ||||||
|  |  | ||||||
| @@ -244,26 +279,35 @@ def test_reply_separated_by_hr(): | |||||||
|             '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) |             '', quotations.extract_from_html(REPLY_SEPARATED_BY_HR))) | ||||||
|  |  | ||||||
|  |  | ||||||
| RE_REPLY = re.compile(r"^Hi\. I am fine\.\s*\n\s*Thanks,\s*\n\s*Alex\s*$") | def test_from_block_and_quotations_in_separate_divs(): | ||||||
|  |     msg_body = ''' | ||||||
|  | Reply | ||||||
|  | <div> | ||||||
|  |   <hr/> | ||||||
|  |   <div> | ||||||
|  |     <font> | ||||||
|  |       <b>From: bob@example.com</b> | ||||||
|  |       <b>Date: Thu, 24 Mar 2016 08:07:12 -0700</b> | ||||||
|  |     </font> | ||||||
|  |   </div> | ||||||
|  |   <div> | ||||||
|  |     Quoted message | ||||||
|  |   </div> | ||||||
|  | </div> | ||||||
|  | ''' | ||||||
|  |     eq_('<html><body><p>Reply</p><div><hr></div></body></html>', | ||||||
|  |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|  |  | ||||||
|  |  | ||||||
| def extract_reply_and_check(filename): | def extract_reply_and_check(filename): | ||||||
|     f = open(filename) |     f = open(filename) | ||||||
|  |  | ||||||
|     msg_body = f.read().decode("utf-8") |     msg_body = f.read() | ||||||
|     reply = quotations.extract_from_html(msg_body) |     reply = quotations.extract_from_html(msg_body) | ||||||
|  |     plain_reply = u.html_to_text(reply) | ||||||
|  |  | ||||||
|     h = html2text.HTML2Text() |     eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"), | ||||||
|     h.body_width = 0 |         RE_WHITESPACE.sub('', plain_reply)) | ||||||
|     plain_reply = h.handle(reply) |  | ||||||
|  |  | ||||||
|     #remove   spaces |  | ||||||
|     plain_reply = plain_reply.replace(u'\xa0', u' ') |  | ||||||
|  |  | ||||||
|     if RE_REPLY.match(plain_reply): |  | ||||||
|         eq_(1, 1) |  | ||||||
|     else: |  | ||||||
|         eq_("Hi. I am fine.\n\nThanks,\nAlex", plain_reply) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_gmail_reply(): | def test_gmail_reply(): | ||||||
| @@ -286,6 +330,10 @@ def test_ms_outlook_2007_reply(): | |||||||
|     extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2007.html") |     extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2007.html") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_ms_outlook_2010_reply(): | ||||||
|  |     extract_reply_and_check("tests/fixtures/html_replies/ms_outlook_2010.html") | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_thunderbird_reply(): | def test_thunderbird_reply(): | ||||||
|     extract_reply_and_check("tests/fixtures/html_replies/thunderbird.html") |     extract_reply_and_check("tests/fixtures/html_replies/thunderbird.html") | ||||||
|  |  | ||||||
| @@ -296,3 +344,37 @@ def test_windows_mail_reply(): | |||||||
|  |  | ||||||
| def test_yandex_ru_reply(): | def test_yandex_ru_reply(): | ||||||
|     extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html") |     extract_reply_and_check("tests/fixtures/html_replies/yandex_ru.html") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_CRLF(): | ||||||
|  |     """CR is not converted to '
' | ||||||
|  |     """ | ||||||
|  |     symbol = '
' | ||||||
|  |     extracted = quotations.extract_from_html('<html>\r\n</html>') | ||||||
|  |     assert_false(symbol in extracted) | ||||||
|  |     eq_('<html></html>', RE_WHITESPACE.sub('', extracted)) | ||||||
|  |  | ||||||
|  |     msg_body = """Reply | ||||||
|  | <blockquote> | ||||||
|  |  | ||||||
|  |   <div> | ||||||
|  |     On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote: | ||||||
|  |   </div> | ||||||
|  |  | ||||||
|  |   <div> | ||||||
|  |     Test | ||||||
|  |   </div> | ||||||
|  |  | ||||||
|  | </blockquote>""" | ||||||
|  |     msg_body = msg_body.replace('\n', '\r\n') | ||||||
|  |     extracted = quotations.extract_from_html(msg_body) | ||||||
|  |     assert_false(symbol in extracted)     | ||||||
|  |     eq_("<html><body><p>Reply</p></body></html>", | ||||||
|  |         RE_WHITESPACE.sub('', extracted)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_gmail_forwarded_msg(): | ||||||
|  |     msg_body = """<div dir="ltr"><br><div class="gmail_quote">---------- Forwarded message ----------<br>From: <b class="gmail_sendername">Bob</b> <span dir="ltr"><<a href="mailto:bob@example.com">bob@example.com</a>></span><br>Date: Fri, Feb 11, 2010 at 5:59 PM<br>Subject: Bob WFH today<br>To: Mary <<a href="mailto:mary@example.com">mary@example.com</a>><br><br><br><div dir="ltr">eom</div> | ||||||
|  | </div><br></div>""" | ||||||
|  |     extracted = quotations.extract_from_html(msg_body) | ||||||
|  |     eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) | ||||||
|   | |||||||
| @@ -3,8 +3,6 @@ | |||||||
| from . import * | from . import * | ||||||
| from . fixtures import * | from . fixtures import * | ||||||
|  |  | ||||||
| from flanker import mime |  | ||||||
|  |  | ||||||
| from talon import quotations | from talon import quotations | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -2,10 +2,6 @@ | |||||||
|  |  | ||||||
| from .. import * | from .. import * | ||||||
|  |  | ||||||
| import os |  | ||||||
|  |  | ||||||
| from flanker import mime |  | ||||||
|  |  | ||||||
| from talon.signature import bruteforce | from talon.signature import bruteforce | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -4,8 +4,6 @@ from .. import * | |||||||
|  |  | ||||||
| import os | import os | ||||||
|  |  | ||||||
| from PyML import SparseDataSet |  | ||||||
|  |  | ||||||
| from talon.signature.learning import dataset | from talon.signature.learning import dataset | ||||||
| from talon import signature | from talon import signature | ||||||
| from talon.signature import extraction as e | from talon.signature import extraction as e | ||||||
|   | |||||||
| @@ -3,9 +3,8 @@ | |||||||
| from ... import * | from ... import * | ||||||
| import os | import os | ||||||
|  |  | ||||||
| from PyML import SparseDataSet | from numpy import genfromtxt | ||||||
|  |  | ||||||
| from talon.utils import to_unicode |  | ||||||
| from talon.signature.learning import dataset as d | from talon.signature.learning import dataset as d | ||||||
|  |  | ||||||
| from talon.signature.learning.featurespace import features | from talon.signature.learning.featurespace import features | ||||||
| @@ -42,10 +41,13 @@ def test_build_extraction_dataset(): | |||||||
|     d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'), |     d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'), | ||||||
|                                os.path.join(TMP_DIR, |                                os.path.join(TMP_DIR, | ||||||
|                                             'extraction.data'), 1) |                                             'extraction.data'), 1) | ||||||
|     test_data = SparseDataSet(os.path.join(TMP_DIR, 'extraction.data'), |  | ||||||
|                               labelsColumn=-1) |     filename = os.path.join(TMP_DIR, 'extraction.data') | ||||||
|  |     file_data = genfromtxt(filename, delimiter=",") | ||||||
|  |     test_data = file_data[:, :-1] | ||||||
|  |  | ||||||
|     # the result is a loadable signature extraction dataset |     # the result is a loadable signature extraction dataset | ||||||
|     # 32 comes from 3 emails in emails/P folder, 11 lines checked to be |     # 32 comes from 3 emails in emails/P folder, 11 lines checked to be | ||||||
|     # a signature, one email has only 10 lines |     # a signature, one email has only 10 lines | ||||||
|     eq_(test_data.size(), 32) |     eq_(test_data.shape[0], 32) | ||||||
|     eq_(len(features('')), test_data.numFeatures) |     eq_(len(features('')), test_data.shape[1]) | ||||||
|   | |||||||
| @@ -6,7 +6,9 @@ from talon.signature.learning import featurespace as fs | |||||||
|  |  | ||||||
|  |  | ||||||
| def test_apply_features(): | def test_apply_features(): | ||||||
|     s = '''John Doe |     s = '''This is John Doe | ||||||
|  |  | ||||||
|  | Tuesday @3pm suits. I'll chat to you then. | ||||||
|  |  | ||||||
| VP Research and Development, Xxxx Xxxx Xxxxx | VP Research and Development, Xxxx Xxxx Xxxxx | ||||||
|  |  | ||||||
| @@ -19,11 +21,12 @@ john@example.com''' | |||||||
|     # note that we don't consider the first line because signatures don't |     # note that we don't consider the first line because signatures don't | ||||||
|     # usually take all the text, empty lines are not considered |     # usually take all the text, empty lines are not considered | ||||||
|     eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], |     eq_(result, [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], | ||||||
|  |                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], | ||||||
|                  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], |                  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], | ||||||
|                  [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], |                  [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], | ||||||
|                  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) |                  [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]) | ||||||
|  |  | ||||||
|     with patch.object(fs, 'SIGNATURE_MAX_LINES', 4): |     with patch.object(fs, 'SIGNATURE_MAX_LINES', 5): | ||||||
|         features = fs.features(sender) |         features = fs.features(sender) | ||||||
|         new_result = fs.apply_features(s, features) |         new_result = fs.apply_features(s, features) | ||||||
|         # result remains the same because we don't consider empty lines |         # result remains the same because we don't consider empty lines | ||||||
|   | |||||||
| @@ -43,7 +43,7 @@ VALID_PHONE_NUMBERS = [e.strip() for e in VALID.splitlines() if e.strip()] | |||||||
|  |  | ||||||
| def test_match_phone_numbers(): | def test_match_phone_numbers(): | ||||||
|     for phone in VALID_PHONE_NUMBERS: |     for phone in VALID_PHONE_NUMBERS: | ||||||
|         ok_(RE_RELAX_PHONE.match(phone), "{} should be matched".format(phone)) |         ok_(RE_RELAX_PHONE.search(phone), "{} should be matched".format(phone)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_match_names(): | def test_match_names(): | ||||||
| @@ -52,29 +52,6 @@ def test_match_names(): | |||||||
|         ok_(RE_NAME.match(name), "{} should be matched".format(name)) |         ok_(RE_NAME.match(name), "{} should be matched".format(name)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_sender_with_name(): |  | ||||||
|     ok_lines = ['Sergey Obukhov <serobnic@example.com>', |  | ||||||
|                 '\tSergey  <serobnic@example.com>', |  | ||||||
|                 ('"Doe, John (TX)"' |  | ||||||
|                  '<DowJ@example.com>@EXAMPLE' |  | ||||||
|                  '<IMCEANOTES-+22Doe+2C+20John+20' |  | ||||||
|                  '+28TX+29+22+20+3CDoeJ+40example+2Ecom+3E' |  | ||||||
|                  '+40EXAMPLE@EXAMPLE.com>'), |  | ||||||
|                 ('Company Sleuth <csleuth@email.xxx.com>' |  | ||||||
|                  '@EXAMPLE <XXX-Company+20Sleuth+20+3Ccsleuth' |  | ||||||
|                  '+40email+2Exxx+2Ecom+3E+40EXAMPLE@EXAMPLE.com>'), |  | ||||||
|                 ('Doe III, John ' |  | ||||||
|                  '</O=EXAMPLE/OU=NA/CN=RECIPIENTS/CN=jDOE5>')] |  | ||||||
|     for line in ok_lines: |  | ||||||
|         ok_(RE_SENDER_WITH_NAME.match(line), |  | ||||||
|             '{} should be matched'.format(line)) |  | ||||||
|  |  | ||||||
|     nok_lines = ['', '<serobnic@xxx.ru>', 'Sergey serobnic@xxx.ru'] |  | ||||||
|     for line in nok_lines: |  | ||||||
|         assert_false(RE_SENDER_WITH_NAME.match(line), |  | ||||||
|                      '{} should not be matched'.format(line)) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # Now test helpers functions | # Now test helpers functions | ||||||
| def test_binary_regex_search(): | def test_binary_regex_search(): | ||||||
|     eq_(1, h.binary_regex_search(re.compile("12"))("12")) |     eq_(1, h.binary_regex_search(re.compile("12"))("12")) | ||||||
|   | |||||||
| @@ -5,19 +5,18 @@ from . fixtures import * | |||||||
|  |  | ||||||
| import os | import os | ||||||
|  |  | ||||||
| from flanker import mime | import email.iterators | ||||||
|  |  | ||||||
| from talon import quotations | from talon import quotations | ||||||
|  |  | ||||||
|  |  | ||||||
| @patch.object(quotations, 'MAX_LINES_COUNT', 1) | @patch.object(quotations, 'MAX_LINES_COUNT', 1) | ||||||
| def test_too_many_lines(): | def test_too_many_lines(): | ||||||
|     msg_body = """Test reply |     msg_body = """Test reply | ||||||
|  | Hi | ||||||
| -----Original Message----- | -----Original Message----- | ||||||
|  |  | ||||||
| Test""" | Test""" | ||||||
|     eq_(msg_body, quotations.extract_from_plain(msg_body)) |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_pattern_on_date_somebody_wrote(): | def test_pattern_on_date_somebody_wrote(): | ||||||
| @@ -33,6 +32,19 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote: | |||||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_pattern_sent_from_samsung_smb_wrote(): | ||||||
|  |     msg_body = """Test reply | ||||||
|  |  | ||||||
|  | Sent from Samsung MobileName <address@example.com> wrote: | ||||||
|  |  | ||||||
|  | > | ||||||
|  | > Test | ||||||
|  | > | ||||||
|  | > Roman""" | ||||||
|  |  | ||||||
|  |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_pattern_on_date_wrote_somebody(): | def test_pattern_on_date_wrote_somebody(): | ||||||
|     eq_('Lorem', quotations.extract_from_plain( |     eq_('Lorem', quotations.extract_from_plain( | ||||||
|     """Lorem |     """Lorem | ||||||
| @@ -55,6 +67,18 @@ On 04/19/2011 07:10 AM, Roman Tkachenko wrote: | |||||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_date_time_email_splitter(): | ||||||
|  |     msg_body = """Test reply | ||||||
|  |  | ||||||
|  | 2014-10-17 11:28 GMT+03:00 Postmaster < | ||||||
|  | postmaster@sandboxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.mailgun.org>: | ||||||
|  |  | ||||||
|  | > First from site | ||||||
|  | > | ||||||
|  |     """ | ||||||
|  |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_pattern_on_date_somebody_wrote_allows_space_in_front(): | def test_pattern_on_date_somebody_wrote_allows_space_in_front(): | ||||||
|     msg_body = """Thanks Thanmai |     msg_body = """Thanks Thanmai | ||||||
|  On Mar 8, 2012 9:59 AM, "Example.com" < |  On Mar 8, 2012 9:59 AM, "Example.com" < | ||||||
| @@ -312,6 +336,33 @@ Emne: The manager has commented on your Loop | |||||||
| Blah-blah-blah | Blah-blah-blah | ||||||
| """)) | """)) | ||||||
|  |  | ||||||
|  | def test_swedish_from_block(): | ||||||
|  |     eq_('Allo! Follow up MIME!', quotations.extract_from_plain( | ||||||
|  |     u"""Allo! Follow up MIME! | ||||||
|  | Från: Anno Sportel [mailto:anno.spoel@hsbcssad.com] | ||||||
|  | Skickat: den 26 augusti 2015 14:45 | ||||||
|  | Till: Isacson Leiff | ||||||
|  | Ämne: RE: Week 36 | ||||||
|  |  | ||||||
|  | Blah-blah-blah | ||||||
|  | """)) | ||||||
|  |  | ||||||
|  | def test_swedish_from_line(): | ||||||
|  |     eq_('Lorem', quotations.extract_from_plain( | ||||||
|  |     """Lorem | ||||||
|  | Den 14 september, 2015 02:23:18, Valentino Rudy (valentino@rudy.be) skrev: | ||||||
|  |  | ||||||
|  | Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse. | ||||||
|  | """)) | ||||||
|  |  | ||||||
|  | def test_norwegian_from_line(): | ||||||
|  |     eq_('Lorem', quotations.extract_from_plain( | ||||||
|  |     u"""Lorem | ||||||
|  | På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev: | ||||||
|  |  | ||||||
|  | Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse. | ||||||
|  | """)) | ||||||
|  |  | ||||||
| def test_dutch_from_block(): | def test_dutch_from_block(): | ||||||
|     eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain( |     eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain( | ||||||
|     """Gluten-free culpa lo-fi et nesciunt nostrud.  |     """Gluten-free culpa lo-fi et nesciunt nostrud.  | ||||||
| @@ -614,22 +665,21 @@ def test_preprocess_postprocess_2_links(): | |||||||
| def test_standard_replies(): | def test_standard_replies(): | ||||||
|     for filename in os.listdir(STANDARD_REPLIES): |     for filename in os.listdir(STANDARD_REPLIES): | ||||||
|         filename = os.path.join(STANDARD_REPLIES, filename) |         filename = os.path.join(STANDARD_REPLIES, filename) | ||||||
|         if os.path.isdir(filename): |         if not filename.endswith('.eml') or os.path.isdir(filename): | ||||||
|             continue |             continue | ||||||
|         with open(filename) as f: |         with open(filename) as f: | ||||||
|             msg = f.read() |             message = email.message_from_file(f) | ||||||
|             m = mime.from_string(msg) |             body = email.iterators.typed_subpart_iterator(message, subtype='plain').next() | ||||||
|             for part in m.walk(): |             text = ''.join(email.iterators.body_line_iterator(body, True)) | ||||||
|                 if part.content_type == 'text/plain': |  | ||||||
|                     text = part.body |  | ||||||
|             stripped_text = quotations.extract_from_plain(text) |             stripped_text = quotations.extract_from_plain(text) | ||||||
|             reply_text_fn = filename[:-4] + '_reply_text' |             reply_text_fn = filename[:-4] + '_reply_text' | ||||||
|             if os.path.isfile(reply_text_fn): |             if os.path.isfile(reply_text_fn): | ||||||
|                 with open(reply_text_fn) as f: |                 with open(reply_text_fn) as f: | ||||||
|                             reply_text = f.read() |                     reply_text = f.read().strip() | ||||||
|             else: |             else: | ||||||
|                 reply_text = 'Hello' |                 reply_text = 'Hello' | ||||||
|                     eq_(reply_text, stripped_text, |             yield eq_, reply_text, stripped_text, \ | ||||||
|                         "'%(reply)s' != %(stripped)s for %(fn)s" % |                 "'%(reply)s' != %(stripped)s for %(fn)s" % \ | ||||||
|                 {'reply': reply_text, 'stripped': stripped_text, |                 {'reply': reply_text, 'stripped': stripped_text, | ||||||
|                          'fn': filename}) |                  'fn': filename} | ||||||
|   | |||||||
| @@ -1,9 +1,107 @@ | |||||||
|  | # coding:utf-8 | ||||||
|  |  | ||||||
| from . import * | from . import * | ||||||
|  |  | ||||||
| from talon import utils | from talon import utils as u | ||||||
|  | import cchardet | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_get_delimiter(): | def test_get_delimiter(): | ||||||
|     eq_('\r\n', utils.get_delimiter('abc\r\n123')) |     eq_('\r\n', u.get_delimiter('abc\r\n123')) | ||||||
|     eq_('\n', utils.get_delimiter('abc\n123')) |     eq_('\n', u.get_delimiter('abc\n123')) | ||||||
|     eq_('\n', utils.get_delimiter('abc')) |     eq_('\n', u.get_delimiter('abc')) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_unicode(): | ||||||
|  |     eq_ (u'hi', u.to_unicode('hi')) | ||||||
|  |     eq_ (type(u.to_unicode('hi')), unicode ) | ||||||
|  |     eq_ (type(u.to_unicode(u'hi')), unicode ) | ||||||
|  |     eq_ (type(u.to_unicode('привет')), unicode ) | ||||||
|  |     eq_ (type(u.to_unicode(u'привет')), unicode ) | ||||||
|  |     eq_ (u"привет", u.to_unicode('привет')) | ||||||
|  |     eq_ (u"привет", u.to_unicode(u'привет')) | ||||||
|  |     # some latin1 stuff | ||||||
|  |     eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_detect_encoding(): | ||||||
|  |     eq_ ('ascii', u.detect_encoding('qwe').lower()) | ||||||
|  |     eq_ ('iso-8859-2', u.detect_encoding('Versi\xf3n').lower()) | ||||||
|  |     eq_ ('utf-8', u.detect_encoding('привет').lower()) | ||||||
|  |     # fallback to utf-8 | ||||||
|  |     with patch.object(u.chardet, 'detect') as detect: | ||||||
|  |         detect.side_effect = Exception | ||||||
|  |         eq_ ('utf-8', u.detect_encoding('qwe').lower()) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_quick_detect_encoding(): | ||||||
|  |     eq_ ('ascii', u.quick_detect_encoding('qwe').lower()) | ||||||
|  |     eq_ ('windows-1252', u.quick_detect_encoding('Versi\xf3n').lower()) | ||||||
|  |     eq_ ('utf-8', u.quick_detect_encoding('привет').lower()) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @patch.object(cchardet, 'detect') | ||||||
|  | @patch.object(u, 'detect_encoding') | ||||||
|  | def test_quick_detect_encoding_edge_cases(detect_encoding, cchardet_detect): | ||||||
|  |     cchardet_detect.return_value = {'encoding': 'ascii'} | ||||||
|  |     eq_('ascii', u.quick_detect_encoding("qwe")) | ||||||
|  |     cchardet_detect.assert_called_once_with("qwe") | ||||||
|  |  | ||||||
|  |     # fallback to detect_encoding | ||||||
|  |     cchardet_detect.return_value = {} | ||||||
|  |     detect_encoding.return_value = 'utf-8' | ||||||
|  |     eq_('utf-8', u.quick_detect_encoding("qwe")) | ||||||
|  |  | ||||||
|  |     # exception | ||||||
|  |     detect_encoding.reset_mock() | ||||||
|  |     cchardet_detect.side_effect = Exception() | ||||||
|  |     detect_encoding.return_value = 'utf-8' | ||||||
|  |     eq_('utf-8', u.quick_detect_encoding("qwe")) | ||||||
|  |     ok_(detect_encoding.called) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_html_to_text(): | ||||||
|  |     html = """<body> | ||||||
|  | <p>Hello world!</p> | ||||||
|  | <br> | ||||||
|  | <ul> | ||||||
|  | <li>One!</li> | ||||||
|  | <li>Two</li> | ||||||
|  | </ul> | ||||||
|  | <p> | ||||||
|  | Haha | ||||||
|  | </p> | ||||||
|  | </body>""" | ||||||
|  |     text = u.html_to_text(html) | ||||||
|  |     eq_("Hello world! \n\n  * One! \n  * Two \nHaha", text) | ||||||
|  |     eq_("привет!", u.html_to_text("<b>привет!</b>")) | ||||||
|  |  | ||||||
|  |     html = '<body><br/><br/>Hi</body>' | ||||||
|  |     eq_ ('Hi', u.html_to_text(html)) | ||||||
|  |  | ||||||
|  |     html = """Hi | ||||||
|  | <style type="text/css"> | ||||||
|  |  | ||||||
|  | div, p, li { | ||||||
|  |  | ||||||
|  | font: 13px 'Lucida Grande', Arial, sans-serif; | ||||||
|  |  | ||||||
|  | } | ||||||
|  | </style> | ||||||
|  |  | ||||||
|  | <style type="text/css"> | ||||||
|  |  | ||||||
|  | h1 { | ||||||
|  |  | ||||||
|  | font: 13px 'Lucida Grande', Arial, sans-serif; | ||||||
|  |  | ||||||
|  | } | ||||||
|  | </style>""" | ||||||
|  |     eq_ ('Hi', u.html_to_text(html)) | ||||||
|  |  | ||||||
|  |     html = """<div> | ||||||
|  | <!-- COMMENT 1 --> | ||||||
|  | <span>TEXT 1</span> | ||||||
|  | <p>TEXT 2 <!-- COMMENT 2 --></p> | ||||||
|  | </div>""" | ||||||
|  |     eq_('TEXT 1 \nTEXT 2', u.html_to_text(html)) | ||||||
|   | |||||||
							
								
								
									
										10
									
								
								train.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								train.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,10 @@ | |||||||
|  | from talon.signature import EXTRACTOR_FILENAME, EXTRACTOR_DATA | ||||||
|  | from talon.signature.learning.classifier import train, init | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def train_model(): | ||||||
|  |     """ retrain model and persist """ | ||||||
|  |     train(init(), EXTRACTOR_DATA, EXTRACTOR_FILENAME) | ||||||
|  |  | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     train_model() | ||||||
		Reference in New Issue
	
	Block a user