Compare commits
	
		
			32 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 1a5548f171 | ||
|  | 53c49b9121 | ||
|  | bd50872043 | ||
|  | d37c4fd551 | ||
|  | d9ed7cc6d1 | ||
|  | 0a0808c0a8 | ||
|  | 16354e3528 | ||
|  | 1018e88ec1 | ||
|  | 2916351517 | ||
|  | 46d4b02c81 | ||
|  | 58eac88a10 | ||
|  | 2ef3d8dfbe | ||
|  | 7cf4c29340 | ||
|  | cdd84563dd | ||
|  | 8138ea9a60 | ||
|  | c171f9a875 | ||
|  | 3f97a8b8ff | ||
|  | 1147767ff3 | ||
|  | 6a304215c3 | ||
|  | 31714506bd | ||
|  | 403d80cf3b | ||
|  | 7cf20f2877 | ||
|  | afff08b017 | ||
|  | 685abb1905 | ||
|  | 41990727a3 | ||
|  | b113d8ab33 | ||
|  | 7bd0e9cc2f | ||
|  | 1e030a51d4 | ||
|  | 238a5de5cc | ||
|  | 53b24ffb3d | ||
|  | a7404afbcb | ||
|  | 0e6d5f993c | 
							
								
								
									
										20
									
								
								.build/Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								.build/Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,20 @@ | |||||||
|  | FROM python:3.9-slim-buster AS deps | ||||||
|  |  | ||||||
|  | RUN apt-get update && \ | ||||||
|  |     apt-get install -y build-essential git curl python3-dev libatlas3-base libatlas-base-dev liblapack-dev libxml2 libxml2-dev libffi6 libffi-dev musl-dev libxslt-dev | ||||||
|  |  | ||||||
|  | FROM deps AS testable | ||||||
|  | ARG REPORT_PATH | ||||||
|  |  | ||||||
|  | VOLUME ["/var/mailgun", "/etc/mailgun/ssl", ${REPORT_PATH}] | ||||||
|  |  | ||||||
|  | ADD . /app | ||||||
|  | WORKDIR /app | ||||||
|  | COPY wheel/* /wheel/ | ||||||
|  |  | ||||||
|  | RUN mkdir -p ${REPORT_PATH} | ||||||
|  |  | ||||||
|  | RUN python ./setup.py build bdist_wheel -d /wheel && \ | ||||||
|  |     pip install --no-deps /wheel/* | ||||||
|  |  | ||||||
|  | ENTRYPOINT ["/bin/sh", "/app/run_tests.sh"] | ||||||
							
								
								
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -54,3 +54,6 @@ _trial_temp | |||||||
|  |  | ||||||
| # OSX | # OSX | ||||||
| .DS_Store | .DS_Store | ||||||
|  |  | ||||||
|  | # vim-backup | ||||||
|  | *.bak | ||||||
|   | |||||||
| @@ -5,3 +5,10 @@ include classifier | |||||||
| include LICENSE | include LICENSE | ||||||
| include MANIFEST.in | include MANIFEST.in | ||||||
| include README.rst | include README.rst | ||||||
|  | include talon/signature/data/train.data | ||||||
|  | include talon/signature/data/classifier | ||||||
|  | include talon/signature/data/classifier_01.npy | ||||||
|  | include talon/signature/data/classifier_02.npy | ||||||
|  | include talon/signature/data/classifier_03.npy | ||||||
|  | include talon/signature/data/classifier_04.npy | ||||||
|  | include talon/signature/data/classifier_05.npy | ||||||
|   | |||||||
							
								
								
									
										16
									
								
								README.rst
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								README.rst
									
									
									
									
									
								
							| @@ -129,6 +129,22 @@ start using it for talon. | |||||||
| .. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set | .. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set | ||||||
| .. _forge: https://github.com/mailgun/forge | .. _forge: https://github.com/mailgun/forge | ||||||
|  |  | ||||||
|  | Training on your dataset | ||||||
|  | ------------------------ | ||||||
|  |  | ||||||
|  | talon comes with a pre-processed dataset and a pre-trained classifier. To retrain the classifier on your own dataset of raw emails, structure and annotate them in the same way the `forge`_ project does. Then do: | ||||||
|  |  | ||||||
|  | .. code:: python | ||||||
|  |  | ||||||
|  |     from talon.signature.learning.dataset import build_extraction_dataset | ||||||
|  |     from talon.signature.learning import classifier as c  | ||||||
|  |      | ||||||
|  |     build_extraction_dataset("/path/to/your/P/folder", "/path/to/talon/signature/data/train.data") | ||||||
|  |     c.train(c.init(), "/path/to/talon/signature/data/train.data", "/path/to/talon/signature/data/classifier") | ||||||
|  |  | ||||||
|  | Note that for signature extraction you need just the folder with the positive samples with annotated signature lines (P folder). | ||||||
|  |  | ||||||
|  | .. _forge: https://github.com/mailgun/forge | ||||||
|  |  | ||||||
| Research | Research | ||||||
| -------- | -------- | ||||||
|   | |||||||
							
								
								
									
										11
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,11 @@ | |||||||
|  | chardet>=1.0.1 | ||||||
|  | cchardet>=0.3.5 | ||||||
|  | cssselect | ||||||
|  | html5lib | ||||||
|  | joblib | ||||||
|  | lxml>=2.3.3 | ||||||
|  | numpy | ||||||
|  | regex>=1 | ||||||
|  | scikit-learn>=1.0.0 | ||||||
|  | scipy | ||||||
|  | six>=1.10.0 | ||||||
							
								
								
									
										4
									
								
								run_tests.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										4
									
								
								run_tests.sh
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,4 @@ | |||||||
|  | #!/usr/bin/env bash | ||||||
|  | set -ex | ||||||
|  | REPORT_PATH="${REPORT_PATH:-./}" | ||||||
|  | nosetests --with-xunit --with-coverage --cover-xml --cover-xml-file $REPORT_PATH/coverage.xml --xunit-file=$REPORT_PATH/nosetests.xml --cover-package=talon . | ||||||
							
								
								
									
										31
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										31
									
								
								setup.py
									
									
									
									
									
								
							| @@ -19,17 +19,17 @@ class InstallCommand(install): | |||||||
|         if self.no_ml: |         if self.no_ml: | ||||||
|             dist = self.distribution |             dist = self.distribution | ||||||
|             dist.packages=find_packages(exclude=[ |             dist.packages=find_packages(exclude=[ | ||||||
|                 'tests', |                 "tests", | ||||||
|                 'tests.*', |                 "tests.*", | ||||||
|                 'talon.signature', |                 "talon.signature", | ||||||
|                 'talon.signature.*', |                 "talon.signature.*", | ||||||
|             ]) |             ]) | ||||||
|             for not_required in ['numpy', 'scipy', 'scikit-learn==0.16.1']: |             for not_required in ["numpy", "scipy", "scikit-learn==0.24.1"]: | ||||||
|                 dist.install_requires.remove(not_required) |                 dist.install_requires.remove(not_required) | ||||||
|  |  | ||||||
|  |  | ||||||
| setup(name='talon', | setup(name='talon', | ||||||
|       version='1.4.4', |       version='1.4.9', | ||||||
|       description=("Mailgun library " |       description=("Mailgun library " | ||||||
|                    "to extract message quotations and signatures."), |                    "to extract message quotations and signatures."), | ||||||
|       long_description=open("README.rst").read(), |       long_description=open("README.rst").read(), | ||||||
| @@ -44,20 +44,21 @@ setup(name='talon', | |||||||
|       include_package_data=True, |       include_package_data=True, | ||||||
|       zip_safe=True, |       zip_safe=True, | ||||||
|       install_requires=[ |       install_requires=[ | ||||||
|           "lxml>=2.3.3", |           "lxml", | ||||||
|           "regex>=1", |           "regex", | ||||||
|           "numpy", |           "numpy", | ||||||
|           "scipy", |           "scipy", | ||||||
|           "scikit-learn>=0.16.1", # pickled versions of classifier, else rebuild |           "scikit-learn>=1.0.0", | ||||||
|           'chardet>=1.0.1', |           "chardet", | ||||||
|           'cchardet>=0.3.5', |           "cchardet", | ||||||
|           'cssselect', |           "cssselect", | ||||||
|           'six>=1.10.0', |           "six", | ||||||
|           'html5lib' |           "html5lib", | ||||||
|  |           "joblib", | ||||||
|           ], |           ], | ||||||
|       tests_require=[ |       tests_require=[ | ||||||
|           "mock", |           "mock", | ||||||
|           "nose>=1.2.1", |           "nose", | ||||||
|           "coverage" |           "coverage" | ||||||
|           ] |           ] | ||||||
|       ) |       ) | ||||||
|   | |||||||
| @@ -87,23 +87,24 @@ def cut_gmail_quote(html_message): | |||||||
|  |  | ||||||
| def cut_microsoft_quote(html_message): | def cut_microsoft_quote(html_message): | ||||||
|     ''' Cuts splitter block and all following blocks. ''' |     ''' Cuts splitter block and all following blocks. ''' | ||||||
|  |     #use EXSLT extensions to have a regex match() function with lxml | ||||||
|  |     ns = {"re": "http://exslt.org/regular-expressions"} | ||||||
|  |  | ||||||
|  |     #general pattern: @style='border:none;border-top:solid <color> 1.0pt;padding:3.0pt 0<unit> 0<unit> 0<unit>' | ||||||
|  |     #outlook 2007, 2010 (international) <color=#B5C4DF> <unit=cm> | ||||||
|  |     #outlook 2007, 2010 (american)      <color=#B5C4DF> <unit=pt> | ||||||
|  |     #outlook 2013       (international) <color=#E1E1E1> <unit=cm> | ||||||
|  |     #outlook 2013       (american)      <color=#E1E1E1> <unit=pt> | ||||||
|  |     #also handles a variant with a space after the semicolon | ||||||
|     splitter = html_message.xpath( |     splitter = html_message.xpath( | ||||||
|         #outlook 2007, 2010 (international) |         #outlook 2007, 2010, 2013 (international, american) | ||||||
|         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" |         "//div[@style[re:match(., 'border:none; ?border-top:solid #(E1E1E1|B5C4DF) 1.0pt; ?" | ||||||
|         "padding:3.0pt 0cm 0cm 0cm']|" |         "padding:3.0pt 0(in|cm) 0(in|cm) 0(in|cm)')]]|" | ||||||
|         #outlook 2007, 2010 (american) |  | ||||||
|         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" |  | ||||||
|         "padding:3.0pt 0in 0in 0in']|" |  | ||||||
|         #outlook 2013 (international) |  | ||||||
|         "//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;" |  | ||||||
|         "padding:3.0pt 0cm 0cm 0cm']|" |  | ||||||
|         #outlook 2013 (american) |  | ||||||
|         "//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;" |  | ||||||
|         "padding:3.0pt 0in 0in 0in']|" |  | ||||||
|         #windows mail |         #windows mail | ||||||
|         "//div[@style='padding-top: 5px; " |         "//div[@style='padding-top: 5px; " | ||||||
|         "border-top-color: rgb(229, 229, 229); " |         "border-top-color: rgb(229, 229, 229); " | ||||||
|         "border-top-width: 1px; border-top-style: solid;']" |         "border-top-width: 1px; border-top-style: solid;']" | ||||||
|  |         , namespaces=ns | ||||||
|     ) |     ) | ||||||
|  |  | ||||||
|     if splitter: |     if splitter: | ||||||
|   | |||||||
| @@ -22,7 +22,7 @@ import six | |||||||
| log = logging.getLogger(__name__) | log = logging.getLogger(__name__) | ||||||
|  |  | ||||||
|  |  | ||||||
| RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) | RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+\s*$", re.I | re.M) | ||||||
|  |  | ||||||
| RE_ON_DATE_SMB_WROTE = re.compile( | RE_ON_DATE_SMB_WROTE = re.compile( | ||||||
|     u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format( |     u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format( | ||||||
| @@ -38,6 +38,8 @@ RE_ON_DATE_SMB_WROTE = re.compile( | |||||||
|             'Op', |             'Op', | ||||||
|             # German |             # German | ||||||
|             'Am', |             'Am', | ||||||
|  |             # Portuguese | ||||||
|  |             'Em', | ||||||
|             # Norwegian |             # Norwegian | ||||||
|             u'På', |             u'På', | ||||||
|             # Swedish, Danish |             # Swedish, Danish | ||||||
| @@ -64,6 +66,8 @@ RE_ON_DATE_SMB_WROTE = re.compile( | |||||||
|             'schreef','verzond','geschreven', |             'schreef','verzond','geschreven', | ||||||
|             # German |             # German | ||||||
|             'schrieb', |             'schrieb', | ||||||
|  |             # Portuguese | ||||||
|  |             'escreveu', | ||||||
|             # Norwegian, Swedish |             # Norwegian, Swedish | ||||||
|             'skrev', |             'skrev', | ||||||
|             # Vietnamese |             # Vietnamese | ||||||
| @@ -135,13 +139,17 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format( | |||||||
|         'Oprindelig meddelelse', |         'Oprindelig meddelelse', | ||||||
|     ))), re.I) |     ))), re.I) | ||||||
|  |  | ||||||
| RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]?.*'.format( | RE_FROM_COLON_OR_DATE_COLON = re.compile(u'((_+\r?\n)?[\s]*:?[*]?({})[\s]?:([^\n$]+\n){{1,2}}){{2,}}'.format( | ||||||
|     u'|'.join(( |     u'|'.join(( | ||||||
|         # "From" in different languages. |         # "From" in different languages. | ||||||
|         'From', 'Van', 'De', 'Von', 'Fra', u'Från', |         'From', 'Van', 'De', 'Von', 'Fra', u'Från', | ||||||
|         # "Date" in different languages. |         # "Date" in different languages. | ||||||
|         'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt', |         'Date', '[S]ent', 'Datum', u'Envoyé', 'Skickat', 'Sendt', 'Gesendet', | ||||||
|     ))), re.I) |         # "Subject" in different languages. | ||||||
|  |         'Subject', 'Betreff', 'Objet', 'Emne', u'Ämne', | ||||||
|  |         # "To" in different languages. | ||||||
|  |         'To', 'An', 'Til', u'À', 'Till' | ||||||
|  |     ))), re.I | re.M) | ||||||
|  |  | ||||||
| # ---- John Smith wrote ---- | # ---- John Smith wrote ---- | ||||||
| RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format( | RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format( | ||||||
| @@ -165,15 +173,15 @@ SPLITTER_PATTERNS = [ | |||||||
|     RE_FROM_COLON_OR_DATE_COLON, |     RE_FROM_COLON_OR_DATE_COLON, | ||||||
|     # 02.04.2012 14:20 пользователь "bob@example.com" < |     # 02.04.2012 14:20 пользователь "bob@example.com" < | ||||||
|     # bob@xxx.mailgun.org> написал: |     # bob@xxx.mailgun.org> написал: | ||||||
|     re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S), |     re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*\s\S+@\S+", re.S), | ||||||
|     # 2014-10-17 11:28 GMT+03:00 Bob < |     # 2014-10-17 11:28 GMT+03:00 Bob < | ||||||
|     # bob@example.com>: |     # bob@example.com>: | ||||||
|     re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S), |     re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*\s\S+@\S+", re.S), | ||||||
|     # Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>: |     # Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>: | ||||||
|     re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' |     re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' | ||||||
|                '( \S+){3,6}@\S+:'), |                '( \S+){3,6}@\S+:'), | ||||||
|     # Sent from Samsung MobileName <address@example.com> wrote: |     # Sent from Samsung MobileName <address@example.com> wrote: | ||||||
|     re.compile('Sent from Samsung .*@.*> wrote'), |     re.compile('Sent from Samsung.* \S+@\S+> wrote'), | ||||||
|     RE_ANDROID_WROTE, |     RE_ANDROID_WROTE, | ||||||
|     RE_POLYMAIL |     RE_POLYMAIL | ||||||
|     ] |     ] | ||||||
| @@ -286,7 +294,7 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): | |||||||
|     # inlined reply |     # inlined reply | ||||||
|     # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm' |     # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm' | ||||||
|     # both 't' entries should be found |     # both 't' entries should be found | ||||||
|     for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers): |     for inline_reply in re.finditer('(?<=m)e*(t[te]*)m', markers): | ||||||
|         # long links could break sequence of quotation lines but they shouldn't |         # long links could break sequence of quotation lines but they shouldn't | ||||||
|         # be considered an inline reply |         # be considered an inline reply | ||||||
|         links = ( |         links = ( | ||||||
| @@ -430,6 +438,9 @@ def _extract_from_html(msg_body): | |||||||
|     Extract not quoted message from provided html message body |     Extract not quoted message from provided html message body | ||||||
|     using tags and plain text algorithm. |     using tags and plain text algorithm. | ||||||
|  |  | ||||||
|  |     Cut out first some encoding html tags such as xml and doctype | ||||||
|  |     for avoiding conflict with unicode decoding | ||||||
|  |  | ||||||
|     Cut out the 'blockquote', 'gmail_quote' tags. |     Cut out the 'blockquote', 'gmail_quote' tags. | ||||||
|     Cut Microsoft quotations. |     Cut Microsoft quotations. | ||||||
|  |  | ||||||
| @@ -445,6 +456,9 @@ def _extract_from_html(msg_body): | |||||||
|         return msg_body |         return msg_body | ||||||
|  |  | ||||||
|     msg_body = msg_body.replace(b'\r\n', b'\n') |     msg_body = msg_body.replace(b'\r\n', b'\n') | ||||||
|  |  | ||||||
|  |     msg_body = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body) | ||||||
|  |  | ||||||
|     html_tree = html_document_fromstring(msg_body) |     html_tree = html_document_fromstring(msg_body) | ||||||
|  |  | ||||||
|     if html_tree is None: |     if html_tree is None: | ||||||
| @@ -502,9 +516,69 @@ def _extract_from_html(msg_body): | |||||||
|     if _readable_text_empty(html_tree_copy): |     if _readable_text_empty(html_tree_copy): | ||||||
|         return msg_body |         return msg_body | ||||||
|  |  | ||||||
|  |     # NOTE: We remove_namespaces() because we are using an HTML5 Parser, HTML | ||||||
|  |     # parsers do not recognize namespaces in HTML tags. As such the rendered | ||||||
|  |     # HTML tags are no longer recognizable HTML tags. Example: <o:p> becomes | ||||||
|  |     # <oU0003Ap>. When we port this to golang we should look into using an | ||||||
|  |     # XML Parser NOT and HTML5 Parser since we do not know what input a | ||||||
|  |     # customer will send us. Switching to a common XML parser in python | ||||||
|  |     # opens us up to a host of vulnerabilities. | ||||||
|  |     # See https://docs.python.org/3/library/xml.html#xml-vulnerabilities | ||||||
|  |     # | ||||||
|  |     # The down sides to removing the namespaces is that customers might | ||||||
|  |     # judge the XML namespaces important. If that is the case then support | ||||||
|  |     # should encourage customers to preform XML parsing of the un-stripped | ||||||
|  |     # body to get the full unmodified XML payload. | ||||||
|  |     # | ||||||
|  |     # Alternatives to this approach are | ||||||
|  |     # 1. Ignore the U0003A in tag names and let the customer deal with it. | ||||||
|  |     #    This is not ideal, as most customers use stripped-html for viewing | ||||||
|  |     #    emails sent from a recipient, as such they cannot control the HTML | ||||||
|  |     #    provided by a recipient. | ||||||
|  |     # 2. Preform a string replace of 'U0003A' to ':' on the rendered HTML | ||||||
|  |     #    string. While this would solve the issue simply, it runs the risk | ||||||
|  |     #    of replacing data outside the <tag> which might be essential to | ||||||
|  |     #    the customer. | ||||||
|  |     remove_namespaces(html_tree_copy) | ||||||
|     return html.tostring(html_tree_copy) |     return html.tostring(html_tree_copy) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def remove_namespaces(root): | ||||||
|  |     """ | ||||||
|  |     Given the root of an HTML document iterate through all the elements | ||||||
|  |     and remove any namespaces that might have been provided and remove | ||||||
|  |     any attributes that contain a namespace | ||||||
|  |  | ||||||
|  |     <html xmlns:o="urn:schemas-microsoft-com:office:office"> | ||||||
|  |     becomes | ||||||
|  |     <html> | ||||||
|  |  | ||||||
|  |     <o:p>Hi</o:p> | ||||||
|  |     becomes | ||||||
|  |     <p>Hi</p> | ||||||
|  |  | ||||||
|  |     Start tags do NOT have a namespace; COLON characters have no special meaning. | ||||||
|  |     if we don't remove the namespace the parser translates the tag name into a | ||||||
|  |     unicode representation. For example <o:p> becomes <oU0003Ap> | ||||||
|  |  | ||||||
|  |     See https://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#start-tags | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     """ | ||||||
|  |     for child in root.iter(): | ||||||
|  |         for key, value in child.attrib.items(): | ||||||
|  |             # If the attribute includes a colon | ||||||
|  |             if key.rfind("U0003A") != -1: | ||||||
|  |                 child.attrib.pop(key) | ||||||
|  |  | ||||||
|  |         # If the tag includes a colon | ||||||
|  |         idx = child.tag.rfind("U0003A") | ||||||
|  |         if idx != -1: | ||||||
|  |             child.tag = child.tag[idx+6:] | ||||||
|  |  | ||||||
|  |     return root | ||||||
|  |  | ||||||
|  |  | ||||||
| def split_emails(msg): | def split_emails(msg): | ||||||
|     """ |     """ | ||||||
|     Given a message (which may consist of an email conversation thread with |     Given a message (which may consist of an email conversation thread with | ||||||
| @@ -557,7 +631,6 @@ def _correct_splitlines_in_headers(markers, lines): | |||||||
|     updated_markers = "" |     updated_markers = "" | ||||||
|     i = 0 |     i = 0 | ||||||
|     in_header_block = False |     in_header_block = False | ||||||
|  |  | ||||||
|     for m in markers: |     for m in markers: | ||||||
|         # Only set in_header_block flag when we hit an 's' and line is a header |         # Only set in_header_block flag when we hit an 's' and line is a header | ||||||
|         if m == 's': |         if m == 's': | ||||||
|   | |||||||
							
								
								
									
										1
									
								
								talon/signature/data/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								talon/signature/data/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | |||||||
|  |  | ||||||
										
											Binary file not shown.
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -8,7 +8,7 @@ body belongs to the signature. | |||||||
| from __future__ import absolute_import | from __future__ import absolute_import | ||||||
|  |  | ||||||
| from numpy import genfromtxt | from numpy import genfromtxt | ||||||
| from sklearn.externals import joblib | import joblib | ||||||
| from sklearn.svm import LinearSVC | from sklearn.svm import LinearSVC | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -131,7 +131,7 @@ def html_tree_to_text(tree): | |||||||
|     for el in tree.iter(): |     for el in tree.iter(): | ||||||
|         el_text = (el.text or '') + (el.tail or '') |         el_text = (el.text or '') + (el.tail or '') | ||||||
|         if len(el_text) > 1: |         if len(el_text) > 1: | ||||||
|             if el.tag in _BLOCKTAGS: |             if el.tag in _BLOCKTAGS + _HARDBREAKS: | ||||||
|                 text += "\n" |                 text += "\n" | ||||||
|             if el.tag == 'li': |             if el.tag == 'li': | ||||||
|                 text += "  * " |                 text += "  * " | ||||||
| @@ -142,7 +142,8 @@ def html_tree_to_text(tree): | |||||||
|             if href: |             if href: | ||||||
|                 text += "(%s) " % href |                 text += "(%s) " % href | ||||||
|  |  | ||||||
|         if el.tag in _HARDBREAKS and text and not text.endswith("\n"): |         if (el.tag in _HARDBREAKS and text and | ||||||
|  |             not text.endswith("\n") and not el_text): | ||||||
|             text += "\n" |             text += "\n" | ||||||
|  |  | ||||||
|     retval = _rm_excessive_newlines(text) |     retval = _rm_excessive_newlines(text) | ||||||
|   | |||||||
							
								
								
									
										3
									
								
								test-requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								test-requirements.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | |||||||
|  | coverage | ||||||
|  | mock | ||||||
|  | nose>=1.2.1 | ||||||
| @@ -8,6 +8,7 @@ import re | |||||||
| from talon import quotations, utils as u | from talon import quotations, utils as u | ||||||
| from . import * | from . import * | ||||||
| from .fixtures import * | from .fixtures import * | ||||||
|  | from lxml import html | ||||||
|  |  | ||||||
| RE_WHITESPACE = re.compile("\s") | RE_WHITESPACE = re.compile("\s") | ||||||
| RE_DOUBLE_WHITESPACE = re.compile("\s") | RE_DOUBLE_WHITESPACE = re.compile("\s") | ||||||
| @@ -424,3 +425,23 @@ def test_readable_html_empty(): | |||||||
| def test_bad_html(): | def test_bad_html(): | ||||||
|     bad_html = "<html></html>" |     bad_html = "<html></html>" | ||||||
|     eq_(bad_html, quotations.extract_from_html(bad_html)) |     eq_(bad_html, quotations.extract_from_html(bad_html)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_remove_namespaces(): | ||||||
|  |     msg_body = """ | ||||||
|  |     <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns="http://www.w3.org/TR/REC-html40"> | ||||||
|  |         <body> | ||||||
|  |             <o:p>Dear Sir,</o:p> | ||||||
|  |             <o:p>Thank you for the email.</o:p> | ||||||
|  |             <blockquote>thing</blockquote> | ||||||
|  |         </body> | ||||||
|  |     </html> | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     rendered = quotations.extract_from_html(msg_body) | ||||||
|  |  | ||||||
|  |     assert_true("<p>" in rendered) | ||||||
|  |     assert_true("xmlns" in rendered) | ||||||
|  |  | ||||||
|  |     assert_true("<o:p>" not in rendered) | ||||||
|  |     assert_true("<xmlns:o>" not in rendered) | ||||||
|   | |||||||
| @@ -119,6 +119,38 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> sent: | |||||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_appointment(): | ||||||
|  |     msg_body = """Response | ||||||
|  |  | ||||||
|  | 10/19/2017 @ 9:30 am for physical therapy | ||||||
|  | Bla | ||||||
|  | 1517 4th Avenue Ste 300 | ||||||
|  | London CA 19129, 555-421-6780 | ||||||
|  |  | ||||||
|  | John Doe, FCLS | ||||||
|  | Mailgun Inc | ||||||
|  | 555-941-0697 | ||||||
|  |  | ||||||
|  | From: from@example.com [mailto:from@example.com] | ||||||
|  | Sent: Wednesday, October 18, 2017 2:05 PM | ||||||
|  | To: John Doer - SIU <jd@example.com> | ||||||
|  | Subject: RE: Claim # 5551188-1 | ||||||
|  |  | ||||||
|  | Text""" | ||||||
|  |  | ||||||
|  |     expected = """Response | ||||||
|  |  | ||||||
|  | 10/19/2017 @ 9:30 am for physical therapy | ||||||
|  | Bla | ||||||
|  | 1517 4th Avenue Ste 300 | ||||||
|  | London CA 19129, 555-421-6780 | ||||||
|  |  | ||||||
|  | John Doe, FCLS | ||||||
|  | Mailgun Inc | ||||||
|  | 555-941-0697""" | ||||||
|  |     eq_(expected, quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_line_starts_with_on(): | def test_line_starts_with_on(): | ||||||
|     msg_body = """Blah-blah-blah |     msg_body = """Blah-blah-blah | ||||||
| On blah-blah-blah""" | On blah-blah-blah""" | ||||||
| @@ -421,6 +453,7 @@ def test_link_closed_with_quotation_marker_on_new_line(): | |||||||
|     msg_body = '''8.45am-1pm |     msg_body = '''8.45am-1pm | ||||||
|  |  | ||||||
| From: somebody@example.com | From: somebody@example.com | ||||||
|  | Date: Wed, 16 May 2012 00:15:02 -0600 | ||||||
|   |   | ||||||
| <http://email.example.com/c/dHJhY2tpbmdfY29kZT1mMDdjYzBmNzM1ZjYzMGIxNT | <http://email.example.com/c/dHJhY2tpbmdfY29kZT1mMDdjYzBmNzM1ZjYzMGIxNT | ||||||
| >  <bob@example.com <mailto:bob@example.com> > | >  <bob@example.com <mailto:bob@example.com> > | ||||||
| @@ -462,7 +495,9 @@ def test_from_block_starts_with_date(): | |||||||
|     msg_body = """Blah |     msg_body = """Blah | ||||||
|  |  | ||||||
| Date: Wed, 16 May 2012 00:15:02 -0600 | Date: Wed, 16 May 2012 00:15:02 -0600 | ||||||
| To: klizhentas@example.com""" | To: klizhentas@example.com | ||||||
|  |  | ||||||
|  | """ | ||||||
|     eq_('Blah', quotations.extract_from_plain(msg_body)) |     eq_('Blah', quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -532,11 +567,12 @@ def test_mark_message_lines(): | |||||||
|              # next line should be marked as splitter |              # next line should be marked as splitter | ||||||
|              '_____________', |              '_____________', | ||||||
|              'From: foo@bar.com', |              'From: foo@bar.com', | ||||||
|  |              'Date: Wed, 16 May 2012 00:15:02 -0600', | ||||||
|              '', |              '', | ||||||
|              '> Hi', |              '> Hi', | ||||||
|              '', |              '', | ||||||
|              'Signature'] |              'Signature'] | ||||||
|     eq_('tessemet', quotations.mark_message_lines(lines)) |     eq_('tesssemet', quotations.mark_message_lines(lines)) | ||||||
|  |  | ||||||
|     lines = ['Just testing the email reply', |     lines = ['Just testing the email reply', | ||||||
|              '', |              '', | ||||||
| @@ -775,7 +811,7 @@ def test_split_email(): | |||||||
|         > |         > | ||||||
|         > |         > | ||||||
| """ | """ | ||||||
|     expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm" |     expected_markers = "stttttsttttetesetesmmmmmmsmmmmmmmmmmmmmmmm" | ||||||
|     markers = quotations.split_emails(msg) |     markers = quotations.split_emails(msg) | ||||||
|     eq_(markers, expected_markers) |     eq_(markers, expected_markers) | ||||||
|  |  | ||||||
| @@ -790,4 +826,16 @@ The user experience was unparallelled. Please continue production. I'm sending p | |||||||
| that this line is intact.""" | that this line is intact.""" | ||||||
|  |  | ||||||
|     parsed = quotations.extract_from_plain(msg_body) |     parsed = quotations.extract_from_plain(msg_body) | ||||||
|     eq_(msg_body, parsed.decode('utf8')) |     eq_(msg_body, parsed) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_appointment_2(): | ||||||
|  |     msg_body = """Invitation for an interview: | ||||||
|  |  | ||||||
|  | Date: Wednesday 3, October 2011  | ||||||
|  | Time: 7 : 00am  | ||||||
|  | Address: 130 Fox St | ||||||
|  |  | ||||||
|  | Please bring in your ID.""" | ||||||
|  |     parsed = quotations.extract_from_plain(msg_body) | ||||||
|  |     eq_(msg_body, parsed) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user