Compare commits
	
		
			29 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 6a304215c3 | ||
|  | 31714506bd | ||
|  | 403d80cf3b | ||
|  | 7cf20f2877 | ||
|  | 685abb1905 | ||
|  | 41990727a3 | ||
|  | b113d8ab33 | ||
|  | 7bd0e9cc2f | ||
|  | 1e030a51d4 | ||
|  | 53b24ffb3d | ||
|  | a7404afbcb | ||
|  | 0e6d5f993c | ||
|  | 60637ff13a | ||
|  | df8259e3fe | ||
|  | aab3b1cc75 | ||
|  | 9492b39f2d | ||
|  | b9ac866ea7 | ||
|  | 678517dd89 | ||
|  | 221774c6f8 | ||
|  | a2aa345712 | ||
|  | d998beaff3 | ||
|  | a379bc4e7c | ||
|  | b8e1894f3b | ||
|  | 0b5a44090f | ||
|  | b40835eca2 | ||
|  | b38562c7cc | ||
|  | 70e9fb415e | ||
|  | 64612099cd | ||
|  | e16dcf629e | 
							
								
								
									
										16
									
								
								README.rst
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								README.rst
									
									
									
									
									
								
							| @@ -129,6 +129,22 @@ start using it for talon. | |||||||
| .. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set | .. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set | ||||||
| .. _forge: https://github.com/mailgun/forge | .. _forge: https://github.com/mailgun/forge | ||||||
|  |  | ||||||
|  | Training on your dataset | ||||||
|  | ------------------------ | ||||||
|  |  | ||||||
|  | talon comes with a pre-processed dataset and a pre-trained classifier. To retrain the classifier on your own dataset of raw emails, structure and annotate them in the same way the `forge`_ project does. Then do: | ||||||
|  |  | ||||||
|  | .. code:: python | ||||||
|  |  | ||||||
|  |     from talon.signature.learning.dataset import build_extraction_dataset | ||||||
|  |     from talon.signature.learning import classifier as c  | ||||||
|  |      | ||||||
|  |     build_extraction_dataset("/path/to/your/P/folder", "/path/to/talon/signature/data/train.data") | ||||||
|  |     c.train(c.init(), "/path/to/talon/signature/data/train.data", "/path/to/talon/signature/data/classifier") | ||||||
|  |  | ||||||
|  | Note that for signature extraction you need just the folder with the positive samples with annotated signature lines (P folder). | ||||||
|  |  | ||||||
|  | .. _forge: https://github.com/mailgun/forge | ||||||
|  |  | ||||||
| Research | Research | ||||||
| -------- | -------- | ||||||
|   | |||||||
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							| @@ -29,7 +29,7 @@ class InstallCommand(install): | |||||||
|  |  | ||||||
|  |  | ||||||
| setup(name='talon', | setup(name='talon', | ||||||
|       version='1.4.0', |       version='1.4.5', | ||||||
|       description=("Mailgun library " |       description=("Mailgun library " | ||||||
|                    "to extract message quotations and signatures."), |                    "to extract message quotations and signatures."), | ||||||
|       long_description=open("README.rst").read(), |       long_description=open("README.rst").read(), | ||||||
|   | |||||||
| @@ -94,6 +94,12 @@ def cut_microsoft_quote(html_message): | |||||||
|         #outlook 2007, 2010 (american) |         #outlook 2007, 2010 (american) | ||||||
|         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" |         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" | ||||||
|         "padding:3.0pt 0in 0in 0in']|" |         "padding:3.0pt 0in 0in 0in']|" | ||||||
|  |         #outlook 2013 (international) | ||||||
|  |         "//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;" | ||||||
|  |         "padding:3.0pt 0cm 0cm 0cm']|" | ||||||
|  |         #outlook 2013 (american) | ||||||
|  |         "//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;" | ||||||
|  |         "padding:3.0pt 0in 0in 0in']|" | ||||||
|         #windows mail |         #windows mail | ||||||
|         "//div[@style='padding-top: 5px; " |         "//div[@style='padding-top: 5px; " | ||||||
|         "border-top-color: rgb(229, 229, 229); " |         "border-top-color: rgb(229, 229, 229); " | ||||||
|   | |||||||
| @@ -38,10 +38,14 @@ RE_ON_DATE_SMB_WROTE = re.compile( | |||||||
|             'Op', |             'Op', | ||||||
|             # German |             # German | ||||||
|             'Am', |             'Am', | ||||||
|  |             # Portuguese | ||||||
|  |             'Em', | ||||||
|             # Norwegian |             # Norwegian | ||||||
|             u'På', |             u'På', | ||||||
|             # Swedish, Danish |             # Swedish, Danish | ||||||
|             'Den', |             'Den', | ||||||
|  |             # Vietnamese | ||||||
|  |             u'Vào', | ||||||
|         )), |         )), | ||||||
|         # Date and sender separator |         # Date and sender separator | ||||||
|         u'|'.join(( |         u'|'.join(( | ||||||
| @@ -62,8 +66,12 @@ RE_ON_DATE_SMB_WROTE = re.compile( | |||||||
|             'schreef','verzond','geschreven', |             'schreef','verzond','geschreven', | ||||||
|             # German |             # German | ||||||
|             'schrieb', |             'schrieb', | ||||||
|  |             # Portuguese | ||||||
|  |             'escreveu', | ||||||
|             # Norwegian, Swedish |             # Norwegian, Swedish | ||||||
|             'skrev', |             'skrev', | ||||||
|  |             # Vietnamese | ||||||
|  |             u'đã viết', | ||||||
|         )) |         )) | ||||||
|     )) |     )) | ||||||
| # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:' | # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:' | ||||||
| @@ -143,7 +151,7 @@ RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]?.* | |||||||
| RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format( | RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format( | ||||||
|     u'|'.join(( |     u'|'.join(( | ||||||
|         # English |         # English | ||||||
|         'wrote' |         'wrote', | ||||||
|     ))), re.I) |     ))), re.I) | ||||||
|  |  | ||||||
| # Support polymail.io reply format | # Support polymail.io reply format | ||||||
| @@ -161,15 +169,15 @@ SPLITTER_PATTERNS = [ | |||||||
|     RE_FROM_COLON_OR_DATE_COLON, |     RE_FROM_COLON_OR_DATE_COLON, | ||||||
|     # 02.04.2012 14:20 пользователь "bob@example.com" < |     # 02.04.2012 14:20 пользователь "bob@example.com" < | ||||||
|     # bob@xxx.mailgun.org> написал: |     # bob@xxx.mailgun.org> написал: | ||||||
|     re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S), |     re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*\s\S+@\S+", re.S), | ||||||
|     # 2014-10-17 11:28 GMT+03:00 Bob < |     # 2014-10-17 11:28 GMT+03:00 Bob < | ||||||
|     # bob@example.com>: |     # bob@example.com>: | ||||||
|     re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S), |     re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*\s\S+@\S+", re.S), | ||||||
|     # Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>: |     # Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>: | ||||||
|     re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' |     re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' | ||||||
|                '( \S+){3,6}@\S+:'), |                '( \S+){3,6}@\S+:'), | ||||||
|     # Sent from Samsung MobileName <address@example.com> wrote: |     # Sent from Samsung MobileName <address@example.com> wrote: | ||||||
|     re.compile('Sent from Samsung .*@.*> wrote'), |     re.compile('Sent from Samsung.* \S+@\S+> wrote'), | ||||||
|     RE_ANDROID_WROTE, |     RE_ANDROID_WROTE, | ||||||
|     RE_POLYMAIL |     RE_POLYMAIL | ||||||
|     ] |     ] | ||||||
| @@ -282,7 +290,7 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): | |||||||
|     # inlined reply |     # inlined reply | ||||||
|     # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm' |     # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm' | ||||||
|     # both 't' entries should be found |     # both 't' entries should be found | ||||||
|     for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers): |     for inline_reply in re.finditer('(?<=m)e*(t[te]*)m', markers): | ||||||
|         # long links could break sequence of quotation lines but they shouldn't |         # long links could break sequence of quotation lines but they shouldn't | ||||||
|         # be considered an inline reply |         # be considered an inline reply | ||||||
|         links = ( |         links = ( | ||||||
| @@ -426,6 +434,9 @@ def _extract_from_html(msg_body): | |||||||
|     Extract not quoted message from provided html message body |     Extract not quoted message from provided html message body | ||||||
|     using tags and plain text algorithm. |     using tags and plain text algorithm. | ||||||
|  |  | ||||||
|  |     Cut out first some encoding html tags such as xml and doctype | ||||||
|  |     for avoiding conflict with unicode decoding | ||||||
|  |  | ||||||
|     Cut out the 'blockquote', 'gmail_quote' tags. |     Cut out the 'blockquote', 'gmail_quote' tags. | ||||||
|     Cut Microsoft quotations. |     Cut Microsoft quotations. | ||||||
|  |  | ||||||
| @@ -441,6 +452,9 @@ def _extract_from_html(msg_body): | |||||||
|         return msg_body |         return msg_body | ||||||
|  |  | ||||||
|     msg_body = msg_body.replace(b'\r\n', b'\n') |     msg_body = msg_body.replace(b'\r\n', b'\n') | ||||||
|  |  | ||||||
|  |     msg_body = re.sub(r"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body) | ||||||
|  |  | ||||||
|     html_tree = html_document_fromstring(msg_body) |     html_tree = html_document_fromstring(msg_body) | ||||||
|  |  | ||||||
|     if html_tree is None: |     if html_tree is None: | ||||||
|   | |||||||
| @@ -32,7 +32,7 @@ RE_REVERSE_SIGNATURE = re.compile(r''' | |||||||
|  |  | ||||||
| def is_signature_line(line, sender, classifier): | def is_signature_line(line, sender, classifier): | ||||||
|     '''Checks if the line belongs to signature. Returns True or False.''' |     '''Checks if the line belongs to signature. Returns True or False.''' | ||||||
|     data = numpy.array(build_pattern(line, features(sender))) |     data = numpy.array(build_pattern(line, features(sender))).reshape(1, -1) | ||||||
|     return classifier.predict(data) > 0 |     return classifier.predict(data) > 0 | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
| @@ -119,6 +119,38 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> sent: | |||||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) |     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_appointment(): | ||||||
|  |     msg_body = """Response | ||||||
|  |  | ||||||
|  | 10/19/2017 @ 9:30 am for physical therapy | ||||||
|  | Bla | ||||||
|  | 1517 4th Avenue Ste 300 | ||||||
|  | London CA 19129, 555-421-6780 | ||||||
|  |  | ||||||
|  | John Doe, FCLS | ||||||
|  | Mailgun Inc | ||||||
|  | 555-941-0697 | ||||||
|  |  | ||||||
|  | From: from@example.com [mailto:from@example.com] | ||||||
|  | Sent: Wednesday, October 18, 2017 2:05 PM | ||||||
|  | To: John Doer - SIU <jd@example.com> | ||||||
|  | Subject: RE: Claim # 5551188-1 | ||||||
|  |  | ||||||
|  | Text""" | ||||||
|  |  | ||||||
|  |     expected = """Response | ||||||
|  |  | ||||||
|  | 10/19/2017 @ 9:30 am for physical therapy | ||||||
|  | Bla | ||||||
|  | 1517 4th Avenue Ste 300 | ||||||
|  | London CA 19129, 555-421-6780 | ||||||
|  |  | ||||||
|  | John Doe, FCLS | ||||||
|  | Mailgun Inc | ||||||
|  | 555-941-0697""" | ||||||
|  |     eq_(expected, quotations.extract_from_plain(msg_body)) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_line_starts_with_on(): | def test_line_starts_with_on(): | ||||||
|     msg_body = """Blah-blah-blah |     msg_body = """Blah-blah-blah | ||||||
| On blah-blah-blah""" | On blah-blah-blah""" | ||||||
| @@ -401,6 +433,14 @@ Op 17-feb.-2015, om 13:18 heeft Julius Caesar <pantheon@rome.com> het volgende g | |||||||
| Small batch beard laboris tempor, non listicle hella Tumblr heirloom. | Small batch beard laboris tempor, non listicle hella Tumblr heirloom. | ||||||
| """)) | """)) | ||||||
|  |  | ||||||
|  | def test_vietnamese_from_block(): | ||||||
|  |     eq_('Hello', quotations.extract_from_plain( | ||||||
|  |     u"""Hello | ||||||
|  |  | ||||||
|  | Vào 14:24 8 tháng 6, 2017, Hùng Nguyễn <hungnguyen@xxx.com> đã viết: | ||||||
|  |  | ||||||
|  | > Xin chào | ||||||
|  | """)) | ||||||
|  |  | ||||||
| def test_quotation_marker_false_positive(): | def test_quotation_marker_false_positive(): | ||||||
|     msg_body = """Visit us now for assistance... |     msg_body = """Visit us now for assistance... | ||||||
| @@ -770,3 +810,16 @@ def test_split_email(): | |||||||
|     expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm" |     expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm" | ||||||
|     markers = quotations.split_emails(msg) |     markers = quotations.split_emails(msg) | ||||||
|     eq_(markers, expected_markers) |     eq_(markers, expected_markers) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_feedback_below_left_unparsed(): | ||||||
|  |     msg_body = """Please enter your feedback below. Thank you. | ||||||
|  |  | ||||||
|  | ------------------------------------- Enter Feedback Below ------------------------------------- | ||||||
|  |  | ||||||
|  | The user experience was unparallelled. Please continue production. I'm sending payment to ensure | ||||||
|  | that this line is intact.""" | ||||||
|  |  | ||||||
|  |     parsed = quotations.extract_from_plain(msg_body) | ||||||
|  |     eq_(msg_body, parsed.decode('utf8')) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user