Compare commits
	
		
			6 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 5a9bc967f1 | ||
|  | a0d7236d0b | ||
|  | 21e9a31ffe | ||
|  | 4ee46c0a97 | ||
|  | 10d9a930f9 | ||
|  | a21ccdb21b | 
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							| @@ -29,7 +29,7 @@ class InstallCommand(install): | |||||||
|  |  | ||||||
|  |  | ||||||
| setup(name='talon', | setup(name='talon', | ||||||
|       version='1.2.11', |       version='1.2.14', | ||||||
|       description=("Mailgun library " |       description=("Mailgun library " | ||||||
|                    "to extract message quotations and signatures."), |                    "to extract message quotations and signatures."), | ||||||
|       long_description=open("README.rst").read(), |       long_description=open("README.rst").read(), | ||||||
|   | |||||||
| @@ -164,6 +164,9 @@ RE_PARENTHESIS_LINK = re.compile("\(https?://") | |||||||
|  |  | ||||||
| SPLITTER_MAX_LINES = 4 | SPLITTER_MAX_LINES = 4 | ||||||
| MAX_LINES_COUNT = 1000 | MAX_LINES_COUNT = 1000 | ||||||
|  | # an extensive research shows that exceeding this limit | ||||||
|  | # leads to excessive processing time | ||||||
|  | MAX_HTML_LEN = 2794202 | ||||||
|  |  | ||||||
| QUOT_PATTERN = re.compile('^>+ ?') | QUOT_PATTERN = re.compile('^>+ ?') | ||||||
| NO_QUOT_LINE = re.compile('^[^>].*[\S].*') | NO_QUOT_LINE = re.compile('^[^>].*[\S].*') | ||||||
| @@ -382,6 +385,9 @@ def _extract_from_html(msg_body): | |||||||
|     then checking deleted checkpoints, |     then checking deleted checkpoints, | ||||||
|     then deleting necessary tags. |     then deleting necessary tags. | ||||||
|     """ |     """ | ||||||
|  |     if len(msg_body) > MAX_HTML_LEN: | ||||||
|  |         return msg_body | ||||||
|  |  | ||||||
|     if msg_body.strip() == b'': |     if msg_body.strip() == b'': | ||||||
|         return msg_body |         return msg_body | ||||||
|  |  | ||||||
|   | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -185,12 +185,13 @@ def capitalized_words_percent(s): | |||||||
|     s = to_unicode(s, precise=True) |     s = to_unicode(s, precise=True) | ||||||
|     words = re.split('\s', s) |     words = re.split('\s', s) | ||||||
|     words = [w for w in words if w.strip()] |     words = [w for w in words if w.strip()] | ||||||
|  |     words = [w for w in words if len(w) > 2]     | ||||||
|     capitalized_words_counter = 0 |     capitalized_words_counter = 0 | ||||||
|     valid_words_counter = 0 |     valid_words_counter = 0 | ||||||
|     for word in words: |     for word in words: | ||||||
|         if not INVALID_WORD_START.match(word): |         if not INVALID_WORD_START.match(word): | ||||||
|             valid_words_counter += 1 |             valid_words_counter += 1 | ||||||
|             if word[0].isupper(): |             if word[0].isupper() and not word[1].isupper(): | ||||||
|                 capitalized_words_counter += 1 |                 capitalized_words_counter += 1 | ||||||
|     if valid_words_counter > 0 and len(words) > 1: |     if valid_words_counter > 0 and len(words) > 1: | ||||||
|         return 100 * float(capitalized_words_counter) / valid_words_counter |         return 100 * float(capitalized_words_counter) / valid_words_counter | ||||||
|   | |||||||
| @@ -380,3 +380,15 @@ def test_gmail_forwarded_msg(): | |||||||
| </div><br></div>""" | </div><br></div>""" | ||||||
|     extracted = quotations.extract_from_html(msg_body) |     extracted = quotations.extract_from_html(msg_body) | ||||||
|     eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) |     eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted)) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @patch.object(quotations, 'MAX_HTML_LEN', 1) | ||||||
|  | def test_too_large_html(): | ||||||
|  |     msg_body = 'Reply' \ | ||||||
|  |                '<div class="gmail_quote">' \ | ||||||
|  |                '<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:' \ | ||||||
|  |                '<div>Test</div>' \ | ||||||
|  |                '</div>' \ | ||||||
|  |                '</div>' | ||||||
|  |     eq_(RE_WHITESPACE.sub('', msg_body), | ||||||
|  |         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) | ||||||
|   | |||||||
| @@ -77,6 +77,31 @@ def test_basic(): | |||||||
|         signature.extract(msg_body, 'Sergey')) |         signature.extract(msg_body, 'Sergey')) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def test_capitalized(): | ||||||
|  |     msg_body = """Hi Mary, | ||||||
|  |  | ||||||
|  | Do you still need a DJ for your wedding? I've included a video demo of one of our DJs available for your wedding date. | ||||||
|  |  | ||||||
|  | DJ Doe  | ||||||
|  | http://example.com | ||||||
|  | Password: SUPERPASSWORD | ||||||
|  |  | ||||||
|  | Would you like to check out more? | ||||||
|  |  | ||||||
|  |  | ||||||
|  | At your service, | ||||||
|  |  | ||||||
|  | John Smith | ||||||
|  | Doe Inc | ||||||
|  | 555-531-7967""" | ||||||
|  |  | ||||||
|  |     sig = """John Smith | ||||||
|  | Doe Inc | ||||||
|  | 555-531-7967""" | ||||||
|  |  | ||||||
|  |     eq_(sig, signature.extract(msg_body, 'Doe')[1]) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_over_2_text_lines_after_signature(): | def test_over_2_text_lines_after_signature(): | ||||||
|     body = """Blah |     body = """Blah | ||||||
|  |  | ||||||
|   | |||||||
| @@ -192,10 +192,11 @@ def test_punctuation_percent(categories_percent): | |||||||
| def test_capitalized_words_percent(): | def test_capitalized_words_percent(): | ||||||
|     eq_(0.0, h.capitalized_words_percent('')) |     eq_(0.0, h.capitalized_words_percent('')) | ||||||
|     eq_(100.0, h.capitalized_words_percent('Example Corp')) |     eq_(100.0, h.capitalized_words_percent('Example Corp')) | ||||||
|     eq_(50.0, h.capitalized_words_percent('Qqq qqq QQQ 123 sss')) |     eq_(50.0, h.capitalized_words_percent('Qqq qqq Aqs 123 sss')) | ||||||
|     eq_(100.0, h.capitalized_words_percent('Cell 713-444-7368')) |     eq_(100.0, h.capitalized_words_percent('Cell 713-444-7368')) | ||||||
|     eq_(100.0, h.capitalized_words_percent('8th Floor')) |     eq_(100.0, h.capitalized_words_percent('8th Floor')) | ||||||
|     eq_(0.0, h.capitalized_words_percent('(212) 230-9276')) |     eq_(0.0, h.capitalized_words_percent('(212) 230-9276')) | ||||||
|  |     eq_(50.0, h.capitalized_words_percent('Password: REMARKABLE')) | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_has_signature(): | def test_has_signature(): | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user