consider word capitilized only if it is camel case - not all upper case

This commit is contained in:
Sergey Obukhov
2016-07-19 16:22:04 -07:00
parent 7cdd7a8f35
commit a21ccdb21b
5 changed files with 2504 additions and 2670 deletions

View File

@@ -29,7 +29,7 @@ class InstallCommand(install):
setup(name='talon',
version='1.2.11',
version='1.2.12',
description=("Mailgun library "
"to extract message quotations and signatures."),
long_description=open("README.rst").read(),

File diff suppressed because it is too large Load Diff

View File

@@ -185,12 +185,13 @@ def capitalized_words_percent(s):
s = to_unicode(s, precise=True)
words = re.split('\s', s)
words = [w for w in words if w.strip()]
words = [w for w in words if len(w) > 2]
capitalized_words_counter = 0
valid_words_counter = 0
for word in words:
if not INVALID_WORD_START.match(word):
valid_words_counter += 1
if word[0].isupper():
if word[0].isupper() and not word[1].isupper():
capitalized_words_counter += 1
if valid_words_counter > 0 and len(words) > 1:
return 100 * float(capitalized_words_counter) / valid_words_counter

View File

@@ -77,6 +77,31 @@ def test_basic():
signature.extract(msg_body, 'Sergey'))
def test_capitalized():
msg_body = """Hi Mary,
Do you still need a DJ for your wedding? I've included a video demo of one of our DJs available for your wedding date.
DJ Doe
http://example.com
Password: SUPERPASSWORD
Would you like to check out more?
At your service,
John Smith
Doe Inc
555-531-7967"""
sig = """John Smith
Doe Inc
555-531-7967"""
eq_(sig, signature.extract(msg_body, 'Doe')[1])
def test_over_2_text_lines_after_signature():
body = """Blah

View File

@@ -192,10 +192,11 @@ def test_punctuation_percent(categories_percent):
def test_capitalized_words_percent():
eq_(0.0, h.capitalized_words_percent(''))
eq_(100.0, h.capitalized_words_percent('Example Corp'))
eq_(50.0, h.capitalized_words_percent('Qqq qqq QQQ 123 sss'))
eq_(50.0, h.capitalized_words_percent('Qqq qqq Aqs 123 sss'))
eq_(100.0, h.capitalized_words_percent('Cell 713-444-7368'))
eq_(100.0, h.capitalized_words_percent('8th Floor'))
eq_(0.0, h.capitalized_words_percent('(212) 230-9276'))
eq_(50.0, h.capitalized_words_percent('Password: REMARKABLE'))
def test_has_signature():