Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5a9bc967f1 | ||
|
|
a0d7236d0b | ||
|
|
21e9a31ffe | ||
|
|
4ee46c0a97 | ||
|
|
10d9a930f9 | ||
|
|
a21ccdb21b |
2
setup.py
2
setup.py
@@ -29,7 +29,7 @@ class InstallCommand(install):
|
|||||||
|
|
||||||
|
|
||||||
setup(name='talon',
|
setup(name='talon',
|
||||||
version='1.2.11',
|
version='1.2.14',
|
||||||
description=("Mailgun library "
|
description=("Mailgun library "
|
||||||
"to extract message quotations and signatures."),
|
"to extract message quotations and signatures."),
|
||||||
long_description=open("README.rst").read(),
|
long_description=open("README.rst").read(),
|
||||||
|
|||||||
@@ -164,6 +164,9 @@ RE_PARENTHESIS_LINK = re.compile("\(https?://")
|
|||||||
|
|
||||||
SPLITTER_MAX_LINES = 4
|
SPLITTER_MAX_LINES = 4
|
||||||
MAX_LINES_COUNT = 1000
|
MAX_LINES_COUNT = 1000
|
||||||
|
# an extensive research shows that exceeding this limit
|
||||||
|
# leads to excessive processing time
|
||||||
|
MAX_HTML_LEN = 2794202
|
||||||
|
|
||||||
QUOT_PATTERN = re.compile('^>+ ?')
|
QUOT_PATTERN = re.compile('^>+ ?')
|
||||||
NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
|
NO_QUOT_LINE = re.compile('^[^>].*[\S].*')
|
||||||
@@ -382,6 +385,9 @@ def _extract_from_html(msg_body):
|
|||||||
then checking deleted checkpoints,
|
then checking deleted checkpoints,
|
||||||
then deleting necessary tags.
|
then deleting necessary tags.
|
||||||
"""
|
"""
|
||||||
|
if len(msg_body) > MAX_HTML_LEN:
|
||||||
|
return msg_body
|
||||||
|
|
||||||
if msg_body.strip() == b'':
|
if msg_body.strip() == b'':
|
||||||
return msg_body
|
return msg_body
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -185,12 +185,13 @@ def capitalized_words_percent(s):
|
|||||||
s = to_unicode(s, precise=True)
|
s = to_unicode(s, precise=True)
|
||||||
words = re.split('\s', s)
|
words = re.split('\s', s)
|
||||||
words = [w for w in words if w.strip()]
|
words = [w for w in words if w.strip()]
|
||||||
|
words = [w for w in words if len(w) > 2]
|
||||||
capitalized_words_counter = 0
|
capitalized_words_counter = 0
|
||||||
valid_words_counter = 0
|
valid_words_counter = 0
|
||||||
for word in words:
|
for word in words:
|
||||||
if not INVALID_WORD_START.match(word):
|
if not INVALID_WORD_START.match(word):
|
||||||
valid_words_counter += 1
|
valid_words_counter += 1
|
||||||
if word[0].isupper():
|
if word[0].isupper() and not word[1].isupper():
|
||||||
capitalized_words_counter += 1
|
capitalized_words_counter += 1
|
||||||
if valid_words_counter > 0 and len(words) > 1:
|
if valid_words_counter > 0 and len(words) > 1:
|
||||||
return 100 * float(capitalized_words_counter) / valid_words_counter
|
return 100 * float(capitalized_words_counter) / valid_words_counter
|
||||||
|
|||||||
@@ -380,3 +380,15 @@ def test_gmail_forwarded_msg():
|
|||||||
</div><br></div>"""
|
</div><br></div>"""
|
||||||
extracted = quotations.extract_from_html(msg_body)
|
extracted = quotations.extract_from_html(msg_body)
|
||||||
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
|
eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))
|
||||||
|
|
||||||
|
|
||||||
|
@patch.object(quotations, 'MAX_HTML_LEN', 1)
|
||||||
|
def test_too_large_html():
|
||||||
|
msg_body = 'Reply' \
|
||||||
|
'<div class="gmail_quote">' \
|
||||||
|
'<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob <bob@example.com> wrote:' \
|
||||||
|
'<div>Test</div>' \
|
||||||
|
'</div>' \
|
||||||
|
'</div>'
|
||||||
|
eq_(RE_WHITESPACE.sub('', msg_body),
|
||||||
|
RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
|
||||||
|
|||||||
@@ -77,6 +77,31 @@ def test_basic():
|
|||||||
signature.extract(msg_body, 'Sergey'))
|
signature.extract(msg_body, 'Sergey'))
|
||||||
|
|
||||||
|
|
||||||
|
def test_capitalized():
|
||||||
|
msg_body = """Hi Mary,
|
||||||
|
|
||||||
|
Do you still need a DJ for your wedding? I've included a video demo of one of our DJs available for your wedding date.
|
||||||
|
|
||||||
|
DJ Doe
|
||||||
|
http://example.com
|
||||||
|
Password: SUPERPASSWORD
|
||||||
|
|
||||||
|
Would you like to check out more?
|
||||||
|
|
||||||
|
|
||||||
|
At your service,
|
||||||
|
|
||||||
|
John Smith
|
||||||
|
Doe Inc
|
||||||
|
555-531-7967"""
|
||||||
|
|
||||||
|
sig = """John Smith
|
||||||
|
Doe Inc
|
||||||
|
555-531-7967"""
|
||||||
|
|
||||||
|
eq_(sig, signature.extract(msg_body, 'Doe')[1])
|
||||||
|
|
||||||
|
|
||||||
def test_over_2_text_lines_after_signature():
|
def test_over_2_text_lines_after_signature():
|
||||||
body = """Blah
|
body = """Blah
|
||||||
|
|
||||||
|
|||||||
@@ -192,10 +192,11 @@ def test_punctuation_percent(categories_percent):
|
|||||||
def test_capitalized_words_percent():
|
def test_capitalized_words_percent():
|
||||||
eq_(0.0, h.capitalized_words_percent(''))
|
eq_(0.0, h.capitalized_words_percent(''))
|
||||||
eq_(100.0, h.capitalized_words_percent('Example Corp'))
|
eq_(100.0, h.capitalized_words_percent('Example Corp'))
|
||||||
eq_(50.0, h.capitalized_words_percent('Qqq qqq QQQ 123 sss'))
|
eq_(50.0, h.capitalized_words_percent('Qqq qqq Aqs 123 sss'))
|
||||||
eq_(100.0, h.capitalized_words_percent('Cell 713-444-7368'))
|
eq_(100.0, h.capitalized_words_percent('Cell 713-444-7368'))
|
||||||
eq_(100.0, h.capitalized_words_percent('8th Floor'))
|
eq_(100.0, h.capitalized_words_percent('8th Floor'))
|
||||||
eq_(0.0, h.capitalized_words_percent('(212) 230-9276'))
|
eq_(0.0, h.capitalized_words_percent('(212) 230-9276'))
|
||||||
|
eq_(50.0, h.capitalized_words_percent('Password: REMARKABLE'))
|
||||||
|
|
||||||
|
|
||||||
def test_has_signature():
|
def test_has_signature():
|
||||||
|
|||||||
Reference in New Issue
Block a user