Compare commits
	
		
			83 Commits
		
	
	
		
			v1.3.3
			...
			dietz/REP-
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | d37c4fd551 | ||
|  | d9ed7cc6d1 | ||
|  | 0a0808c0a8 | ||
|  | 16354e3528 | ||
|  | 1018e88ec1 | ||
|  | 2916351517 | ||
|  | 46d4b02c81 | ||
|  | 58eac88a10 | ||
|  | 2ef3d8dfbe | ||
|  | 7cf4c29340 | ||
|  | cdd84563dd | ||
|  | 8138ea9a60 | ||
|  | c171f9a875 | ||
|  | 3f97a8b8ff | ||
|  | 1147767ff3 | ||
|  | 6a304215c3 | ||
|  | 31714506bd | ||
|  | 403d80cf3b | ||
|  | 7cf20f2877 | ||
|  | afff08b017 | ||
|  | 685abb1905 | ||
|  | 41990727a3 | ||
|  | b113d8ab33 | ||
|  | 7bd0e9cc2f | ||
|  | 1e030a51d4 | ||
|  | 238a5de5cc | ||
|  | 53b24ffb3d | ||
|  | a7404afbcb | ||
|  | 0e6d5f993c | ||
|  | 60637ff13a | ||
|  | df8259e3fe | ||
|  | aab3b1cc75 | ||
|  | 9492b39f2d | ||
|  | b9ac866ea7 | ||
|  | 678517dd89 | ||
|  | 221774c6f8 | ||
|  | a2aa345712 | ||
|  | d998beaff3 | ||
|  | a379bc4e7c | ||
|  | b8e1894f3b | ||
|  | 0b5a44090f | ||
|  | b40835eca2 | ||
|  | b38562c7cc | ||
|  | 70e9fb415e | ||
|  | 64612099cd | ||
|  | 45c20f979d | ||
|  | 743c76f159 | ||
|  | bc5dad75d3 | ||
|  | 4acf05cf28 | ||
|  | f5f7264077 | ||
|  | 4364bebf38 | ||
|  | 15e61768f2 | ||
|  | dd0a0f5c4d | ||
|  | 086f5ba43b | ||
|  | e16dcf629e | ||
|  | f16ae5110b | ||
|  | ab5cbe5ec3 | ||
|  | be5da92f16 | ||
|  | 95954a65a0 | ||
|  | 0b55e8fa77 | ||
|  | 6f159e8959 | ||
|  | 5c413b4b00 | ||
|  | cca64d3ed1 | ||
|  | e11eaf6ff8 | ||
|  | 85a4c1d855 | ||
|  | 0f5e72623b | ||
|  | 061e549ad7 | ||
|  | 49d1a5d248 | ||
|  | 03d6b00db8 | ||
|  | a2eb0f7201 | ||
|  | 5c71a0ca07 | ||
|  | 489d16fad9 | ||
|  | a458707777 | ||
|  | a1d0a86305 | ||
|  | 29f1d21be7 | ||
|  | 34c5b526c3 | ||
|  | 3edb6578ba | ||
|  | 984c036b6e | ||
|  | a403ecb5c9 | ||
|  | a44713409c | ||
|  | 567467b8ed | ||
|  | 139edd6104 | ||
|  | e756d55abf | 
							
								
								
									
										20
									
								
								.build/Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								.build/Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,20 @@ | ||||
| FROM python:3.9-slim-buster AS deps | ||||
|  | ||||
| RUN apt-get update && \ | ||||
|     apt-get install -y build-essential git curl python3-dev libatlas3-base libatlas-base-dev liblapack-dev libxml2 libxml2-dev libffi6 libffi-dev musl-dev libxslt-dev | ||||
|  | ||||
| FROM deps AS testable | ||||
| ARG REPORT_PATH | ||||
|  | ||||
| VOLUME ["/var/mailgun", "/etc/mailgun/ssl", ${REPORT_PATH}] | ||||
|  | ||||
| ADD . /app | ||||
| WORKDIR /app | ||||
| COPY wheel/* /wheel/ | ||||
|  | ||||
| RUN mkdir -p ${REPORT_PATH} | ||||
|  | ||||
| RUN python ./setup.py build bdist_wheel -d /wheel && \ | ||||
|     pip install --no-deps /wheel/* | ||||
|  | ||||
| ENTRYPOINT ["/bin/sh", "/app/run_tests.sh"] | ||||
							
								
								
									
										5
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										5
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -39,6 +39,8 @@ nosetests.xml | ||||
| /.emacs.desktop | ||||
| /.emacs.desktop.lock | ||||
| .elc | ||||
| .idea | ||||
| .cache | ||||
| auto-save-list | ||||
| tramp | ||||
| .\#* | ||||
| @@ -52,3 +54,6 @@ _trial_temp | ||||
|  | ||||
| # OSX | ||||
| .DS_Store | ||||
|  | ||||
| # vim-backup | ||||
| *.bak | ||||
|   | ||||
| @@ -5,3 +5,10 @@ include classifier | ||||
| include LICENSE | ||||
| include MANIFEST.in | ||||
| include README.rst | ||||
| include talon/signature/data/train.data | ||||
| include talon/signature/data/classifier | ||||
| include talon/signature/data/classifier_01.npy | ||||
| include talon/signature/data/classifier_02.npy | ||||
| include talon/signature/data/classifier_03.npy | ||||
| include talon/signature/data/classifier_04.npy | ||||
| include talon/signature/data/classifier_05.npy | ||||
|   | ||||
							
								
								
									
										16
									
								
								README.rst
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								README.rst
									
									
									
									
									
								
							| @@ -129,6 +129,22 @@ start using it for talon. | ||||
| .. _EDRM: http://www.edrm.net/resources/data-sets/edrm-enron-email-data-set | ||||
| .. _forge: https://github.com/mailgun/forge | ||||
|  | ||||
| Training on your dataset | ||||
| ------------------------ | ||||
|  | ||||
| talon comes with a pre-processed dataset and a pre-trained classifier. To retrain the classifier on your own dataset of raw emails, structure and annotate them in the same way the `forge`_ project does. Then do: | ||||
|  | ||||
| .. code:: python | ||||
|  | ||||
|     from talon.signature.learning.dataset import build_extraction_dataset | ||||
|     from talon.signature.learning import classifier as c  | ||||
|      | ||||
|     build_extraction_dataset("/path/to/your/P/folder", "/path/to/talon/signature/data/train.data") | ||||
|     c.train(c.init(), "/path/to/talon/signature/data/train.data", "/path/to/talon/signature/data/classifier") | ||||
|  | ||||
| Note that for signature extraction you need just the folder with the positive samples with annotated signature lines (P folder). | ||||
|  | ||||
| .. _forge: https://github.com/mailgun/forge | ||||
|  | ||||
| Research | ||||
| -------- | ||||
|   | ||||
							
								
								
									
										11
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								requirements.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,11 @@ | ||||
| chardet>=1.0.1 | ||||
| cchardet>=0.3.5 | ||||
| cssselect | ||||
| html5lib | ||||
| joblib | ||||
| lxml>=2.3.3 | ||||
| numpy | ||||
| regex>=1 | ||||
| scikit-learn==0.24.1 # pickled versions of classifier, else rebuild | ||||
| scipy | ||||
| six>=1.10.0 | ||||
							
								
								
									
										4
									
								
								run_tests.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										4
									
								
								run_tests.sh
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,4 @@ | ||||
| #!/usr/bin/env bash | ||||
| set -ex | ||||
| REPORT_PATH="${REPORT_PATH:-./}" | ||||
| nosetests --with-xunit --with-coverage --cover-xml --cover-xml-file $REPORT_PATH/coverage.xml --xunit-file=$REPORT_PATH/nosetests.xml --cover-package=talon . | ||||
							
								
								
									
										25
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										25
									
								
								setup.py
									
									
									
									
									
								
							| @@ -19,17 +19,17 @@ class InstallCommand(install): | ||||
|         if self.no_ml: | ||||
|             dist = self.distribution | ||||
|             dist.packages=find_packages(exclude=[ | ||||
|                 'tests', | ||||
|                 'tests.*', | ||||
|                 'talon.signature', | ||||
|                 'talon.signature.*', | ||||
|                 "tests", | ||||
|                 "tests.*", | ||||
|                 "talon.signature", | ||||
|                 "talon.signature.*", | ||||
|             ]) | ||||
|             for not_required in ['numpy', 'scipy', 'scikit-learn==0.16.1']: | ||||
|             for not_required in ["numpy", "scipy", "scikit-learn==0.24.1"]: | ||||
|                 dist.install_requires.remove(not_required) | ||||
|  | ||||
|  | ||||
| setup(name='talon', | ||||
|       version='1.3.3', | ||||
|       version='1.4.8', | ||||
|       description=("Mailgun library " | ||||
|                    "to extract message quotations and signatures."), | ||||
|       long_description=open("README.rst").read(), | ||||
| @@ -48,12 +48,13 @@ setup(name='talon', | ||||
|           "regex>=1", | ||||
|           "numpy", | ||||
|           "scipy", | ||||
|           "scikit-learn==0.16.1", # pickled versions of classifier, else rebuild | ||||
|           'chardet>=1.0.1', | ||||
|           'cchardet>=0.3.5', | ||||
|           'cssselect', | ||||
|           'six>=1.10.0', | ||||
|           'html5lib' | ||||
|           "scikit-learn==0.24.1", # pickled versions of classifier, else rebuild | ||||
|           "chardet>=1.0.1", | ||||
|           "cchardet>=0.3.5", | ||||
|           "cssselect", | ||||
|           "six>=1.10.0", | ||||
|           "html5lib", | ||||
|           "joblib", | ||||
|           ], | ||||
|       tests_require=[ | ||||
|           "mock", | ||||
|   | ||||
| @@ -87,17 +87,24 @@ def cut_gmail_quote(html_message): | ||||
|  | ||||
| def cut_microsoft_quote(html_message): | ||||
|     ''' Cuts splitter block and all following blocks. ''' | ||||
|     #use EXSLT extensions to have a regex match() function with lxml | ||||
|     ns = {"re": "http://exslt.org/regular-expressions"} | ||||
|  | ||||
|     #general pattern: @style='border:none;border-top:solid <color> 1.0pt;padding:3.0pt 0<unit> 0<unit> 0<unit>' | ||||
|     #outlook 2007, 2010 (international) <color=#B5C4DF> <unit=cm> | ||||
|     #outlook 2007, 2010 (american)      <color=#B5C4DF> <unit=pt> | ||||
|     #outlook 2013       (international) <color=#E1E1E1> <unit=cm> | ||||
|     #outlook 2013       (american)      <color=#E1E1E1> <unit=pt> | ||||
|     #also handles a variant with a space after the semicolon | ||||
|     splitter = html_message.xpath( | ||||
|         #outlook 2007, 2010 (international) | ||||
|         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" | ||||
|         "padding:3.0pt 0cm 0cm 0cm']|" | ||||
|         #outlook 2007, 2010 (american) | ||||
|         "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" | ||||
|         "padding:3.0pt 0in 0in 0in']|" | ||||
|         #outlook 2007, 2010, 2013 (international, american) | ||||
|         "//div[@style[re:match(., 'border:none; ?border-top:solid #(E1E1E1|B5C4DF) 1.0pt; ?" | ||||
|         "padding:3.0pt 0(in|cm) 0(in|cm) 0(in|cm)')]]|" | ||||
|         #windows mail | ||||
|         "//div[@style='padding-top: 5px; " | ||||
|         "border-top-color: rgb(229, 229, 229); " | ||||
|         "border-top-width: 1px; border-top-style: solid;']" | ||||
|         , namespaces=ns | ||||
|     ) | ||||
|  | ||||
|     if splitter: | ||||
|   | ||||
| @@ -22,7 +22,7 @@ import six | ||||
| log = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) | ||||
| RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+\s*$", re.I | re.M) | ||||
|  | ||||
| RE_ON_DATE_SMB_WROTE = re.compile( | ||||
|     u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format( | ||||
| @@ -38,10 +38,14 @@ RE_ON_DATE_SMB_WROTE = re.compile( | ||||
|             'Op', | ||||
|             # German | ||||
|             'Am', | ||||
|             # Portuguese | ||||
|             'Em', | ||||
|             # Norwegian | ||||
|             u'På', | ||||
|             # Swedish, Danish | ||||
|             'Den', | ||||
|             # Vietnamese | ||||
|             u'Vào', | ||||
|         )), | ||||
|         # Date and sender separator | ||||
|         u'|'.join(( | ||||
| @@ -62,8 +66,12 @@ RE_ON_DATE_SMB_WROTE = re.compile( | ||||
|             'schreef','verzond','geschreven', | ||||
|             # German | ||||
|             'schrieb', | ||||
|             # Portuguese | ||||
|             'escreveu', | ||||
|             # Norwegian, Swedish | ||||
|             'skrev', | ||||
|             # Vietnamese | ||||
|             u'đã viết', | ||||
|         )) | ||||
|     )) | ||||
| # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:' | ||||
| @@ -131,14 +139,33 @@ RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format( | ||||
|         'Oprindelig meddelelse', | ||||
|     ))), re.I) | ||||
|  | ||||
| RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format( | ||||
| RE_FROM_COLON_OR_DATE_COLON = re.compile(u'((_+\r?\n)?[\s]*:?[*]?({})[\s]?:([^\n$]+\n){{1,2}}){{2,}}'.format( | ||||
|     u'|'.join(( | ||||
|         # "From" in different languages. | ||||
|         'From', 'Van', 'De', 'Von', 'Fra', u'Från', | ||||
|         # "Date" in different languages. | ||||
|         'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt', | ||||
|         'Date', '[S]ent', 'Datum', u'Envoyé', 'Skickat', 'Sendt', 'Gesendet', | ||||
|         # "Subject" in different languages. | ||||
|         'Subject', 'Betreff', 'Objet', 'Emne', u'Ämne', | ||||
|         # "To" in different languages. | ||||
|         'To', 'An', 'Til', u'À', 'Till' | ||||
|     ))), re.I | re.M) | ||||
|  | ||||
| # ---- John Smith wrote ---- | ||||
| RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format( | ||||
|     u'|'.join(( | ||||
|         # English | ||||
|         'wrote', | ||||
|     ))), re.I) | ||||
|  | ||||
| # Support polymail.io reply format | ||||
| # On Tue, Apr 11, 2017 at 10:07 PM John Smith | ||||
| # | ||||
| # < | ||||
| # mailto:John Smith <johnsmith@gmail.com> | ||||
| # > wrote: | ||||
| RE_POLYMAIL = re.compile('On.*\s{2}<\smailto:.*\s> wrote:', re.I) | ||||
|  | ||||
| SPLITTER_PATTERNS = [ | ||||
|     RE_ORIGINAL_MESSAGE, | ||||
|     RE_ON_DATE_SMB_WROTE, | ||||
| @@ -146,24 +173,25 @@ SPLITTER_PATTERNS = [ | ||||
|     RE_FROM_COLON_OR_DATE_COLON, | ||||
|     # 02.04.2012 14:20 пользователь "bob@example.com" < | ||||
|     # bob@xxx.mailgun.org> написал: | ||||
|     re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*@", re.S), | ||||
|     re.compile("(\d+/\d+/\d+|\d+\.\d+\.\d+).*\s\S+@\S+", re.S), | ||||
|     # 2014-10-17 11:28 GMT+03:00 Bob < | ||||
|     # bob@example.com>: | ||||
|     re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*@", re.S), | ||||
|     re.compile("\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+GMT.*\s\S+@\S+", re.S), | ||||
|     # Thu, 26 Jun 2014 14:00:51 +0400 Bob <bob@example.com>: | ||||
|     re.compile('\S{3,10}, \d\d? \S{3,10} 20\d\d,? \d\d?:\d\d(:\d\d)?' | ||||
|                '( \S+){3,6}@\S+:'), | ||||
|     # Sent from Samsung MobileName <address@example.com> wrote: | ||||
|     re.compile('Sent from Samsung .*@.*> wrote') | ||||
|     re.compile('Sent from Samsung.* \S+@\S+> wrote'), | ||||
|     RE_ANDROID_WROTE, | ||||
|     RE_POLYMAIL | ||||
|     ] | ||||
|  | ||||
|  | ||||
| RE_LINK = re.compile('<(http://[^>]*)>') | ||||
| RE_NORMALIZED_LINK = re.compile('@@(http://[^>@]*)@@') | ||||
|  | ||||
| RE_PARENTHESIS_LINK = re.compile("\(https?://") | ||||
|  | ||||
| SPLITTER_MAX_LINES = 4 | ||||
| SPLITTER_MAX_LINES = 6 | ||||
| MAX_LINES_COUNT = 1000 | ||||
| # an extensive research shows that exceeding this limit | ||||
| # leads to excessive processing time | ||||
| @@ -188,6 +216,19 @@ def extract_from(msg_body, content_type='text/plain'): | ||||
|     return msg_body | ||||
|  | ||||
|  | ||||
| def remove_initial_spaces_and_mark_message_lines(lines): | ||||
|     """ | ||||
|     Removes the initial spaces in each line before marking message lines. | ||||
|  | ||||
|     This ensures headers can be identified if they are indented with spaces. | ||||
|     """ | ||||
|     i = 0 | ||||
|     while i < len(lines): | ||||
|         lines[i] = lines[i].lstrip(' ') | ||||
|         i += 1 | ||||
|     return mark_message_lines(lines) | ||||
|  | ||||
|  | ||||
| def mark_message_lines(lines): | ||||
|     """Mark message lines with markers to distinguish quotation lines. | ||||
|  | ||||
| @@ -253,7 +294,7 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): | ||||
|     # inlined reply | ||||
|     # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm' | ||||
|     # both 't' entries should be found | ||||
|     for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers): | ||||
|     for inline_reply in re.finditer('(?<=m)e*(t[te]*)m', markers): | ||||
|         # long links could break sequence of quotation lines but they shouldn't | ||||
|         # be considered an inline reply | ||||
|         links = ( | ||||
| @@ -290,9 +331,21 @@ def preprocess(msg_body, delimiter, content_type='text/plain'): | ||||
|  | ||||
|     Converts msg_body into a unicode. | ||||
|     """ | ||||
|     # normalize links i.e. replace '<', '>' wrapping the link with some symbols | ||||
|     # so that '>' closing the link couldn't be mistakenly taken for quotation | ||||
|     # marker. | ||||
|     msg_body = _replace_link_brackets(msg_body) | ||||
|  | ||||
|     msg_body = _wrap_splitter_with_newline(msg_body, delimiter, content_type) | ||||
|  | ||||
|     return msg_body | ||||
|  | ||||
|  | ||||
| def _replace_link_brackets(msg_body): | ||||
|     """ | ||||
|     Normalize links i.e. replace '<', '>' wrapping the link with some symbols | ||||
|     so that '>' closing the link couldn't be mistakenly taken for quotation | ||||
|     marker. | ||||
|  | ||||
|     Converts msg_body into a unicode | ||||
|     """ | ||||
|     if isinstance(msg_body, bytes): | ||||
|         msg_body = msg_body.decode('utf8') | ||||
|  | ||||
| @@ -304,7 +357,14 @@ def preprocess(msg_body, delimiter, content_type='text/plain'): | ||||
|             return "@@%s@@" % link.group(1) | ||||
|  | ||||
|     msg_body = re.sub(RE_LINK, link_wrapper, msg_body) | ||||
|     return msg_body | ||||
|  | ||||
|  | ||||
| def _wrap_splitter_with_newline(msg_body, delimiter, content_type='text/plain'): | ||||
|     """ | ||||
|     Splits line in two if splitter pattern preceded by some text on the same | ||||
|     line (done only for 'On <date> <person> wrote:' pattern. | ||||
|     """ | ||||
|     def splitter_wrapper(splitter): | ||||
|         """Wraps splitter with new line""" | ||||
|         if splitter.start() and msg_body[splitter.start() - 1] != '\n': | ||||
| @@ -378,6 +438,9 @@ def _extract_from_html(msg_body): | ||||
|     Extract not quoted message from provided html message body | ||||
|     using tags and plain text algorithm. | ||||
|  | ||||
|     Cut out first some encoding html tags such as xml and doctype | ||||
|     for avoiding conflict with unicode decoding | ||||
|  | ||||
|     Cut out the 'blockquote', 'gmail_quote' tags. | ||||
|     Cut Microsoft quotations. | ||||
|  | ||||
| @@ -393,6 +456,9 @@ def _extract_from_html(msg_body): | ||||
|         return msg_body | ||||
|  | ||||
|     msg_body = msg_body.replace(b'\r\n', b'\n') | ||||
|  | ||||
|     msg_body = re.sub(br"\<\?xml.+\?\>|\<\!DOCTYPE.+]\>", "", msg_body) | ||||
|  | ||||
|     html_tree = html_document_fromstring(msg_body) | ||||
|  | ||||
|     if html_tree is None: | ||||
| @@ -450,24 +516,87 @@ def _extract_from_html(msg_body): | ||||
|     if _readable_text_empty(html_tree_copy): | ||||
|         return msg_body | ||||
|  | ||||
|     # NOTE: We remove_namespaces() because we are using an HTML5 Parser, HTML | ||||
|     # parsers do not recognize namespaces in HTML tags. As such the rendered | ||||
|     # HTML tags are no longer recognizable HTML tags. Example: <o:p> becomes | ||||
|     # <oU0003Ap>. When we port this to golang we should look into using an | ||||
|     # XML Parser NOT and HTML5 Parser since we do not know what input a | ||||
|     # customer will send us. Switching to a common XML parser in python | ||||
|     # opens us up to a host of vulnerabilities. | ||||
|     # See https://docs.python.org/3/library/xml.html#xml-vulnerabilities | ||||
|     # | ||||
|     # The down sides to removing the namespaces is that customers might | ||||
|     # judge the XML namespaces important. If that is the case then support | ||||
|     # should encourage customers to preform XML parsing of the un-stripped | ||||
|     # body to get the full unmodified XML payload. | ||||
|     # | ||||
|     # Alternatives to this approach are | ||||
|     # 1. Ignore the U0003A in tag names and let the customer deal with it. | ||||
|     #    This is not ideal, as most customers use stripped-html for viewing | ||||
|     #    emails sent from a recipient, as such they cannot control the HTML | ||||
|     #    provided by a recipient. | ||||
|     # 2. Preform a string replace of 'U0003A' to ':' on the rendered HTML | ||||
|     #    string. While this would solve the issue simply, it runs the risk | ||||
|     #    of replacing data outside the <tag> which might be essential to | ||||
|     #    the customer. | ||||
|     remove_namespaces(html_tree_copy) | ||||
|     return html.tostring(html_tree_copy) | ||||
|  | ||||
|  | ||||
| def remove_namespaces(root): | ||||
|     """ | ||||
|     Given the root of an HTML document iterate through all the elements | ||||
|     and remove any namespaces that might have been provided and remove | ||||
|     any attributes that contain a namespace | ||||
|  | ||||
|     <html xmlns:o="urn:schemas-microsoft-com:office:office"> | ||||
|     becomes | ||||
|     <html> | ||||
|  | ||||
|     <o:p>Hi</o:p> | ||||
|     becomes | ||||
|     <p>Hi</p> | ||||
|  | ||||
|     Start tags do NOT have a namespace; COLON characters have no special meaning. | ||||
|     if we don't remove the namespace the parser translates the tag name into a | ||||
|     unicode representation. For example <o:p> becomes <oU0003Ap> | ||||
|  | ||||
|     See https://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#start-tags | ||||
|  | ||||
|  | ||||
|     """ | ||||
|     for child in root.iter(): | ||||
|         for key, value in child.attrib.items(): | ||||
|             # If the attribute includes a colon | ||||
|             if key.rfind("U0003A") != -1: | ||||
|                 child.attrib.pop(key) | ||||
|  | ||||
|         # If the tag includes a colon | ||||
|         idx = child.tag.rfind("U0003A") | ||||
|         if idx != -1: | ||||
|             child.tag = child.tag[idx+6:] | ||||
|  | ||||
|     return root | ||||
|  | ||||
|  | ||||
| def split_emails(msg): | ||||
|     """ | ||||
|     Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify | ||||
|      split lines, content lines and empty lines. | ||||
|     Given a message (which may consist of an email conversation thread with | ||||
|     multiple emails), mark the lines to identify split lines, content lines and | ||||
|     empty lines. | ||||
|  | ||||
|     Correct the split line markers inside header blocks. Header blocks are identified by the regular expression | ||||
|     RE_HEADER. | ||||
|     Correct the split line markers inside header blocks. Header blocks are | ||||
|     identified by the regular expression RE_HEADER. | ||||
|  | ||||
|     Return the corrected markers | ||||
|     """ | ||||
|     delimiter = get_delimiter(msg) | ||||
|     msg_body = preprocess(msg, delimiter) | ||||
|     msg_body = _replace_link_brackets(msg) | ||||
|  | ||||
|     # don't process too long messages | ||||
|     lines = msg_body.splitlines()[:MAX_LINES_COUNT] | ||||
|     markers = mark_message_lines(lines) | ||||
|     markers = remove_initial_spaces_and_mark_message_lines(lines) | ||||
|  | ||||
|     markers = _mark_quoted_email_splitlines(markers, lines) | ||||
|  | ||||
|     # we don't want splitlines in header blocks | ||||
|     markers = _correct_splitlines_in_headers(markers, lines) | ||||
| @@ -475,20 +604,44 @@ def split_emails(msg): | ||||
|     return markers | ||||
|  | ||||
|  | ||||
| def _mark_quoted_email_splitlines(markers, lines): | ||||
|     """ | ||||
|     When there are headers indented with '>' characters, this method will | ||||
|     attempt to identify if the header is a splitline header. If it is, then we | ||||
|     mark it with 's' instead of leaving it as 'm' and return the new markers. | ||||
|     """ | ||||
|     # Create a list of markers to easily alter specific characters | ||||
|     markerlist = list(markers) | ||||
|     for i, line in enumerate(lines): | ||||
|         if markerlist[i] != 'm': | ||||
|             continue | ||||
|         for pattern in SPLITTER_PATTERNS: | ||||
|             matcher = re.search(pattern, line) | ||||
|             if matcher: | ||||
|                 markerlist[i] = 's' | ||||
|                 break | ||||
|  | ||||
|     return "".join(markerlist) | ||||
|  | ||||
|  | ||||
| def _correct_splitlines_in_headers(markers, lines): | ||||
|     """Corrects markers by removing splitlines deemed to be inside header blocks""" | ||||
|     """ | ||||
|     Corrects markers by removing splitlines deemed to be inside header blocks. | ||||
|     """ | ||||
|     updated_markers = "" | ||||
|     i = 0 | ||||
|     in_header_block = False | ||||
|  | ||||
|     for m in markers: | ||||
|         # Only set in_header_block flag true when we hit an 's' and the line is a header. | ||||
|         # Only set in_header_block flag when we hit an 's' and line is a header | ||||
|         if m == 's': | ||||
|             if not in_header_block: | ||||
|                 if bool(re.search(RE_HEADER, lines[i])): | ||||
|                     in_header_block = True | ||||
|             else: | ||||
|                 m = 't' | ||||
|                 if QUOT_PATTERN.match(lines[i]): | ||||
|                     m = 'm' | ||||
|                 else: | ||||
|                     m = 't' | ||||
|  | ||||
|         # If the line is not a header line, set in_header_block false. | ||||
|         if not bool(re.search(RE_HEADER, lines[i])): | ||||
|   | ||||
| @@ -1,15 +1,15 @@ | ||||
| from __future__ import absolute_import | ||||
|  | ||||
| import logging | ||||
|  | ||||
| import regex as re | ||||
|  | ||||
| from talon.utils import get_delimiter | ||||
| from talon.signature.constants import (SIGNATURE_MAX_LINES, | ||||
|                                        TOO_LONG_SIGNATURE_LINE) | ||||
| from talon.utils import get_delimiter | ||||
|  | ||||
| log = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| # regex to fetch signature based on common signature words | ||||
| RE_SIGNATURE = re.compile(r''' | ||||
|                ( | ||||
| @@ -28,7 +28,6 @@ RE_SIGNATURE = re.compile(r''' | ||||
|                ) | ||||
|                ''', re.I | re.X | re.M | re.S) | ||||
|  | ||||
|  | ||||
| # signatures appended by phone email clients | ||||
| RE_PHONE_SIGNATURE = re.compile(r''' | ||||
|                ( | ||||
| @@ -45,7 +44,6 @@ RE_PHONE_SIGNATURE = re.compile(r''' | ||||
|                ) | ||||
|                ''', re.I | re.X | re.M | re.S) | ||||
|  | ||||
|  | ||||
| # see _mark_candidate_indexes() for details | ||||
| # c - could be signature line | ||||
| # d - line starts with dashes (could be signature or list item) | ||||
| @@ -112,7 +110,7 @@ def extract_signature(msg_body): | ||||
|  | ||||
|             return (stripped_body.strip(), | ||||
|                     signature.strip()) | ||||
|     except Exception as e: | ||||
|     except Exception: | ||||
|         log.exception('ERROR extracting signature') | ||||
|         return (msg_body, None) | ||||
|  | ||||
| @@ -163,7 +161,7 @@ def _mark_candidate_indexes(lines, candidate): | ||||
|     'cdc' | ||||
|     """ | ||||
|     # at first consider everything to be potential signature lines | ||||
|     markers = bytearray('c'*len(candidate)) | ||||
|     markers = list('c' * len(candidate)) | ||||
|  | ||||
|     # mark lines starting from bottom up | ||||
|     for i, line_idx in reversed(list(enumerate(candidate))): | ||||
| @@ -174,7 +172,7 @@ def _mark_candidate_indexes(lines, candidate): | ||||
|             if line.startswith('-') and line.strip("-"): | ||||
|                 markers[i] = 'd' | ||||
|  | ||||
|     return markers | ||||
|     return "".join(markers) | ||||
|  | ||||
|  | ||||
| def _process_marked_candidate_indexes(candidate, markers): | ||||
|   | ||||
							
								
								
									
										1
									
								
								talon/signature/data/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								talon/signature/data/__init__.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1 @@ | ||||
|  | ||||
										
											Binary file not shown.
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,16 +1,15 @@ | ||||
| # -*- coding: utf-8 -*- | ||||
|  | ||||
| from __future__ import absolute_import | ||||
|  | ||||
| import logging | ||||
|  | ||||
| import regex as re | ||||
| import numpy | ||||
|  | ||||
| from talon.signature.learning.featurespace import features, build_pattern | ||||
| from talon.utils import get_delimiter | ||||
| import regex as re | ||||
| from talon.signature.bruteforce import get_signature_candidate | ||||
| from talon.signature.learning.featurespace import features, build_pattern | ||||
| from talon.signature.learning.helpers import has_signature | ||||
|  | ||||
| from talon.utils import get_delimiter | ||||
|  | ||||
| log = logging.getLogger(__name__) | ||||
|  | ||||
| @@ -33,7 +32,7 @@ RE_REVERSE_SIGNATURE = re.compile(r''' | ||||
|  | ||||
| def is_signature_line(line, sender, classifier): | ||||
|     '''Checks if the line belongs to signature. Returns True or False.''' | ||||
|     data = numpy.array(build_pattern(line, features(sender))) | ||||
|     data = numpy.array(build_pattern(line, features(sender))).reshape(1, -1) | ||||
|     return classifier.predict(data) > 0 | ||||
|  | ||||
|  | ||||
| @@ -58,7 +57,7 @@ def extract(body, sender): | ||||
|                 text = delimiter.join(text) | ||||
|                 if text.strip(): | ||||
|                     return (text, delimiter.join(signature)) | ||||
|     except Exception: | ||||
|     except Exception as e: | ||||
|         log.exception('ERROR when extracting signature with classifiers') | ||||
|  | ||||
|     return (body, None) | ||||
| @@ -81,7 +80,7 @@ def _mark_lines(lines, sender): | ||||
|     candidate = get_signature_candidate(lines) | ||||
|  | ||||
|     # at first consider everything to be text no signature | ||||
|     markers = bytearray('t'*len(lines)) | ||||
|     markers = list('t' * len(lines)) | ||||
|  | ||||
|     # mark lines starting from bottom up | ||||
|     # mark only lines that belong to candidate | ||||
| @@ -96,7 +95,7 @@ def _mark_lines(lines, sender): | ||||
|         elif is_signature_line(line, sender, EXTRACTOR): | ||||
|             markers[j] = 's' | ||||
|  | ||||
|     return markers | ||||
|     return "".join(markers) | ||||
|  | ||||
|  | ||||
| def _process_marked_lines(lines, markers): | ||||
| @@ -111,3 +110,4 @@ def _process_marked_lines(lines, markers): | ||||
|         return (lines[:-signature.end()], lines[-signature.end():]) | ||||
|  | ||||
|     return (lines, None) | ||||
|  | ||||
|   | ||||
| @@ -6,9 +6,10 @@ body belongs to the signature. | ||||
| """ | ||||
|  | ||||
| from __future__ import absolute_import | ||||
|  | ||||
| from numpy import genfromtxt | ||||
| import joblib | ||||
| from sklearn.svm import LinearSVC | ||||
| from sklearn.externals import joblib | ||||
|  | ||||
|  | ||||
| def init(): | ||||
| @@ -29,4 +30,40 @@ def train(classifier, train_data_filename, save_classifier_filename=None): | ||||
|  | ||||
| def load(saved_classifier_filename, train_data_filename): | ||||
|     """Loads saved classifier. """ | ||||
|     return joblib.load(saved_classifier_filename) | ||||
|     try: | ||||
|         return joblib.load(saved_classifier_filename) | ||||
|     except Exception: | ||||
|         import sys | ||||
|         if sys.version_info > (3, 0): | ||||
|             return load_compat(saved_classifier_filename) | ||||
|  | ||||
|         raise | ||||
|  | ||||
|  | ||||
| def load_compat(saved_classifier_filename): | ||||
|     import os | ||||
|     import pickle | ||||
|     import tempfile | ||||
|  | ||||
|     # we need to switch to the data path to properly load the related _xx.npy files | ||||
|     cwd = os.getcwd() | ||||
|     os.chdir(os.path.dirname(saved_classifier_filename)) | ||||
|  | ||||
|     # convert encoding using pick.load and write to temp file which we'll tell joblib to use | ||||
|     pickle_file = open(saved_classifier_filename, 'rb') | ||||
|     classifier = pickle.load(pickle_file, encoding='latin1') | ||||
|  | ||||
|     try: | ||||
|         # save our conversion if permissions allow | ||||
|         joblib.dump(classifier, saved_classifier_filename) | ||||
|     except Exception: | ||||
|         # can't write to classifier, use a temp file | ||||
|         tmp = tempfile.SpooledTemporaryFile() | ||||
|         joblib.dump(classifier, tmp) | ||||
|         saved_classifier_filename = tmp | ||||
|  | ||||
|     # important, use joblib.load before switching back to original cwd | ||||
|     jb_classifier = joblib.load(saved_classifier_filename) | ||||
|     os.chdir(cwd) | ||||
|  | ||||
|     return jb_classifier | ||||
|   | ||||
| @@ -17,13 +17,14 @@ suffix which should be `_sender`. | ||||
| """ | ||||
|  | ||||
| from __future__ import absolute_import | ||||
|  | ||||
| import os | ||||
|  | ||||
| import regex as re | ||||
| from six.moves import range | ||||
|  | ||||
| from talon.signature.constants import SIGNATURE_MAX_LINES | ||||
| from talon.signature.learning.featurespace import build_pattern, features | ||||
| from six.moves import range | ||||
|  | ||||
|  | ||||
| SENDER_SUFFIX = '_sender' | ||||
| BODY_SUFFIX = '_body' | ||||
| @@ -57,9 +58,14 @@ def parse_msg_sender(filename, sender_known=True): | ||||
|     algorithm: | ||||
|     >>> parse_msg_sender(filename, False) | ||||
|     """ | ||||
|     import sys | ||||
|     kwargs = {} | ||||
|     if sys.version_info > (3, 0): | ||||
|         kwargs["encoding"] = "utf8" | ||||
|  | ||||
|     sender, msg = None, None | ||||
|     if os.path.isfile(filename) and not is_sender_filename(filename): | ||||
|         with open(filename) as f: | ||||
|         with open(filename, **kwargs) as f: | ||||
|             msg = f.read() | ||||
|             sender = u'' | ||||
|             if sender_known: | ||||
| @@ -147,7 +153,7 @@ def build_extraction_dataset(folder, dataset_filename, | ||||
|                 continue | ||||
|             lines = msg.splitlines() | ||||
|             for i in range(1, min(SIGNATURE_MAX_LINES, | ||||
|                                    len(lines)) + 1): | ||||
|                                   len(lines)) + 1): | ||||
|                 line = lines[-i] | ||||
|                 label = -1 | ||||
|                 if line[:len(SIGNATURE_ANNOTATION)] == \ | ||||
|   | ||||
| @@ -1,19 +1,18 @@ | ||||
| # coding:utf-8 | ||||
|  | ||||
| from __future__ import absolute_import | ||||
| import logging | ||||
|  | ||||
| from random import shuffle | ||||
| import chardet | ||||
|  | ||||
| import cchardet | ||||
| import regex as re | ||||
|  | ||||
| from lxml.html import html5parser | ||||
| from lxml.cssselect import CSSSelector | ||||
|  | ||||
| import chardet | ||||
| import html5lib | ||||
| import regex as re | ||||
| import six | ||||
| from lxml.cssselect import CSSSelector | ||||
| from lxml.html import html5parser | ||||
|  | ||||
| from talon.constants import RE_DELIMITER | ||||
| import six | ||||
|  | ||||
|  | ||||
| def safe_format(format_string, *args, **kwargs): | ||||
| @@ -128,11 +127,11 @@ def html_tree_to_text(tree): | ||||
|  | ||||
|         parent.remove(c) | ||||
|  | ||||
|     text   = "" | ||||
|     text = "" | ||||
|     for el in tree.iter(): | ||||
|         el_text = (el.text or '') + (el.tail or '') | ||||
|         if len(el_text) > 1: | ||||
|             if el.tag in _BLOCKTAGS: | ||||
|             if el.tag in _BLOCKTAGS + _HARDBREAKS: | ||||
|                 text += "\n" | ||||
|             if el.tag == 'li': | ||||
|                 text += "  * " | ||||
| @@ -143,7 +142,8 @@ def html_tree_to_text(tree): | ||||
|             if href: | ||||
|                 text += "(%s) " % href | ||||
|  | ||||
|         if el.tag in _HARDBREAKS and text and not text.endswith("\n"): | ||||
|         if (el.tag in _HARDBREAKS and text and | ||||
|             not text.endswith("\n") and not el_text): | ||||
|             text += "\n" | ||||
|  | ||||
|     retval = _rm_excessive_newlines(text) | ||||
| @@ -177,6 +177,8 @@ def html_to_text(string): | ||||
| def html_fromstring(s): | ||||
|     """Parse html tree from string. Return None if the string can't be parsed. | ||||
|     """ | ||||
|     if isinstance(s, six.text_type): | ||||
|         s = s.encode('utf8') | ||||
|     try: | ||||
|         if html_too_big(s): | ||||
|             return None | ||||
| @@ -189,6 +191,8 @@ def html_fromstring(s): | ||||
| def html_document_fromstring(s): | ||||
|     """Parse html tree from string. Return None if the string can't be parsed. | ||||
|     """ | ||||
|     if isinstance(s, six.text_type): | ||||
|         s = s.encode('utf8') | ||||
|     try: | ||||
|         if html_too_big(s): | ||||
|             return None | ||||
| @@ -203,7 +207,9 @@ def cssselect(expr, tree): | ||||
|  | ||||
|  | ||||
| def html_too_big(s): | ||||
|     return s.count('<') > _MAX_TAGS_COUNT | ||||
|     if isinstance(s, six.text_type): | ||||
|         s = s.encode('utf8') | ||||
|     return s.count(b'<') > _MAX_TAGS_COUNT | ||||
|  | ||||
|  | ||||
| def _contains_charset_spec(s): | ||||
| @@ -248,8 +254,7 @@ def _html5lib_parser(): | ||||
| _UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;' | ||||
|                      b'charset=utf-8">') | ||||
|  | ||||
|  | ||||
| _BLOCKTAGS  = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] | ||||
| _BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3'] | ||||
| _HARDBREAKS = ['br', 'hr', 'tr'] | ||||
|  | ||||
| _RE_EXCESSIVE_NEWLINES = re.compile("\n{2,10}") | ||||
|   | ||||
							
								
								
									
										3
									
								
								test-requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								test-requirements.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,3 @@ | ||||
| coverage | ||||
| mock | ||||
| nose>=1.2.1 | ||||
| @@ -1,13 +1,14 @@ | ||||
| # -*- coding: utf-8 -*- | ||||
|  | ||||
| from __future__ import absolute_import | ||||
| from . import * | ||||
| from . fixtures import * | ||||
|  | ||||
| import regex as re | ||||
| # noinspection PyUnresolvedReferences | ||||
| import re | ||||
|  | ||||
| from talon import quotations, utils as u | ||||
|  | ||||
| from . import * | ||||
| from .fixtures import * | ||||
| from lxml import html | ||||
|  | ||||
| RE_WHITESPACE = re.compile("\s") | ||||
| RE_DOUBLE_WHITESPACE = re.compile("\s") | ||||
| @@ -303,7 +304,12 @@ Reply | ||||
|  | ||||
|  | ||||
| def extract_reply_and_check(filename): | ||||
|     f = open(filename) | ||||
|     import sys | ||||
|     kwargs = {} | ||||
|     if sys.version_info > (3, 0): | ||||
|         kwargs["encoding"] = "utf8" | ||||
|  | ||||
|     f = open(filename, **kwargs) | ||||
|  | ||||
|     msg_body = f.read() | ||||
|     reply = quotations.extract_from_html(msg_body) | ||||
| @@ -419,3 +425,23 @@ def test_readable_html_empty(): | ||||
| def test_bad_html(): | ||||
|     bad_html = "<html></html>" | ||||
|     eq_(bad_html, quotations.extract_from_html(bad_html)) | ||||
|  | ||||
|  | ||||
| def test_remove_namespaces(): | ||||
|     msg_body = """ | ||||
|     <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns="http://www.w3.org/TR/REC-html40"> | ||||
|         <body> | ||||
|             <o:p>Dear Sir,</o:p> | ||||
|             <o:p>Thank you for the email.</o:p> | ||||
|             <blockquote>thing</blockquote> | ||||
|         </body> | ||||
|     </html> | ||||
|     """ | ||||
|  | ||||
|     rendered = quotations.extract_from_html(msg_body) | ||||
|  | ||||
|     assert_true("<p>" in rendered) | ||||
|     assert_true("xmlns" in rendered) | ||||
|  | ||||
|     assert_true("<o:p>" not in rendered) | ||||
|     assert_true("<xmlns:o>" not in rendered) | ||||
|   | ||||
| @@ -1,16 +1,16 @@ | ||||
| # -*- coding: utf-8 -*- | ||||
|  | ||||
| from __future__ import absolute_import | ||||
| from .. import * | ||||
|  | ||||
| import os | ||||
|  | ||||
| from talon.signature.learning import dataset | ||||
| from talon import signature | ||||
| from talon.signature import extraction as e | ||||
| from talon.signature import bruteforce | ||||
| from six.moves import range | ||||
|  | ||||
| from talon.signature import bruteforce, extraction, extract | ||||
| from talon.signature import extraction as e | ||||
| from talon.signature.learning import dataset | ||||
| from .. import * | ||||
|  | ||||
|  | ||||
| def test_message_shorter_SIGNATURE_MAX_LINES(): | ||||
|     sender = "bob@foo.bar" | ||||
| @@ -18,23 +18,28 @@ def test_message_shorter_SIGNATURE_MAX_LINES(): | ||||
|  | ||||
| Thanks in advance, | ||||
| Bob""" | ||||
|     text, extracted_signature = signature.extract(body, sender) | ||||
|     text, extracted_signature = extract(body, sender) | ||||
|     eq_('\n'.join(body.splitlines()[:2]), text) | ||||
|     eq_('\n'.join(body.splitlines()[-2:]), extracted_signature) | ||||
|  | ||||
|  | ||||
| def test_messages_longer_SIGNATURE_MAX_LINES(): | ||||
|     import sys | ||||
|     kwargs = {} | ||||
|     if sys.version_info > (3, 0): | ||||
|         kwargs["encoding"] = "utf8" | ||||
|  | ||||
|     for filename in os.listdir(STRIPPED): | ||||
|         filename = os.path.join(STRIPPED, filename) | ||||
|         if not filename.endswith('_body'): | ||||
|             continue | ||||
|         sender, body = dataset.parse_msg_sender(filename) | ||||
|         text, extracted_signature = signature.extract(body, sender) | ||||
|         text, extracted_signature = extract(body, sender) | ||||
|         extracted_signature = extracted_signature or '' | ||||
|         with open(filename[:-len('body')] + 'signature') as ms: | ||||
|         with open(filename[:-len('body')] + 'signature', **kwargs) as ms: | ||||
|             msg_signature = ms.read() | ||||
|             eq_(msg_signature.strip(), extracted_signature.strip()) | ||||
|             stripped_msg = body.strip()[:len(body.strip())-len(msg_signature)] | ||||
|             stripped_msg = body.strip()[:len(body.strip()) - len(msg_signature)] | ||||
|             eq_(stripped_msg.strip(), text.strip()) | ||||
|  | ||||
|  | ||||
| @@ -47,7 +52,7 @@ Thanks in advance, | ||||
| some text which doesn't seem to be a signature at all | ||||
| Bob""" | ||||
|  | ||||
|     text, extracted_signature = signature.extract(body, sender) | ||||
|     text, extracted_signature = extract(body, sender) | ||||
|     eq_('\n'.join(body.splitlines()[:2]), text) | ||||
|     eq_('\n'.join(body.splitlines()[-3:]), extracted_signature) | ||||
|  | ||||
| @@ -60,7 +65,7 @@ Thanks in advance, | ||||
| some long text here which doesn't seem to be a signature at all | ||||
| Bob""" | ||||
|  | ||||
|     text, extracted_signature = signature.extract(body, sender) | ||||
|     text, extracted_signature = extract(body, sender) | ||||
|     eq_('\n'.join(body.splitlines()[:-1]), text) | ||||
|     eq_('Bob', extracted_signature) | ||||
|  | ||||
| @@ -68,13 +73,13 @@ Bob""" | ||||
|  | ||||
|     some *long* text here which doesn't seem to be a signature at all | ||||
|     """ | ||||
|     ((body, None), signature.extract(body, "david@example.com")) | ||||
|     ((body, None), extract(body, "david@example.com")) | ||||
|  | ||||
|  | ||||
| def test_basic(): | ||||
|     msg_body = 'Blah\r\n--\r\n\r\nSergey Obukhov' | ||||
|     eq_(('Blah', '--\r\n\r\nSergey Obukhov'), | ||||
|         signature.extract(msg_body, 'Sergey')) | ||||
|         extract(msg_body, 'Sergey')) | ||||
|  | ||||
|  | ||||
| def test_capitalized(): | ||||
| @@ -99,7 +104,7 @@ Doe Inc | ||||
| Doe Inc | ||||
| 555-531-7967""" | ||||
|  | ||||
|     eq_(sig, signature.extract(msg_body, 'Doe')[1]) | ||||
|     eq_(sig, extract(msg_body, 'Doe')[1]) | ||||
|  | ||||
|  | ||||
| def test_over_2_text_lines_after_signature(): | ||||
| @@ -110,25 +115,25 @@ def test_over_2_text_lines_after_signature(): | ||||
|     2 non signature lines in the end | ||||
|     It's not signature | ||||
|     """ | ||||
|     text, extracted_signature = signature.extract(body, "Bob") | ||||
|     text, extracted_signature = extract(body, "Bob") | ||||
|     eq_(extracted_signature, None) | ||||
|  | ||||
|  | ||||
| def test_no_signature(): | ||||
|     sender, body = "bob@foo.bar", "Hello" | ||||
|     eq_((body, None), signature.extract(body, sender)) | ||||
|     eq_((body, None), extract(body, sender)) | ||||
|  | ||||
|  | ||||
| def test_handles_unicode(): | ||||
|     sender, body = dataset.parse_msg_sender(UNICODE_MSG) | ||||
|     text, extracted_signature = signature.extract(body, sender) | ||||
|     text, extracted_signature = extract(body, sender) | ||||
|  | ||||
|  | ||||
| @patch.object(signature.extraction, 'has_signature') | ||||
| @patch.object(extraction, 'has_signature') | ||||
| def test_signature_extract_crash(has_signature): | ||||
|     has_signature.side_effect = Exception('Bam!') | ||||
|     msg_body = u'Blah\r\n--\r\n\r\nСергей' | ||||
|     eq_((msg_body, None), signature.extract(msg_body, 'Сергей')) | ||||
|     eq_((msg_body, None), extract(msg_body, 'Сергей')) | ||||
|  | ||||
|  | ||||
| def test_mark_lines(): | ||||
| @@ -137,19 +142,19 @@ def test_mark_lines(): | ||||
|         # (starting from the bottom) because we don't count empty line | ||||
|         eq_('ttset', | ||||
|             e._mark_lines(['Bob Smith', | ||||
|                           'Bob Smith', | ||||
|                           'Bob Smith', | ||||
|                           '', | ||||
|                           'some text'], 'Bob Smith')) | ||||
|                            'Bob Smith', | ||||
|                            'Bob Smith', | ||||
|                            '', | ||||
|                            'some text'], 'Bob Smith')) | ||||
|  | ||||
|     with patch.object(bruteforce, 'SIGNATURE_MAX_LINES', 3): | ||||
|         # we don't analyse the 1st line because | ||||
|         # signature cant start from the 1st line | ||||
|         eq_('tset', | ||||
|             e._mark_lines(['Bob Smith', | ||||
|                           'Bob Smith', | ||||
|                           '', | ||||
|                           'some text'], 'Bob Smith')) | ||||
|                            'Bob Smith', | ||||
|                            '', | ||||
|                            'some text'], 'Bob Smith')) | ||||
|  | ||||
|  | ||||
| def test_process_marked_lines(): | ||||
|   | ||||
| @@ -35,6 +35,19 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> wrote: | ||||
|  | ||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||
|  | ||||
| def test_pattern_on_date_polymail(): | ||||
|     msg_body = """Test reply | ||||
|  | ||||
| On Tue, Apr 11, 2017 at 10:07 PM John Smith | ||||
|  | ||||
| < | ||||
| mailto:John Smith <johnsmith@gmail.com> | ||||
| > wrote: | ||||
| Test quoted data | ||||
| """ | ||||
|  | ||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||
|  | ||||
|  | ||||
| def test_pattern_sent_from_samsung_smb_wrote(): | ||||
|     msg_body = """Test reply | ||||
| @@ -106,6 +119,38 @@ On 11-Apr-2011, at 6:54 PM, Roman Tkachenko <romant@example.com> sent: | ||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||
|  | ||||
|  | ||||
| def test_appointment(): | ||||
|     msg_body = """Response | ||||
|  | ||||
| 10/19/2017 @ 9:30 am for physical therapy | ||||
| Bla | ||||
| 1517 4th Avenue Ste 300 | ||||
| London CA 19129, 555-421-6780 | ||||
|  | ||||
| John Doe, FCLS | ||||
| Mailgun Inc | ||||
| 555-941-0697 | ||||
|  | ||||
| From: from@example.com [mailto:from@example.com] | ||||
| Sent: Wednesday, October 18, 2017 2:05 PM | ||||
| To: John Doer - SIU <jd@example.com> | ||||
| Subject: RE: Claim # 5551188-1 | ||||
|  | ||||
| Text""" | ||||
|  | ||||
|     expected = """Response | ||||
|  | ||||
| 10/19/2017 @ 9:30 am for physical therapy | ||||
| Bla | ||||
| 1517 4th Avenue Ste 300 | ||||
| London CA 19129, 555-421-6780 | ||||
|  | ||||
| John Doe, FCLS | ||||
| Mailgun Inc | ||||
| 555-941-0697""" | ||||
|     eq_(expected, quotations.extract_from_plain(msg_body)) | ||||
|  | ||||
|  | ||||
| def test_line_starts_with_on(): | ||||
|     msg_body = """Blah-blah-blah | ||||
| On blah-blah-blah""" | ||||
| @@ -142,7 +187,8 @@ def _check_pattern_original_message(original_message_indicator): | ||||
| -----{}----- | ||||
|  | ||||
| Test""" | ||||
|     eq_('Test reply', quotations.extract_from_plain(msg_body.format(six.text_type(original_message_indicator)))) | ||||
|     eq_('Test reply', quotations.extract_from_plain( | ||||
|         msg_body.format(six.text_type(original_message_indicator)))) | ||||
|  | ||||
| def test_english_original_message(): | ||||
|     _check_pattern_original_message('Original Message') | ||||
| @@ -165,6 +211,17 @@ Test reply""" | ||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||
|  | ||||
|  | ||||
| def test_android_wrote(): | ||||
|     msg_body = """Test reply | ||||
|  | ||||
| ---- John Smith wrote ---- | ||||
|  | ||||
| > quoted | ||||
| > text | ||||
| """ | ||||
|     eq_("Test reply", quotations.extract_from_plain(msg_body)) | ||||
|  | ||||
|  | ||||
| def test_reply_wraps_quotations(): | ||||
|     msg_body = """Test reply | ||||
|  | ||||
| @@ -376,6 +433,14 @@ Op 17-feb.-2015, om 13:18 heeft Julius Caesar <pantheon@rome.com> het volgende g | ||||
| Small batch beard laboris tempor, non listicle hella Tumblr heirloom. | ||||
| """)) | ||||
|  | ||||
| def test_vietnamese_from_block(): | ||||
|     eq_('Hello', quotations.extract_from_plain( | ||||
|     u"""Hello | ||||
|  | ||||
| Vào 14:24 8 tháng 6, 2017, Hùng Nguyễn <hungnguyen@xxx.com> đã viết: | ||||
|  | ||||
| > Xin chào | ||||
| """)) | ||||
|  | ||||
| def test_quotation_marker_false_positive(): | ||||
|     msg_body = """Visit us now for assistance... | ||||
| @@ -388,6 +453,7 @@ def test_link_closed_with_quotation_marker_on_new_line(): | ||||
|     msg_body = '''8.45am-1pm | ||||
|  | ||||
| From: somebody@example.com | ||||
| Date: Wed, 16 May 2012 00:15:02 -0600 | ||||
|   | ||||
| <http://email.example.com/c/dHJhY2tpbmdfY29kZT1mMDdjYzBmNzM1ZjYzMGIxNT | ||||
| >  <bob@example.com <mailto:bob@example.com> > | ||||
| @@ -429,7 +495,9 @@ def test_from_block_starts_with_date(): | ||||
|     msg_body = """Blah | ||||
|  | ||||
| Date: Wed, 16 May 2012 00:15:02 -0600 | ||||
| To: klizhentas@example.com""" | ||||
| To: klizhentas@example.com | ||||
|  | ||||
| """ | ||||
|     eq_('Blah', quotations.extract_from_plain(msg_body)) | ||||
|  | ||||
|  | ||||
| @@ -499,11 +567,12 @@ def test_mark_message_lines(): | ||||
|              # next line should be marked as splitter | ||||
|              '_____________', | ||||
|              'From: foo@bar.com', | ||||
|              'Date: Wed, 16 May 2012 00:15:02 -0600', | ||||
|              '', | ||||
|              '> Hi', | ||||
|              '', | ||||
|              'Signature'] | ||||
|     eq_('tessemet', quotations.mark_message_lines(lines)) | ||||
|     eq_('tesssemet', quotations.mark_message_lines(lines)) | ||||
|  | ||||
|     lines = ['Just testing the email reply', | ||||
|              '', | ||||
| @@ -700,23 +769,73 @@ def test_standard_replies(): | ||||
|  | ||||
| def test_split_email(): | ||||
|     msg = """From: Mr. X | ||||
| Date: 24 February 2016 | ||||
| To: Mr. Y | ||||
| Subject: Hi | ||||
| Attachments: none | ||||
| Goodbye. | ||||
| From: Mr. Y | ||||
| To: Mr. X | ||||
| Date: 24 February 2016 | ||||
| Subject: Hi | ||||
| Attachments: none | ||||
|     Date: 24 February 2016 | ||||
|     To: Mr. Y | ||||
|     Subject: Hi | ||||
|     Attachments: none | ||||
|     Goodbye. | ||||
|     From: Mr. Y | ||||
|     To: Mr. X | ||||
|     Date: 24 February 2016 | ||||
|     Subject: Hi | ||||
|     Attachments: none | ||||
|  | ||||
| Hello. | ||||
|     Hello. | ||||
|  | ||||
| -- Original Message -- | ||||
| On 24th February 2016 at 09.32am Conal Wrote: | ||||
| Hey! | ||||
|         On 24th February 2016 at 09.32am, Conal wrote: | ||||
|  | ||||
|         Hey! | ||||
|  | ||||
|         On Mon, 2016-10-03 at 09:45 -0600, Stangel, Dan wrote: | ||||
|         > Mohan, | ||||
|         > | ||||
|         > We have not yet migrated the systems. | ||||
|         > | ||||
|         > Dan | ||||
|         > | ||||
|         > > -----Original Message----- | ||||
|         > > Date: Mon, 2 Apr 2012 17:44:22 +0400 | ||||
|         > > Subject: Test | ||||
|         > > From: bob@xxx.mailgun.org | ||||
|         > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com | ||||
|         > > | ||||
|         > > Hi | ||||
|         > > | ||||
|         > > > From: bob@xxx.mailgun.org | ||||
|         > > > To: xxx@gmail.com; xxx@hotmail.com; xxx@yahoo.com; xxx@aol.com; xxx@comcast.net; xxx@nyc.rr.com | ||||
|         > > > Date: Mon, 2 Apr 2012 17:44:22 +0400 | ||||
|         > > > Subject: Test | ||||
|         > > > Hi | ||||
|         > > > | ||||
|         > > | ||||
|         > | ||||
|         > | ||||
| """ | ||||
|     expected_markers = "stttttsttttetestt" | ||||
|     expected_markers = "stttttsttttetesetesmmmmmmsmmmmmmmmmmmmmmmm" | ||||
|     markers = quotations.split_emails(msg) | ||||
|     eq_(markers, expected_markers) | ||||
|  | ||||
|  | ||||
|  | ||||
| def test_feedback_below_left_unparsed(): | ||||
|     msg_body = """Please enter your feedback below. Thank you. | ||||
|  | ||||
| ------------------------------------- Enter Feedback Below ------------------------------------- | ||||
|  | ||||
| The user experience was unparallelled. Please continue production. I'm sending payment to ensure | ||||
| that this line is intact.""" | ||||
|  | ||||
|     parsed = quotations.extract_from_plain(msg_body) | ||||
|     eq_(msg_body, parsed) | ||||
|  | ||||
|  | ||||
| def test_appointment_2(): | ||||
|     msg_body = """Invitation for an interview: | ||||
|  | ||||
| Date: Wednesday 3, October 2011  | ||||
| Time: 7 : 00am  | ||||
| Address: 130 Fox St | ||||
|  | ||||
| Please bring in your ID.""" | ||||
|     parsed = quotations.extract_from_plain(msg_body) | ||||
|     eq_(msg_body, parsed) | ||||
|   | ||||
| @@ -1,12 +1,12 @@ | ||||
| # coding:utf-8 | ||||
|  | ||||
| from __future__ import absolute_import | ||||
| from . import * | ||||
|  | ||||
| from talon import utils as u | ||||
| import cchardet | ||||
| import six | ||||
| from lxml import html | ||||
|  | ||||
| from talon import utils as u | ||||
| from . import * | ||||
|  | ||||
|  | ||||
| def test_get_delimiter(): | ||||
| @@ -16,31 +16,35 @@ def test_get_delimiter(): | ||||
|  | ||||
|  | ||||
| def test_unicode(): | ||||
|     eq_ (u'hi', u.to_unicode('hi')) | ||||
|     eq_ (type(u.to_unicode('hi')), six.text_type ) | ||||
|     eq_ (type(u.to_unicode(u'hi')), six.text_type ) | ||||
|     eq_ (type(u.to_unicode('привет')), six.text_type ) | ||||
|     eq_ (type(u.to_unicode(u'привет')), six.text_type ) | ||||
|     eq_ (u"привет", u.to_unicode('привет')) | ||||
|     eq_ (u"привет", u.to_unicode(u'привет')) | ||||
|     eq_(u'hi', u.to_unicode('hi')) | ||||
|     eq_(type(u.to_unicode('hi')), six.text_type) | ||||
|     eq_(type(u.to_unicode(u'hi')), six.text_type) | ||||
|     eq_(type(u.to_unicode('привет')), six.text_type) | ||||
|     eq_(type(u.to_unicode(u'привет')), six.text_type) | ||||
|     eq_(u"привет", u.to_unicode('привет')) | ||||
|     eq_(u"привет", u.to_unicode(u'привет')) | ||||
|     # some latin1 stuff | ||||
|     eq_ (u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True)) | ||||
|     eq_(u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True)) | ||||
|  | ||||
|  | ||||
| def test_detect_encoding(): | ||||
|     eq_ ('ascii', u.detect_encoding(b'qwe').lower()) | ||||
|     eq_ ('iso-8859-2', u.detect_encoding(u'Versi\xf3n'.encode('iso-8859-2')).lower()) | ||||
|     eq_ ('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower()) | ||||
|     eq_('ascii', u.detect_encoding(b'qwe').lower()) | ||||
|     ok_(u.detect_encoding( | ||||
|         u'Versi\xf3n'.encode('iso-8859-2')).lower() in [ | ||||
|             'iso-8859-1', 'iso-8859-2']) | ||||
|     eq_('utf-8', u.detect_encoding(u'привет'.encode('utf8')).lower()) | ||||
|     # fallback to utf-8 | ||||
|     with patch.object(u.chardet, 'detect') as detect: | ||||
|         detect.side_effect = Exception | ||||
|         eq_ ('utf-8', u.detect_encoding('qwe'.encode('utf8')).lower()) | ||||
|         eq_('utf-8', u.detect_encoding('qwe'.encode('utf8')).lower()) | ||||
|  | ||||
|  | ||||
| def test_quick_detect_encoding(): | ||||
|     eq_ ('ascii', u.quick_detect_encoding(b'qwe').lower()) | ||||
|     eq_ ('windows-1252', u.quick_detect_encoding(u'Versi\xf3n'.encode('windows-1252')).lower()) | ||||
|     eq_ ('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower()) | ||||
|     eq_('ascii', u.quick_detect_encoding(b'qwe').lower()) | ||||
|     ok_(u.quick_detect_encoding( | ||||
|         u'Versi\xf3n'.encode('windows-1252')).lower() in [ | ||||
|             'windows-1252', 'windows-1250']) | ||||
|     eq_('utf-8', u.quick_detect_encoding(u'привет'.encode('utf8')).lower()) | ||||
|  | ||||
|  | ||||
| @patch.object(cchardet, 'detect') | ||||
| @@ -80,7 +84,7 @@ Haha | ||||
|     eq_(u"привет!", u.html_to_text("<b>привет!</b>").decode('utf8')) | ||||
|  | ||||
|     html = '<body><br/><br/>Hi</body>' | ||||
|     eq_ (b'Hi', u.html_to_text(html)) | ||||
|     eq_(b'Hi', u.html_to_text(html)) | ||||
|  | ||||
|     html = """Hi | ||||
| <style type="text/css"> | ||||
| @@ -100,7 +104,7 @@ font: 13px 'Lucida Grande', Arial, sans-serif; | ||||
|  | ||||
| } | ||||
| </style>""" | ||||
|     eq_ (b'Hi', u.html_to_text(html)) | ||||
|     eq_(b'Hi', u.html_to_text(html)) | ||||
|  | ||||
|     html = """<div> | ||||
| <!-- COMMENT 1 --> | ||||
| @@ -111,15 +115,16 @@ font: 13px 'Lucida Grande', Arial, sans-serif; | ||||
|  | ||||
|  | ||||
| def test_comment_no_parent(): | ||||
|     s = "<!-- COMMENT 1 --> no comment" | ||||
|     s = b'<!-- COMMENT 1 --> no comment' | ||||
|     d = u.html_document_fromstring(s) | ||||
|     eq_("no comment", u.html_tree_to_text(d)) | ||||
|     eq_(b"no comment", u.html_tree_to_text(d)) | ||||
|  | ||||
|  | ||||
| @patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception())) | ||||
| def test_html_fromstring_exception(): | ||||
|     eq_(None, u.html_fromstring("<html></html>")) | ||||
|  | ||||
|  | ||||
| @patch.object(u, 'html_too_big', Mock()) | ||||
| @patch.object(u.html5parser, 'fromstring') | ||||
| def test_html_fromstring_too_big(fromstring): | ||||
| @@ -154,5 +159,5 @@ def test_html_too_big(): | ||||
|  | ||||
| @patch.object(u, '_MAX_TAGS_COUNT', 3) | ||||
| def test_html_to_text(): | ||||
|     eq_("Hello", u.html_to_text("<div>Hello</div>")) | ||||
|     eq_(b"Hello", u.html_to_text("<div>Hello</div>")) | ||||
|     eq_(None, u.html_to_text("<div><span>Hi</span></div>")) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user