From 238a5de5ccbfa5962ba1eca852522f132fb95075 Mon Sep 17 00:00:00 2001 From: Esperat Julian Date: Fri, 31 Aug 2018 12:39:52 +0200 Subject: [PATCH 1/2] Use regex match to detect outlook 2007, 2010, 2013 I encountered a variant of the outlook quotations with a space after the semicolon. To prevent multiplying the number of rules, I implemented a regex match instead (I found how to here: https://stackoverflow.com/a/34093801/211204). I documented all the different variants as cleanly as I could. --- talon/html_quotations.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/talon/html_quotations.py b/talon/html_quotations.py index 0728594..4001be5 100644 --- a/talon/html_quotations.py +++ b/talon/html_quotations.py @@ -87,23 +87,24 @@ def cut_gmail_quote(html_message): def cut_microsoft_quote(html_message): ''' Cuts splitter block and all following blocks. ''' + #use EXSLT extensions to have a regex match() function with lxml + ns = {"re": "http://exslt.org/regular-expressions"} + + #general pattern: @style='border:none;border-top:solid 1.0pt;padding:3.0pt 0 0 0' + #outlook 2007, 2010 (international) + #outlook 2007, 2010 (american) + #outlook 2013 (international) + #outlook 2013 (american) + #also handles a variant with a space after the semicolon splitter = html_message.xpath( - #outlook 2007, 2010 (international) - "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" - "padding:3.0pt 0cm 0cm 0cm']|" - #outlook 2007, 2010 (american) - "//div[@style='border:none;border-top:solid #B5C4DF 1.0pt;" - "padding:3.0pt 0in 0in 0in']|" - #outlook 2013 (international) - "//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;" - "padding:3.0pt 0cm 0cm 0cm']|" - #outlook 2013 (american) - "//div[@style='border:none;border-top:solid #E1E1E1 1.0pt;" - "padding:3.0pt 0in 0in 0in']|" + #outlook 2007, 2010, 2013 (international, american) + "//div[@style[re:match(., 'border:none; ?border-top:solid #(E1E1E1|B5C4DF) 1.0pt; ?" + "padding:3.0pt 0(in|cm) 0(in|cm) 0(in|cm)')]]" #windows mail "//div[@style='padding-top: 5px; " "border-top-color: rgb(229, 229, 229); " "border-top-width: 1px; border-top-style: solid;']" + , namespaces=ns ) if splitter: From 1147767ff361c27965c8ef24573225213f799fc5 Mon Sep 17 00:00:00 2001 From: Esperat Julian Date: Sun, 4 Nov 2018 19:42:12 +0100 Subject: [PATCH 2/2] Fix regression: windows mail format was left forgotten Missing a | at the end of the regex, so next lines are part of the global search. --- talon/html_quotations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/talon/html_quotations.py b/talon/html_quotations.py index 4001be5..a2db32d 100644 --- a/talon/html_quotations.py +++ b/talon/html_quotations.py @@ -99,7 +99,7 @@ def cut_microsoft_quote(html_message): splitter = html_message.xpath( #outlook 2007, 2010, 2013 (international, american) "//div[@style[re:match(., 'border:none; ?border-top:solid #(E1E1E1|B5C4DF) 1.0pt; ?" - "padding:3.0pt 0(in|cm) 0(in|cm) 0(in|cm)')]]" + "padding:3.0pt 0(in|cm) 0(in|cm) 0(in|cm)')]]|" #windows mail "//div[@style='padding-top: 5px; " "border-top-color: rgb(229, 229, 229); "