Implemented Bogofilter tokeniser in Python for proper OCR analysis

3 years ago · 8309c83e6a
parent 626d8cb4cf
commit 8309c83e6a
1 changed files with 23 additions and 2 deletions
--- a/cringebot.py
+++ b/cringebot.py
@ -326,6 +326,24 @@ class CringeBotClient(BotClient):
            self.log(traceback.format_exc())
            self.enqueue_deletion(status_id, 300)

+# Bogofilter lexer uses these regular expressions:
+#   FRONT_CHAR  [^[:blank:][:cntrl:][:digit:][:punct:]]
+#   MID_CHAR    [^[:blank:][:cntrl:]:$*<>;=()&%#@+|/\\{}^\"?,\[\]]
+#   BACK_CHAR   [^[:blank:][:cntrl:]:$*<>;=()&%#@+|/\\{}^\"?,\[\]._~\'\`\-]
+#   TOKEN       {FRONT_CHAR}({MID_CHAR}*{BACK_CHAR})?
+
+# C standard library defines these character classes:
+#   ispunct()   !"#$%&'()*+,-./:;<=>?@[\]^_`{|}
+#   iscntrl()   Between ASCII codes 0x00 (NUL) and 0x1f (US), plus 0x7f (DEL)
+
+# Contruct regular expression for tokens based on the above:
+PUNCT = r"""!:"#\$%&'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^_`\{\|\}"""
+CNTRL = r"\x00-\x1f\x7f"
+FRONT_CHAR = rf"[^\s{CNTRL}\d{PUNCT}]"
+MID_CHAR = rf"""[^\s{CNTRL}:\$\*<>;=\(\)&%#@\+\|/\\\{{\}}\^"\?,\[\]]"""
+BACK_CHAR = rf"""[^\s{CNTRL}:\$\*<>;=\(\)&%#@\+\|/\\\{{\}}\^"\?,\[\]\._~'`\-]"""
+TOKEN = f"({FRONT_CHAR}({MID_CHAR}*{BACK_CHAR})?)"
+
 def toot_dict_to_mail(toot_dict):
    flags = []
    
@ -361,8 +379,11 @@ def toot_dict_to_mail(toot_dict):
                try:
                    with urllib.request.urlopen(media["url"]) as image:
                        ocr_text = pytesseract.image_to_string(Image.open(image))
-                        words = re.findall(r"[^\s]+", ocr_text)
-                        tokens = ["ocr_" + word.lower() for word in words]
+                        #print("ocr_text =", ocr_text)
+                        words = [match[0] for match in filter(lambda match: len(match[1]) > 0, re.findall(TOKEN, ocr_text))]
+                        #print("words =", words)
+                        tokens = ["ocr`" + word.lower() for word in words]
+                        #print("tokens =", tokens)
                        body += "\n\n" + " ".join(tokens)
                except Exception:
                    print("Skipping OCR on attachment due to exception")