diff --git a/cringebot.py b/cringebot.py index 3a960c3..9082504 100644 --- a/cringebot.py +++ b/cringebot.py @@ -326,6 +326,24 @@ class CringeBotClient(BotClient): self.log(traceback.format_exc()) self.enqueue_deletion(status_id, 300) +# Bogofilter lexer uses these regular expressions: +# FRONT_CHAR [^[:blank:][:cntrl:][:digit:][:punct:]] +# MID_CHAR [^[:blank:][:cntrl:]:$*<>;=()&%#@+|/\\{}^\"?,\[\]] +# BACK_CHAR [^[:blank:][:cntrl:]:$*<>;=()&%#@+|/\\{}^\"?,\[\]._~\'\`\-] +# TOKEN {FRONT_CHAR}({MID_CHAR}*{BACK_CHAR})? + +# C standard library defines these character classes: +# ispunct() !"#$%&'()*+,-./:;<=>?@[\]^_`{|} +# iscntrl() Between ASCII codes 0x00 (NUL) and 0x1f (US), plus 0x7f (DEL) + +# Contruct regular expression for tokens based on the above: +PUNCT = r"""!:"#\$%&'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^_`\{\|\}""" +CNTRL = r"\x00-\x1f\x7f" +FRONT_CHAR = rf"[^\s{CNTRL}\d{PUNCT}]" +MID_CHAR = rf"""[^\s{CNTRL}:\$\*<>;=\(\)&%#@\+\|/\\\{{\}}\^"\?,\[\]]""" +BACK_CHAR = rf"""[^\s{CNTRL}:\$\*<>;=\(\)&%#@\+\|/\\\{{\}}\^"\?,\[\]\._~'`\-]""" +TOKEN = f"({FRONT_CHAR}({MID_CHAR}*{BACK_CHAR})?)" + def toot_dict_to_mail(toot_dict): flags = [] @@ -361,8 +379,11 @@ def toot_dict_to_mail(toot_dict): try: with urllib.request.urlopen(media["url"]) as image: ocr_text = pytesseract.image_to_string(Image.open(image)) - words = re.findall(r"[^\s]+", ocr_text) - tokens = ["ocr_" + word.lower() for word in words] + #print("ocr_text =", ocr_text) + words = [match[0] for match in filter(lambda match: len(match[1]) > 0, re.findall(TOKEN, ocr_text))] + #print("words =", words) + tokens = ["ocr`" + word.lower() for word in words] + #print("tokens =", tokens) body += "\n\n" + " ".join(tokens) except Exception: print("Skipping OCR on attachment due to exception")