Implemented Bogofilter tokeniser in Python for proper OCR analysis

master
Thor 3 years ago
parent 626d8cb4cf
commit 8309c83e6a
  1. 25
      cringebot.py

@ -326,6 +326,24 @@ class CringeBotClient(BotClient):
self.log(traceback.format_exc())
self.enqueue_deletion(status_id, 300)
# Bogofilter lexer uses these regular expressions:
# FRONT_CHAR [^[:blank:][:cntrl:][:digit:][:punct:]]
# MID_CHAR [^[:blank:][:cntrl:]:$*<>;=()&%#@+|/\\{}^\"?,\[\]]
# BACK_CHAR [^[:blank:][:cntrl:]:$*<>;=()&%#@+|/\\{}^\"?,\[\]._~\'\`\-]
# TOKEN {FRONT_CHAR}({MID_CHAR}*{BACK_CHAR})?
# C standard library defines these character classes:
# ispunct() !"#$%&'()*+,-./:;<=>?@[\]^_`{|}
# iscntrl() Between ASCII codes 0x00 (NUL) and 0x1f (US), plus 0x7f (DEL)
# Contruct regular expression for tokens based on the above:
PUNCT = r"""!:"#\$%&'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^_`\{\|\}"""
CNTRL = r"\x00-\x1f\x7f"
FRONT_CHAR = rf"[^\s{CNTRL}\d{PUNCT}]"
MID_CHAR = rf"""[^\s{CNTRL}:\$\*<>;=\(\)&%#@\+\|/\\\{{\}}\^"\?,\[\]]"""
BACK_CHAR = rf"""[^\s{CNTRL}:\$\*<>;=\(\)&%#@\+\|/\\\{{\}}\^"\?,\[\]\._~'`\-]"""
TOKEN = f"({FRONT_CHAR}({MID_CHAR}*{BACK_CHAR})?)"
def toot_dict_to_mail(toot_dict):
flags = []
@ -361,8 +379,11 @@ def toot_dict_to_mail(toot_dict):
try:
with urllib.request.urlopen(media["url"]) as image:
ocr_text = pytesseract.image_to_string(Image.open(image))
words = re.findall(r"[^\s]+", ocr_text)
tokens = ["ocr_" + word.lower() for word in words]
#print("ocr_text =", ocr_text)
words = [match[0] for match in filter(lambda match: len(match[1]) > 0, re.findall(TOKEN, ocr_text))]
#print("words =", words)
tokens = ["ocr`" + word.lower() for word in words]
#print("tokens =", tokens)
body += "\n\n" + " ".join(tokens)
except Exception:
print("Skipping OCR on attachment due to exception")

Loading…
Cancel
Save