|
|
|
@ -326,6 +326,24 @@ class CringeBotClient(BotClient): |
|
|
|
|
self.log(traceback.format_exc()) |
|
|
|
|
self.enqueue_deletion(status_id, 300) |
|
|
|
|
|
|
|
|
|
# Bogofilter lexer uses these regular expressions: |
|
|
|
|
# FRONT_CHAR [^[:blank:][:cntrl:][:digit:][:punct:]] |
|
|
|
|
# MID_CHAR [^[:blank:][:cntrl:]:$*<>;=()&%#@+|/\\{}^\"?,\[\]] |
|
|
|
|
# BACK_CHAR [^[:blank:][:cntrl:]:$*<>;=()&%#@+|/\\{}^\"?,\[\]._~\'\`\-] |
|
|
|
|
# TOKEN {FRONT_CHAR}({MID_CHAR}*{BACK_CHAR})? |
|
|
|
|
|
|
|
|
|
# C standard library defines these character classes: |
|
|
|
|
# ispunct() !"#$%&'()*+,-./:;<=>?@[\]^_`{|} |
|
|
|
|
# iscntrl() Between ASCII codes 0x00 (NUL) and 0x1f (US), plus 0x7f (DEL) |
|
|
|
|
|
|
|
|
|
# Contruct regular expression for tokens based on the above: |
|
|
|
|
PUNCT = r"""!:"#\$%&'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^_`\{\|\}""" |
|
|
|
|
CNTRL = r"\x00-\x1f\x7f" |
|
|
|
|
FRONT_CHAR = rf"[^\s{CNTRL}\d{PUNCT}]" |
|
|
|
|
MID_CHAR = rf"""[^\s{CNTRL}:\$\*<>;=\(\)&%#@\+\|/\\\{{\}}\^"\?,\[\]]""" |
|
|
|
|
BACK_CHAR = rf"""[^\s{CNTRL}:\$\*<>;=\(\)&%#@\+\|/\\\{{\}}\^"\?,\[\]\._~'`\-]""" |
|
|
|
|
TOKEN = f"({FRONT_CHAR}({MID_CHAR}*{BACK_CHAR})?)" |
|
|
|
|
|
|
|
|
|
def toot_dict_to_mail(toot_dict): |
|
|
|
|
flags = [] |
|
|
|
|
|
|
|
|
@ -361,8 +379,11 @@ def toot_dict_to_mail(toot_dict): |
|
|
|
|
try: |
|
|
|
|
with urllib.request.urlopen(media["url"]) as image: |
|
|
|
|
ocr_text = pytesseract.image_to_string(Image.open(image)) |
|
|
|
|
words = re.findall(r"[^\s]+", ocr_text) |
|
|
|
|
tokens = ["ocr_" + word.lower() for word in words] |
|
|
|
|
#print("ocr_text =", ocr_text) |
|
|
|
|
words = [match[0] for match in filter(lambda match: len(match[1]) > 0, re.findall(TOKEN, ocr_text))] |
|
|
|
|
#print("words =", words) |
|
|
|
|
tokens = ["ocr`" + word.lower() for word in words] |
|
|
|
|
#print("tokens =", tokens) |
|
|
|
|
body += "\n\n" + " ".join(tokens) |
|
|
|
|
except Exception: |
|
|
|
|
print("Skipping OCR on attachment due to exception") |
|
|
|
|