From 626d8cb4cf5bef8f6d6bfc397381f8757a951794 Mon Sep 17 00:00:00 2001 From: Thor Harald Johansen Date: Mon, 19 Jul 2021 11:06:09 +0200 Subject: [PATCH] Make OCR tokenise by non-whitespace instead of word characters --- bogofilter.py | 4 ++-- cringebot.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bogofilter.py b/bogofilter.py index 5ffcea0..24b2744 100644 --- a/bogofilter.py +++ b/bogofilter.py @@ -4,8 +4,8 @@ import quopri import os import re -MAX_HAM = 1.0 / 3.0 -MIN_SPAM = 2.0 / 3.0 +MAX_HAM = 1.0 / 4.0 +MIN_SPAM = 3.0 / 4.0 MIN_DEV = 2 * (MIN_SPAM - MAX_HAM) * 0.375 ROBS = 0.0178 ROBX = (MAX_HAM + MIN_SPAM) / 2 diff --git a/cringebot.py b/cringebot.py index 3deb8cb..3a960c3 100644 --- a/cringebot.py +++ b/cringebot.py @@ -361,7 +361,7 @@ def toot_dict_to_mail(toot_dict): try: with urllib.request.urlopen(media["url"]) as image: ocr_text = pytesseract.image_to_string(Image.open(image)) - words = re.findall(r"\w+", ocr_text) + words = re.findall(r"[^\s]+", ocr_text) tokens = ["ocr_" + word.lower() for word in words] body += "\n\n" + " ".join(tokens) except Exception: