From 626d8cb4cf5bef8f6d6bfc397381f8757a951794 Mon Sep 17 00:00:00 2001
From: Thor Harald Johansen <thj@thj.no>
Date: Mon, 19 Jul 2021 11:06:09 +0200
Subject: [PATCH] Make OCR tokenise by non-whitespace instead of word
 characters

---
 bogofilter.py | 4 ++--
 cringebot.py  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/bogofilter.py b/bogofilter.py
index 5ffcea0..24b2744 100644
--- a/bogofilter.py
+++ b/bogofilter.py
@@ -4,8 +4,8 @@ import quopri
 import os
 import re
 
-MAX_HAM = 1.0 / 3.0
-MIN_SPAM = 2.0 / 3.0
+MAX_HAM = 1.0 / 4.0
+MIN_SPAM = 3.0 / 4.0
 MIN_DEV = 2 * (MIN_SPAM - MAX_HAM) * 0.375
 ROBS = 0.0178 
 ROBX = (MAX_HAM + MIN_SPAM) / 2
diff --git a/cringebot.py b/cringebot.py
index 3deb8cb..3a960c3 100644
--- a/cringebot.py
+++ b/cringebot.py
@@ -361,7 +361,7 @@ def toot_dict_to_mail(toot_dict):
                 try:
                     with urllib.request.urlopen(media["url"]) as image:
                         ocr_text = pytesseract.image_to_string(Image.open(image))
-                        words = re.findall(r"\w+", ocr_text)
+                        words = re.findall(r"[^\s]+", ocr_text)
                         tokens = ["ocr_" + word.lower() for word in words]
                         body += "\n\n" + " ".join(tokens)
                 except Exception: