Make OCR tokenise by non-whitespace instead of word characters

3 years ago · 626d8cb4cf
parent 478514a9e3
commit 626d8cb4cf
2 changed files with 3 additions and 3 deletions
--- a/bogofilter.py
+++ b/bogofilter.py
@ -4,8 +4,8 @@ import quopri
 import os
 import re

-MAX_HAM = 1.0 / 3.0
-MIN_SPAM = 2.0 / 3.0
+MAX_HAM = 1.0 / 4.0
+MIN_SPAM = 3.0 / 4.0
 MIN_DEV = 2 * (MIN_SPAM - MAX_HAM) * 0.375
 ROBS = 0.0178 
 ROBX = (MAX_HAM + MIN_SPAM) / 2
--- a/cringebot.py
+++ b/cringebot.py
@ -361,7 +361,7 @@ def toot_dict_to_mail(toot_dict):
                try:
                    with urllib.request.urlopen(media["url"]) as image:
                        ocr_text = pytesseract.image_to_string(Image.open(image))
-                        words = re.findall(r"\w+", ocr_text)
+                        words = re.findall(r"[^\s]+", ocr_text)
                        tokens = ["ocr_" + word.lower() for word in words]
                        body += "\n\n" + " ".join(tokens)
                except Exception: