Make OCR tokenise by non-whitespace instead of word characters

master
Thor 3 years ago
parent 478514a9e3
commit 626d8cb4cf
  1. 4
      bogofilter.py
  2. 2
      cringebot.py

@ -4,8 +4,8 @@ import quopri
import os
import re
MAX_HAM = 1.0 / 3.0
MIN_SPAM = 2.0 / 3.0
MAX_HAM = 1.0 / 4.0
MIN_SPAM = 3.0 / 4.0
MIN_DEV = 2 * (MIN_SPAM - MAX_HAM) * 0.375
ROBS = 0.0178
ROBX = (MAX_HAM + MIN_SPAM) / 2

@ -361,7 +361,7 @@ def toot_dict_to_mail(toot_dict):
try:
with urllib.request.urlopen(media["url"]) as image:
ocr_text = pytesseract.image_to_string(Image.open(image))
words = re.findall(r"\w+", ocr_text)
words = re.findall(r"[^\s]+", ocr_text)
tokens = ["ocr_" + word.lower() for word in words]
body += "\n\n" + " ".join(tokens)
except Exception:

Loading…
Cancel
Save