Make OCR support optional; other OCR patches

master
Thor 3 years ago
parent 736480c156
commit 6ebf952dd8
  1. 7
      README.md
  2. 3
      bogofilter.py
  3. 3
      config.example.toml
  4. 83
      cringebot.py

@ -17,6 +17,13 @@ sudo -i -u cringebot
pip3 install Mastodon.py html2text
```
#### Install dependencies (OCR support)
```
apt install python3 bogofilter tesseract-ocr
sudo -i -u cringebot
pip3 install Mastodon.py html2text PIL
```
#### Create configuration
```
sudo -i -u cringebot

@ -49,6 +49,9 @@ class Mail:
self.headers = {**headers, "Content-Type": "text/html; charset=\"UTF-8\""}
self.change_body(body)
def get_body(self):
return "\n".join(self.body)
def change_body(self, body):
if isinstance(body, str):
self.body = body.strip().split("\n")

@ -16,6 +16,9 @@ poll_interval = 15
# Minutes until cringe statuses are deleted
max_age = 600
# Enable OCR support for image attachments
#ocr = false
[clients]
# The client name (in quotes below) is displayed in log messages, and is
# used as the default hostname of the server to connect to, and as the

@ -3,10 +3,22 @@ import sys
import time
import sched
from datetime import datetime, timezone, timedelta
import bogofilter
import html2text
from collections import deque
import toml
import re
import bogofilter
import html2text
import urllib
config_path = os.path.join(os.path.dirname(sys.argv[0]), "config.toml")
loaded_config = toml.load(config_path)
OCR = loaded_config.get("ocr", False)
if OCR:
from PIL import Image
import pytesseract
from mastodon import Mastodon, MastodonNotFoundError
@ -129,22 +141,27 @@ class CringeBotClient(BotClient):
# Fetch the target status
target_status = self.api.status(target_id)
target_mail_text = toot_dict_to_mail(target_status).format()
target_mail = toot_dict_to_mail(target_status)
target_mail_text = target_mail.format()
tokens = deque(command.split())
while True:
token = tokens.popleft()
if token == "cringe":
if event not in ["categorise", "learn"]:
if len(target_mail.body) == 0:
self.respond(status, "Status has no content to classify", {"event": "error", "type": "empty", "target": target_id})
return True
elif event not in ["categorise", "learn"]:
self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id})
break
elif self.is_cringe(target_id):
self.respond(status, "Status was already learned as cringe", {"event": "error", "type": "redundant", "target": target_id})
break
elif self.is_based(target_id):
bogofilter.run(target_mail_text, actions = [bogofilter.UNLEARN_HAM, bogofilter.LEARN_SPAM], db_dir = self.db_dir)
bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.UNLEARN_HAM, bogofilter.LEARN_SPAM])
else:
bogofilter.run(target_mail_text, actions = [bogofilter.LEARN_SPAM], db_dir = self.db_dir)
bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.LEARN_SPAM])
self.make_cringe(target_id, target_mail_text)
self.enqueue_deletion(target_id)
@ -153,16 +170,19 @@ class CringeBotClient(BotClient):
break
elif token == "based":
if event not in ["categorise", "learn"]:
if len(target_mail.body) == 0:
self.respond(status, "Status has no content to classify", {"event": "error", "type": "empty", "target": target_id})
return True
elif event not in ["categorise", "learn"]:
self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id})
break
elif self.is_based(target_id):
self.respond(status, "Status was already learned as based", {"event": "error", "type": "redundant", "target": target_id})
break
elif self.is_cringe(target_id):
bogofilter.run(target_mail_text, actions = [bogofilter.UNLEARN_SPAM, bogofilter.LEARN_HAM], db_dir = self.db_dir)
bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.UNLEARN_SPAM, bogofilter.LEARN_HAM])
else:
bogofilter.run(target_mail_text, actions = [bogofilter.LEARN_HAM], db_dir = self.db_dir)
bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.LEARN_HAM])
self.make_based(target_id, target_mail_text)
self.unqueue_deletion(target_id)
@ -171,16 +191,19 @@ class CringeBotClient(BotClient):
break
elif token == "unlearn":
if event not in ["categorise", "learn"]:
if len(target_mail.body) == 0:
self.respond(status, "Status has no content to classify", {"event": "error", "type": "empty", "target": target_id})
return True
elif event not in ["categorise", "learn"]:
self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id})
break
elif self.is_unsure(target_id):
self.respond(status, "Status was already unsure", {"event": "error", "type": "redundant", "target": target_id})
break
elif self.is_cringe(target_id):
bogofilter.run(target_mail_text, actions = [bogofilter.UNLEARN_SPAM], db_dir = self.db_dir)
bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.UNLEARN_SPAM])
elif self.is_based(target_id):
bogofilter.run(target_mail_text, [bogofilter.UNLEARN_HAM], db_dir = self.db_dir)
bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.UNLEARN_HAM])
self.make_unsure(target_id, target_mail_text)
self.unqueue_deletion(target_id)
@ -211,23 +234,22 @@ class CringeBotClient(BotClient):
return
# Create faux HTML email of status
mail_text = toot_dict_to_mail(status).format()
# Format and log plain-text preview
md_text = self.h2t.handle(status["content"])
preview = toot_dict_to_mail(status)
preview.change_body(md_text)
preview_text = preview.format()
mail = toot_dict_to_mail(status)
mail_text = mail.format()
self.log()
self.log(preview_text)
self.log(mail_text)
self.log()
if len(mail.body) == 0:
self.log("Not classifying {} because it has no content".format(status_id))
return
# Process any commands
if self.process_commands(status):
return
result = bogofilter.run(mail_text, actions = [bogofilter.CLASSIFY, bogofilter.REGISTER] if self.config["register"] else [bogofilter.CLASSIFY], db_dir = self.db_dir)
result = bogofilter.run(mail_text, db_dir = self.db_dir, actions = [bogofilter.CLASSIFY, bogofilter.REGISTER] if self.config["register"] else [bogofilter.CLASSIFY])
bogo_report = "Bogofilter: Category={}, Score={}".format(result.category, "{:.4f}".format(result.score))
if result.category == bogofilter.SPAM:
self.log("CRINGE: Enqueuing status {} for deletion".format(status_id))
@ -323,10 +345,23 @@ def toot_dict_to_mail(toot_dict):
body = toot_dict["content"]
if OCR:
for media in toot_dict["media_attachments"]:
if media["type"] == "image":
try:
with urllib.request.urlopen(media["url"]) as image:
ocr_text = pytesseract.image_to_string(Image.open(image))
words = re.findall(r"\w+", ocr_text)
tokens = ["ocr_" + word.lower() for word in words]
body += "\n\n" + " ".join(tokens)
except Exception:
print("Skipping OCR on attachment due to exception")
print(traceback.format_exc())
return bogofilter.Mail(headers = headers, body = body)
config_path = os.path.join(os.path.dirname(sys.argv[0]), "config.toml")
bot = Bot(CringeBotClient, toml.load(config_path))
bot = Bot(CringeBotClient, loaded_config)
del loaded_config
bot.start()
while True:

Loading…
Cancel
Save