Make OCR support optional; other OCR patches

master
Thor 3 years ago
parent 736480c156
commit 6ebf952dd8
  1. 7
      README.md
  2. 3
      bogofilter.py
  3. 3
      config.example.toml
  4. 83
      cringebot.py

@ -17,6 +17,13 @@ sudo -i -u cringebot
pip3 install Mastodon.py html2text pip3 install Mastodon.py html2text
``` ```
#### Install dependencies (OCR support)
```
apt install python3 bogofilter tesseract-ocr
sudo -i -u cringebot
pip3 install Mastodon.py html2text PIL
```
#### Create configuration #### Create configuration
``` ```
sudo -i -u cringebot sudo -i -u cringebot

@ -49,6 +49,9 @@ class Mail:
self.headers = {**headers, "Content-Type": "text/html; charset=\"UTF-8\""} self.headers = {**headers, "Content-Type": "text/html; charset=\"UTF-8\""}
self.change_body(body) self.change_body(body)
def get_body(self):
return "\n".join(self.body)
def change_body(self, body): def change_body(self, body):
if isinstance(body, str): if isinstance(body, str):
self.body = body.strip().split("\n") self.body = body.strip().split("\n")

@ -16,6 +16,9 @@ poll_interval = 15
# Minutes until cringe statuses are deleted # Minutes until cringe statuses are deleted
max_age = 600 max_age = 600
# Enable OCR support for image attachments
#ocr = false
[clients] [clients]
# The client name (in quotes below) is displayed in log messages, and is # The client name (in quotes below) is displayed in log messages, and is
# used as the default hostname of the server to connect to, and as the # used as the default hostname of the server to connect to, and as the

@ -3,10 +3,22 @@ import sys
import time import time
import sched import sched
from datetime import datetime, timezone, timedelta from datetime import datetime, timezone, timedelta
import bogofilter
import html2text
from collections import deque from collections import deque
import toml import toml
import re
import bogofilter
import html2text
import urllib
config_path = os.path.join(os.path.dirname(sys.argv[0]), "config.toml")
loaded_config = toml.load(config_path)
OCR = loaded_config.get("ocr", False)
if OCR:
from PIL import Image
import pytesseract
from mastodon import Mastodon, MastodonNotFoundError from mastodon import Mastodon, MastodonNotFoundError
@ -129,22 +141,27 @@ class CringeBotClient(BotClient):
# Fetch the target status # Fetch the target status
target_status = self.api.status(target_id) target_status = self.api.status(target_id)
target_mail_text = toot_dict_to_mail(target_status).format() target_mail = toot_dict_to_mail(target_status)
target_mail_text = target_mail.format()
tokens = deque(command.split()) tokens = deque(command.split())
while True: while True:
token = tokens.popleft() token = tokens.popleft()
if token == "cringe": if token == "cringe":
if event not in ["categorise", "learn"]: if len(target_mail.body) == 0:
self.respond(status, "Status has no content to classify", {"event": "error", "type": "empty", "target": target_id})
return True
elif event not in ["categorise", "learn"]:
self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id}) self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id})
break break
elif self.is_cringe(target_id): elif self.is_cringe(target_id):
self.respond(status, "Status was already learned as cringe", {"event": "error", "type": "redundant", "target": target_id}) self.respond(status, "Status was already learned as cringe", {"event": "error", "type": "redundant", "target": target_id})
break break
elif self.is_based(target_id): elif self.is_based(target_id):
bogofilter.run(target_mail_text, actions = [bogofilter.UNLEARN_HAM, bogofilter.LEARN_SPAM], db_dir = self.db_dir) bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.UNLEARN_HAM, bogofilter.LEARN_SPAM])
else: else:
bogofilter.run(target_mail_text, actions = [bogofilter.LEARN_SPAM], db_dir = self.db_dir) bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.LEARN_SPAM])
self.make_cringe(target_id, target_mail_text) self.make_cringe(target_id, target_mail_text)
self.enqueue_deletion(target_id) self.enqueue_deletion(target_id)
@ -153,16 +170,19 @@ class CringeBotClient(BotClient):
break break
elif token == "based": elif token == "based":
if event not in ["categorise", "learn"]: if len(target_mail.body) == 0:
self.respond(status, "Status has no content to classify", {"event": "error", "type": "empty", "target": target_id})
return True
elif event not in ["categorise", "learn"]:
self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id}) self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id})
break break
elif self.is_based(target_id): elif self.is_based(target_id):
self.respond(status, "Status was already learned as based", {"event": "error", "type": "redundant", "target": target_id}) self.respond(status, "Status was already learned as based", {"event": "error", "type": "redundant", "target": target_id})
break break
elif self.is_cringe(target_id): elif self.is_cringe(target_id):
bogofilter.run(target_mail_text, actions = [bogofilter.UNLEARN_SPAM, bogofilter.LEARN_HAM], db_dir = self.db_dir) bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.UNLEARN_SPAM, bogofilter.LEARN_HAM])
else: else:
bogofilter.run(target_mail_text, actions = [bogofilter.LEARN_HAM], db_dir = self.db_dir) bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.LEARN_HAM])
self.make_based(target_id, target_mail_text) self.make_based(target_id, target_mail_text)
self.unqueue_deletion(target_id) self.unqueue_deletion(target_id)
@ -171,16 +191,19 @@ class CringeBotClient(BotClient):
break break
elif token == "unlearn": elif token == "unlearn":
if event not in ["categorise", "learn"]: if len(target_mail.body) == 0:
self.respond(status, "Status has no content to classify", {"event": "error", "type": "empty", "target": target_id})
return True
elif event not in ["categorise", "learn"]:
self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id}) self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id})
break break
elif self.is_unsure(target_id): elif self.is_unsure(target_id):
self.respond(status, "Status was already unsure", {"event": "error", "type": "redundant", "target": target_id}) self.respond(status, "Status was already unsure", {"event": "error", "type": "redundant", "target": target_id})
break break
elif self.is_cringe(target_id): elif self.is_cringe(target_id):
bogofilter.run(target_mail_text, actions = [bogofilter.UNLEARN_SPAM], db_dir = self.db_dir) bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.UNLEARN_SPAM])
elif self.is_based(target_id): elif self.is_based(target_id):
bogofilter.run(target_mail_text, [bogofilter.UNLEARN_HAM], db_dir = self.db_dir) bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.UNLEARN_HAM])
self.make_unsure(target_id, target_mail_text) self.make_unsure(target_id, target_mail_text)
self.unqueue_deletion(target_id) self.unqueue_deletion(target_id)
@ -211,23 +234,22 @@ class CringeBotClient(BotClient):
return return
# Create faux HTML email of status # Create faux HTML email of status
mail_text = toot_dict_to_mail(status).format() mail = toot_dict_to_mail(status)
mail_text = mail.format()
# Format and log plain-text preview
md_text = self.h2t.handle(status["content"])
preview = toot_dict_to_mail(status)
preview.change_body(md_text)
preview_text = preview.format()
self.log() self.log()
self.log(preview_text) self.log(mail_text)
self.log() self.log()
if len(mail.body) == 0:
self.log("Not classifying {} because it has no content".format(status_id))
return
# Process any commands # Process any commands
if self.process_commands(status): if self.process_commands(status):
return return
result = bogofilter.run(mail_text, actions = [bogofilter.CLASSIFY, bogofilter.REGISTER] if self.config["register"] else [bogofilter.CLASSIFY], db_dir = self.db_dir) result = bogofilter.run(mail_text, db_dir = self.db_dir, actions = [bogofilter.CLASSIFY, bogofilter.REGISTER] if self.config["register"] else [bogofilter.CLASSIFY])
bogo_report = "Bogofilter: Category={}, Score={}".format(result.category, "{:.4f}".format(result.score)) bogo_report = "Bogofilter: Category={}, Score={}".format(result.category, "{:.4f}".format(result.score))
if result.category == bogofilter.SPAM: if result.category == bogofilter.SPAM:
self.log("CRINGE: Enqueuing status {} for deletion".format(status_id)) self.log("CRINGE: Enqueuing status {} for deletion".format(status_id))
@ -323,10 +345,23 @@ def toot_dict_to_mail(toot_dict):
body = toot_dict["content"] body = toot_dict["content"]
if OCR:
for media in toot_dict["media_attachments"]:
if media["type"] == "image":
try:
with urllib.request.urlopen(media["url"]) as image:
ocr_text = pytesseract.image_to_string(Image.open(image))
words = re.findall(r"\w+", ocr_text)
tokens = ["ocr_" + word.lower() for word in words]
body += "\n\n" + " ".join(tokens)
except Exception:
print("Skipping OCR on attachment due to exception")
print(traceback.format_exc())
return bogofilter.Mail(headers = headers, body = body) return bogofilter.Mail(headers = headers, body = body)
config_path = os.path.join(os.path.dirname(sys.argv[0]), "config.toml") bot = Bot(CringeBotClient, loaded_config)
bot = Bot(CringeBotClient, toml.load(config_path)) del loaded_config
bot.start() bot.start()
while True: while True:

Loading…
Cancel
Save