diff --git a/README.md b/README.md index 28add28..7e15298 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,13 @@ sudo -i -u cringebot pip3 install Mastodon.py html2text ``` +#### Install dependencies (OCR support) +``` +apt install python3 bogofilter tesseract-ocr +sudo -i -u cringebot +pip3 install Mastodon.py html2text PIL +``` + #### Create configuration ``` sudo -i -u cringebot diff --git a/bogofilter.py b/bogofilter.py index f5cf841..5ffcea0 100644 --- a/bogofilter.py +++ b/bogofilter.py @@ -49,6 +49,9 @@ class Mail: self.headers = {**headers, "Content-Type": "text/html; charset=\"UTF-8\""} self.change_body(body) + def get_body(self): + return "\n".join(self.body) + def change_body(self, body): if isinstance(body, str): self.body = body.strip().split("\n") diff --git a/config.example.toml b/config.example.toml index 2944a2c..b017c26 100644 --- a/config.example.toml +++ b/config.example.toml @@ -16,6 +16,9 @@ poll_interval = 15 # Minutes until cringe statuses are deleted max_age = 600 +# Enable OCR support for image attachments +#ocr = false + [clients] # The client name (in quotes below) is displayed in log messages, and is # used as the default hostname of the server to connect to, and as the diff --git a/cringebot.py b/cringebot.py index 6185344..e652205 100644 --- a/cringebot.py +++ b/cringebot.py @@ -3,10 +3,22 @@ import sys import time import sched from datetime import datetime, timezone, timedelta -import bogofilter -import html2text from collections import deque import toml +import re + +import bogofilter +import html2text +import urllib + +config_path = os.path.join(os.path.dirname(sys.argv[0]), "config.toml") +loaded_config = toml.load(config_path) + +OCR = loaded_config.get("ocr", False) + +if OCR: + from PIL import Image + import pytesseract from mastodon import Mastodon, MastodonNotFoundError @@ -129,22 +141,27 @@ class CringeBotClient(BotClient): # Fetch the target status target_status = self.api.status(target_id) - target_mail_text = toot_dict_to_mail(target_status).format() + target_mail = toot_dict_to_mail(target_status) + target_mail_text = target_mail.format() + tokens = deque(command.split()) while True: token = tokens.popleft() if token == "cringe": - if event not in ["categorise", "learn"]: + if len(target_mail.body) == 0: + self.respond(status, "Status has no content to classify", {"event": "error", "type": "empty", "target": target_id}) + return True + elif event not in ["categorise", "learn"]: self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id}) break elif self.is_cringe(target_id): self.respond(status, "Status was already learned as cringe", {"event": "error", "type": "redundant", "target": target_id}) break elif self.is_based(target_id): - bogofilter.run(target_mail_text, actions = [bogofilter.UNLEARN_HAM, bogofilter.LEARN_SPAM], db_dir = self.db_dir) + bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.UNLEARN_HAM, bogofilter.LEARN_SPAM]) else: - bogofilter.run(target_mail_text, actions = [bogofilter.LEARN_SPAM], db_dir = self.db_dir) + bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.LEARN_SPAM]) self.make_cringe(target_id, target_mail_text) self.enqueue_deletion(target_id) @@ -153,16 +170,19 @@ class CringeBotClient(BotClient): break elif token == "based": - if event not in ["categorise", "learn"]: + if len(target_mail.body) == 0: + self.respond(status, "Status has no content to classify", {"event": "error", "type": "empty", "target": target_id}) + return True + elif event not in ["categorise", "learn"]: self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id}) break elif self.is_based(target_id): self.respond(status, "Status was already learned as based", {"event": "error", "type": "redundant", "target": target_id}) break elif self.is_cringe(target_id): - bogofilter.run(target_mail_text, actions = [bogofilter.UNLEARN_SPAM, bogofilter.LEARN_HAM], db_dir = self.db_dir) + bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.UNLEARN_SPAM, bogofilter.LEARN_HAM]) else: - bogofilter.run(target_mail_text, actions = [bogofilter.LEARN_HAM], db_dir = self.db_dir) + bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.LEARN_HAM]) self.make_based(target_id, target_mail_text) self.unqueue_deletion(target_id) @@ -171,16 +191,19 @@ class CringeBotClient(BotClient): break elif token == "unlearn": - if event not in ["categorise", "learn"]: + if len(target_mail.body) == 0: + self.respond(status, "Status has no content to classify", {"event": "error", "type": "empty", "target": target_id}) + return True + elif event not in ["categorise", "learn"]: self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id}) break elif self.is_unsure(target_id): self.respond(status, "Status was already unsure", {"event": "error", "type": "redundant", "target": target_id}) break elif self.is_cringe(target_id): - bogofilter.run(target_mail_text, actions = [bogofilter.UNLEARN_SPAM], db_dir = self.db_dir) + bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.UNLEARN_SPAM]) elif self.is_based(target_id): - bogofilter.run(target_mail_text, [bogofilter.UNLEARN_HAM], db_dir = self.db_dir) + bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.UNLEARN_HAM]) self.make_unsure(target_id, target_mail_text) self.unqueue_deletion(target_id) @@ -211,23 +234,22 @@ class CringeBotClient(BotClient): return # Create faux HTML email of status - mail_text = toot_dict_to_mail(status).format() - - # Format and log plain-text preview - md_text = self.h2t.handle(status["content"]) - preview = toot_dict_to_mail(status) - preview.change_body(md_text) - preview_text = preview.format() - + mail = toot_dict_to_mail(status) + mail_text = mail.format() + self.log() - self.log(preview_text) + self.log(mail_text) self.log() + if len(mail.body) == 0: + self.log("Not classifying {} because it has no content".format(status_id)) + return + # Process any commands if self.process_commands(status): return - result = bogofilter.run(mail_text, actions = [bogofilter.CLASSIFY, bogofilter.REGISTER] if self.config["register"] else [bogofilter.CLASSIFY], db_dir = self.db_dir) + result = bogofilter.run(mail_text, db_dir = self.db_dir, actions = [bogofilter.CLASSIFY, bogofilter.REGISTER] if self.config["register"] else [bogofilter.CLASSIFY]) bogo_report = "Bogofilter: Category={}, Score={}".format(result.category, "{:.4f}".format(result.score)) if result.category == bogofilter.SPAM: self.log("CRINGE: Enqueuing status {} for deletion".format(status_id)) @@ -323,10 +345,23 @@ def toot_dict_to_mail(toot_dict): body = toot_dict["content"] + if OCR: + for media in toot_dict["media_attachments"]: + if media["type"] == "image": + try: + with urllib.request.urlopen(media["url"]) as image: + ocr_text = pytesseract.image_to_string(Image.open(image)) + words = re.findall(r"\w+", ocr_text) + tokens = ["ocr_" + word.lower() for word in words] + body += "\n\n" + " ".join(tokens) + except Exception: + print("Skipping OCR on attachment due to exception") + print(traceback.format_exc()) + return bogofilter.Mail(headers = headers, body = body) -config_path = os.path.join(os.path.dirname(sys.argv[0]), "config.toml") -bot = Bot(CringeBotClient, toml.load(config_path)) +bot = Bot(CringeBotClient, loaded_config) +del loaded_config bot.start() while True: