|
|
|
@ -3,10 +3,22 @@ import sys |
|
|
|
|
import time |
|
|
|
|
import sched |
|
|
|
|
from datetime import datetime, timezone, timedelta |
|
|
|
|
import bogofilter |
|
|
|
|
import html2text |
|
|
|
|
from collections import deque |
|
|
|
|
import toml |
|
|
|
|
import re |
|
|
|
|
|
|
|
|
|
import bogofilter |
|
|
|
|
import html2text |
|
|
|
|
import urllib |
|
|
|
|
|
|
|
|
|
config_path = os.path.join(os.path.dirname(sys.argv[0]), "config.toml") |
|
|
|
|
loaded_config = toml.load(config_path) |
|
|
|
|
|
|
|
|
|
OCR = loaded_config.get("ocr", False) |
|
|
|
|
|
|
|
|
|
if OCR: |
|
|
|
|
from PIL import Image |
|
|
|
|
import pytesseract |
|
|
|
|
|
|
|
|
|
from mastodon import Mastodon, MastodonNotFoundError |
|
|
|
|
|
|
|
|
@ -129,22 +141,27 @@ class CringeBotClient(BotClient): |
|
|
|
|
|
|
|
|
|
# Fetch the target status |
|
|
|
|
target_status = self.api.status(target_id) |
|
|
|
|
target_mail_text = toot_dict_to_mail(target_status).format() |
|
|
|
|
target_mail = toot_dict_to_mail(target_status) |
|
|
|
|
target_mail_text = target_mail.format() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokens = deque(command.split()) |
|
|
|
|
while True: |
|
|
|
|
token = tokens.popleft() |
|
|
|
|
if token == "cringe": |
|
|
|
|
if event not in ["categorise", "learn"]: |
|
|
|
|
if len(target_mail.body) == 0: |
|
|
|
|
self.respond(status, "Status has no content to classify", {"event": "error", "type": "empty", "target": target_id}) |
|
|
|
|
return True |
|
|
|
|
elif event not in ["categorise", "learn"]: |
|
|
|
|
self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id}) |
|
|
|
|
break |
|
|
|
|
elif self.is_cringe(target_id): |
|
|
|
|
self.respond(status, "Status was already learned as cringe", {"event": "error", "type": "redundant", "target": target_id}) |
|
|
|
|
break |
|
|
|
|
elif self.is_based(target_id): |
|
|
|
|
bogofilter.run(target_mail_text, actions = [bogofilter.UNLEARN_HAM, bogofilter.LEARN_SPAM], db_dir = self.db_dir) |
|
|
|
|
bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.UNLEARN_HAM, bogofilter.LEARN_SPAM]) |
|
|
|
|
else: |
|
|
|
|
bogofilter.run(target_mail_text, actions = [bogofilter.LEARN_SPAM], db_dir = self.db_dir) |
|
|
|
|
bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.LEARN_SPAM]) |
|
|
|
|
|
|
|
|
|
self.make_cringe(target_id, target_mail_text) |
|
|
|
|
self.enqueue_deletion(target_id) |
|
|
|
@ -153,16 +170,19 @@ class CringeBotClient(BotClient): |
|
|
|
|
break |
|
|
|
|
|
|
|
|
|
elif token == "based": |
|
|
|
|
if event not in ["categorise", "learn"]: |
|
|
|
|
if len(target_mail.body) == 0: |
|
|
|
|
self.respond(status, "Status has no content to classify", {"event": "error", "type": "empty", "target": target_id}) |
|
|
|
|
return True |
|
|
|
|
elif event not in ["categorise", "learn"]: |
|
|
|
|
self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id}) |
|
|
|
|
break |
|
|
|
|
elif self.is_based(target_id): |
|
|
|
|
self.respond(status, "Status was already learned as based", {"event": "error", "type": "redundant", "target": target_id}) |
|
|
|
|
break |
|
|
|
|
elif self.is_cringe(target_id): |
|
|
|
|
bogofilter.run(target_mail_text, actions = [bogofilter.UNLEARN_SPAM, bogofilter.LEARN_HAM], db_dir = self.db_dir) |
|
|
|
|
bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.UNLEARN_SPAM, bogofilter.LEARN_HAM]) |
|
|
|
|
else: |
|
|
|
|
bogofilter.run(target_mail_text, actions = [bogofilter.LEARN_HAM], db_dir = self.db_dir) |
|
|
|
|
bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.LEARN_HAM]) |
|
|
|
|
|
|
|
|
|
self.make_based(target_id, target_mail_text) |
|
|
|
|
self.unqueue_deletion(target_id) |
|
|
|
@ -171,16 +191,19 @@ class CringeBotClient(BotClient): |
|
|
|
|
break |
|
|
|
|
|
|
|
|
|
elif token == "unlearn": |
|
|
|
|
if event not in ["categorise", "learn"]: |
|
|
|
|
if len(target_mail.body) == 0: |
|
|
|
|
self.respond(status, "Status has no content to classify", {"event": "error", "type": "empty", "target": target_id}) |
|
|
|
|
return True |
|
|
|
|
elif event not in ["categorise", "learn"]: |
|
|
|
|
self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id}) |
|
|
|
|
break |
|
|
|
|
elif self.is_unsure(target_id): |
|
|
|
|
self.respond(status, "Status was already unsure", {"event": "error", "type": "redundant", "target": target_id}) |
|
|
|
|
break |
|
|
|
|
elif self.is_cringe(target_id): |
|
|
|
|
bogofilter.run(target_mail_text, actions = [bogofilter.UNLEARN_SPAM], db_dir = self.db_dir) |
|
|
|
|
bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.UNLEARN_SPAM]) |
|
|
|
|
elif self.is_based(target_id): |
|
|
|
|
bogofilter.run(target_mail_text, [bogofilter.UNLEARN_HAM], db_dir = self.db_dir) |
|
|
|
|
bogofilter.run(target_mail_text, db_dir = self.db_dir, actions = [bogofilter.UNLEARN_HAM]) |
|
|
|
|
|
|
|
|
|
self.make_unsure(target_id, target_mail_text) |
|
|
|
|
self.unqueue_deletion(target_id) |
|
|
|
@ -211,23 +234,22 @@ class CringeBotClient(BotClient): |
|
|
|
|
return |
|
|
|
|
|
|
|
|
|
# Create faux HTML email of status |
|
|
|
|
mail_text = toot_dict_to_mail(status).format() |
|
|
|
|
|
|
|
|
|
# Format and log plain-text preview |
|
|
|
|
md_text = self.h2t.handle(status["content"]) |
|
|
|
|
preview = toot_dict_to_mail(status) |
|
|
|
|
preview.change_body(md_text) |
|
|
|
|
preview_text = preview.format() |
|
|
|
|
mail = toot_dict_to_mail(status) |
|
|
|
|
mail_text = mail.format() |
|
|
|
|
|
|
|
|
|
self.log() |
|
|
|
|
self.log(preview_text) |
|
|
|
|
self.log(mail_text) |
|
|
|
|
self.log() |
|
|
|
|
|
|
|
|
|
if len(mail.body) == 0: |
|
|
|
|
self.log("Not classifying {} because it has no content".format(status_id)) |
|
|
|
|
return |
|
|
|
|
|
|
|
|
|
# Process any commands |
|
|
|
|
if self.process_commands(status): |
|
|
|
|
return |
|
|
|
|
|
|
|
|
|
result = bogofilter.run(mail_text, actions = [bogofilter.CLASSIFY, bogofilter.REGISTER] if self.config["register"] else [bogofilter.CLASSIFY], db_dir = self.db_dir) |
|
|
|
|
result = bogofilter.run(mail_text, db_dir = self.db_dir, actions = [bogofilter.CLASSIFY, bogofilter.REGISTER] if self.config["register"] else [bogofilter.CLASSIFY]) |
|
|
|
|
bogo_report = "Bogofilter: Category={}, Score={}".format(result.category, "{:.4f}".format(result.score)) |
|
|
|
|
if result.category == bogofilter.SPAM: |
|
|
|
|
self.log("CRINGE: Enqueuing status {} for deletion".format(status_id)) |
|
|
|
@ -323,10 +345,23 @@ def toot_dict_to_mail(toot_dict): |
|
|
|
|
|
|
|
|
|
body = toot_dict["content"] |
|
|
|
|
|
|
|
|
|
if OCR: |
|
|
|
|
for media in toot_dict["media_attachments"]: |
|
|
|
|
if media["type"] == "image": |
|
|
|
|
try: |
|
|
|
|
with urllib.request.urlopen(media["url"]) as image: |
|
|
|
|
ocr_text = pytesseract.image_to_string(Image.open(image)) |
|
|
|
|
words = re.findall(r"\w+", ocr_text) |
|
|
|
|
tokens = ["ocr_" + word.lower() for word in words] |
|
|
|
|
body += "\n\n" + " ".join(tokens) |
|
|
|
|
except Exception: |
|
|
|
|
print("Skipping OCR on attachment due to exception") |
|
|
|
|
print(traceback.format_exc()) |
|
|
|
|
|
|
|
|
|
return bogofilter.Mail(headers = headers, body = body) |
|
|
|
|
|
|
|
|
|
config_path = os.path.join(os.path.dirname(sys.argv[0]), "config.toml") |
|
|
|
|
bot = Bot(CringeBotClient, toml.load(config_path)) |
|
|
|
|
bot = Bot(CringeBotClient, loaded_config) |
|
|
|
|
del loaded_config |
|
|
|
|
bot.start() |
|
|
|
|
|
|
|
|
|
while True: |
|
|
|
|