diff --git a/.gitignore b/.gitignore index c939a2a..4e05105 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,9 @@ config.toml wordlist.db +clients/ +cringe/ +based/ +unsure/ __pycache__ .* *.db diff --git a/bogofilter.py b/bogofilter.py index c2f0433..f5cf841 100644 --- a/bogofilter.py +++ b/bogofilter.py @@ -2,16 +2,15 @@ import subprocess from email.utils import format_datetime import quopri import os +import re -DB_DIR = "." -DB_PATH = os.path.join(DB_DIR, "wordlist.db") MAX_HAM = 1.0 / 3.0 MIN_SPAM = 2.0 / 3.0 MIN_DEV = 2 * (MIN_SPAM - MAX_HAM) * 0.375 ROBS = 0.0178 ROBX = (MAX_HAM + MIN_SPAM) / 2 -LOAD_COMMAND = ["bogoutil", "-l", DB_PATH] -COMMAND = ["bogofilter", "-T", "-c", "/dev/null", "-d", DB_DIR, "-o", "{},{}".format(MIN_SPAM, MAX_HAM), "-m", "{},{},{}".format(MIN_DEV, ROBS, ROBX)] +LOAD_COMMAND = ["bogoutil", "-l"] +COMMAND = ["bogofilter", "-T", "-c", "/dev/null", "-o", "{},{}".format(MIN_SPAM, MAX_HAM), "-m", "{},{},{}".format(MIN_DEV, ROBS, ROBX)] # Categories SPAM = "S" @@ -46,44 +45,62 @@ class BogofilterResult: self.score = score class Mail: - def __init__(self, headers = {}, body = None): - self.headers = {**{ - "Content-Type": "text/html; charset=\"UTF-8\""}, **headers} - self.body = body + def __init__(self, headers = {}, body = []): + self.headers = {**headers, "Content-Type": "text/html; charset=\"UTF-8\""} + self.change_body(body) + + def change_body(self, body): + if isinstance(body, str): + self.body = body.strip().split("\n") + else: + self.body = list(body) + + def format(self, mbox = False): + lines = [] + + if mbox: + lines.append("From {} {}\n".format(self.headers["From"], format_datetime(datetime.now(timezone.utc), usegmt = True))) - def format(self): - text = str() for key, value in self.headers.items(): if key == "Subject": value = "=?utf-8?Q?{}?=".format(quopri.encodestring(bytes(value, "utf-8"), header = True).decode("utf-8")) + lines.append("{}: {}".format(key, value)) + lines.append("") - text += "{key}: {value}\n".format(key = key, value = value) - - text += "\n" + if mbox: + lines.extend([">" + line if re.match(r">*From ", line) else line for line in self.body]) + else: + lines.extend(self.body) - if self.body: - text += self.body + return "\n".join(lines) - text += "\n" + def deliver_to_mbox(path, mode = "a+"): + empty = not os.path.exists(path) or os.path.getsize(path) == 0 + with open(path, mode, encoding = "utf-8") as mbox_file: + if not empty: + mbox_file.write("\n") + mbox_file.write(self.format(mbox = True)) - return text - -def run(text, actions = [CLASSIFY], category = UNSURE): +def run(text, db_dir, actions = [CLASSIFY], category = UNSURE): args = [] for action in actions: args.extend(ACTIONS[action]) - if not os.path.exists(DB_PATH): - subprocess.run(LOAD_COMMAND, input = b'') + if not os.path.exists(os.path.join(db_dir, "wordlist.db")): + os.makedirs(db_dir, exist_ok = True) + subprocess.run(LOAD_COMMAND + [db_dir], input = b'') - cp = subprocess.run(COMMAND + args, capture_output = True, encoding = "utf-8", input = text) + cp = subprocess.run(COMMAND + ["-d", db_dir] + args, capture_output = True, encoding = "utf-8", input = text) arr = cp.stdout.strip().split(" ") if len(arr) == 2: (category, score) = arr return BogofilterResult(category, float(score)) else: if cp.returncode == 3: - print("Bogofilter error") + print("Bogofilter error:") if cp.stdout.strip(): print(cp.stderr.strip()) return None + +#mail = Mail({"From": "thor"}, "Hello, World\nHow are you?") +#print(mail.format()) diff --git a/bot.py b/bot.py index db8ab7f..a2ca2cf 100644 --- a/bot.py +++ b/bot.py @@ -22,9 +22,12 @@ class BotClient: self.bot = bot self.config = { "base_url": "https://{}".format(config["name"]), - "client_file": "secret/{}.client".format(config["name"]), - "user_file": "secret/{}.user".format(config["name"]), - "state_file": "state/{}.state".format(config["name"]), **config} + "client_file": os.path.join("clients", config["name"], "client.secret"), + "user_file": os.path.join("clients", config["name"], "user.secret"), + "state_file": os.path.join("clients", config["name"], "state.json"), + "cringe_dir": os.path.join("clients", config["name"], "cringe"), + "based_dir": os.path.join("clients", config["name"], "based"), + "unsure_dir": os.path.join("clients", config["name"], "unsure"), **config} self.load_state() @@ -43,6 +46,7 @@ class BotClient: def setup(self): client_file_path = os.path.join(os.path.dirname(sys.argv[0]), self.config["client_file"]) + os.makedirs(os.path.dirname(client_file_path), exist_ok = True) if not os.path.exists(client_file_path): Mastodon.create_app( self.app_name, @@ -50,6 +54,7 @@ class BotClient: to_file = client_file_path) user_file_path = os.path.join(os.path.dirname(sys.argv[0]), self.config["user_file"]) + os.makedirs(os.path.dirname(client_file_path), exist_ok = True) if not os.path.exists(user_file_path): api = Mastodon( api_base_url = self.config["base_url"], @@ -130,15 +135,17 @@ class BotClient: pass def on_load_state(self): - if os.path.exists(self.config["state_file"]): - state_file_path = os.path.join(os.path.dirname(sys.argv[0]), self.config["state_file"]) - with open(self.config["state_file"]) as json_file: + state_file_path = os.path.join(os.path.dirname(sys.argv[0]), self.config["state_file"]) + if os.path.exists(state_file_path): + os.makedirs(os.path.dirname(state_file_path), exist_ok = True) + with open(state_file_path) as json_file: return json.load(json_file) return copy.deepcopy(self.DEFAULT_STATE) def on_save_state(self, state): state_file_path = os.path.join(os.path.dirname(sys.argv[0]), self.config["state_file"]) + os.makedirs(os.path.dirname(state_file_path), exist_ok = True) with open(state_file_path, "w") as json_file: json.dump(state, json_file, indent = 4) diff --git a/config.toml.example b/config.toml.example index a9aaffc..2944a2c 100644 --- a/config.toml.example +++ b/config.toml.example @@ -14,26 +14,38 @@ retry_rate = 60 poll_interval = 15 # Minutes until cringe statuses are deleted -max_age = 90 +max_age = 600 [clients] - # By default, the client name is used as the hostname of the server and - # the base name of the files associated with the client, unless overridden - # below. + # The client name (in quotes below) is displayed in log messages, and is + # used as the default hostname of the server to connect to, and as the + # default base name of the various files and folders that are used by + # each client. [clients."mastodon.social"] # The URL of the server that the client connects to #base_url = "https://mastodon.social" # Where to store the authorisation key for the client - #client_file = "secret/mastodon.social.client" + #client_file = "clients/mastodon.social/client.secret" # Where to store the authorisation key for the user account - #user_file = "secret/mastodon.social.user" + #user_file = "clients/mastodon.social/user.secret" # Where to store the persisted state of the client - #state_file = "state/mastodon.social.state" + #state_file = "clients/mastodon.social/state.json" + + # Where to store the training database + #db_dir = "." + # Where to store learned messages + #cringe_dir = "data/cringe" + #based_dir = "data/based" + #unsure_dir = "data/unsure" + + # All paths are relative to the location of cringebot.py unless + # explicitly absolute ones are given. + # Whether or not to register (learn from) each categorised status # Enabling this gives the bot more data to work with, but errors # in categorisation will self-reinforce over time if not corrected. diff --git a/cringebot.py b/cringebot.py index 182067e..cdd3d6b 100644 --- a/cringebot.py +++ b/cringebot.py @@ -27,6 +27,15 @@ TIME_OF_DAY = { class CringeBotClient(BotClient): def __init__(self, bot, config): + config = { + "db_dir": ".", + "cringe_dir": "data/cringe", + "based_dir": "data/based", + "unsure_dir": "data/unsure", + **config} + + self.db_dir = os.path.join(os.path.dirname(sys.argv[0]), config["db_dir"]) + # Initialise HTML-to-Markdown converter self.h2t = html2text.HTML2Text() self.h2t.ignore_links = True @@ -56,26 +65,46 @@ class CringeBotClient(BotClient): # Perform any scheduled deletes self.deletion_scheduler.run(blocking = False) - def set_cringe(self, status_id): - self.state["cringe"].add(status_id) - self.state["based"].discard(status_id) - self.state["unsure"].discard(status_id) + def get_cringe_path(self, status_id): + return os.path.join(os.path.dirname(sys.argv[0]), self.config["cringe_dir"], status_id) - def set_based(self, status_id): - self.state["cringe"].discard(status_id) - self.state["based"].add(status_id) - self.state["unsure"].discard(status_id) - - def set_unsure(self, status_id): - self.state["cringe"].discard(status_id) - self.state["based"].discard(status_id) - self.state["unsure"].add(status_id) - - def set_discard(self, status_id): - self.state["cringe"].discard(status_id) - self.state["based"].discard(status_id) - self.state["unsure"].discard(status_id) + def get_based_path(self, status_id): + return os.path.join(os.path.dirname(sys.argv[0]), self.config["based_dir"], status_id) + def get_unsure_path(self, status_id): + return os.path.join(os.path.dirname(sys.argv[0]), self.config["unsure_dir"], status_id) + + def is_cringe(self, status_id): + path = self.get_cringe_path(status_id) + return path if os.path.isfile(path) else None + + def is_based(self, status_id): + path = self.get_based_path(status_id) + return path if os.path.isfile(path) else None + + def is_unsure(self, status_id): + path = self.get_unsure_path(status_id) + return path if os.path.isfile(path) else None + + def get_category_path(self, status_id): + return self.is_cringe(status_id) or self.is_based(status_id) or self.is_unsure(status_id) or None + + def delete_and_write(self, delete_path, write_path, text, mode = "a+"): + if delete_path and os.path.isfile(delete_path): + os.remove(delete_path) + os.makedirs(os.path.dirname(write_path), exist_ok = True) + with open(write_path, mode, encoding = "utf-8") as text_file: + text_file.write(text) + + def make_cringe(self, status_id, text): + self.delete_and_write(self.get_category_path(status_id), self.get_cringe_path(status_id), text + "\n") + + def make_based(self, status_id, text): + self.delete_and_write(self.get_category_path(status_id), self.get_based_path(status_id), text + "\n") + + def make_unsure(self, status_id, text): + self.delete_and_write(self.get_category_path(status_id), self.get_unsure_path(status_id), text + "\n") + # Look for replies to the bot and return True if commands were processed def process_commands(self, status): status_id = status["id"] @@ -101,11 +130,6 @@ class CringeBotClient(BotClient): target_status = self.api.status(target_id) target_mail_text = toot_dict_to_mail(target_status).format() - # Check if target status was previously classified - was_cringe = target_id in self.state["cringe"] - was_based = target_id in self.state["based"] - was_unsure = target_id in self.state["unsure"] - tokens = deque(command.split()) while True: token = tokens.popleft() @@ -113,14 +137,14 @@ class CringeBotClient(BotClient): if event not in ["categorise", "learn"]: self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id}) return True - elif was_cringe: + elif self.is_cringe(target_id): break - elif was_based: - bogofilter.run(target_mail_text, [bogofilter.UNLEARN_HAM, bogofilter.LEARN_SPAM]) + elif self.is_based(target_id): + bogofilter.run(target_mail_text, actions = [bogofilter.UNLEARN_HAM, bogofilter.LEARN_SPAM], db_dir = self.db_dir) else: - bogofilter.run(target_mail_text, [bogofilter.LEARN_SPAM]) + bogofilter.run(target_mail_text, actions = [bogofilter.LEARN_SPAM], db_dir = self.db_dir) - self.set_cringe(target_id) + self.make_cringe(target_id, target_mail_text) self.enqueue_deletion(target_id) self.respond(status, "Learned as cringe", {"event": "learn", "target": target_id}) @@ -130,14 +154,14 @@ class CringeBotClient(BotClient): if event not in ["categorise", "learn"]: self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id}) return True - elif was_based: + elif self.is_based(target_id): break - elif was_cringe: - bogofilter.run(target_mail_text, [bogofilter.UNLEARN_SPAM, bogofilter.LEARN_HAM]) + elif self.is_cringe(target_id): + bogofilter.run(target_mail_text, actions = [bogofilter.UNLEARN_SPAM, bogofilter.LEARN_HAM], db_dir = self.db_dir) else: - bogofilter.run(target_mail_text, [bogofilter.LEARN_HAM]) + bogofilter.run(target_mail_text, actions = [bogofilter.LEARN_HAM], db_dir = self.db_dir) - self.set_based(target_id) + self.make_based(target_id, target_mail_text) self.unqueue_deletion(target_id) self.respond(status, "Learned as based", {"event": "learn", "target": target_id}) @@ -147,14 +171,14 @@ class CringeBotClient(BotClient): if event not in ["categorise", "learn"]: self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id}) return True - elif was_unsure: + elif self.is_unsure(target_id): break - elif was_cringe: - bogofilter.run(target_mail_text, [bogofilter.UNLEARN_SPAM]) - elif was_based: - bogofilter.run(target_mail_text, [bogofilter.UNLEARN_HAM]) + elif self.is_cringe(target_id): + bogofilter.run(target_mail_text, actions = [bogofilter.UNLEARN_SPAM], db_dir = self.db_dir) + elif self.is_based(target_id): + bogofilter.run(target_mail_text, [bogofilter.UNLEARN_HAM], db_dir = self.db_dir) - self.set_unsure(target_id) + self.make_unsure(target_id, target_mail_text) self.unqueue_deletion(target_id) self.respond(status, "Unlearned", {"event": "learn", "target": target_id}) @@ -188,7 +212,7 @@ class CringeBotClient(BotClient): # Format and log plain-text preview md_text = self.h2t.handle(status["content"]) preview = toot_dict_to_mail(status) - preview.body = md_text + preview.change_body(md_text) preview_text = preview.format() self.log() @@ -199,23 +223,23 @@ class CringeBotClient(BotClient): if self.process_commands(status): return - result = bogofilter.run(mail_text, [bogofilter.CLASSIFY, bogofilter.REGISTER] if self.config["register"] else [bogofilter.CLASSIFY]) + result = bogofilter.run(mail_text, actions = [bogofilter.CLASSIFY, bogofilter.REGISTER] if self.config["register"] else [bogofilter.CLASSIFY], db_dir = self.db_dir) bogo_report = "Bogofilter: Category={}, Score={}".format(result.category, "{:.4f}".format(result.score)) if result.category == bogofilter.SPAM: self.log("CRINGE: Enqueuing status {} for deletion".format(status_id)) if self.config["register"]: - self.set_cringe(status_id) + self.make_cringe(status_id, mail_text) self.enqueue_deletion(status_id) self.respond(status, "Categorised as cringe\n{}".format(bogo_report), context = {"event": "categorise", "target": status_id}) elif result.category == bogofilter.HAM: self.log("BASED: Not enqueueing status {} for deletion".format(status_id)) if self.config["register"]: - self.set_based(status_id) + self.make_based(status_id, mail_text) self.respond(status, "Categorised as based\n{}".format(bogo_report), context = {"event": "categorise", "target": status_id}) else: self.log("UNSURE: Not enqueueing status {} for deletion".format(status_id)) if self.config["register"]: - self.set_unsure(status_id) + self.make_unsure(status_id, mail_text) self.respond(status, "Categorised as unsure\n{}".format(bogo_report), context = {"event": "categorise", "target": status_id}) def on_load_state(self): @@ -269,12 +293,10 @@ class CringeBotClient(BotClient): try: self.log("Deleting status {}".format(status_id)) self.api.status_delete(status_id) - self.set_discard(status_id) if status_id in self.state["own"]: del self.state["own"][status_id] except MastodonNotFoundError: self.log("Cannot find status {} on server".format(status_id)) - self.set_discard(status_id) except Exception: self.log(traceback.format_exc()) self.enqueue_deletion(status_id, 300) diff --git a/secret/.gitignore b/secret/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/secret/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/state/.gitignore b/state/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/state/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore