Add data store for statuses; rearrange client files

master
Thor 3 years ago
parent 255126b1ee
commit e1b3e0ffb3
  1. 4
      .gitignore
  2. 63
      bogofilter.py
  3. 19
      bot.py
  4. 26
      config.toml.example
  5. 114
      cringebot.py
  6. 2
      secret/.gitignore
  7. 2
      state/.gitignore

4
.gitignore vendored

@ -1,5 +1,9 @@
config.toml
wordlist.db
clients/
cringe/
based/
unsure/
__pycache__
.*
*.db

@ -2,16 +2,15 @@ import subprocess
from email.utils import format_datetime
import quopri
import os
import re
DB_DIR = "."
DB_PATH = os.path.join(DB_DIR, "wordlist.db")
MAX_HAM = 1.0 / 3.0
MIN_SPAM = 2.0 / 3.0
MIN_DEV = 2 * (MIN_SPAM - MAX_HAM) * 0.375
ROBS = 0.0178
ROBX = (MAX_HAM + MIN_SPAM) / 2
LOAD_COMMAND = ["bogoutil", "-l", DB_PATH]
COMMAND = ["bogofilter", "-T", "-c", "/dev/null", "-d", DB_DIR, "-o", "{},{}".format(MIN_SPAM, MAX_HAM), "-m", "{},{},{}".format(MIN_DEV, ROBS, ROBX)]
LOAD_COMMAND = ["bogoutil", "-l"]
COMMAND = ["bogofilter", "-T", "-c", "/dev/null", "-o", "{},{}".format(MIN_SPAM, MAX_HAM), "-m", "{},{},{}".format(MIN_DEV, ROBS, ROBX)]
# Categories
SPAM = "S"
@ -46,44 +45,62 @@ class BogofilterResult:
self.score = score
class Mail:
def __init__(self, headers = {}, body = None):
self.headers = {**{
"Content-Type": "text/html; charset=\"UTF-8\""}, **headers}
self.body = body
def __init__(self, headers = {}, body = []):
self.headers = {**headers, "Content-Type": "text/html; charset=\"UTF-8\""}
self.change_body(body)
def change_body(self, body):
if isinstance(body, str):
self.body = body.strip().split("\n")
else:
self.body = list(body)
def format(self, mbox = False):
lines = []
if mbox:
lines.append("From {} {}\n".format(self.headers["From"], format_datetime(datetime.now(timezone.utc), usegmt = True)))
def format(self):
text = str()
for key, value in self.headers.items():
if key == "Subject":
value = "=?utf-8?Q?{}?=".format(quopri.encodestring(bytes(value, "utf-8"), header = True).decode("utf-8"))
lines.append("{}: {}".format(key, value))
lines.append("")
text += "{key}: {value}\n".format(key = key, value = value)
text += "\n"
if mbox:
lines.extend([">" + line if re.match(r">*From ", line) else line for line in self.body])
else:
lines.extend(self.body)
if self.body:
text += self.body
return "\n".join(lines)
text += "\n"
def deliver_to_mbox(path, mode = "a+"):
empty = not os.path.exists(path) or os.path.getsize(path) == 0
with open(path, mode, encoding = "utf-8") as mbox_file:
if not empty:
mbox_file.write("\n")
mbox_file.write(self.format(mbox = True))
return text
def run(text, actions = [CLASSIFY], category = UNSURE):
def run(text, db_dir, actions = [CLASSIFY], category = UNSURE):
args = []
for action in actions:
args.extend(ACTIONS[action])
if not os.path.exists(DB_PATH):
subprocess.run(LOAD_COMMAND, input = b'')
if not os.path.exists(os.path.join(db_dir, "wordlist.db")):
os.makedirs(db_dir, exist_ok = True)
subprocess.run(LOAD_COMMAND + [db_dir], input = b'')
cp = subprocess.run(COMMAND + args, capture_output = True, encoding = "utf-8", input = text)
cp = subprocess.run(COMMAND + ["-d", db_dir] + args, capture_output = True, encoding = "utf-8", input = text)
arr = cp.stdout.strip().split(" ")
if len(arr) == 2:
(category, score) = arr
return BogofilterResult(category, float(score))
else:
if cp.returncode == 3:
print("Bogofilter error")
print("Bogofilter error:")
if cp.stdout.strip():
print(cp.stderr.strip())
return None
#mail = Mail({"From": "thor"}, "Hello, World\nHow are you?")
#print(mail.format())

@ -22,9 +22,12 @@ class BotClient:
self.bot = bot
self.config = {
"base_url": "https://{}".format(config["name"]),
"client_file": "secret/{}.client".format(config["name"]),
"user_file": "secret/{}.user".format(config["name"]),
"state_file": "state/{}.state".format(config["name"]), **config}
"client_file": os.path.join("clients", config["name"], "client.secret"),
"user_file": os.path.join("clients", config["name"], "user.secret"),
"state_file": os.path.join("clients", config["name"], "state.json"),
"cringe_dir": os.path.join("clients", config["name"], "cringe"),
"based_dir": os.path.join("clients", config["name"], "based"),
"unsure_dir": os.path.join("clients", config["name"], "unsure"), **config}
self.load_state()
@ -43,6 +46,7 @@ class BotClient:
def setup(self):
client_file_path = os.path.join(os.path.dirname(sys.argv[0]), self.config["client_file"])
os.makedirs(os.path.dirname(client_file_path), exist_ok = True)
if not os.path.exists(client_file_path):
Mastodon.create_app(
self.app_name,
@ -50,6 +54,7 @@ class BotClient:
to_file = client_file_path)
user_file_path = os.path.join(os.path.dirname(sys.argv[0]), self.config["user_file"])
os.makedirs(os.path.dirname(client_file_path), exist_ok = True)
if not os.path.exists(user_file_path):
api = Mastodon(
api_base_url = self.config["base_url"],
@ -130,15 +135,17 @@ class BotClient:
pass
def on_load_state(self):
if os.path.exists(self.config["state_file"]):
state_file_path = os.path.join(os.path.dirname(sys.argv[0]), self.config["state_file"])
with open(self.config["state_file"]) as json_file:
state_file_path = os.path.join(os.path.dirname(sys.argv[0]), self.config["state_file"])
if os.path.exists(state_file_path):
os.makedirs(os.path.dirname(state_file_path), exist_ok = True)
with open(state_file_path) as json_file:
return json.load(json_file)
return copy.deepcopy(self.DEFAULT_STATE)
def on_save_state(self, state):
state_file_path = os.path.join(os.path.dirname(sys.argv[0]), self.config["state_file"])
os.makedirs(os.path.dirname(state_file_path), exist_ok = True)
with open(state_file_path, "w") as json_file:
json.dump(state, json_file, indent = 4)

@ -14,26 +14,38 @@ retry_rate = 60
poll_interval = 15
# Minutes until cringe statuses are deleted
max_age = 90
max_age = 600
[clients]
# By default, the client name is used as the hostname of the server and
# the base name of the files associated with the client, unless overridden
# below.
# The client name (in quotes below) is displayed in log messages, and is
# used as the default hostname of the server to connect to, and as the
# default base name of the various files and folders that are used by
# each client.
[clients."mastodon.social"]
# The URL of the server that the client connects to
#base_url = "https://mastodon.social"
# Where to store the authorisation key for the client
#client_file = "secret/mastodon.social.client"
#client_file = "clients/mastodon.social/client.secret"
# Where to store the authorisation key for the user account
#user_file = "secret/mastodon.social.user"
#user_file = "clients/mastodon.social/user.secret"
# Where to store the persisted state of the client
#state_file = "state/mastodon.social.state"
#state_file = "clients/mastodon.social/state.json"
# Where to store the training database
#db_dir = "."
# Where to store learned messages
#cringe_dir = "data/cringe"
#based_dir = "data/based"
#unsure_dir = "data/unsure"
# All paths are relative to the location of cringebot.py unless
# explicitly absolute ones are given.
# Whether or not to register (learn from) each categorised status
# Enabling this gives the bot more data to work with, but errors
# in categorisation will self-reinforce over time if not corrected.

@ -27,6 +27,15 @@ TIME_OF_DAY = {
class CringeBotClient(BotClient):
def __init__(self, bot, config):
config = {
"db_dir": ".",
"cringe_dir": "data/cringe",
"based_dir": "data/based",
"unsure_dir": "data/unsure",
**config}
self.db_dir = os.path.join(os.path.dirname(sys.argv[0]), config["db_dir"])
# Initialise HTML-to-Markdown converter
self.h2t = html2text.HTML2Text()
self.h2t.ignore_links = True
@ -56,26 +65,46 @@ class CringeBotClient(BotClient):
# Perform any scheduled deletes
self.deletion_scheduler.run(blocking = False)
def set_cringe(self, status_id):
self.state["cringe"].add(status_id)
self.state["based"].discard(status_id)
self.state["unsure"].discard(status_id)
def get_cringe_path(self, status_id):
return os.path.join(os.path.dirname(sys.argv[0]), self.config["cringe_dir"], status_id)
def set_based(self, status_id):
self.state["cringe"].discard(status_id)
self.state["based"].add(status_id)
self.state["unsure"].discard(status_id)
def set_unsure(self, status_id):
self.state["cringe"].discard(status_id)
self.state["based"].discard(status_id)
self.state["unsure"].add(status_id)
def set_discard(self, status_id):
self.state["cringe"].discard(status_id)
self.state["based"].discard(status_id)
self.state["unsure"].discard(status_id)
def get_based_path(self, status_id):
return os.path.join(os.path.dirname(sys.argv[0]), self.config["based_dir"], status_id)
def get_unsure_path(self, status_id):
return os.path.join(os.path.dirname(sys.argv[0]), self.config["unsure_dir"], status_id)
def is_cringe(self, status_id):
path = self.get_cringe_path(status_id)
return path if os.path.isfile(path) else None
def is_based(self, status_id):
path = self.get_based_path(status_id)
return path if os.path.isfile(path) else None
def is_unsure(self, status_id):
path = self.get_unsure_path(status_id)
return path if os.path.isfile(path) else None
def get_category_path(self, status_id):
return self.is_cringe(status_id) or self.is_based(status_id) or self.is_unsure(status_id) or None
def delete_and_write(self, delete_path, write_path, text, mode = "a+"):
if delete_path and os.path.isfile(delete_path):
os.remove(delete_path)
os.makedirs(os.path.dirname(write_path), exist_ok = True)
with open(write_path, mode, encoding = "utf-8") as text_file:
text_file.write(text)
def make_cringe(self, status_id, text):
self.delete_and_write(self.get_category_path(status_id), self.get_cringe_path(status_id), text + "\n")
def make_based(self, status_id, text):
self.delete_and_write(self.get_category_path(status_id), self.get_based_path(status_id), text + "\n")
def make_unsure(self, status_id, text):
self.delete_and_write(self.get_category_path(status_id), self.get_unsure_path(status_id), text + "\n")
# Look for replies to the bot and return True if commands were processed
def process_commands(self, status):
status_id = status["id"]
@ -101,11 +130,6 @@ class CringeBotClient(BotClient):
target_status = self.api.status(target_id)
target_mail_text = toot_dict_to_mail(target_status).format()
# Check if target status was previously classified
was_cringe = target_id in self.state["cringe"]
was_based = target_id in self.state["based"]
was_unsure = target_id in self.state["unsure"]
tokens = deque(command.split())
while True:
token = tokens.popleft()
@ -113,14 +137,14 @@ class CringeBotClient(BotClient):
if event not in ["categorise", "learn"]:
self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id})
return True
elif was_cringe:
elif self.is_cringe(target_id):
break
elif was_based:
bogofilter.run(target_mail_text, [bogofilter.UNLEARN_HAM, bogofilter.LEARN_SPAM])
elif self.is_based(target_id):
bogofilter.run(target_mail_text, actions = [bogofilter.UNLEARN_HAM, bogofilter.LEARN_SPAM], db_dir = self.db_dir)
else:
bogofilter.run(target_mail_text, [bogofilter.LEARN_SPAM])
bogofilter.run(target_mail_text, actions = [bogofilter.LEARN_SPAM], db_dir = self.db_dir)
self.set_cringe(target_id)
self.make_cringe(target_id, target_mail_text)
self.enqueue_deletion(target_id)
self.respond(status, "Learned as cringe", {"event": "learn", "target": target_id})
@ -130,14 +154,14 @@ class CringeBotClient(BotClient):
if event not in ["categorise", "learn"]:
self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id})
return True
elif was_based:
elif self.is_based(target_id):
break
elif was_cringe:
bogofilter.run(target_mail_text, [bogofilter.UNLEARN_SPAM, bogofilter.LEARN_HAM])
elif self.is_cringe(target_id):
bogofilter.run(target_mail_text, actions = [bogofilter.UNLEARN_SPAM, bogofilter.LEARN_HAM], db_dir = self.db_dir)
else:
bogofilter.run(target_mail_text, [bogofilter.LEARN_HAM])
bogofilter.run(target_mail_text, actions = [bogofilter.LEARN_HAM], db_dir = self.db_dir)
self.set_based(target_id)
self.make_based(target_id, target_mail_text)
self.unqueue_deletion(target_id)
self.respond(status, "Learned as based", {"event": "learn", "target": target_id})
@ -147,14 +171,14 @@ class CringeBotClient(BotClient):
if event not in ["categorise", "learn"]:
self.respond(status, "Status is not learnable", {"event": "error", "type": "learnable", "target": target_id})
return True
elif was_unsure:
elif self.is_unsure(target_id):
break
elif was_cringe:
bogofilter.run(target_mail_text, [bogofilter.UNLEARN_SPAM])
elif was_based:
bogofilter.run(target_mail_text, [bogofilter.UNLEARN_HAM])
elif self.is_cringe(target_id):
bogofilter.run(target_mail_text, actions = [bogofilter.UNLEARN_SPAM], db_dir = self.db_dir)
elif self.is_based(target_id):
bogofilter.run(target_mail_text, [bogofilter.UNLEARN_HAM], db_dir = self.db_dir)
self.set_unsure(target_id)
self.make_unsure(target_id, target_mail_text)
self.unqueue_deletion(target_id)
self.respond(status, "Unlearned", {"event": "learn", "target": target_id})
@ -188,7 +212,7 @@ class CringeBotClient(BotClient):
# Format and log plain-text preview
md_text = self.h2t.handle(status["content"])
preview = toot_dict_to_mail(status)
preview.body = md_text
preview.change_body(md_text)
preview_text = preview.format()
self.log()
@ -199,23 +223,23 @@ class CringeBotClient(BotClient):
if self.process_commands(status):
return
result = bogofilter.run(mail_text, [bogofilter.CLASSIFY, bogofilter.REGISTER] if self.config["register"] else [bogofilter.CLASSIFY])
result = bogofilter.run(mail_text, actions = [bogofilter.CLASSIFY, bogofilter.REGISTER] if self.config["register"] else [bogofilter.CLASSIFY], db_dir = self.db_dir)
bogo_report = "Bogofilter: Category={}, Score={}".format(result.category, "{:.4f}".format(result.score))
if result.category == bogofilter.SPAM:
self.log("CRINGE: Enqueuing status {} for deletion".format(status_id))
if self.config["register"]:
self.set_cringe(status_id)
self.make_cringe(status_id, mail_text)
self.enqueue_deletion(status_id)
self.respond(status, "Categorised as cringe\n{}".format(bogo_report), context = {"event": "categorise", "target": status_id})
elif result.category == bogofilter.HAM:
self.log("BASED: Not enqueueing status {} for deletion".format(status_id))
if self.config["register"]:
self.set_based(status_id)
self.make_based(status_id, mail_text)
self.respond(status, "Categorised as based\n{}".format(bogo_report), context = {"event": "categorise", "target": status_id})
else:
self.log("UNSURE: Not enqueueing status {} for deletion".format(status_id))
if self.config["register"]:
self.set_unsure(status_id)
self.make_unsure(status_id, mail_text)
self.respond(status, "Categorised as unsure\n{}".format(bogo_report), context = {"event": "categorise", "target": status_id})
def on_load_state(self):
@ -269,12 +293,10 @@ class CringeBotClient(BotClient):
try:
self.log("Deleting status {}".format(status_id))
self.api.status_delete(status_id)
self.set_discard(status_id)
if status_id in self.state["own"]:
del self.state["own"][status_id]
except MastodonNotFoundError:
self.log("Cannot find status {} on server".format(status_id))
self.set_discard(status_id)
except Exception:
self.log(traceback.format_exc())
self.enqueue_deletion(status_id, 300)

2
secret/.gitignore vendored

@ -1,2 +0,0 @@
*
!.gitignore

2
state/.gitignore vendored

@ -1,2 +0,0 @@
*
!.gitignore
Loading…
Cancel
Save