From 54c31581d59df2a9c0ff50a95d91601b20964bcf Mon Sep 17 00:00:00 2001 From: Thor Harald Johansen Date: Tue, 3 Aug 2021 08:38:02 +0200 Subject: [PATCH] Move dicts around and adjust algorithm --- dict.csv => dicts/main.csv | 0 {sourcedicts => dicts}/mhyph.txt | 0 {sourcedicts => dicts}/unigram_freq.csv | 0 userdict.csv => dicts/user.dict | 0 mergedicts.py | 4 +- portbot.py | 55 +++++++++++++++++-------- 6 files changed, 39 insertions(+), 20 deletions(-) rename dict.csv => dicts/main.csv (100%) rename {sourcedicts => dicts}/mhyph.txt (100%) rename {sourcedicts => dicts}/unigram_freq.csv (100%) rename userdict.csv => dicts/user.dict (100%) diff --git a/dict.csv b/dicts/main.csv similarity index 100% rename from dict.csv rename to dicts/main.csv diff --git a/sourcedicts/mhyph.txt b/dicts/mhyph.txt similarity index 100% rename from sourcedicts/mhyph.txt rename to dicts/mhyph.txt diff --git a/sourcedicts/unigram_freq.csv b/dicts/unigram_freq.csv similarity index 100% rename from sourcedicts/unigram_freq.csv rename to dicts/unigram_freq.csv diff --git a/userdict.csv b/dicts/user.dict similarity index 100% rename from userdict.csv rename to dicts/user.dict diff --git a/mergedicts.py b/mergedicts.py index ab3e870..386d862 100644 --- a/mergedicts.py +++ b/mergedicts.py @@ -1,9 +1,9 @@ import itertools -with open("sourcedicts/mhyph.txt", "r", encoding = "macroman", newline = "\r\n") as f: +with open("dicts/mhyph.txt", "r", encoding = "macroman", newline = "\r\n") as f: hyph_words = set(tuple(l.split("•")) for l in f.read().splitlines()) -with open("sourcedicts/unigram_freq.csv", "r") as f: +with open("dicts/unigram_freq.csv", "r") as f: goog_words = set((w, int(f)) for w, f in itertools.islice((l.split(",") for l in f.read().splitlines()), 1, None)) diff --git a/portbot.py b/portbot.py index c4794bc..48d9019 100644 --- a/portbot.py +++ b/portbot.py @@ -1,23 +1,24 @@ import os import sys import time +from datetime import datetime, timedelta, timezone import toml import random import re import sched import math import string -import itertools -from datetime import datetime, timedelta, timezone from mastodon import Mastodon, MastodonNotFoundError from fedbot.bot import Bot, BotClient -POST_INTERVAL = timedelta(seconds = 15) TEST = "test" in sys.argv[1:] -PORT_PCT = 30 -MAX_DICT_LEN = 100 -MAX_GEN_LEN = 15 +USER_PCT = 35 +MIN_MAIN_LEN = 3 +MAX_PORT_LEN = 14 +MAIN_DICT_PATH = "dicts/main.csv" +USER_DICT_PATH = "dicts/user.dict" +USED_DICT_PATH = "dicts/used.dict" def next_dt(): dt = datetime.now(timezone.utc) @@ -27,7 +28,6 @@ def next_dt(): microseconds = dt.microsecond) return dt - config_path = os.path.join(os.path.dirname(sys.argv[0]), "config.toml") loaded_config = { "name": "portmanteaubot", @@ -42,7 +42,9 @@ def overlap_words(left_word, right_word): if left_word == right_word: return set() - offset = 2 + min_shift = 2 + + offset = min_shift attempts = set() while offset + 2 <= len(left_word[0]): if right_word[0].lower().startswith(left_word[0].lower()[offset : offset + 2]): @@ -52,7 +54,7 @@ def overlap_words(left_word, right_word): offset += 1 offset = len(right_word[0]) - 2 - while offset >= 0: + while offset >= 2: if left_word[0].lower().endswith(right_word[0].lower()[offset : offset + 2]): word_str = left_word[0] + right_word[0][offset + 2:] if len(word_str) >= 6 and not is_affixed(word_str): @@ -74,26 +76,35 @@ def pick_one_word(words): return random.choices(list(words.items()), weights = (v[0] for v in words.values()))[0] +def word_diff(a, b): + seq = difflib.SequenceMatcher(None, a, b) + return seq.ratio() + class WordMaker: def __init__(self): print("Loading dictionaries") illegal = set(ch for ch in (string.ascii_uppercase + string.punctuation + string.digits + string.whitespace)) - with open ("dict.csv", "r") as f: + with open (MAIN_DICT_PATH, "r") as f: self.main_words = { sl[0] : (int(sl[1]), tuple(sl[2].split("="))) for sl in (tuple(l.split(",")) for l in f.read().splitlines()) if - len(sl[0]) >= 3 and - len(sl[0]) < MAX_DICT_LEN and + len(sl[0]) >= MIN_MAIN_LEN and not any(c in illegal for c in sl[0])} - with open("userdict.csv", "r") as f: + with open(USER_DICT_PATH, "r") as f: self.user_words = {l : (1, None) for l in f.read().splitlines()} - self.all_words = {k.lower() : v for k, v in {**self.main_words, **self.user_words}.items()} + if os.path.exists(USED_DICT_PATH): + with open("dicts/used.dict", "r") as f: + used_words = {l : (1, None) for l in f.read().splitlines()} + else: + used_words = dict() + + self.all_words = {k.lower() : v for k, v in {**self.main_words, **self.user_words, **used_words}.items()} def extend_word(self, prev_word): - user_dict = random.randint(0, 100) < PORT_PCT + user_dict = random.randint(0, 100) < USER_PCT if user_dict: next_dict = self.user_words else: @@ -108,7 +119,7 @@ class WordMaker: max_len = max(len(w) for w in new_words) for w in new_words: - new_words[w] = (max_len + 1 - len(w), None) + new_words[w] = (math.pow(max_len + 1 - len(w), 3), None) while len(new_words) > 0: new_word = pick_one_word(new_words) @@ -120,7 +131,7 @@ class WordMaker: return None def get_portmanteau(self, target_times = 1): - user_dict = random.randint(0, 100) < PORT_PCT + user_dict = random.randint(0, 100) < USER_PCT if user_dict: words = self.user_words else: @@ -142,7 +153,7 @@ class WordMaker: if times == 0: break - if len(word[0]) < MAX_GEN_LEN: + if len(word[0]) < MAX_PORT_LEN: break word_str = word[0].lower() @@ -154,11 +165,19 @@ class WordMaker: def get_portmanteaus(self, count = 10): words = set() + used_words = dict() while count > 0: word_str = self.get_portmanteau() if word_str not in words: words.add(word_str) + used_words[word_str] = (1, None) count -= 1 + + self.all_words.update(used_words) + + if not TEST: + with open("dicts/used.dict", "a") as f: + f.write("\n".join(used_words.keys()) + "\n") return words