Move dicts around and adjust algorithm

3 years ago · 54c31581d5
parent 3ceb937911
commit 54c31581d5
6 changed files with 39 additions and 20 deletions
--- a/dicts/main.csv
+++ b/dicts/main.csv
--- a/sourcedicts/mhyph.txt
+++ b/sourcedicts/mhyph.txt
--- a/sourcedicts/unigram_freq.csv
+++ b/sourcedicts/unigram_freq.csv
--- a/dicts/user.dict
+++ b/dicts/user.dict
--- a/mergedicts.py
+++ b/mergedicts.py
@ -1,9 +1,9 @@
 import itertools
-with open("sourcedicts/mhyph.txt", "r", encoding = "macroman", newline = "\r\n") as f:
+with open("dicts/mhyph.txt", "r", encoding = "macroman", newline = "\r\n") as f:
    hyph_words = set(tuple(l.split("•")) for l in f.read().splitlines())
-with open("sourcedicts/unigram_freq.csv", "r") as f:
+with open("dicts/unigram_freq.csv", "r") as f:
    goog_words = set((w, int(f)) for w, f in
        itertools.islice((l.split(",") for
            l in f.read().splitlines()), 1, None))
--- a/portbot.py
+++ b/portbot.py
@ -1,23 +1,24 @@
 import os
 import sys
 import time
 from datetime import datetime, timedelta, timezone
 import toml
 import random
 import re
 import sched
 import math
 import string
 import itertools
 from datetime import datetime, timedelta, timezone
 from mastodon import Mastodon, MastodonNotFoundError
 from fedbot.bot import Bot, BotClient
 POST_INTERVAL = timedelta(seconds = 15)
 TEST = "test" in sys.argv[1:]
-PORT_PCT = 30
+USER_PCT = 35
-MAX_DICT_LEN = 100
+MIN_MAIN_LEN = 3
-MAX_GEN_LEN = 15
+MAX_PORT_LEN = 14
 MAIN_DICT_PATH = "dicts/main.csv"
 USER_DICT_PATH = "dicts/user.dict"
 USED_DICT_PATH = "dicts/used.dict"
 def next_dt():
    dt = datetime.now(timezone.utc)
@ -27,7 +28,6 @@ def next_dt():
                    microseconds = dt.microsecond)
    return dt
 config_path = os.path.join(os.path.dirname(sys.argv[0]), "config.toml")
 loaded_config = {
    "name": "portmanteaubot",
@ -42,7 +42,9 @@ def overlap_words(left_word, right_word):
    if left_word == right_word:
        return set()
-    offset = 2
+    min_shift = 2
    offset = min_shift
    attempts = set()
    while offset + 2 <= len(left_word[0]):
        if right_word[0].lower().startswith(left_word[0].lower()[offset : offset + 2]):
@ -52,7 +54,7 @@ def overlap_words(left_word, right_word):
        offset += 1
    offset = len(right_word[0]) - 2
-    while offset >= 0:
+    while offset >= 2:
        if left_word[0].lower().endswith(right_word[0].lower()[offset : offset + 2]):
            word_str = left_word[0] + right_word[0][offset + 2:]
            if len(word_str) >= 6 and not is_affixed(word_str):
@ -74,26 +76,35 @@ def pick_one_word(words):
    return random.choices(list(words.items()), weights = (v[0] for v in words.values()))[0]
 def word_diff(a, b):
    seq = difflib.SequenceMatcher(None, a, b)
    return seq.ratio()
 class WordMaker:
    def __init__(self):
        print("Loading dictionaries")
        illegal = set(ch for ch in (string.ascii_uppercase + string.punctuation + string.digits + string.whitespace))
-        with open ("dict.csv", "r") as f:
+        with open (MAIN_DICT_PATH, "r") as f:
            self.main_words = {
                sl[0] :
                    (int(sl[1]), tuple(sl[2].split("="))) for sl in
                        (tuple(l.split(",")) for l in f.read().splitlines()) if 
-                    len(sl[0]) >= 3 and
+                    len(sl[0]) >= MIN_MAIN_LEN and
                    len(sl[0]) < MAX_DICT_LEN and
                    not any(c in illegal for c in sl[0])}
-        with open("userdict.csv", "r") as f:
+        with open(USER_DICT_PATH, "r") as f:
            self.user_words = {l : (1, None) for l in f.read().splitlines()}
-        self.all_words = {k.lower() : v for k, v in {**self.main_words, **self.user_words}.items()}
+        if os.path.exists(USED_DICT_PATH):
            with open("dicts/used.dict", "r") as f:
                used_words = {l : (1, None) for l in f.read().splitlines()}
        else:
            used_words = dict()
        self.all_words = {k.lower() : v for k, v in {**self.main_words, **self.user_words, **used_words}.items()}
    def extend_word(self, prev_word):
-        user_dict = random.randint(0, 100) < PORT_PCT
+        user_dict = random.randint(0, 100) < USER_PCT
        if user_dict:
            next_dict = self.user_words
        else:
@ -108,7 +119,7 @@ class WordMaker:
        max_len = max(len(w) for w in new_words)
        for w in new_words:
-            new_words[w] = (max_len + 1 - len(w), None)
+            new_words[w] = (math.pow(max_len + 1 - len(w), 3), None)
        while len(new_words) > 0:
            new_word = pick_one_word(new_words)
@ -120,7 +131,7 @@ class WordMaker:
        return None
    def get_portmanteau(self, target_times = 1):
-        user_dict = random.randint(0, 100) < PORT_PCT
+        user_dict = random.randint(0, 100) < USER_PCT
        if user_dict:
            words = self.user_words
        else:
@ -142,7 +153,7 @@ class WordMaker:
                if times == 0:
                    break
-            if len(word[0]) < MAX_GEN_LEN:
+            if len(word[0]) < MAX_PORT_LEN:
                break
        word_str = word[0].lower()
@ -154,11 +165,19 @@ class WordMaker:
    def get_portmanteaus(self, count = 10):
        words = set()
        used_words = dict()
        while count > 0:
            word_str = self.get_portmanteau()
            if word_str not in words:
                words.add(word_str)
                used_words[word_str] = (1, None)
                count -= 1
        self.all_words.update(used_words)
        if not TEST:
            with open("dicts/used.dict", "a") as f:
                f.write("\n".join(used_words.keys()) + "\n")
        return words