Move dicts around and adjust algorithm

3 years ago · 54c31581d5
parent 3ceb937911
commit 54c31581d5
6 changed files with 39 additions and 20 deletions
--- a/dicts/main.csv
+++ b/dicts/main.csv
--- a/sourcedicts/mhyph.txt
+++ b/sourcedicts/mhyph.txt
--- a/sourcedicts/unigram_freq.csv
+++ b/sourcedicts/unigram_freq.csv
--- a/dicts/user.dict
+++ b/dicts/user.dict
--- a/mergedicts.py
+++ b/mergedicts.py
@ -1,9 +1,9 @@
 import itertools

-with open("sourcedicts/mhyph.txt", "r", encoding = "macroman", newline = "\r\n") as f:
+with open("dicts/mhyph.txt", "r", encoding = "macroman", newline = "\r\n") as f:
    hyph_words = set(tuple(l.split("•")) for l in f.read().splitlines())

-with open("sourcedicts/unigram_freq.csv", "r") as f:
+with open("dicts/unigram_freq.csv", "r") as f:
    goog_words = set((w, int(f)) for w, f in
        itertools.islice((l.split(",") for
            l in f.read().splitlines()), 1, None))
--- a/portbot.py
+++ b/portbot.py
@ -1,23 +1,24 @@
 import os
 import sys
 import time
+from datetime import datetime, timedelta, timezone
 import toml
 import random
 import re
 import sched
 import math
 import string
-import itertools
-from datetime import datetime, timedelta, timezone

 from mastodon import Mastodon, MastodonNotFoundError
 from fedbot.bot import Bot, BotClient

-POST_INTERVAL = timedelta(seconds = 15)
 TEST = "test" in sys.argv[1:]
-PORT_PCT = 30
-MAX_DICT_LEN = 100
-MAX_GEN_LEN = 15
+USER_PCT = 35
+MIN_MAIN_LEN = 3
+MAX_PORT_LEN = 14
+MAIN_DICT_PATH = "dicts/main.csv"
+USER_DICT_PATH = "dicts/user.dict"
+USED_DICT_PATH = "dicts/used.dict"

 def next_dt():
    dt = datetime.now(timezone.utc)
@ -27,7 +28,6 @@ def next_dt():
                    microseconds = dt.microsecond)
    return dt

-
 config_path = os.path.join(os.path.dirname(sys.argv[0]), "config.toml")
 loaded_config = {
    "name": "portmanteaubot",
@ -42,7 +42,9 @@ def overlap_words(left_word, right_word):
    if left_word == right_word:
        return set()

-    offset = 2
+    min_shift = 2
+
+    offset = min_shift
    attempts = set()
    while offset + 2 <= len(left_word[0]):
        if right_word[0].lower().startswith(left_word[0].lower()[offset : offset + 2]):
@ -52,7 +54,7 @@ def overlap_words(left_word, right_word):
        offset += 1

    offset = len(right_word[0]) - 2
-    while offset >= 0:
+    while offset >= 2:
        if left_word[0].lower().endswith(right_word[0].lower()[offset : offset + 2]):
            word_str = left_word[0] + right_word[0][offset + 2:]
            if len(word_str) >= 6 and not is_affixed(word_str):
@ -74,26 +76,35 @@ def pick_one_word(words):
   
    return random.choices(list(words.items()), weights = (v[0] for v in words.values()))[0]

+def word_diff(a, b):
+    seq = difflib.SequenceMatcher(None, a, b)
+    return seq.ratio()
+
 class WordMaker:
    def __init__(self):
        print("Loading dictionaries")
        illegal = set(ch for ch in (string.ascii_uppercase + string.punctuation + string.digits + string.whitespace))
-        with open ("dict.csv", "r") as f:
+        with open (MAIN_DICT_PATH, "r") as f:
            self.main_words = {
                sl[0] :
                    (int(sl[1]), tuple(sl[2].split("="))) for sl in
                        (tuple(l.split(",")) for l in f.read().splitlines()) if 
-                    len(sl[0]) >= 3 and
-                    len(sl[0]) < MAX_DICT_LEN and
+                    len(sl[0]) >= MIN_MAIN_LEN and
                    not any(c in illegal for c in sl[0])}
        
-        with open("userdict.csv", "r") as f:
+        with open(USER_DICT_PATH, "r") as f:
            self.user_words = {l : (1, None) for l in f.read().splitlines()}
 
-        self.all_words = {k.lower() : v for k, v in {**self.main_words, **self.user_words}.items()}
+        if os.path.exists(USED_DICT_PATH):
+            with open("dicts/used.dict", "r") as f:
+                used_words = {l : (1, None) for l in f.read().splitlines()}
+        else:
+            used_words = dict()
+
+        self.all_words = {k.lower() : v for k, v in {**self.main_words, **self.user_words, **used_words}.items()}

    def extend_word(self, prev_word):
-        user_dict = random.randint(0, 100) < PORT_PCT
+        user_dict = random.randint(0, 100) < USER_PCT
        if user_dict:
            next_dict = self.user_words
        else:
@ -108,7 +119,7 @@ class WordMaker:

        max_len = max(len(w) for w in new_words)
        for w in new_words:
-            new_words[w] = (max_len + 1 - len(w), None)
+            new_words[w] = (math.pow(max_len + 1 - len(w), 3), None)

        while len(new_words) > 0:
            new_word = pick_one_word(new_words)
@ -120,7 +131,7 @@ class WordMaker:
        return None

    def get_portmanteau(self, target_times = 1):
-        user_dict = random.randint(0, 100) < PORT_PCT
+        user_dict = random.randint(0, 100) < USER_PCT
        if user_dict:
            words = self.user_words
        else:
@ -142,7 +153,7 @@ class WordMaker:
                if times == 0:
                    break

-            if len(word[0]) < MAX_GEN_LEN:
+            if len(word[0]) < MAX_PORT_LEN:
                break

        word_str = word[0].lower()
@ -154,11 +165,19 @@ class WordMaker:
    def get_portmanteaus(self, count = 10):
        words = set()

+        used_words = dict()
        while count > 0:
            word_str = self.get_portmanteau()
            if word_str not in words:
                words.add(word_str)
+                used_words[word_str] = (1, None)
                count -= 1
+        
+        self.all_words.update(used_words)
+        
+        if not TEST:
+            with open("dicts/used.dict", "a") as f:
+                f.write("\n".join(used_words.keys()) + "\n")

        return words