From 54c31581d59df2a9c0ff50a95d91601b20964bcf Mon Sep 17 00:00:00 2001
From: Thor Harald Johansen <thj@thj.no>
Date: Tue, 3 Aug 2021 08:38:02 +0200
Subject: [PATCH] Move dicts around and adjust algorithm

---
 dict.csv => dicts/main.csv              |  0
 {sourcedicts => dicts}/mhyph.txt        |  0
 {sourcedicts => dicts}/unigram_freq.csv |  0
 userdict.csv => dicts/user.dict         |  0
 mergedicts.py                           |  4 +-
 portbot.py                              | 55 +++++++++++++++++--------
 6 files changed, 39 insertions(+), 20 deletions(-)
 rename dict.csv => dicts/main.csv (100%)
 rename {sourcedicts => dicts}/mhyph.txt (100%)
 rename {sourcedicts => dicts}/unigram_freq.csv (100%)
 rename userdict.csv => dicts/user.dict (100%)

diff --git a/dict.csv b/dicts/main.csv
similarity index 100%
rename from dict.csv
rename to dicts/main.csv
diff --git a/sourcedicts/mhyph.txt b/dicts/mhyph.txt
similarity index 100%
rename from sourcedicts/mhyph.txt
rename to dicts/mhyph.txt
diff --git a/sourcedicts/unigram_freq.csv b/dicts/unigram_freq.csv
similarity index 100%
rename from sourcedicts/unigram_freq.csv
rename to dicts/unigram_freq.csv
diff --git a/userdict.csv b/dicts/user.dict
similarity index 100%
rename from userdict.csv
rename to dicts/user.dict
diff --git a/mergedicts.py b/mergedicts.py
index ab3e870..386d862 100644
--- a/mergedicts.py
+++ b/mergedicts.py
@@ -1,9 +1,9 @@
 import itertools
 
-with open("sourcedicts/mhyph.txt", "r", encoding = "macroman", newline = "\r\n") as f:
+with open("dicts/mhyph.txt", "r", encoding = "macroman", newline = "\r\n") as f:
     hyph_words = set(tuple(l.split("•")) for l in f.read().splitlines())
 
-with open("sourcedicts/unigram_freq.csv", "r") as f:
+with open("dicts/unigram_freq.csv", "r") as f:
     goog_words = set((w, int(f)) for w, f in
         itertools.islice((l.split(",") for
             l in f.read().splitlines()), 1, None))
diff --git a/portbot.py b/portbot.py
index c4794bc..48d9019 100644
--- a/portbot.py
+++ b/portbot.py
@@ -1,23 +1,24 @@
 import os
 import sys
 import time
+from datetime import datetime, timedelta, timezone
 import toml
 import random
 import re
 import sched
 import math
 import string
-import itertools
-from datetime import datetime, timedelta, timezone
 
 from mastodon import Mastodon, MastodonNotFoundError
 from fedbot.bot import Bot, BotClient
 
-POST_INTERVAL = timedelta(seconds = 15)
 TEST = "test" in sys.argv[1:]
-PORT_PCT = 30
-MAX_DICT_LEN = 100
-MAX_GEN_LEN = 15
+USER_PCT = 35
+MIN_MAIN_LEN = 3
+MAX_PORT_LEN = 14
+MAIN_DICT_PATH = "dicts/main.csv"
+USER_DICT_PATH = "dicts/user.dict"
+USED_DICT_PATH = "dicts/used.dict"
 
 def next_dt():
     dt = datetime.now(timezone.utc)
@@ -27,7 +28,6 @@ def next_dt():
                     microseconds = dt.microsecond)
     return dt
 
-
 config_path = os.path.join(os.path.dirname(sys.argv[0]), "config.toml")
 loaded_config = {
     "name": "portmanteaubot",
@@ -42,7 +42,9 @@ def overlap_words(left_word, right_word):
     if left_word == right_word:
         return set()
 
-    offset = 2
+    min_shift = 2
+
+    offset = min_shift
     attempts = set()
     while offset + 2 <= len(left_word[0]):
         if right_word[0].lower().startswith(left_word[0].lower()[offset : offset + 2]):
@@ -52,7 +54,7 @@ def overlap_words(left_word, right_word):
         offset += 1
 
     offset = len(right_word[0]) - 2
-    while offset >= 0:
+    while offset >= 2:
         if left_word[0].lower().endswith(right_word[0].lower()[offset : offset + 2]):
             word_str = left_word[0] + right_word[0][offset + 2:]
             if len(word_str) >= 6 and not is_affixed(word_str):
@@ -74,26 +76,35 @@ def pick_one_word(words):
    
     return random.choices(list(words.items()), weights = (v[0] for v in words.values()))[0]
 
+def word_diff(a, b):
+    seq = difflib.SequenceMatcher(None, a, b)
+    return seq.ratio()
+
 class WordMaker:
     def __init__(self):
         print("Loading dictionaries")
         illegal = set(ch for ch in (string.ascii_uppercase + string.punctuation + string.digits + string.whitespace))
-        with open ("dict.csv", "r") as f:
+        with open (MAIN_DICT_PATH, "r") as f:
             self.main_words = {
                 sl[0] :
                     (int(sl[1]), tuple(sl[2].split("="))) for sl in
                         (tuple(l.split(",")) for l in f.read().splitlines()) if 
-                    len(sl[0]) >= 3 and
-                    len(sl[0]) < MAX_DICT_LEN and
+                    len(sl[0]) >= MIN_MAIN_LEN and
                     not any(c in illegal for c in sl[0])}
         
-        with open("userdict.csv", "r") as f:
+        with open(USER_DICT_PATH, "r") as f:
             self.user_words = {l : (1, None) for l in f.read().splitlines()}
  
-        self.all_words = {k.lower() : v for k, v in {**self.main_words, **self.user_words}.items()}
+        if os.path.exists(USED_DICT_PATH):
+            with open("dicts/used.dict", "r") as f:
+                used_words = {l : (1, None) for l in f.read().splitlines()}
+        else:
+            used_words = dict()
+
+        self.all_words = {k.lower() : v for k, v in {**self.main_words, **self.user_words, **used_words}.items()}
 
     def extend_word(self, prev_word):
-        user_dict = random.randint(0, 100) < PORT_PCT
+        user_dict = random.randint(0, 100) < USER_PCT
         if user_dict:
             next_dict = self.user_words
         else:
@@ -108,7 +119,7 @@ class WordMaker:
 
         max_len = max(len(w) for w in new_words)
         for w in new_words:
-            new_words[w] = (max_len + 1 - len(w), None)
+            new_words[w] = (math.pow(max_len + 1 - len(w), 3), None)
 
         while len(new_words) > 0:
             new_word = pick_one_word(new_words)
@@ -120,7 +131,7 @@ class WordMaker:
         return None
 
     def get_portmanteau(self, target_times = 1):
-        user_dict = random.randint(0, 100) < PORT_PCT
+        user_dict = random.randint(0, 100) < USER_PCT
         if user_dict:
             words = self.user_words
         else:
@@ -142,7 +153,7 @@ class WordMaker:
                 if times == 0:
                     break
 
-            if len(word[0]) < MAX_GEN_LEN:
+            if len(word[0]) < MAX_PORT_LEN:
                 break
 
         word_str = word[0].lower()
@@ -154,11 +165,19 @@ class WordMaker:
     def get_portmanteaus(self, count = 10):
         words = set()
 
+        used_words = dict()
         while count > 0:
             word_str = self.get_portmanteau()
             if word_str not in words:
                 words.add(word_str)
+                used_words[word_str] = (1, None)
                 count -= 1
+        
+        self.all_words.update(used_words)
+        
+        if not TEST:
+            with open("dicts/used.dict", "a") as f:
+                f.write("\n".join(used_words.keys()) + "\n")
 
         return words