Edits to the user dictionary

Update location of used words dict in .gitignore
Add barebones README file
10 changed files with 135264 additions and 25 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,4 +2,5 @@
 !.gitignore
 !.gitmodules
 config.toml
+dicts/used.dict
 clients/
--- a/README.md
+++ b/README.md
@ -0,0 +1,12 @@
+# PortmanteuBot
+
+### Dictionaries
+
+#### Word frequency
+https://www.kaggle.com/rtatman/english-word-frequency
+
+#### Hyphenation
+https://www.gutenberg.org/files/3204/files/
+
+#### Pronunciation
+http://www.speech.cs.cmu.edu/cgi-bin/cmudict
--- a/dicts/cmudict.dict
+++ b/dicts/cmudict.dict
--- a/dicts/cmudict.phones
+++ b/dicts/cmudict.phones
@ -0,0 +1,39 @@
+AA	vowel
+AE	vowel
+AH	vowel
+AO	vowel
+AW	vowel
+AY	vowel
+B	stop
+CH	affricate
+D	stop
+DH	fricative
+EH	vowel
+ER	vowel
+EY	vowel
+F	fricative
+G	stop
+HH	aspirate
+IH	vowel
+IY	vowel
+JH	affricate
+K	stop
+L	liquid
+M	nasal
+N	nasal
+NG	nasal
+OW	vowel
+OY	vowel
+P	stop
+R	liquid
+S	fricative
+SH	fricative
+T	stop
+TH	fricative
+UH	vowel
+UW	vowel
+V	fricative
+W	semivowel
+Y	semivowel
+Z	fricative
+ZH	fricative
--- a/dicts/main.csv
+++ b/dicts/main.csv
--- a/sourcedicts/mhyph.txt
+++ b/sourcedicts/mhyph.txt
--- a/sourcedicts/unigram_freq.csv
+++ b/sourcedicts/unigram_freq.csv
--- a/dicts/user.dict
+++ b/dicts/user.dict
@ -30,6 +30,7 @@ blaseball
 blaze
 bloatware
 blob
+blobby
 blobcat
 blockchain
 blogger
@ -39,6 +40,7 @@ Bluetooth
 boner
 boob
 boomer
+boost
 boot
 bootstrap
 booty
@ -83,6 +85,7 @@ cookie
 cool
 coomer
 crab
+cranky
 crap
 creeper
 crew
@ -102,8 +105,6 @@ dank
 darkweb
 Debian
 Democrat
-derp
-derpina
 desu
 dildo
 doge
@ -145,6 +146,10 @@ fangirl
 fappable
 fapworthy
 fart
+fav
+fave
+favorite
+favourite
 fedi
 fediverse
 fedora
@ -167,6 +172,7 @@ futa
 futurama
 game
 gamer
+Gamergate
 gang
 gangsta
 gangster
@ -199,7 +205,6 @@ hellthread
 hentai
 herp
 hipster
-hodl
 hoodie
 horny
 huge
@ -224,6 +229,7 @@ kalm
 Kickstarter
 Kirby
 Knuckles
+kuudere
 Kraftwerk
 lame
 lamer
@ -231,6 +237,7 @@ laptop
 legend
 lesbian
 liberal
+like
 Link
 LinkedIn
 livestream
@ -297,6 +304,8 @@ photo
 photobomb
 Pikachu
 pirate
+pissed
+pissy
 Pixelfed
 player
 PlayStation
@ -335,6 +344,8 @@ roleplayer
 sage
 sapphic
 satan
+scream
+screamer
 selfie
 senpai
 sex
@ -355,7 +366,6 @@ smol
 smug
 smut
 Snapchat
-Snowden
 snug
 snuggle
 software
@ -369,12 +379,14 @@ spook
 spooky
 Spotify
 squad
-Squidward
 startup
 steampunk
 stim
 stonk
 stonks
+sub
+submit
+submissive
 subtweet
 swole
 tentacle
@ -400,6 +412,7 @@ trigger
 triggered
 trollface
 trolltard
+troon
 tsundere
 Twitch
 Uber
--- a/mergedicts.py
+++ b/mergedicts.py
@ -1,9 +1,9 @@
 import itertools

-with open("sourcedicts/mhyph.txt", "r", encoding = "macroman", newline = "\r\n") as f:
+with open("dicts/mhyph.txt", "r", encoding = "macroman", newline = "\r\n") as f:
    hyph_words = set(tuple(l.split("•")) for l in f.read().splitlines())

-with open("sourcedicts/unigram_freq.csv", "r") as f:
+with open("dicts/unigram_freq.csv", "r") as f:
    goog_words = set((w, int(f)) for w, f in
        itertools.islice((l.split(",") for
            l in f.read().splitlines()), 1, None))
--- a/portbot.py
+++ b/portbot.py
@ -1,23 +1,24 @@
 import os
 import sys
 import time
+from datetime import datetime, timedelta, timezone
 import toml
 import random
 import re
 import sched
 import math
 import string
-import itertools
-from datetime import datetime, timedelta, timezone

 from mastodon import Mastodon, MastodonNotFoundError
 from fedbot.bot import Bot, BotClient

-POST_INTERVAL = timedelta(seconds = 15)
 TEST = "test" in sys.argv[1:]
-PORT_PCT = 30
-MAX_DICT_LEN = 100
-MAX_GEN_LEN = 15
+USER_PCT = 35
+MIN_MAIN_LEN = 3
+MAX_PORT_LEN = 14
+MAIN_DICT_PATH = "dicts/main.csv"
+USER_DICT_PATH = "dicts/user.dict"
+USED_DICT_PATH = "dicts/used.dict"

 def next_dt():
    dt = datetime.now(timezone.utc)
@ -27,7 +28,6 @@ def next_dt():
                    microseconds = dt.microsecond)
    return dt

-
 config_path = os.path.join(os.path.dirname(sys.argv[0]), "config.toml")
 loaded_config = {
    "name": "portmanteaubot",
@ -42,7 +42,9 @@ def overlap_words(left_word, right_word):
    if left_word == right_word:
        return set()

-    offset = 2
+    min_shift = 2
+
+    offset = min_shift
    attempts = set()
    while offset + 2 <= len(left_word[0]):
        if right_word[0].lower().startswith(left_word[0].lower()[offset : offset + 2]):
@ -52,7 +54,7 @@ def overlap_words(left_word, right_word):
        offset += 1

    offset = len(right_word[0]) - 2
-    while offset >= 0:
+    while offset >= 2:
        if left_word[0].lower().endswith(right_word[0].lower()[offset : offset + 2]):
            word_str = left_word[0] + right_word[0][offset + 2:]
            if len(word_str) >= 6 and not is_affixed(word_str):
@ -74,26 +76,35 @@ def pick_one_word(words):
   
    return random.choices(list(words.items()), weights = (v[0] for v in words.values()))[0]

+def word_diff(a, b):
+    seq = difflib.SequenceMatcher(None, a, b)
+    return seq.ratio()
+
 class WordMaker:
    def __init__(self):
        print("Loading dictionaries")
        illegal = set(ch for ch in (string.ascii_uppercase + string.punctuation + string.digits + string.whitespace))
-        with open ("dict.csv", "r") as f:
+        with open (MAIN_DICT_PATH, "r") as f:
            self.main_words = {
                sl[0] :
                    (int(sl[1]), tuple(sl[2].split("="))) for sl in
                        (tuple(l.split(",")) for l in f.read().splitlines()) if 
-                    len(sl[0]) >= 3 and
-                    len(sl[0]) < MAX_DICT_LEN and
+                    len(sl[0]) >= MIN_MAIN_LEN and
                    not any(c in illegal for c in sl[0])}
        
-        with open("userdict.csv", "r") as f:
+        with open(USER_DICT_PATH, "r") as f:
            self.user_words = {l : (1, None) for l in f.read().splitlines()}
 
-        self.all_words = {k.lower() : v for k, v in {**self.main_words, **self.user_words}.items()}
+        if os.path.exists(USED_DICT_PATH):
+            with open("dicts/used.dict", "r") as f:
+                used_words = {l : (1, None) for l in f.read().splitlines()}
+        else:
+            used_words = dict()
+
+        self.all_words = {k.lower() : v for k, v in {**self.main_words, **self.user_words, **used_words}.items()}

    def extend_word(self, prev_word):
-        user_dict = random.randint(0, 100) < PORT_PCT
+        user_dict = random.randint(0, 100) < USER_PCT
        if user_dict:
            next_dict = self.user_words
        else:
@ -108,7 +119,7 @@ class WordMaker:

        max_len = max(len(w) for w in new_words)
        for w in new_words:
-            new_words[w] = (max_len + 1 - len(w), None)
+            new_words[w] = (math.pow(max_len + 1 - len(w), 3), None)

        while len(new_words) > 0:
            new_word = pick_one_word(new_words)
@ -120,7 +131,7 @@ class WordMaker:
        return None

    def get_portmanteau(self, target_times = 1):
-        user_dict = random.randint(0, 100) < PORT_PCT
+        user_dict = random.randint(0, 100) < USER_PCT
        if user_dict:
            words = self.user_words
        else:
@ -142,7 +153,7 @@ class WordMaker:
                if times == 0:
                    break

-            if len(word[0]) < MAX_GEN_LEN:
+            if len(word[0]) < MAX_PORT_LEN:
                break

        word_str = word[0].lower()
@ -154,11 +165,19 @@ class WordMaker:
    def get_portmanteaus(self, count = 10):
        words = set()

+        used_words = dict()
        while count > 0:
            word_str = self.get_portmanteau()
            if word_str not in words:
                words.add(word_str)
+                used_words[word_str] = (1, None)
                count -= 1
+        
+        self.all_words.update(used_words)
+        
+        if not TEST:
+            with open("dicts/used.dict", "a") as f:
+                f.write("\n".join(used_words.keys()) + "\n")

        return words
Author	SHA1	Message	Date
Thor	cd2933759e	Edits to the user dictionary	3 years ago
Thor	0fc1304a5d	Update location of used words dict in .gitignore	3 years ago
Thor	03d8419a74	Add barebones README file	3 years ago
Thor	40f1496160	Add CMU pronunciation dictionary files	3 years ago
Thor	54c31581d5	Move dicts around and adjust algorithm	3 years ago