Add dictionary merger

Add merged word frequency dictionary
Update user dictionary with new words
6 changed files with 51518 additions and 88 deletions
--- a/dict.csv
+++ b/dict.csv
--- a/mergedicts.py
+++ b/mergedicts.py
@ -0,0 +1,20 @@
+import itertools
+
+with open("sourcedicts/mhyph.txt", "r", encoding = "macroman", newline = "\r\n") as f:
+    hyph_words = set(tuple(l.split("•")) for l in f.read().splitlines())
+
+with open("sourcedicts/unigram_freq.csv", "r") as f:
+    goog_words = set((w, int(f)) for w, f in
+        itertools.islice((l.split(",") for
+            l in f.read().splitlines()), 1, None))
+
+hyph_map = {"".join(x) : x for x in hyph_words}
+
+words = dict()
+for w, f in sorted(goog_words, key = lambda w: w[1], reverse = True):
+    hw = hyph_map.get(w)
+    if hw:
+        words[w] = (f, hw)
+
+for w, (f, hw) in words.items():
+    print("{},{},{}".format(w, f, "=".join(hw)))
--- a/portbot.py
+++ b/portbot.py
@ -7,6 +7,7 @@ import re
 import sched
 import math
 import string
+import itertools
 from datetime import datetime, timedelta, timezone

 from mastodon import Mastodon, MastodonNotFoundError
@ -14,12 +15,14 @@ from fedbot.bot import Bot, BotClient

 POST_INTERVAL = timedelta(seconds = 15)
 TEST = "test" in sys.argv[1:]
-PORT_PCT = 67
+PORT_PCT = 30
+MAX_DICT_LEN = 100
+MAX_GEN_LEN = 15

 def next_dt():
    dt = datetime.now(timezone.utc)
-    dt -= timedelta(hours        = 0,
-                    minutes      = (dt.minute % 15) - 15,
+    dt -= timedelta(hours        = -1,
+                    minutes      = dt.minute,
                    seconds      = dt.second,
                    microseconds = dt.microsecond)
    return dt
@ -30,110 +33,106 @@ loaded_config = {
    "name": "portmanteaubot",
    **toml.load(config_path)}

-SUFFIXES = [
-    'ly$',
-    'ing$',
-    '[bdklmptw]?est$',
-    '[^ious]s$',
-    'ted$',
-    '[ei]ty$']
+AFFIXES = []

-def is_suffixed(word):
-    return any(re.fullmatch(suf, word) for suf in SUFFIXES)
+def is_affixed(word):
+    return any(re.search(suf, word) for suf in AFFIXES)

 def overlap_words(left_word, right_word):
    if left_word == right_word:
-        return None
+        return set()

    offset = 2
-    attempts = []
-    while offset + 2 <= len(left_word):
-        if right_word.lower().startswith(left_word.lower()[offset : offset + 2]):
-            attempts.append(left_word[:offset] + right_word)
-            #break
+    attempts = set()
+    while offset + 2 <= len(left_word[0]):
+        if right_word[0].lower().startswith(left_word[0].lower()[offset : offset + 2]):
+            word_str = left_word[0][:offset] + right_word[0]
+            if len(word_str) >= 6 and not is_affixed(word_str):
+                attempts.add(word_str)
        offset += 1

-    offset = len(right_word) - 2
+    offset = len(right_word[0]) - 2
    while offset >= 0:
-        if left_word.lower().endswith(right_word.lower()[offset : offset + 2]):
-            attempts.append(left_word + right_word[offset + 2:])
-            #break
+        if left_word[0].lower().endswith(right_word[0].lower()[offset : offset + 2]):
+            word_str = left_word[0] + right_word[0][offset + 2:]
+            if len(word_str) >= 6 and not is_affixed(word_str):
+                attempts.add(word_str)
        offset -= 1

-    attempts = sorted(attempts, key = lambda w: len(w), reverse = True)
-
-    if len(attempts) == 0:
-        return None
-
-    return pick_one_word(attempts)
+    return attempts

-def word_weight(index, length, power = 2):
+def word_weight(index, length, power):
    a = pow((index + 1) / length, 2)
-    return int(100000 * a)
+    return int(350000 * a)

-def pick_one_word(words, power = 2, max_len = 12):
-    words = list(filter(lambda w: len(w) <= max_len, words))
-    
+def weights_for(words, power):
+    return [word_weight(i, len(words), power = power) for i in range(0, len(words))]
+
+def pick_one_word(words):
    if len(words) == 0:
        return None
-    
-    weights = [word_weight(i, len(words), power = power) for i in range(0, len(words))]
-    return random.choices(words, weights = weights)[0]
+   
+    return random.choices(list(words.items()), weights = (v[0] for v in words.values()))[0]

 class WordMaker:
    def __init__(self):
        print("Loading dictionaries")
        illegal = set(ch for ch in (string.ascii_uppercase + string.punctuation + string.digits + string.whitespace))
-        with open ("mhyph.txt", "r", encoding = "mac-roman") as f:
-            lines = [l.strip() for l in f.readlines()]
-            lines = filter(lambda w: len(w) > 0 and not any(ch in illegal for ch in w), lines)
-            words = [l.replace("•", "") for l in lines]
-            self.all_words = words
-            words = list(set(sorted(words, key = lambda w: len(w), reverse = True)))
-            self.first_words = list(filter(lambda w: not is_suffixed(w), words))
-            self.next_words = words
-
-        with open("porthyph.txt", "r") as f:
-            lines = [line.strip() for line in f.readlines()]
-            words = list(filter(lambda l: len(l) > 0, lines))
-            self.all_words = list(set(sorted([w.lower() for w in [*self.all_words, *words]], key = lambda w: len(w), reverse = True)))
-            self.port_words = list(set(sorted(words, key = lambda w: len(w), reverse = True)))
-    
-    def extend_word2(self, prev_word):
-        port_dict = random.randint(0, 100) < PORT_PCT
-        if port_dict:
-            next_dict = self.port_words
+        with open ("dict.csv", "r") as f:
+            self.main_words = {
+                sl[0] :
+                    (int(sl[1]), tuple(sl[2].split("="))) for sl in
+                        (tuple(l.split(",")) for l in f.read().splitlines()) if 
+                    len(sl[0]) >= 3 and
+                    len(sl[0]) < MAX_DICT_LEN and
+                    not any(c in illegal for c in sl[0])}
+        
+        with open("userdict.csv", "r") as f:
+            self.user_words = {l : (1, None) for l in f.read().splitlines()}
+ 
+        self.all_words = {k.lower() : v for k, v in {**self.main_words, **self.user_words}.items()}
+
+    def extend_word(self, prev_word):
+        user_dict = random.randint(0, 100) < PORT_PCT
+        if user_dict:
+            next_dict = self.user_words
        else:
-            next_dict = self.next_words
+            next_dict = self.main_words
+
+        new_words = dict()
+        for w in next_dict.items():
+            new_words.update(dict.fromkeys(overlap_words(prev_word, w)))

-        new_words = [overlap_words(prev_word, w) for w in next_dict if overlap_words(prev_word, w)]
+        if len(new_words) == 0:
+            return None
+
+        max_len = max(len(w) for w in new_words)
+        for w in new_words:
+            new_words[w] = (max_len + 1 - len(w), None)

        while len(new_words) > 0:
-            new_word = pick_one_word(new_words, power = 2 if port_dict else 4)
-            if not new_word:
-                return None
-            new_words.remove(new_word)
+            new_word = pick_one_word(new_words)
+            del new_words[new_word[0]]

-            if new_word.lower() not in self.all_words:
+            if new_word[0].lower() not in self.all_words:
                return new_word 

        return None

-    def get_portmanteau(self):
-        target_times = 1
-        port_dict = random.randint(0, 100) < PORT_PCT
-        if port_dict:
-            words = self.port_words
+    def get_portmanteau(self, target_times = 1):
+        user_dict = random.randint(0, 100) < PORT_PCT
+        if user_dict:
+            words = self.user_words
        else:
-            words = self.first_words
+            words = self.main_words

        while True:
            while True:
-                word = pick_one_word(words, power = 2 if port_dict else 4)
+                word = pick_one_word(words)
                
                times = target_times
                while times > 0:
-                    ext_word = self.extend_word2(word)
+                    ext_word = self.extend_word(word)
                    if ext_word is None:
                        break
                    
@ -143,17 +142,25 @@ class WordMaker:
                if times == 0:
                    break

-            if len(word) < 15:
+            if len(word[0]) < MAX_GEN_LEN:
                break

-        word = word.lower()
+        word_str = word[0].lower()

-        print(word)
+        print(word_str)
        
-        return word
+        return word_str

    def get_portmanteaus(self, count = 10):
-        return [self.get_portmanteau() for x in range(0, count)]
+        words = set()
+
+        while count > 0:
+            word_str = self.get_portmanteau()
+            if word_str not in words:
+                words.add(word_str)
+                count -= 1
+
+        return words

 class PortBotClient(BotClient):
    def __init__(self, bot, config):
@ -180,14 +187,6 @@ class PortBotClient(BotClient):
        if status["account"]["id"] != self.my_id:
            return

-        #if status["created_at"] < datetime.now(timezone.utc) - timedelta(hours = 24) and status["reblogs_count"] == 0 and status["favourites_count"] == 0:
-        #    try:
-        #        print("Deleting", status["created_at"], status["content"])
-        #        self.api.status_delete(status["id"])
-        #        time.sleep(2)
-        #    except MastodonNotFoundError:
-        #        pass
-
 def post():
    for client_name, client in bot.clients.items():
        words = wm.get_portmanteaus(3)
@ -204,7 +203,7 @@ def post():
            print("Scheduling at", dt)
        
        if TEST:
-            scheduler.enter(1, 1, post)
+            scheduler.enter(2, 1, post)
        else:
            scheduler.enterabs(dt.timestamp(), 1, post)

@ -218,8 +217,9 @@ bot.start()
 print("Running")

 dt = next_dt()
+
 if TEST:
-    scheduler.enter(1, 1, post)
+    scheduler.enter(2, 1, post)
 else:
    print("Scheduling at", dt)
    scheduler.enterabs(dt.timestamp(), 1, post)
--- a/sourcedicts/mhyph.txt
+++ b/sourcedicts/mhyph.txt
--- a/sourcedicts/unigram_freq.csv
+++ b/sourcedicts/unigram_freq.csv
--- a/userdict.csv
+++ b/userdict.csv
@ -1,6 +1,5 @@
 4chan
 4channer
-acquihire
 ActivityPub
 amazon
 angst
@ -9,6 +8,7 @@ anime
 animu
 Anon
 antifa
+anus
 Apple
 Asperger
 aspie
@ -20,6 +20,7 @@ Barbie
 based
 basic
 Batman
+belly
 birb
 bisexual
 bitch
@ -28,8 +29,13 @@ blåhaj
 blaseball
 blaze
 bloatware
+blob
+blobcat
 blockchain
 blogger
+blue
+blueballs
+Bluetooth
 boner
 boob
 boomer
@ -48,6 +54,7 @@ bummer
 buttcrack
 buttload
 Buzzfeed
+cancel
 cancer
 cat
 catgirl
@ -63,6 +70,9 @@ chill
 chonk
 chonker
 choomer
+chub
+chubby
+chungus
 clickbait
 cocaine
 cock
@ -72,8 +82,10 @@ conservative
 cookie
 cool
 coomer
+crab
 crap
 creeper
+crew
 cringe
 crush
 crypto
@ -137,11 +149,16 @@ fedi
 fediverse
 fedora
 Fedora
+fiddle
+fidget
 fire
 fired
 Firefox
 flame
 flamewar
+fluff
+fluffy
+fondle
 freemium
 fuck
 fucker
@ -150,6 +167,9 @@ futa
 futurama
 game
 gamer
+gang
+gangsta
+gangster
 gay
 gaydar
 gaymer
@ -182,6 +202,8 @@ hipster
 hodl
 hoodie
 horny
+huge
+humble
 hung
 husbando
 hyperlink
@ -197,6 +219,7 @@ jam
 JavaScript
 jerk
 jiggle
+jumbo
 kalm
 Kickstarter
 Kirby
@ -221,6 +244,7 @@ Luigi
 lulz
 lurk
 lurker
+mad
 mafia
 malware
 mama
@ -229,6 +253,7 @@ manscape
 Mario
 Mastodon
 masturbate
+mega
 Megaman
 meme
 metal
@ -239,6 +264,7 @@ mista
 modem
 moe
 mother
+multiplayer
 Murican
 mutual
 Nazi
@ -264,12 +290,15 @@ ozone
 panik
 pedo
 Pedobear
+penis
 Pepe
+pet
 photo
 photobomb
 Pikachu
 pirate
 Pixelfed
+player
 PlayStation
 Pleroma
 plugin
@ -277,6 +306,7 @@ Pokémon
 porn
 PornHub
 pornography
+possum
 post
 potat
 potet
@ -284,13 +314,14 @@ pregnant
 protecc
 pspsps
 psyop
+pube
+pubes
 pussy
-pwned
 rageface
+reboot
 Reddit
 redditor
 RedHat
-reboot
 ree
 reee
 reeee
@ -299,8 +330,11 @@ Republican
 retweet
 rickroll
 Robocop
+roleplay
+roleplayer
 sage
 sapphic
+satan
 selfie
 senpai
 sex
@ -312,14 +346,18 @@ shitton
 shroom
 sick
 sista
+skeleton
 Slenderman
 Slendermen
+small
 smartphone
 smol
 smug
 smut
 Snapchat
 Snowden
+snug
+snuggle
 software
 Sonic
 soyboy
@ -327,11 +365,14 @@ spam
 sperg
 Spiderman
 Spongebob
+spook
+spooky
 Spotify
 squad
 Squidward
 startup
 steampunk
+stim
 stonk
 stonks
 subtweet
@ -344,6 +385,9 @@ thread
 tiger
 Tiktok
 Tinder
+tiny
+tit
+tits
 tlap
 Tor
 totes
@ -352,6 +396,8 @@ transsex
 transsexual
 trap
 trending
+trigger
+triggered
 trollface
 trolltard
 tsundere
@ -360,8 +406,10 @@ Uber
 Ubuntu
 unfappable
 Unix
+vagina
 Valhalla
 vegan
+videogame
 viral
 virgin
 virus
@ -381,6 +429,7 @@ weeaboo
 weeb
 weed
 WhatsApp
+wiener
 Wikipedia
 Windows
 WordPress
Author	SHA1	Message	Date
Thor	3ceb937911	Add dictionary merger	3 years ago
Thor	f0667b31b0	Add merged word frequency dictionary	3 years ago
Thor	3c91677029	Update user dictionary with new words	3 years ago
Thor	5fadcffc5c	Change overlapping algorithm; use word frequencies	3 years ago