Compare commits

...

5 Commits

  1. 1
      .gitignore
  2. 12
      README.md
  3. 135155
      dicts/cmudict.dict
  4. 39
      dicts/cmudict.phones
  5. 0
      dicts/main.csv
  6. 0
      dicts/mhyph.txt
  7. 0
      dicts/unigram_freq.csv
  8. 23
      dicts/user.dict
  9. 4
      mergedicts.py
  10. 55
      portbot.py

1
.gitignore vendored

@ -2,4 +2,5 @@
!.gitignore
!.gitmodules
config.toml
dicts/used.dict
clients/

@ -0,0 +1,12 @@
# PortmanteuBot
### Dictionaries
#### Word frequency
https://www.kaggle.com/rtatman/english-word-frequency
#### Hyphenation
https://www.gutenberg.org/files/3204/files/
#### Pronunciation
http://www.speech.cs.cmu.edu/cgi-bin/cmudict

File diff suppressed because it is too large Load Diff

@ -0,0 +1,39 @@
AA vowel
AE vowel
AH vowel
AO vowel
AW vowel
AY vowel
B stop
CH affricate
D stop
DH fricative
EH vowel
ER vowel
EY vowel
F fricative
G stop
HH aspirate
IH vowel
IY vowel
JH affricate
K stop
L liquid
M nasal
N nasal
NG nasal
OW vowel
OY vowel
P stop
R liquid
S fricative
SH fricative
T stop
TH fricative
UH vowel
UW vowel
V fricative
W semivowel
Y semivowel
Z fricative
ZH fricative

unable to load file from base commit

@ -30,6 +30,7 @@ blaseball
blaze
bloatware
blob
blobby
blobcat
blockchain
blogger
@ -39,6 +40,7 @@ Bluetooth
boner
boob
boomer
boost
boot
bootstrap
booty
@ -83,6 +85,7 @@ cookie
cool
coomer
crab
cranky
crap
creeper
crew
@ -102,8 +105,6 @@ dank
darkweb
Debian
Democrat
derp
derpina
desu
dildo
doge
@ -145,6 +146,10 @@ fangirl
fappable
fapworthy
fart
fav
fave
favorite
favourite
fedi
fediverse
fedora
@ -167,6 +172,7 @@ futa
futurama
game
gamer
Gamergate
gang
gangsta
gangster
@ -199,7 +205,6 @@ hellthread
hentai
herp
hipster
hodl
hoodie
horny
huge
@ -224,6 +229,7 @@ kalm
Kickstarter
Kirby
Knuckles
kuudere
Kraftwerk
lame
lamer
@ -231,6 +237,7 @@ laptop
legend
lesbian
liberal
like
Link
LinkedIn
livestream
@ -297,6 +304,8 @@ photo
photobomb
Pikachu
pirate
pissed
pissy
Pixelfed
player
PlayStation
@ -335,6 +344,8 @@ roleplayer
sage
sapphic
satan
scream
screamer
selfie
senpai
sex
@ -355,7 +366,6 @@ smol
smug
smut
Snapchat
Snowden
snug
snuggle
software
@ -369,12 +379,14 @@ spook
spooky
Spotify
squad
Squidward
startup
steampunk
stim
stonk
stonks
sub
submit
submissive
subtweet
swole
tentacle
@ -400,6 +412,7 @@ trigger
triggered
trollface
trolltard
troon
tsundere
Twitch
Uber

@ -1,9 +1,9 @@
import itertools
with open("sourcedicts/mhyph.txt", "r", encoding = "macroman", newline = "\r\n") as f:
with open("dicts/mhyph.txt", "r", encoding = "macroman", newline = "\r\n") as f:
hyph_words = set(tuple(l.split("")) for l in f.read().splitlines())
with open("sourcedicts/unigram_freq.csv", "r") as f:
with open("dicts/unigram_freq.csv", "r") as f:
goog_words = set((w, int(f)) for w, f in
itertools.islice((l.split(",") for
l in f.read().splitlines()), 1, None))

@ -1,23 +1,24 @@
import os
import sys
import time
from datetime import datetime, timedelta, timezone
import toml
import random
import re
import sched
import math
import string
import itertools
from datetime import datetime, timedelta, timezone
from mastodon import Mastodon, MastodonNotFoundError
from fedbot.bot import Bot, BotClient
POST_INTERVAL = timedelta(seconds = 15)
TEST = "test" in sys.argv[1:]
PORT_PCT = 30
MAX_DICT_LEN = 100
MAX_GEN_LEN = 15
USER_PCT = 35
MIN_MAIN_LEN = 3
MAX_PORT_LEN = 14
MAIN_DICT_PATH = "dicts/main.csv"
USER_DICT_PATH = "dicts/user.dict"
USED_DICT_PATH = "dicts/used.dict"
def next_dt():
dt = datetime.now(timezone.utc)
@ -27,7 +28,6 @@ def next_dt():
microseconds = dt.microsecond)
return dt
config_path = os.path.join(os.path.dirname(sys.argv[0]), "config.toml")
loaded_config = {
"name": "portmanteaubot",
@ -42,7 +42,9 @@ def overlap_words(left_word, right_word):
if left_word == right_word:
return set()
offset = 2
min_shift = 2
offset = min_shift
attempts = set()
while offset + 2 <= len(left_word[0]):
if right_word[0].lower().startswith(left_word[0].lower()[offset : offset + 2]):
@ -52,7 +54,7 @@ def overlap_words(left_word, right_word):
offset += 1
offset = len(right_word[0]) - 2
while offset >= 0:
while offset >= 2:
if left_word[0].lower().endswith(right_word[0].lower()[offset : offset + 2]):
word_str = left_word[0] + right_word[0][offset + 2:]
if len(word_str) >= 6 and not is_affixed(word_str):
@ -74,26 +76,35 @@ def pick_one_word(words):
return random.choices(list(words.items()), weights = (v[0] for v in words.values()))[0]
def word_diff(a, b):
seq = difflib.SequenceMatcher(None, a, b)
return seq.ratio()
class WordMaker:
def __init__(self):
print("Loading dictionaries")
illegal = set(ch for ch in (string.ascii_uppercase + string.punctuation + string.digits + string.whitespace))
with open ("dict.csv", "r") as f:
with open (MAIN_DICT_PATH, "r") as f:
self.main_words = {
sl[0] :
(int(sl[1]), tuple(sl[2].split("="))) for sl in
(tuple(l.split(",")) for l in f.read().splitlines()) if
len(sl[0]) >= 3 and
len(sl[0]) < MAX_DICT_LEN and
len(sl[0]) >= MIN_MAIN_LEN and
not any(c in illegal for c in sl[0])}
with open("userdict.csv", "r") as f:
with open(USER_DICT_PATH, "r") as f:
self.user_words = {l : (1, None) for l in f.read().splitlines()}
self.all_words = {k.lower() : v for k, v in {**self.main_words, **self.user_words}.items()}
if os.path.exists(USED_DICT_PATH):
with open("dicts/used.dict", "r") as f:
used_words = {l : (1, None) for l in f.read().splitlines()}
else:
used_words = dict()
self.all_words = {k.lower() : v for k, v in {**self.main_words, **self.user_words, **used_words}.items()}
def extend_word(self, prev_word):
user_dict = random.randint(0, 100) < PORT_PCT
user_dict = random.randint(0, 100) < USER_PCT
if user_dict:
next_dict = self.user_words
else:
@ -108,7 +119,7 @@ class WordMaker:
max_len = max(len(w) for w in new_words)
for w in new_words:
new_words[w] = (max_len + 1 - len(w), None)
new_words[w] = (math.pow(max_len + 1 - len(w), 3), None)
while len(new_words) > 0:
new_word = pick_one_word(new_words)
@ -120,7 +131,7 @@ class WordMaker:
return None
def get_portmanteau(self, target_times = 1):
user_dict = random.randint(0, 100) < PORT_PCT
user_dict = random.randint(0, 100) < USER_PCT
if user_dict:
words = self.user_words
else:
@ -142,7 +153,7 @@ class WordMaker:
if times == 0:
break
if len(word[0]) < MAX_GEN_LEN:
if len(word[0]) < MAX_PORT_LEN:
break
word_str = word[0].lower()
@ -154,11 +165,19 @@ class WordMaker:
def get_portmanteaus(self, count = 10):
words = set()
used_words = dict()
while count > 0:
word_str = self.get_portmanteau()
if word_str not in words:
words.add(word_str)
used_words[word_str] = (1, None)
count -= 1
self.all_words.update(used_words)
if not TEST:
with open("dicts/used.dict", "a") as f:
f.write("\n".join(used_words.keys()) + "\n")
return words

Loading…
Cancel
Save