diff --git a/mergedicts.py b/mergedicts.py new file mode 100644 index 0000000..ab3e870 --- /dev/null +++ b/mergedicts.py @@ -0,0 +1,20 @@ +import itertools + +with open("sourcedicts/mhyph.txt", "r", encoding = "macroman", newline = "\r\n") as f: + hyph_words = set(tuple(l.split("•")) for l in f.read().splitlines()) + +with open("sourcedicts/unigram_freq.csv", "r") as f: + goog_words = set((w, int(f)) for w, f in + itertools.islice((l.split(",") for + l in f.read().splitlines()), 1, None)) + +hyph_map = {"".join(x) : x for x in hyph_words} + +words = dict() +for w, f in sorted(goog_words, key = lambda w: w[1], reverse = True): + hw = hyph_map.get(w) + if hw: + words[w] = (f, hw) + +for w, (f, hw) in words.items(): + print("{},{},{}".format(w, f, "=".join(hw)))