Ok, so you saw the output of your generator idea. Now, let me show you the difference when my ledger is used to create new words:
Now you can see that the random words being are a good bit more Voynich like. But, they still have a lot of issues. They are following weights from the Voynich in picking letters but it's still not enough.
I took my basic "don't look stupid rules" from my generator and added them to yours. Now look at the difference.
Now we have some control in those random words over vowels and consonant structure. As well as repeated vowels and repeated consonants. It's starting to look more like Voynich instead of word soup but, those random words still don't look Voynich. Not enough rules. In order to get that right, I'd have to do some machine learning. Learning what letters follow what other letters, what bigrams follow other bigrams and add weighting to that.
py minimal_phrase_generator_v3.py --ledger Ledger_scribe1.json --pages 100 --print-all-pages
Code:
# minimal_phrase_generator_v3.py
#
# "Helped" version of the forum proposal:
#
# 1) fixed common words
# 2) fixed common combinations
# 3) mix those with occasional longer words
# 4) longer words are generated by using existing Ledger_scribe1.json
# 5) generated longer words may repeat locally
#
# Required external file:
# Ledger_scribe1.json
#
# Example:
# py minimal_phrase_generator_v3.py --ledger Ledger_scribe1.json --pages 100 --print-all-pages
import argparse
import json
import random
from collections import Counter
from pathlib import Path
# ============================================================
# SETTINGS
# ============================================================
DEFAULT_SEED = 42
DEFAULT_PAGES = 100
DEFAULT_WORDS_PER_PAGE = 95
DEFAULT_WORDS_PER_LINE = 8
# Uniform branch selection.
# This avoids hidden weighting between the three production modes.
MODES = [
"common_word",
"common_combination",
"ledger_word",
]
# This is the smallest operational version of
# "random longer words which occasionally repeat."
LEDGER_WORD_REPEAT_CHANCE = 0.25
RECENT_LEDGER_WORD_WINDOW = 10
# Ledger does not encode word length, so this is the one explicit external assumption.
MIN_LEDGER_WORD_LEN = 5
MAX_LEDGER_WORD_LEN = 12
MAX_ATTEMPTS_PER_LEDGER_WORD = 200
GALLOWS = set("ktpf")
VOWELS = set("aeioy")
# Basic "don't look stupid" guards copied from the v11 generator idea.
# These are deliberately simple visual plausibility filters, not grammar.
MAX_VOWEL_RUN = 4
MAX_CONSONANT_RUN = 4
MAX_TOKEN_LEN = 12
# Repeat/family guards for newly generated ledger words.
RECENT_TOKEN_REPEAT_LIMIT = 3
MAX_LEDGER_TOKEN_PAGE_COUNT = 5
MAX_LEDGER_FAMILY_PAGE_COUNT = 9
# ============================================================
# FIXED COMMON WORDS
# ============================================================
COMMON_WORDS = [
"daiin", "dain", "ol", "or", "chol",
"chedy", "qokain", "qokeedy", "qotedy", "otedy",
"shey", "cthey", "cthol", "shol", "chor",
"dair", "qokair", "saiin", "aiin", "okain",
"sary", "okol", "qol", "qokal", "chdy",
]
# ============================================================
# FIXED COMMON COMBINATIONS
# ============================================================
COMMON_COMBINATIONS = [
["qokain", "daiin"],
["chol", "chedy"],
["qokeedy", "qokedy"],
["ol", "chedy"],
["dain", "chol"],
["shey", "qokain"],
["cthey", "daiin"],
["shol", "chor"],
["qotedy", "qokain"],
["otedy", "ol"],
["saiin", "daiin"],
["qokair", "dair"],
["chol", "daiin"],
["cthol", "chedy"],
["or", "chol"],
]
# ============================================================
# BASIC HELPERS
# ============================================================
def weighted_choice(rng, values, weights=None):
if not values:
return None
if weights is None:
return rng.choice(list(values))
return rng.choices(list(values), weights=list(weights), k=1)[0]
def char_ngrams(words, n):
counts = Counter()
for word in words:
for i in range(0, len(word) - n + 1):
counts[word[i:i+n]] += 1
return counts
def max_run(token, charset):
best = 0
current = 0
for ch in token:
if ch in charset:
current += 1
best = max(best, current)
else:
current = 0
return best
def family_form(token):
return "".join(ch for ch in token if ch not in GALLOWS)
def passes_dls(token, ledger, page):
"""
Basic v11-style don't-look-stupid filter.
Applied only to newly generated ledger words, not to the fixed
common-word/common-combination scaffolding.
"""
if not token:
return False
if len(token) > MAX_TOKEN_LEN:
return False
if max_run(token, VOWELS) > MAX_VOWEL_RUN:
return False
consonants = set(ledger.alphabet) - VOWELS
if max_run(token, consonants) > MAX_CONSONANT_RUN:
return False
if page and token == page[-1]:
return False
if page[-RECENT_LEDGER_WORD_WINDOW:].count(token) >= RECENT_TOKEN_REPEAT_LIMIT:
return False
if page.count(token) >= MAX_LEDGER_TOKEN_PAGE_COUNT:
return False
family = family_form(token)
if family:
family_count = sum(
1 for existing in page
if family_form(existing) == family
)
if family_count >= MAX_LEDGER_FAMILY_PAGE_COUNT:
return False
return ledger.validate(token)
# ============================================================
# YOUR LEDGER FORMAT
# ============================================================
class Ledger:
def __init__(self, path):
path = Path(path)
if not path.exists():
raise FileNotFoundError(
f"Ledger file not found: {path}\n"
"Put Ledger_scribe1.json beside this script or pass --ledger PATH."
)
with path.open("r", encoding="utf-8") as f:
data = json.load(f)
self.metadata = data.get("metadata", {})
self.rows = data["ledger"]
self.alphabet = list(data.get("alphabet", self.rows.keys()))
self.columns = tuple(self.metadata.get("columns", ["prefix", "midfix", "suffix"]))
self.tiers = tuple(self.metadata.get("tiers", ["80", "18", "2"]))
self.short_tokens = set((self.metadata.get("short_tokens") or {}).keys())
self.tier_weights = self._tier_weights(self.tiers)
self.first_token_values, self.first_token_weights = self._load_first_token_weights()
self.followers = {}
for glyph, row in self.rows.items():
self.followers[glyph] = {}
for column in self.columns:
values = []
weights = []
for tier, tier_weight in self.tier_weights:
glyphs = row.get(column, {}).get(tier, [])
if not glyphs:
continue
each_weight = tier_weight / len(glyphs)
for follower in glyphs:
values.append(follower)
weights.append(each_weight)
self.followers[glyph][column] = (values, weights)
def _tier_weights(self, tiers):
numeric = []
for tier in tiers:
try:
numeric.append((tier, float(tier) / 100.0))
except ValueError:
numeric.append((tier, 1.0))
total = sum(weight for _, weight in numeric) or 1.0
return [
(tier, weight / total)
for tier, weight in numeric
]
def _load_first_token_weights(self):
raw_weights = (
self.metadata.get("first_token_weights")
or self.metadata.get("start_token_weights")
)
if raw_weights:
pairs = [
(glyph, float(weight))
for glyph, weight in raw_weights.items()
if glyph in self.rows and float(weight) > 0
]
if pairs:
pairs.sort()
values = [p[0] for p in pairs]
weights = [p[1] for p in pairs]
return values, weights
raw_counts = (
self.metadata.get("first_tokens")
or self.metadata.get("start_tokens")
)
if raw_counts:
pairs = [
(glyph, float(count))
for glyph, count in raw_counts.items()
if glyph in self.rows and float(count) > 0
]
if pairs:
pairs.sort()
values = [p[0] for p in pairs]
weights = [p[1] for p in pairs]
return values, weights
return list(self.rows.keys()), None
def choose_start_glyph(self, rng):
return weighted_choice(
rng,
self.first_token_values,
self.first_token_weights,
)
def choose_follower(self, rng, left, column):
values, weights = self.followers.get(left, {}).get(column, ([], []))
if not values:
return None
return weighted_choice(rng, values, weights)
def legal_transition(self, left, right, column):
row = self.rows.get(left, {}).get(column, {})
for tier in self.tiers:
if right in row.get(tier, []):
return True
return False
def has_multiple_gallows(self, token):
return sum(1 for ch in token if ch in GALLOWS) > 1
def validate(self, token):
if not token:
return False
if self.has_multiple_gallows(token):
return False
if len(token) == 1:
return token in self.short_tokens
if token[0] not in self.rows:
return False
if not self.legal_transition(token[0], token[1], "prefix"):
return False
for index in range(2, len(token) - 1):
if not self.legal_transition(token[index - 1], token[index], "midfix"):
return False
return self.legal_transition(token[-2], token[-1], "suffix")
def generate_word(self, rng, page=None, min_len=MIN_LEDGER_WORD_LEN, max_len=MAX_LEDGER_WORD_LEN):
page = page or []
for _ in range(MAX_ATTEMPTS_PER_LEDGER_WORD):
target_len = rng.randint(min_len, max_len)
chars = [self.choose_start_glyph(rng)]
if not chars[0]:
continue
while len(chars) < target_len:
left = chars[-1]
if len(chars) == 1:
column = "prefix"
elif len(chars) == target_len - 1:
column = "suffix"
else:
column = "midfix"
right = self.choose_follower(rng, left, column)
if right is None:
break
chars.append(right)
token = "".join(chars)
if len(token) == target_len and passes_dls(token, self, page):
return token
raise RuntimeError(
"Could not generate a ledger-valid DLS-passing word after "
f"{MAX_ATTEMPTS_PER_LEDGER_WORD} attempts."
)
# ============================================================
# GENERATION
# ============================================================
def generate_page(rng, ledger, words_per_page):
page = []
recent_ledger_words = []
while len(page) < words_per_page:
mode = rng.choice(MODES)
if mode == "common_word":
page.append(rng.choice(COMMON_WORDS))
elif mode == "common_combination":
page.extend(rng.choice(COMMON_COMBINATIONS))
elif mode == "ledger_word":
word = None
if recent_ledger_words and rng.random() < LEDGER_WORD_REPEAT_CHANCE:
repeat_candidates = [
candidate for candidate in recent_ledger_words
if passes_dls(candidate, ledger, page)
]
if repeat_candidates:
word = rng.choice(repeat_candidates)
if word is None:
word = ledger.generate_word(rng, page=page)
recent_ledger_words.append(word)
if len(recent_ledger_words) > RECENT_LEDGER_WORD_WINDOW:
recent_ledger_words.pop(0)
page.append(word)
return page[:words_per_page]
# ============================================================
# OUTPUT / ANALYSIS
# ============================================================
def fixed_vocab():
vocab = set(COMMON_WORDS)
for combo in COMMON_COMBINATIONS:
vocab.update(combo)
return vocab
def print_page(page_num, page, words_per_line):
print()
print("=" * 60)
print(f"PAGE {page_num}")
print("=" * 60)
print()
for i in range(0, len(page), words_per_line):
print(" ".join(page[i:i + words_per_line]))
def analyze(all_words):
vocab = fixed_vocab()
freq = Counter(all_words)
generated_occurrences = [
word for word in all_words
if word not in vocab
]
generated_freq = Counter(generated_occurrences)
print()
print("=" * 60)
print("GLOBAL STATISTICS")
print("=" * 60)
print()
print(f"Tokens : {len(all_words)}")
print(f"Types : {len(freq)}")
print(f"TTR : {len(freq) / len(all_words):.4f}")
print(f"Hapax : {sum(1 for c in freq.values() if c == 1)}")
print(f"Fixed vocabulary size : {len(vocab)}")
print(f"Ledger-word occurrences : {len(generated_occurrences)}")
print(f"Ledger-word types : {len(generated_freq)}")
print(f"Repeated ledger-word types : {sum(1 for c in generated_freq.values() if c > 1)}")
print(f"Max ledger-word repeat : {max(generated_freq.values()) if generated_freq else 0}")
print()
print("Top 25 word tokens")
print()
for word, count in freq.most_common(25):
print(f"{word:20} {count}")
print()
print("Top 30 character bigrams")
print()
for bg, count in char_ngrams(all_words, 2).most_common(30):
print(f"{bg:5} {count}")
print()
print("Top 30 character trigrams")
print()
for tg, count in char_ngrams(all_words, 3).most_common(30):
print(f"{tg:5} {count}")
print()
print("Top repeated ledger-generated words")
print()
shown = 0
for word, count in generated_freq.most_common():
if count <= 1:
break
print(f"{word:20} {count}")
shown += 1
if shown >= 25:
break
def main():
parser = argparse.ArgumentParser(
description="Minimal phrase + common word generator using the existing Scribe 1 ledger for new words."
)
parser.add_argument("--ledger", default="Ledger_scribe1.json")
parser.add_argument("--pages", type=int, default=DEFAULT_PAGES)
parser.add_argument("--words-per-page", type=int, default=DEFAULT_WORDS_PER_PAGE)
parser.add_argument("--words-per-line", type=int, default=DEFAULT_WORDS_PER_LINE)
parser.add_argument("--seed", type=int, default=DEFAULT_SEED)
parser.add_argument("--print-all-pages", action="store_true")
parser.add_argument("--output", default=None, help="Optional text output file.")
args = parser.parse_args()
rng = random.Random(args.seed)
ledger = Ledger(args.ledger)
lines = []
all_words = []
# Capture print output if --output is requested.
# Simpler than redirecting stdout externally.
def emit(text=""):
print(text)
if args.output is not None:
lines.append(text)
emit("Common words:")
emit(", ".join(COMMON_WORDS))
emit()
emit("Common combinations:")
for combo in COMMON_COMBINATIONS:
emit(" ".join(combo))
for page_num in range(1, args.pages + 1):
page = generate_page(rng, ledger, args.words_per_page)
all_words.extend(page)
if args.print_all_pages or page_num <= 3 or page_num == args.pages:
emit()
emit("=" * 60)
emit(f"PAGE {page_num}")
emit("=" * 60)
emit()
for i in range(0, len(page), args.words_per_line):
emit(" ".join(page[i:i + args.words_per_line]))
# Analysis prints to stdout directly.
analyze(all_words)
if args.output is not None:
Path(args.output).write_text("\n".join(lines) + "\n", encoding="utf-8")
if __name__ == "__main__":
main()
Disclaimer: I had codex create and modify this code from a prompt.