oshfdk > 26-05-2026, 05:04 PM
(26-05-2026, 04:44 PM)Dunsel Wrote: You are not allowed to view links. Register or Login to view.I think the Vounich is implausible. If it is copy mutate, the best you can hope for is statistically comparable
Everyone hopes for an exact explanation or reproduction. What if there isn't one?
(26-05-2026, 04:44 PM)Dunsel Wrote: You are not allowed to view links. Register or Login to view.Try my challenge in a previous reply. See if you can make statistically correct Voynich with it.
Dunsel > 26-05-2026, 05:19 PM
(26-05-2026, 05:04 PM)oshfdk Wrote: You are not allowed to view links. Register or Login to view.I think I did exactly that in my reply, it broke down immediately. I took a very common word "dain" and I identified that introducing some simple changes to it that pass the ledger produces words that not only are unattested in the manuscript, but also don't conform to CLS framework (which is ok in rare cases, but not generally). This challenge doesn't produce statistically correct Voynichese.
oshfdk > 26-05-2026, 06:09 PM
(26-05-2026, 05:19 PM)Dunsel Wrote: You are not allowed to view links. Register or Login to view.No, you're trying to make it look Voynich. Don't. Just try to make it look like a language. Create your own visually appealing language.
oshfdk > 26-05-2026, 06:14 PM
Dunsel > 26-05-2026, 08:42 PM
(26-05-2026, 06:14 PM)oshfdk Wrote: You are not allowed to view links. Register or Login to view.In fact, this way of creating a fake language is much simpler and produces a much more plausible result:
1) create a list of common words, about 20-30, write them down
2) create 10-20 common combinations of these common words, write them down
3) start writing mixing common words and common word combinations with random longer words, which would occasionally repeat
4) done
There is nothing wrong for a text in a natural language to have some repetitions, it's expected. In some styles of texts (recipes, log entries) it's even mandatory. There is nothing wrong with that, there is no need to fight against this and the result looks very much like an unknown language. The statistics would differ significantly from Voynichese, but maybe they would be closer to real languages, and what is more, no-one in the XV century would go over the text with a scientific calculator and measure token type ratios.
# oshfdk_literal_generator.py
# No external files required.
#
# Literal version of:
# 1) create 20-30 common words
# 2) create 10-20 common combinations
# 3) mix common words + common combinations + random longer words
# 4) random longer words occasionally repeat
#
# No mutation.
# No ledger.
# No syllables.
# No hidden morphology.
import random
from collections import Counter
SEED = 42
PAGES = 100
WORDS_PER_PAGE = 95
WORDS_PER_LINE = 8
# Standard EVA-ish alphabet, sorted alphabetically.
VOYNICH_ALPHABET = "acdefhiklnoqrstxy"
LONG_WORD_MIN_LEN = 5
LONG_WORD_MAX_LEN = 12
LONG_WORD_REPEAT_CHANCE = 0.25
RECENT_LONG_WORD_WINDOW = 10
COMMON_WORDS = [
"daiin", "dain", "ol", "or", "chol",
"chedy", "qokain", "qokeedy", "qotedy", "otedy",
"shey", "cthey", "cthol", "shol", "chor",
"dair", "qokair", "saiin", "aiin", "okain",
"sary", "okol", "qol", "qokal", "chdy",
]
COMMON_COMBINATIONS = [
["qokain", "daiin"],
["chol", "chedy"],
["qokeedy", "qokedy"],
["ol", "chedy"],
["dain", "chol"],
["shey", "qokain"],
["cthey", "daiin"],
["shol", "chor"],
["qotedy", "qokain"],
["otedy", "ol"],
["saiin", "daiin"],
["qokair", "dair"],
["chol", "daiin"],
["cthol", "chedy"],
["or", "chol"],
]
def make_random_long_word():
length = random.randint(LONG_WORD_MIN_LEN, LONG_WORD_MAX_LEN)
return "".join(random.choice(VOYNICH_ALPHABET) for _ in range(length))
def generate_page():
page = []
recent_long_words = []
while len(page) < WORDS_PER_PAGE:
mode = random.choice(["common_word", "combination", "long_word"])
if mode == "common_word":
page.append(random.choice(COMMON_WORDS))
elif mode == "combination":
combo = random.choice(COMMON_COMBINATIONS)
page.extend(combo)
else:
if recent_long_words and random.random() < LONG_WORD_REPEAT_CHANCE:
word = random.choice(recent_long_words)
else:
word = make_random_long_word()
recent_long_words.append(word)
if len(recent_long_words) > RECENT_LONG_WORD_WINDOW:
recent_long_words.pop(0)
page.append(word)
return page[:WORDS_PER_PAGE]
def char_ngrams(words, n):
counts = Counter()
for word in words:
if len(word) < n:
continue
for i in range(len(word) - n + 1):
counts[word[i:i + n]] += 1
return counts
def analyze(all_words):
token_count = len(all_words)
type_count = len(set(all_words))
word_freq = Counter(all_words)
char_bigram_counts = char_ngrams(all_words, 2)
char_trigram_counts = char_ngrams(all_words, 3)
print()
print("=" * 60)
print("GLOBAL WORD STATISTICS")
print("=" * 60)
print()
print(f"Tokens : {token_count}")
print(f"Types : {type_count}")
print(f"TTR : {type_count / token_count:.4f}")
print(f"Hapax : {sum(1 for c in word_freq.values() if c == 1)}")
print()
print("Top 25 word tokens")
print()
for word, count in word_freq.most_common(25):
print(f"{word:15} {count}")
print()
print("=" * 60)
print("CHARACTER BIGRAMS")
print("=" * 60)
print()
for bg, count in char_bigram_counts.most_common(30):
print(f"{bg:5} {count}")
print()
print("=" * 60)
print("CHARACTER TRIGRAMS")
print("=" * 60)
print()
for tg, count in char_trigram_counts.most_common(30):
print(f"{tg:5} {count}")
def print_page(page_num, page):
print()
print("=" * 60)
print(f"PAGE {page_num}")
print("=" * 60)
print()
for i in range(0, len(page), WORDS_PER_LINE):
print(" ".join(page[i:i + WORDS_PER_LINE]))
def main():
random.seed(SEED)
all_words = []
for page_num in range(1, PAGES + 1):
page = generate_page()
all_words.extend(page)
print_page(page_num, page)
analyze(all_words)
if __name__ == "__main__":
main()============================================================
GLOBAL WORD STATISTICS
============================================================
Tokens : 9500
Types : 1800
TTR : 0.1895
Hapax : 1330
Top 25 word tokens
chol 748
daiin 736
chedy 589
qokain 530
ol 427
qokeedy 289
or 277
saiin 276
otedy 275
cthol 268
dair 268
shey 247
cthey 246
qokair 246
shol 242
qotedy 231
chor 230
dain 227
qokedy 185
aiin 107
qokal 98
okol 93
sary 89
chdy 84
okain 80
============================================================
CHARACTER BIGRAMS
============================================================
ai 2538
in 2011
ol 1901
dy 1729
ch 1711
qo 1704
ed 1632
ok 1615
ho 1546
da 1291
ii 1179
he 1136
ka 1029
ir 580
sh 573
te 571
ot 569
th 561
or 561
ey 557
ct 550
ke 538
sa 433
ee 346
ar 179
al 159
ko 154
ry 147
hd 139
fa 85
============================================================
CHARACTER TRIGRAMS
============================================================
edy 1571
qok 1348
hol 1260
dai 1237
iin 1126
aii 1122
cho 986
oka 961
kai 863
ain 839
hed 594
che 591
air 522
cth 516
ote 515
ted 510
hey 493
oke 485
eed 293
kee 290
sai 277
tho 268
she 256
the 248
sho 244
qot 232
hor 230
ked 190
kal 102
kol 96bi3mw > 26-05-2026, 10:55 PM
"".join(random.choice(VOYNICH_ALPHABET) for _ in range(length))Dunsel > 26-05-2026, 11:28 PM
(26-05-2026, 10:55 PM)bi3mw Wrote: You are not allowed to view links. Register or Login to view.I would say the biggest problem in the script is on line 60.
Code:"".join(random.choice(VOYNICH_ALPHABET) for _ in range(length))
Each character is generated independently there. However, Voynich words exhibit strong positional dependencies. Certain characters only appear at the beginning or end of a word, such as qo-, -dy, and -in for example. Statistically, the result is therefore immediately recognizable as random noise.
(26-05-2026, 10:55 PM)bi3mw Wrote: You are not allowed to view links. Register or Login to view.One possible solution might be to assign weights to the individual parts of the word.
oshfdk > 26-05-2026, 11:57 PM
(26-05-2026, 08:42 PM)Dunsel Wrote: You are not allowed to view links. Register or Login to view.Your python generator per your exact specification:
# mixwords_literal_generator.py
# No external files required.
#
# Literal version of:
# 1) write down 20-30 common words
# 2) create 10-20 random common pairings from these words
# 3) choose either a common word or a common combination
# 4) after that, add 1-3 bigram-seeded, trigram-built longer words
# 5) once 20 long words exist in the pipeline, repeat one of them 25% of the time
#
# No mutation.
# No ledger.
# No syllables.
# No hidden morphology.
import argparse
import random
from collections import Counter
SEED = 42
PAGES = 100
WORDS_PER_PAGE = 95
WORDS_PER_LINE = 8
COMMON_COMBINATION_MIN_COUNT = 10
COMMON_COMBINATION_MAX_COUNT = 20
COMMON_COMBINATION_WORDS = 2
LONG_WORD_MIN_LEN = 5
LONG_WORD_MAX_LEN = 11
LONG_WORD_REPEAT_CHANCE = 0.25
RECENT_LONG_WORD_WINDOW = 20
LONG_WORDS_PER_MIX_MIN = 1
LONG_WORDS_PER_MIX_MAX = 3
COMMON_WORD_CHANCE = 0.5
COMMON_RANK_TOP_WEIGHT = 3.0
COMMON_RANK_BOTTOM_WEIGHT = 1.0
MAX_TRIGRAM_WORD_ATTEMPTS = 64
MAX_MIXED_LONG_WORD_ATTEMPTS = MAX_TRIGRAM_WORD_ATTEMPTS * 8
COMMON_WORDS = [
"daiin", "ol", "or", "qoky", "dary",
"chol", "qokair", "sair", "opdor", "ockhol",
"chedy", "qokchy", "ofain", "ykal", "oteedy",
"sheey", "otey", "cphol", "sholdaly", "cheol",
"saral", "doroldal", "qolky", "qokeedy", "chdy",
]
COMMON_WORDS_ENGLISH = [
"the", "and", "are", "a", "this",
"is", "unless", "more", "each",
"together", "well", "anything", "right", "wrong",
"stay", "move", "wish", "however", "two",
"one", "four", "because", "always", "trouble",
]
COMMON_COMBINATIONS = []
COMMON_SOURCE_WORD_TOKENS = []
COMMON_SOURCE_WORD_SET = set()
COMMON_WORD_SET = set()
COMMON_WORD_WEIGHTS = []
COMMON_COMBINATION_WEIGHTS = []
WORD_START_BIGRAMS = []
BIGRAM_OPTIONS = {}
TRIGRAMS = []
TRIGRAM_OPTIONS = {}
NONFINAL_TRIGRAMS = set()
FINAL_BIGRAMS = set()
FINAL_TRIGRAMS = set()
WORD_FINAL_SUFFIXES = {}
LONG_WORD_FALLBACKS = []
def make_common_combinations():
target_count = min(
random.randint(
COMMON_COMBINATION_MIN_COUNT,
COMMON_COMBINATION_MAX_COUNT,
),
len(COMMON_WORDS) * (len(COMMON_WORDS) - 1),
)
combinations = []
seen = set()
while len(combinations) < target_count:
# Avoid A A pairs like "daiin daiin".
pair = tuple(random.sample(COMMON_WORDS, COMMON_COMBINATION_WORDS))
if pair in seen:
continue
seen.add(pair)
combinations.append(list(pair))
return combinations
def make_rank_weights(items):
count = len(items)
if count <= 1:
return [COMMON_RANK_BOTTOM_WEIGHT] * count
step = (
COMMON_RANK_BOTTOM_WEIGHT - COMMON_RANK_TOP_WEIGHT
) / (count - 1)
return [
COMMON_RANK_TOP_WEIGHT + (step * index)
for index in range(count)
]
def weighted_choice(items, weights):
return random.choices(items, weights=weights, k=1)[0]
def initialize_common_sources():
global COMMON_COMBINATIONS
global COMMON_SOURCE_WORD_TOKENS
global COMMON_SOURCE_WORD_SET
global COMMON_WORD_SET
global COMMON_WORD_WEIGHTS
global COMMON_COMBINATION_WEIGHTS
global WORD_START_BIGRAMS
global BIGRAM_OPTIONS
global TRIGRAMS
global TRIGRAM_OPTIONS
global NONFINAL_TRIGRAMS
global FINAL_BIGRAMS
global FINAL_TRIGRAMS
global WORD_FINAL_SUFFIXES
global LONG_WORD_FALLBACKS
COMMON_COMBINATIONS = make_common_combinations()
COMMON_WORD_SET = set(COMMON_WORDS)
COMMON_WORD_WEIGHTS = make_rank_weights(COMMON_WORDS)
COMMON_COMBINATION_WEIGHTS = make_rank_weights(COMMON_COMBINATIONS)
COMMON_SOURCE_WORD_TOKENS = COMMON_WORDS + [
word for combo in COMMON_COMBINATIONS for word in combo
]
COMMON_SOURCE_WORD_SET = set(COMMON_SOURCE_WORD_TOKENS)
WORD_START_BIGRAMS = [
word[:2] for word in COMMON_SOURCE_WORD_TOKENS if len(word) >= 2
]
TRIGRAMS = [
word[i:i + 3]
for word in COMMON_SOURCE_WORD_TOKENS
for i in range(len(word) - 2)
]
BIGRAM_OPTIONS = {}
TRIGRAM_OPTIONS = {}
NONFINAL_TRIGRAMS = set()
FINAL_BIGRAMS = set()
FINAL_TRIGRAMS = set()
WORD_FINAL_SUFFIXES = {}
for word in COMMON_SOURCE_WORD_TOKENS:
if len(word) >= 2:
FINAL_BIGRAMS.add(word[-2:])
if len(word) >= 3:
FINAL_TRIGRAMS.add(word[-3:])
for i in range(len(word) - 1):
bigram = word[i:i + 2]
BIGRAM_OPTIONS.setdefault(bigram[0], []).append(bigram)
for i in range(len(word) - 2):
trigram = word[i:i + 3]
TRIGRAM_OPTIONS.setdefault(trigram[:2], []).append(trigram)
if i < len(word) - 3:
NONFINAL_TRIGRAMS.add(trigram)
suffix = word[i:]
WORD_FINAL_SUFFIXES.setdefault(suffix[:2], []).append(suffix)
for prefix, suffixes in WORD_FINAL_SUFFIXES.items():
WORD_FINAL_SUFFIXES[prefix] = sorted(suffixes, key=len)
LONG_WORD_FALLBACKS = [
word for word in COMMON_SOURCE_WORD_SET if len(word) >= LONG_WORD_MIN_LEN
]
def continue_with_bigram(word, target_length, disallowed_words):
next_bigrams = BIGRAM_OPTIONS.get(word[-1], [])
allowed_bigrams = [
bigram
for bigram in next_bigrams
if (
len(word) + 1 < target_length
or (
word + bigram[1] not in disallowed_words
and (word + bigram[1])[-2:] in FINAL_BIGRAMS
and (word + bigram[1])[-3:] in FINAL_TRIGRAMS
)
)
]
if not allowed_bigrams:
return None
return word + random.choice(allowed_bigrams)[1]
def make_trigram_long_word(disallowed_words=None):
target_length = random.randint(LONG_WORD_MIN_LEN, LONG_WORD_MAX_LEN)
fallback = None
disallowed_words = set(disallowed_words or ())
for _ in range(MAX_TRIGRAM_WORD_ATTEMPTS):
word = random.choice(WORD_START_BIGRAMS)
completed = False
while len(word) < target_length:
suffixes = WORD_FINAL_SUFFIXES.get(word[-2:], [])
exact_suffixes = [
suffix
for suffix in suffixes
if len(word) + len(suffix) - 2 == target_length
and word + suffix[2:] not in disallowed_words
]
if exact_suffixes:
word += random.choice(exact_suffixes)[2:]
completed = True
break
next_trigrams = TRIGRAM_OPTIONS.get(word[-2:])
allowed_trigrams = [
trigram for trigram in next_trigrams if trigram in NONFINAL_TRIGRAMS
] if next_trigrams else []
if allowed_trigrams:
word += random.choice(allowed_trigrams)[2]
continue
word = continue_with_bigram(word, target_length, disallowed_words)
if word is None:
break
if len(word) == target_length:
completed = True
break
if not completed or len(word) < LONG_WORD_MIN_LEN:
continue
if fallback is None:
fallback = word
if word not in COMMON_SOURCE_WORD_SET:
return word
if fallback is not None:
return fallback
allowed_fallbacks = [
word for word in LONG_WORD_FALLBACKS if word not in disallowed_words
]
if allowed_fallbacks:
return random.choice(allowed_fallbacks)
return random.choice(LONG_WORD_FALLBACKS)
def append_recent_long_word(recent_long_words, word):
recent_long_words.append(word)
if len(recent_long_words) > RECENT_LONG_WORD_WINDOW:
recent_long_words.pop(0)
def make_mixed_long_word(recent_long_words, previous_word=None):
if (
len(recent_long_words) >= RECENT_LONG_WORD_WINDOW
and random.random() < LONG_WORD_REPEAT_CHANCE
):
repeat_candidates = [
word for word in recent_long_words if word != previous_word
]
if repeat_candidates:
return random.choice(repeat_candidates)
fallback = None
recent_long_word_set = set(recent_long_words)
disallowed_words = COMMON_WORD_SET | recent_long_word_set
for _ in range(MAX_MIXED_LONG_WORD_ATTEMPTS):
word = make_trigram_long_word(disallowed_words)
if word in COMMON_WORD_SET or word in recent_long_word_set:
continue
if fallback is None:
fallback = word
if word != previous_word:
return word
if fallback is not None and fallback != previous_word:
return fallback
distinct_fallbacks = [
word
for word in LONG_WORD_FALLBACKS
if (
word != previous_word
and word not in COMMON_WORD_SET
and word not in recent_long_word_set
)
]
if distinct_fallbacks:
return random.choice(distinct_fallbacks)
if fallback is not None:
return fallback
raise RuntimeError(
"Unable to generate a valid long word that is not a common word "
"or one of the recent long words."
)
def append_marked_word(words, marker, word):
words.append((marker, word))
def generate_words(total_words, recent_long_words):
words = []
while len(words) < total_words:
if random.random() < COMMON_WORD_CHANCE:
append_marked_word(
words,
"A",
weighted_choice(COMMON_WORDS, COMMON_WORD_WEIGHTS),
)
else:
for word in weighted_choice(
COMMON_COMBINATIONS,
COMMON_COMBINATION_WEIGHTS,
):
if len(words) >= total_words:
break
append_marked_word(words, "B", word)
long_word_count = random.randint(
LONG_WORDS_PER_MIX_MIN,
LONG_WORDS_PER_MIX_MAX,
)
for _ in range(long_word_count):
if len(words) >= total_words:
break
previous_word = words[-1][1] if words else None
word = make_mixed_long_word(recent_long_words, previous_word)
append_marked_word(words, "C", word)
append_recent_long_word(recent_long_words, word)
return words[:total_words]
def char_ngrams(words, n):
counts = Counter()
for word in words:
if len(word) < n:
continue
for i in range(len(word) - n + 1):
counts[word[i:i + n]] += 1
return counts
def analyze(all_words):
token_count = len(all_words)
type_count = len(set(all_words))
word_freq = Counter(all_words)
char_bigram_counts = char_ngrams(all_words, 2)
char_trigram_counts = char_ngrams(all_words, 3)
print()
print("=" * 60)
print("GLOBAL WORD STATISTICS")
print("=" * 60)
print()
print(f"Tokens : {token_count}")
print(f"Types : {type_count}")
print(f"TTR : {type_count / token_count:.4f}")
print(f"Hapax : {sum(1 for c in word_freq.values() if c == 1)}")
print()
print("Top 25 word tokens")
print()
for word, count in word_freq.most_common(25):
print(f"{word:15} {count}")
print()
print("=" * 60)
print("CHARACTER BIGRAMS")
print("=" * 60)
print()
for bg, count in char_bigram_counts.most_common(30):
print(f"{bg:5} {count}")
print()
print("=" * 60)
print("CHARACTER TRIGRAMS")
print("=" * 60)
print()
for tg, count in char_trigram_counts.most_common(30):
print(f"{tg:5} {count}")
def format_token(entry, debug=False):
marker, word = entry
if debug:
return f"{marker} {word}"
return word
def print_page(page_num, page, debug=False):
print()
print("=" * 60)
print(f"PAGE {page_num}")
print("=" * 60)
print()
for i in range(0, len(page), WORDS_PER_LINE):
print(" ".join(format_token(entry, debug) for entry in page[i:i + WORDS_PER_LINE]))
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--debug", action="store_true")
parser.add_argument("--english", action="store_true")
args = parser.parse_args()
if args.english:
global COMMON_WORDS
COMMON_WORDS = COMMON_WORDS_ENGLISH
random.seed(SEED)
initialize_common_sources()
recent_long_words = []
total_words = PAGES * WORDS_PER_PAGE
all_entries = generate_words(total_words, recent_long_words)
all_words = [word for _, word in all_entries]
for page_num in range(1, PAGES + 1):
start = (page_num - 1) * WORDS_PER_PAGE
end = start + WORDS_PER_PAGE
page = all_entries[start:end]
print_page(page_num, page, debug=args.debug)
analyze(all_words)
if __name__ == "__main__":
main()Dunsel > Yesterday, 12:00 AM
(26-05-2026, 06:14 PM)oshfdk Wrote: You are not allowed to view links. Register or Login to view.In fact, this way of creating a fake language is much simpler and produces a much more plausible result:
1) create a list of common words, about 20-30, write them down
2) create 10-20 common combinations of these common words, write them down
3) start writing mixing common words and common word combinations with random longer words, which would occasionally repeat
4) done
# minimal_phrase_generator_v3.py
#
# "Helped" version of the forum proposal:
#
# 1) fixed common words
# 2) fixed common combinations
# 3) mix those with occasional longer words
# 4) longer words are generated by using existing Ledger_scribe1.json
# 5) generated longer words may repeat locally
#
# Required external file:
# Ledger_scribe1.json
#
# Example:
# py minimal_phrase_generator_v3.py --ledger Ledger_scribe1.json --pages 100 --print-all-pages
import argparse
import json
import random
from collections import Counter
from pathlib import Path
# ============================================================
# SETTINGS
# ============================================================
DEFAULT_SEED = 42
DEFAULT_PAGES = 100
DEFAULT_WORDS_PER_PAGE = 95
DEFAULT_WORDS_PER_LINE = 8
# Uniform branch selection.
# This avoids hidden weighting between the three production modes.
MODES = [
"common_word",
"common_combination",
"ledger_word",
]
# This is the smallest operational version of
# "random longer words which occasionally repeat."
LEDGER_WORD_REPEAT_CHANCE = 0.25
RECENT_LEDGER_WORD_WINDOW = 10
# Ledger does not encode word length, so this is the one explicit external assumption.
MIN_LEDGER_WORD_LEN = 5
MAX_LEDGER_WORD_LEN = 12
MAX_ATTEMPTS_PER_LEDGER_WORD = 200
GALLOWS = set("ktpf")
VOWELS = set("aeioy")
# Basic "don't look stupid" guards copied from the v11 generator idea.
# These are deliberately simple visual plausibility filters, not grammar.
MAX_VOWEL_RUN = 4
MAX_CONSONANT_RUN = 4
MAX_TOKEN_LEN = 12
# Repeat/family guards for newly generated ledger words.
RECENT_TOKEN_REPEAT_LIMIT = 3
MAX_LEDGER_TOKEN_PAGE_COUNT = 5
MAX_LEDGER_FAMILY_PAGE_COUNT = 9
# ============================================================
# FIXED COMMON WORDS
# ============================================================
COMMON_WORDS = [
"daiin", "dain", "ol", "or", "chol",
"chedy", "qokain", "qokeedy", "qotedy", "otedy",
"shey", "cthey", "cthol", "shol", "chor",
"dair", "qokair", "saiin", "aiin", "okain",
"sary", "okol", "qol", "qokal", "chdy",
]
# ============================================================
# FIXED COMMON COMBINATIONS
# ============================================================
COMMON_COMBINATIONS = [
["qokain", "daiin"],
["chol", "chedy"],
["qokeedy", "qokedy"],
["ol", "chedy"],
["dain", "chol"],
["shey", "qokain"],
["cthey", "daiin"],
["shol", "chor"],
["qotedy", "qokain"],
["otedy", "ol"],
["saiin", "daiin"],
["qokair", "dair"],
["chol", "daiin"],
["cthol", "chedy"],
["or", "chol"],
]
# ============================================================
# BASIC HELPERS
# ============================================================
def weighted_choice(rng, values, weights=None):
if not values:
return None
if weights is None:
return rng.choice(list(values))
return rng.choices(list(values), weights=list(weights), k=1)[0]
def char_ngrams(words, n):
counts = Counter()
for word in words:
for i in range(0, len(word) - n + 1):
counts[word[i:i+n]] += 1
return counts
def max_run(token, charset):
best = 0
current = 0
for ch in token:
if ch in charset:
current += 1
best = max(best, current)
else:
current = 0
return best
def family_form(token):
return "".join(ch for ch in token if ch not in GALLOWS)
def passes_dls(token, ledger, page):
"""
Basic v11-style don't-look-stupid filter.
Applied only to newly generated ledger words, not to the fixed
common-word/common-combination scaffolding.
"""
if not token:
return False
if len(token) > MAX_TOKEN_LEN:
return False
if max_run(token, VOWELS) > MAX_VOWEL_RUN:
return False
consonants = set(ledger.alphabet) - VOWELS
if max_run(token, consonants) > MAX_CONSONANT_RUN:
return False
if page and token == page[-1]:
return False
if page[-RECENT_LEDGER_WORD_WINDOW:].count(token) >= RECENT_TOKEN_REPEAT_LIMIT:
return False
if page.count(token) >= MAX_LEDGER_TOKEN_PAGE_COUNT:
return False
family = family_form(token)
if family:
family_count = sum(
1 for existing in page
if family_form(existing) == family
)
if family_count >= MAX_LEDGER_FAMILY_PAGE_COUNT:
return False
return ledger.validate(token)
# ============================================================
# YOUR LEDGER FORMAT
# ============================================================
class Ledger:
def __init__(self, path):
path = Path(path)
if not path.exists():
raise FileNotFoundError(
f"Ledger file not found: {path}\n"
"Put Ledger_scribe1.json beside this script or pass --ledger PATH."
)
with path.open("r", encoding="utf-8") as f:
data = json.load(f)
self.metadata = data.get("metadata", {})
self.rows = data["ledger"]
self.alphabet = list(data.get("alphabet", self.rows.keys()))
self.columns = tuple(self.metadata.get("columns", ["prefix", "midfix", "suffix"]))
self.tiers = tuple(self.metadata.get("tiers", ["80", "18", "2"]))
self.short_tokens = set((self.metadata.get("short_tokens") or {}).keys())
self.tier_weights = self._tier_weights(self.tiers)
self.first_token_values, self.first_token_weights = self._load_first_token_weights()
self.followers = {}
for glyph, row in self.rows.items():
self.followers[glyph] = {}
for column in self.columns:
values = []
weights = []
for tier, tier_weight in self.tier_weights:
glyphs = row.get(column, {}).get(tier, [])
if not glyphs:
continue
each_weight = tier_weight / len(glyphs)
for follower in glyphs:
values.append(follower)
weights.append(each_weight)
self.followers[glyph][column] = (values, weights)
def _tier_weights(self, tiers):
numeric = []
for tier in tiers:
try:
numeric.append((tier, float(tier) / 100.0))
except ValueError:
numeric.append((tier, 1.0))
total = sum(weight for _, weight in numeric) or 1.0
return [
(tier, weight / total)
for tier, weight in numeric
]
def _load_first_token_weights(self):
raw_weights = (
self.metadata.get("first_token_weights")
or self.metadata.get("start_token_weights")
)
if raw_weights:
pairs = [
(glyph, float(weight))
for glyph, weight in raw_weights.items()
if glyph in self.rows and float(weight) > 0
]
if pairs:
pairs.sort()
values = [p[0] for p in pairs]
weights = [p[1] for p in pairs]
return values, weights
raw_counts = (
self.metadata.get("first_tokens")
or self.metadata.get("start_tokens")
)
if raw_counts:
pairs = [
(glyph, float(count))
for glyph, count in raw_counts.items()
if glyph in self.rows and float(count) > 0
]
if pairs:
pairs.sort()
values = [p[0] for p in pairs]
weights = [p[1] for p in pairs]
return values, weights
return list(self.rows.keys()), None
def choose_start_glyph(self, rng):
return weighted_choice(
rng,
self.first_token_values,
self.first_token_weights,
)
def choose_follower(self, rng, left, column):
values, weights = self.followers.get(left, {}).get(column, ([], []))
if not values:
return None
return weighted_choice(rng, values, weights)
def legal_transition(self, left, right, column):
row = self.rows.get(left, {}).get(column, {})
for tier in self.tiers:
if right in row.get(tier, []):
return True
return False
def has_multiple_gallows(self, token):
return sum(1 for ch in token if ch in GALLOWS) > 1
def validate(self, token):
if not token:
return False
if self.has_multiple_gallows(token):
return False
if len(token) == 1:
return token in self.short_tokens
if token[0] not in self.rows:
return False
if not self.legal_transition(token[0], token[1], "prefix"):
return False
for index in range(2, len(token) - 1):
if not self.legal_transition(token[index - 1], token[index], "midfix"):
return False
return self.legal_transition(token[-2], token[-1], "suffix")
def generate_word(self, rng, page=None, min_len=MIN_LEDGER_WORD_LEN, max_len=MAX_LEDGER_WORD_LEN):
page = page or []
for _ in range(MAX_ATTEMPTS_PER_LEDGER_WORD):
target_len = rng.randint(min_len, max_len)
chars = [self.choose_start_glyph(rng)]
if not chars[0]:
continue
while len(chars) < target_len:
left = chars[-1]
if len(chars) == 1:
column = "prefix"
elif len(chars) == target_len - 1:
column = "suffix"
else:
column = "midfix"
right = self.choose_follower(rng, left, column)
if right is None:
break
chars.append(right)
token = "".join(chars)
if len(token) == target_len and passes_dls(token, self, page):
return token
raise RuntimeError(
"Could not generate a ledger-valid DLS-passing word after "
f"{MAX_ATTEMPTS_PER_LEDGER_WORD} attempts."
)
# ============================================================
# GENERATION
# ============================================================
def generate_page(rng, ledger, words_per_page):
page = []
recent_ledger_words = []
while len(page) < words_per_page:
mode = rng.choice(MODES)
if mode == "common_word":
page.append(rng.choice(COMMON_WORDS))
elif mode == "common_combination":
page.extend(rng.choice(COMMON_COMBINATIONS))
elif mode == "ledger_word":
word = None
if recent_ledger_words and rng.random() < LEDGER_WORD_REPEAT_CHANCE:
repeat_candidates = [
candidate for candidate in recent_ledger_words
if passes_dls(candidate, ledger, page)
]
if repeat_candidates:
word = rng.choice(repeat_candidates)
if word is None:
word = ledger.generate_word(rng, page=page)
recent_ledger_words.append(word)
if len(recent_ledger_words) > RECENT_LEDGER_WORD_WINDOW:
recent_ledger_words.pop(0)
page.append(word)
return page[:words_per_page]
# ============================================================
# OUTPUT / ANALYSIS
# ============================================================
def fixed_vocab():
vocab = set(COMMON_WORDS)
for combo in COMMON_COMBINATIONS:
vocab.update(combo)
return vocab
def print_page(page_num, page, words_per_line):
print()
print("=" * 60)
print(f"PAGE {page_num}")
print("=" * 60)
print()
for i in range(0, len(page), words_per_line):
print(" ".join(page[i:i + words_per_line]))
def analyze(all_words):
vocab = fixed_vocab()
freq = Counter(all_words)
generated_occurrences = [
word for word in all_words
if word not in vocab
]
generated_freq = Counter(generated_occurrences)
print()
print("=" * 60)
print("GLOBAL STATISTICS")
print("=" * 60)
print()
print(f"Tokens : {len(all_words)}")
print(f"Types : {len(freq)}")
print(f"TTR : {len(freq) / len(all_words):.4f}")
print(f"Hapax : {sum(1 for c in freq.values() if c == 1)}")
print(f"Fixed vocabulary size : {len(vocab)}")
print(f"Ledger-word occurrences : {len(generated_occurrences)}")
print(f"Ledger-word types : {len(generated_freq)}")
print(f"Repeated ledger-word types : {sum(1 for c in generated_freq.values() if c > 1)}")
print(f"Max ledger-word repeat : {max(generated_freq.values()) if generated_freq else 0}")
print()
print("Top 25 word tokens")
print()
for word, count in freq.most_common(25):
print(f"{word:20} {count}")
print()
print("Top 30 character bigrams")
print()
for bg, count in char_ngrams(all_words, 2).most_common(30):
print(f"{bg:5} {count}")
print()
print("Top 30 character trigrams")
print()
for tg, count in char_ngrams(all_words, 3).most_common(30):
print(f"{tg:5} {count}")
print()
print("Top repeated ledger-generated words")
print()
shown = 0
for word, count in generated_freq.most_common():
if count <= 1:
break
print(f"{word:20} {count}")
shown += 1
if shown >= 25:
break
def main():
parser = argparse.ArgumentParser(
description="Minimal phrase + common word generator using the existing Scribe 1 ledger for new words."
)
parser.add_argument("--ledger", default="Ledger_scribe1.json")
parser.add_argument("--pages", type=int, default=DEFAULT_PAGES)
parser.add_argument("--words-per-page", type=int, default=DEFAULT_WORDS_PER_PAGE)
parser.add_argument("--words-per-line", type=int, default=DEFAULT_WORDS_PER_LINE)
parser.add_argument("--seed", type=int, default=DEFAULT_SEED)
parser.add_argument("--print-all-pages", action="store_true")
parser.add_argument("--output", default=None, help="Optional text output file.")
args = parser.parse_args()
rng = random.Random(args.seed)
ledger = Ledger(args.ledger)
lines = []
all_words = []
# Capture print output if --output is requested.
# Simpler than redirecting stdout externally.
def emit(text=""):
print(text)
if args.output is not None:
lines.append(text)
emit("Common words:")
emit(", ".join(COMMON_WORDS))
emit()
emit("Common combinations:")
for combo in COMMON_COMBINATIONS:
emit(" ".join(combo))
for page_num in range(1, args.pages + 1):
page = generate_page(rng, ledger, args.words_per_page)
all_words.extend(page)
if args.print_all_pages or page_num <= 3 or page_num == args.pages:
emit()
emit("=" * 60)
emit(f"PAGE {page_num}")
emit("=" * 60)
emit()
for i in range(0, len(page), args.words_per_line):
emit(" ".join(page[i:i + args.words_per_line]))
# Analysis prints to stdout directly.
analyze(all_words)
if args.output is not None:
Path(args.output).write_text("\n".join(lines) + "\n", encoding="utf-8")
if __name__ == "__main__":
main()bi3mw > Yesterday, 12:04 AM
(26-05-2026, 11:28 PM)Dunsel Wrote: You are not allowed to view links. Register or Login to view.PRECISELY... which is what my ledger does. Weighted letters.I was thinking more along the lines of an extension for “random.” Just for a quick suggestion, I decided to ask the AI for the first time. The basic idea seems viable, but of course the accuracy of the weighting would need to be verified.
# oshfdk_literal_generator_modified.py
# No external files required.
#
# Literal version of:
# 1) create 20-30 common words
# 2) create 10-20 common combinations
# 3) mix common words + common combinations + random longer words
# 4) random longer words occasionally repeat
#
# No mutation.
# No ledger.
# No syllables.
# No hidden morphology.
#
# FIX for Problem:
# Long words are no longer generated by flat random character sampling.
# Instead, positional character pools (INITIAL / MEDIAL / TERMINAL) with
# weights give each word a plausible frame:
# INITIAL – q, d, s, c dominate word-starts
# MEDIAL – o, e, a, i are the backbone of word bodies
# TERMINAL – n, y, r, l, m are the most common word-endings
import random
from collections import Counter
SEED = 42
PAGES = 100
WORDS_PER_PAGE = 95
WORDS_PER_LINE = 8
# ---------------------------------------------------------------------------
# Standard EVA-ish alphabet, sorted alphabetically.
# ---------------------------------------------------------------------------
VOYNICH_ALPHABET = "acdefhiklnoqrstxy"
LONG_WORD_MIN_LEN = 5
LONG_WORD_MAX_LEN = 12
LONG_WORD_REPEAT_CHANCE = 0.25
RECENT_LONG_WORD_WINDOW = 10
# ---------------------------------------------------------------------------
# Common words and combinations (unchanged)
# ---------------------------------------------------------------------------
COMMON_WORDS = [
"daiin", "dain", "ol", "or", "chol",
"chedy", "qokain", "qokeedy", "qotedy", "otedy",
"shey", "cthey", "cthol", "shol", "chor",
"dair", "qokair", "saiin", "aiin", "okain",
"sary", "okol", "qol", "qokal", "chdy",
]
COMMON_COMBINATIONS = [
["qokain", "daiin"],
["chol", "chedy"],
["qokeedy","qokedy"],
["ol", "chedy"],
["dain", "chol"],
["shey", "qokain"],
["cthey", "daiin"],
["shol", "chor"],
["qotedy", "qokain"],
["otedy", "ol"],
["saiin", "daiin"],
["qokair", "dair"],
["chol", "daiin"],
["cthol", "chedy"],
["or", "chol"],
]
# ---------------------------------------------------------------------------
# Weights are intentionally skewed to reflect the strong positional biases
# observed in Voynich script:
# INITIAL – q, d, s, c dominate word-starts
# MEDIAL – o, e, a, i are the backbone of word bodies
# TERMINAL – n, y, r, l, m are the most common word-endings
# ---------------------------------------------------------------------------
INITIAL_CHARS = {
"q": 8, "d": 7, "s": 6, "c": 6,
"o": 4, "a": 3, "f": 2, "r": 1,
"k": 1, "t": 1,
}
MEDIAL_CHARS = {
"o": 9, "e": 7, "a": 6, "i": 6,
"l": 4, "n": 3, "k": 3, "h": 3,
"r": 2, "t": 2, "d": 1, "s": 1,
}
TERMINAL_CHARS = {
"n": 8, "y": 8, "r": 6, "l": 5,
"m": 4, "s": 3, "d": 3,
}
def weighted_choice(weight_dict: dict) -> str:
"""Return a single character sampled according to integer weights."""
chars = list(weight_dict.keys())
weights = list(weight_dict.values())
return random.choices(chars, weights=weights, k=1)[0]
def make_random_long_word() -> str:
"""
Build a long word using positional character pools.
Structure:
[INITIAL char] + [MEDIAL body] + [TERMINAL char]
"""
length = random.randint(LONG_WORD_MIN_LEN, LONG_WORD_MAX_LEN)
# --- Word-initial character (positional pool) ---
word = weighted_choice(INITIAL_CHARS)
# --- Medial body (positional pool) ---
for _ in range(length - 2):
word += weighted_choice(MEDIAL_CHARS)
# --- Word-terminal character (positional pool) ---
word += weighted_choice(TERMINAL_CHARS)
return word
# ---------------------------------------------------------------------------
# Page generation (unchanged logic, fixed long-word generator plugged in)
# ---------------------------------------------------------------------------
def generate_page() -> list[str]:
page = []
recent_long_words = []
while len(page) < WORDS_PER_PAGE:
mode = random.choice(["common_word", "combination", "long_word"])
if mode == "common_word":
page.append(random.choice(COMMON_WORDS))
elif mode == "combination":
combo = random.choice(COMMON_COMBINATIONS)
page.extend(combo)
else: # long_word
if recent_long_words and random.random() < LONG_WORD_REPEAT_CHANCE:
word = random.choice(recent_long_words)
else:
word = make_random_long_word()
recent_long_words.append(word)
if len(recent_long_words) > RECENT_LONG_WORD_WINDOW:
recent_long_words.pop(0)
page.append(word)
return page[:WORDS_PER_PAGE]
# ---------------------------------------------------------------------------
# Analysis helpers (unchanged)
# ---------------------------------------------------------------------------
def char_ngrams(words: list[str], n: int) -> Counter:
counts = Counter()
for word in words:
if len(word) < n:
continue
for i in range(len(word) - n + 1):
counts[word[i:i + n]] += 1
return counts
def analyze(all_words: list[str]) -> None:
token_count = len(all_words)
type_count = len(set(all_words))
word_freq = Counter(all_words)
char_bigram_counts = char_ngrams(all_words, 2)
char_trigram_counts = char_ngrams(all_words, 3)
print()
print("=" * 60)
print("GLOBAL WORD STATISTICS")
print("=" * 60)
print()
print(f"Tokens : {token_count}")
print(f"Types : {type_count}")
print(f"TTR : {type_count / token_count:.4f}")
print(f"Hapax : {sum(1 for c in word_freq.values() if c == 1)}")
print()
print("Top 25 word tokens")
print()
for word, count in word_freq.most_common(25):
print(f"{word:15} {count}")
print()
print("=" * 60)
print("CHARACTER BIGRAMS")
print("=" * 60)
print()
for bg, count in char_bigram_counts.most_common(30):
print(f"{bg:5} {count}")
print()
print("=" * 60)
print("CHARACTER TRIGRAMS")
print("=" * 60)
print()
for tg, count in char_trigram_counts.most_common(30):
print(f"{tg:5} {count}")
def print_page(page_num: int, page: list[str]) -> None:
print()
print("=" * 60)
print(f"PAGE {page_num}")
print("=" * 60)
print()
for i in range(0, len(page), WORDS_PER_LINE):
print(" ".join(page[i:i + WORDS_PER_LINE]))
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
def main() -> None:
random.seed(SEED)
all_words = []
for page_num in range(1, PAGES + 1):
page = generate_page()
all_words.extend(page)
print_page(page_num, page)
analyze(all_words)
if __name__ == "__main__":
main()