Dunsel > Today, 04:22 AM
#!/usr/bin/env python3[/i]
[i]# Takahashi Voynich Parser — LOCKED, CLEAN VERSION[/i]
[i]import sys, re, hashlib[/i]
[i]from collections import defaultdict, OrderedDict[/i]
[i]import inspect as _inspect_internal_guard[/i]
[i]# -------------------------[/i]
[i]# TAG + NORMALIZATION[/i]
[i]# -------------------------[/i]
[i]TAG_LINE = re.compile([/i]
[i] r'^<(?P<folio>f\d+[rv](\d*)?)\.(?P<tag>[A-Z]+)(?P<idx>\d+)?(?:\.(?P<line>\d+))?;H>(?P<payload>.*)$'[/i]
[i])[/i]
[i]A_Z_SPACE = re.compile(r'[^a-z ]+')[/i]
[i]def normalize_payload(s: str) -> str:[/i]
[i] s = re.sub(r'\{[^}]*\}', '', s)[/i]
[i] s = re.sub(r'<![^>]*>', '', s)[/i]
[i] s = s.replace('<->', ' ')[/i]
[i] s = s.replace('\t', ' ').replace('.', ' ')[/i]
[i] s = s.lower()[/i]
[i] s = A_Z_SPACE.sub(' ', s)[/i]
[i] s = re.sub(r'\s+', ' ', s).strip()[/i]
[i] return s[/i]
[i]# -------------------------[/i]
[i]# INTERNAL ITERATOR (raw)[/i]
[i]# -------------------------[/i]
[i]def _orig_iter_h_records(path, wanted_folio=None):[/i]
[i] current = None[/i]
[i] buf = [][/i]
[i] with open(path, 'r', encoding='utf-8', errors='ignore') as f:[/i]
[i] for raw in f:[/i]
[i] line = raw.rstrip('\n')[/i]
[i] if not line:[/i]
[i] continue[/i]
[i] if line.startswith('<'):[/i]
[i] if current and buf:[/i]
[i] folio, tag, idx, ln = current[/i]
[i] payload = ''.join(buf)[/i]
[i] yield (folio, tag, idx, ln, payload)[/i]
[i] m = TAG_LINE.match(line)[/i]
[i] if m:[/i]
[i] folio = m.group('folio')[/i]
[i] if (wanted_folio is None) or (folio == wanted_folio):[/i]
[i] tag = m.group('tag')[/i]
[i] idx = m.group('idx') or '0'[/i]
[i] ln = m.group('line') or '1'[/i]
[i] payload = m.group('payload')[/i]
[i] current = (folio, tag, idx, ln)[/i]
[i] buf = [payload][/i]
[i] else:[/i]
[i] current = None[/i]
[i] buf = [][/i]
[i] else:[/i]
[i] current = None[/i]
[i] buf = [][/i]
[i] else:[/i]
[i] if current is not None:[/i]
[i] buf.append(line)[/i]
[i] if current and buf:[/i]
[i] folio, tag, idx, ln = current[/i]
[i] payload = ''.join(buf)[/i]
[i] yield (folio, tag, idx, ln, payload)[/i]
[i]# -------------------------[/i]
[i]# API GUARD[/i]
[i]# -------------------------[/i]
[i]_API_GUARD_ENABLED = True[/i]
[i]def allow_internal_helpers():[/i]
[i] global _API_GUARD_ENABLED[/i]
[i] _API_GUARD_ENABLED = False[/i]
[i]def _guard_internal(name: str):[/i]
[i] if not _API_GUARD_ENABLED:[/i]
[i] return[/i]
[i] frame = _inspect_internal_guard.currentframe()[/i]
[i] caller = frame.f_back[/i]
[i] mod = caller.f_globals.get("__name__", "")[/i]
[i] if mod == __name__:[/i]
[i] return[/i]
[i] raise RuntimeError(f"{name} is INTERNAL. Use public API only.")[/i]
[i]def iter_h_records(*args, **kwargs):[/i]
[i] _guard_internal("iter_h_records")[/i]
[i] return _orig_iter_h_records(*args, **kwargs)[/i]
[i]# -------------------------[/i]
[i]# CORPUS PARSING[/i]
[i]# -------------------------[/i]
[i]from collections import defaultdict, OrderedDict[/i]
[i]def _parse_folio_corpus_for_sanity(path, folio):[/i]
[i] fid = str(folio).lower()[/i]
[i] if _EXCLUDE_EMPTY_FOLIOS_ENABLED and fid in _EMPTY_FOLIOS:[/i]
[i] return ''[/i]
[i] pieces = [][/i]
[i] for _folio, _tag, _idx, _ln, payload in _orig_iter_h_records(path, folio):[/i]
[i] norm = normalize_payload(payload)[/i]
[i] if norm:[/i]
[i] pieces.append(norm)[/i]
[i] return ' '.join(pieces).strip()[/i]
[i]def parse_folio_corpus(path, folio):[/i]
[i] """Deprecated: this mixes body, label, marginal, and special into one blob.[/i]
[i] Kept only for CLI/sanity compatibility. Do NOT use for analysis."""[/i]
[i] raise RuntimeError([/i]
[i] "parse_folio_corpus is deprecated and disabled. Use parse_folio_structured "[/i]
[i] "and iter_corpus_words/profile-aware helpers instead."[/i]
[i] )[/i]
[i]def parse_folio_structured(path, folio):[/i]
[i] assert_folio_exists(path, folio)[/i]
[i] fid = str(folio).lower()[/i]
[i] if _EXCLUDE_EMPTY_FOLIOS_ENABLED and fid in _EMPTY_FOLIOS:[/i]
[i] return {}[/i]
[i] groups = defaultdict(lambda: defaultdict(list))[/i]
[i] for _folio, tag, idx, _ln, payload in _orig_iter_h_records(path, folio):[/i]
[i] norm = normalize_payload(payload)[/i]
[i] if norm:[/i]
[i] groups[tag][idx].append(norm)[/i]
[i] out = {}[/i]
[i] for tag, by_idx in groups.items():[/i]
[i] od = OrderedDict()[/i]
[i] for idx in sorted(by_idx, key=lambda x: int(x)):[/i]
[i] od[f"{tag}{idx}"] = ' '.join(by_idx[idx]).strip()[/i]
[i] out[tag] = od[/i]
[i] out = sort_structured(out)[/i]
[i] return out[/i]
[i]# -------------------------[/i]
[i]# TAG CATEGORIES & CORPUS ITERATOR[/i]
[i]# -------------------------[/i]
[i]# Expected mapping from tag letter to high-level category.[/i]
[i]TAG_CATEGORY = {[/i]
[i] # BODY TEXT[/i]
[i] "P": "body",[/i]
[i] "C": "body",[/i]
[i] "T": "body",[/i]
[i] # LABELS[/i]
[i] "R": "label",[/i]
[i] "S": "label",[/i]
[i] "L": "label",[/i]
[i] "X": "label",[/i]
[i] "Y": "label",[/i]
[i] "Z": "label",[/i]
[i] "B": "label",[/i]
[i] "K": "label",[/i]
[i] "I": "label",[/i]
[i] "F": "label",[/i]
[i] "U": "label",[/i]
[i] "O": "label",[/i]
[i] # MARGINAL / SCRIBBLES[/i]
[i] "M": "marginal",[/i]
[i] "N": "marginal",[/i]
[i] "W": "marginal",[/i]
[i] # SPECIAL STRUCTURE[/i]
[i] "Q": "special",[/i]
[i]}[/i]
[i]def classify_tag(tag: str) -> str:[/i]
[i] """Return high-level category for a tag, or 'unknown'."""[/i]
[i] return TAG_CATEGORY.get(str(tag).upper(), "unknown")[/i]
[i]def assert_folio_exists(pages_path: str, folio: str) -> None:[/i]
[i] """Raise if folio is not present in the transcription."""[/i]
[i] folio_norm = str(folio).lower()[/i]
[i] existing = set(get_folio_list(pages_path, include_excluded=True))[/i]
[i] if folio_norm not in existing:[/i]
[i] # Suggest similar foldouts if any share the prefix.[/i]
[i] prefix = folio_norm[/i]
[i] similar = sorted(f for f in existing if f.startswith(prefix))[/i]
[i] msg = [f"Folio '{folio}' not found in {pages_path}."][/i]
[i] if similar:[/i]
[i] msg.append("Did you mean: " + ", ".join(similar))[/i]
[i] raise ValueError(" ".join(msg))[/i]
[i]def iter_corpus_words(pages_path: str,[/i]
[i] profile: str,[/i]
[i] include_excluded: bool = False):[/i]
[i] """[/i]
[i] Yield (folio, tag, unit_key, word) for a given high-level profile.[/i]
[i] profile:[/i]
[i] 'body' -> tags classified as body[/i]
[i] 'labels' -> tags classified as label[/i]
[i] 'marginal' -> tags classified as marginal[/i]
[i] 'special' -> tags classified as special[/i]
[i] """[/i]
[i] profile = str(profile).lower()[/i]
[i] valid = {"body", "labels", "marginal", "special"}[/i]
[i] if profile not in valid:[/i]
[i] raise ValueError(f"Unknown profile: {profile!r}; expected one of {sorted(valid)}")[/i]
[i] wanted_category = {[/i]
[i] "body": "body",[/i]
[i] "labels": "label",[/i]
[i] "marginal": "marginal",[/i]
[i] "special": "special",[/i]
[i] }[profile][/i]
[i] for folio in get_folio_list(pages_path, include_excluded=include_excluded):[/i]
[i] struct = parse_folio_structured(pages_path, folio)[/i]
[i] for tag, blocks in struct.items():[/i]
[i] cat = classify_tag(tag)[/i]
[i] if cat != wanted_category:[/i]
[i] continue[/i]
[i] if isinstance(blocks, dict):[/i]
[i] for unit_key, text in blocks.items():[/i]
[i] for word in text.split():[/i]
[i] yield folio, tag, unit_key, word[/i]
[i]# -------------------------[/i]
[i]# SANITY CHECK (kept)[/i]
[i]# -------------------------[/i]
[i]import hashlib as _hashlib[/i]
[i]def sha256(text: str) -> str:[/i]
[i] return _hashlib.sha256(text.encode('utf-8')).hexdigest()[/i]
[i]SENTINELS = {[/i]
[i] 'f49v': {'tokens': 151, 'sha256': '172a8f2b7f06e12de9e69a73509a570834b93808d81c79bb17e5d93ebb0ce0d0'},[/i]
[i] 'f68r3': {'tokens': 104, 'sha256': '8e9aa4f9c9ed68f55ab2283c85581c82ec1f85377043a6ad9eff6550ba790f61'},[/i]
[i]}[/i]
[i]def sanity_check(path):[/i]
[i] results = {}[/i]
[i] for folio, exp in SENTINELS.items():[/i]
[i] line = _parse_folio_corpus_for_sanity(path, folio)[/i]
[i] toks = len(line.split())[/i]
[i] dig = sha256(line)[/i]
[i] results[folio] = {[/i]
[i] 'ok': (toks == exp['tokens'] and dig == exp['sha256']),[/i]
[i] 'tokens': toks,[/i]
[i] 'sha256': dig[/i]
[i] }[/i]
[i] all_ok = all(v['ok'] for v in results.values())[/i]
[i] return all_ok, results[/i]
[i]# -------------------------[/i]
[i]# EXCLUSIONS (kept)[/i]
[i]# -------------------------[/i]
[i]_EXCLUDE_EMPTY_FOLIOS_ENABLED = True[/i]
[i]_EMPTY_FOLIOS = set([
'f101r2','f109r','f109v','f110r','f110v','f116v','f12r','f12v',
'f59r','f59v','f60r','f60v','f61r','f61v','f62r','f62v',
'f63r','f63v','f64r','f64v','f74r','f74v','f91r','f91v',
'f92r','f92v','f97r','f97v','f98r','f98v'
])[/i]
[i]def set_exclude_empty_folios(flag: bool):[/i]
[i] global _EXCLUDE_EMPTY_FOLIOS_ENABLED[/i]
[i] _EXCLUDE_EMPTY_FOLIOS_ENABLED = bool(flag)[/i]
[i]def get_exclude_empty_folios() -> bool:[/i]
[i] return _EXCLUDE_EMPTY_FOLIOS_ENABLED[/i]
[i]def get_excluded_folios() -> list:[/i]
[i] return sorted(_EMPTY_FOLIOS)[/i]
[i]# -------------------------[/i]
[i]# SORTING + ORDER[/i]
[i]# -------------------------[/i]
[i]_REGION_ORDER = {"P": 0, "T": 1, "C": 2, "R": 3}[/i]
[i]def sort_structured(struct):[/i]
[i] try:[/i]
[i] out = OrderedDict()[/i]
[i] def region_key(tag):[/i]
[i] return (_REGION_ORDER.get(tag, 99), tag)[/i]
[i] for tag in sorted(struct.keys(), key=region_key):[/i]
[i] blocks = struct[tag][/i]
[i] if isinstance(blocks, dict):[/i]
[i] od = OrderedDict()[/i]
[i] def idx_key(k):[/i]
[i] m = re.search(r"(\d+)$", str(k))[/i]
[i] return int(m.group(1)) if m else 999999[/i]
[i] for k in sorted(blocks.keys(), key=idx_key):[/i]
[i] od[k] = blocks[k][/i]
[i] out[tag] = od[/i]
[i] else:[/i]
[i] out[tag] = blocks[/i]
[i] return out[/i]
[i] except Exception:[/i]
[i] return struct[/i]
[i]def folio_sort_key(fid: str):[/i]
[i] fid = fid.lower().strip()[/i]
[i] m = re.match(r"^f(\d+)(r|v)(\d+)?$", fid)[/i]
[i] if not m:[/i]
[i] return (999999, 9, 999999)[/i]
[i] return (int(m.group(1)), 0 if m.group(2)=='r' else 1, int(m.group(3) or 0))[/i]
[i]# -------------------------[/i]
[i]# ASTRO REMAP (kept)[/i]
[i]# -------------------------[/i]
[i]_ASTRO_START, _ASTRO_END = 67, 73[/i]
[i]_KEEP_AS_IS = {"C", "R", "P", "T"}[/i]
[i]_folio_re_ast = re.compile(r"^f(\d+)(r|v)(\d+)?$")[/i]
[i]def _is_astro_folio(folio: str) -> bool:[/i]
[i] m = _folio_re_ast.match(str(folio).lower())[/i]
[i] if not m:[/i]
[i] return False[/i]
[i] num = int(m.group(1))[/i]
[i] return _ASTRO_START <= num <= _ASTRO_END[/i]
[i]def _astro_remap_unknown_to_R(folio: str, out: dict) -> dict:[/i]
[i] if not isinstance(out, dict) or not _is_astro_folio(folio):[/i]
[i] return sort_structured(out)[/i]
[i] if not out:[/i]
[i] return sort_structured(out)[/i]
[i] out = dict(out) # shallow copy[/i]
[i] out.setdefault("R", OrderedDict())[/i]
[i] unknown = [t for t in list(out.keys()) if t not in _KEEP_AS_IS][/i]
[i] for tag in unknown:[/i]
[i] blocks = out.get(tag, {})[/i]
[i] if isinstance(blocks, dict):[/i]
[i] for unit_key, text in blocks.items():[/i]
[i] new_key = f"R_from_{tag}_{unit_key}"[/i]
[i] if new_key in out["R"]:[/i]
[i] out["R"][new_key] += " " + (text or "")[/i]
[i] else:[/i]
[i] out["R"][new_key] = text[/i]
[i] out.pop(tag, None)[/i]
[i] return sort_structured(out)[/i]
[i]# -------------------------[/i]
[i]# PUBLIC: Folio list[/i]
[i]# -------------------------[/i]
[i]def get_folio_list(pages_path: str, include_excluded: bool=False) -> list:[/i]
[i] ids = set()[/i]
[i] for folio, _t, _i, _l, _p in _orig_iter_h_records(pages_path, None):[/i]
[i] ids.add(folio.lower())[/i]
[i] ids = sorted(ids, key=folio_sort_key)[/i]
[i] if include_excluded:[/i]
[i] return ids[/i]
[i] return [fid for fid in ids if fid not in _EMPTY_FOLIOS][/i]
[i]# -------------------------[/i]
[i]# PUBLIC: Lines in natural order (NEW)[/i]
[i]# -------------------------[/i]
[i]def get_folio_lines_ordered(pages_path: str, folio: str) -> list:[/i]
[i] folio = str(folio).lower()[/i]
[i] lines = [][/i]
[i] for _folio, tag, idx, ln, payload in _orig_iter_h_records(pages_path, folio):[/i]
[i] norm = normalize_payload(payload)[/i]
[i] if norm:[/i]
[i] region_rank = _REGION_ORDER.get(tag, 99)[/i]
[i] lines.append((region_rank, tag, int(idx), int(ln), norm))[/i]
[i] lines.sort(key=lambda x: (x[0], x[1], x[2], x[3]))[/i]
[i] return [x[4] for x in lines][/i]
[i]# -------------------------[/i]
[i]# CLI — ONLY sanity + parse[/i]
[i]# -------------------------[/i]
[i]USAGE = '''[/i]
[i]Usage:[/i]
[i] python TParser.py sanity PagesH.txt[/i]
[i] python TParser.py parse PagesH.txt <folio> corpus[/i]
[i] python TParser.py parse PagesH.txt <folio> structured[/i]
[i]'''[/i]
[i]def main(argv):[/i]
[i] if len(argv) < 3:[/i]
[i] print(USAGE); sys.exit(1)[/i]
[i] cmd = argv[1].lower()[/i]
[i] path = argv[2][/i]
[i] if cmd == 'sanity':[/i]
[i] ok, res = sanity_check(path)[/i]
[i] print("PRECHECK:", "PASS" if ok else "FAIL")[/i]
[i] for f, info in res.items():[/i]
[i] print(f" {f}: ok={info['ok']} tokens={info['tokens']} sha256={info['sha256']}")[/i]
[i] sys.exit(0 if ok else 2)[/i]
[i] if cmd == 'parse':[/i]
[i] if len(argv) != 5:[/i]
[i] print(USAGE); sys.exit(1)[/i]
[i] folio = argv[3].lower()[/i]
[i] mode = argv[4].lower()[/i]
[i] if mode == 'corpus':[/i]
[i] print(parse_folio_corpus(path, folio))[/i]
[i] elif mode == 'structured':[/i]
[i] data = parse_folio_structured(path, folio)[/i]
[i] order = ['P','T','C','R','X','N','S','V','L'][/i]
[i] for grp in order + [k for k in sorted(data) if k not in order]:[/i]
[i] if grp in data and data[grp]:[/i]
[i] print(f'[{grp}]')[/i]
[i] for k,v in data[grp].items():[/i]
[i] print(f'{k}: {v}')[/i]
[i] print()[/i]
[i] else:[/i]
[i] print(USAGE); sys.exit(1)[/i]
[i] sys.exit(0)[/i]
[i] print(USAGE); sys.exit(1)[/i]
[i]if __name__ == '__main__':[/i]
[i] main(sys.argv)[/i]
[i]Jorge_Stolfi > Today, 06:10 AM
(Today, 04:22 AM)Dunsel Wrote: You are not allowed to view links. Register or Login to view.Voynich words peak sharply at 5–6 glyphs, with a long but low-frequency tail.
Dunsel > Today, 06:37 AM
(Today, 06:10 AM)Jorge_Stolfi Wrote: You are not allowed to view links. Register or Login to view.(Today, 04:22 AM)Dunsel Wrote: You are not allowed to view links. Register or Login to view.Voynich words peak sharply at 5–6 glyphs, with a long but low-frequency tail.
That "long-frequency tail" is probably transcription errors, where multiple words are strung together. Hunt them down and check with the page images.
You seem to be computing the length distribution of tokens (word ocurrences). Try instead that of lexemes ("word types").
Be sure to map everything to lowercase.
For a double surprise, try also Chinese or Vietnamese...
All the best, --stolfi
Philipp Harland > Today, 08:31 AM
(Today, 04:22 AM)Dunsel Wrote: You are not allowed to view links. Register or Login to view.Again, not one to give up, I stumbled onto another method that half way works. Here's the details:
But first, why am I insistent on getting AI to work with the Voynich. Well, it's simple. It's fast. It can create code to parse and generate charts faster than I could hope to. Second, It's damned creative. It can devise new methods to analyze the Voynich that I could only imagine. Third, it's resources are vast. It has intimate knowledge (when it's not lying) about the history of Medieval texts and encryption methods used during that time.
So, if I CAN get something reliable out of it, it can do years worth of work in weeks.
So, here's the current method. It involves using a free Google tool called Colab. Colab is a website where you can paste chunks of Python, run the python to generate tables and charts, and combine that with text to form a complete 'runnable' document.
GPT understands Colab and, it can generate python scripts that you can copy, paste and run in Colab.
So, here's what I did. I went over my parser code and had GPT refine and clean it up. It's in the code section below. I then uploaded that to a 'Voynich' directory on my google drive. I also uploaded my 4 sample text files from Gutenberg. I uploaded the Takahashi transcription. And I had GPT create another parser for my Gutenberg text to strip out the header and parse the words.
I also uploaded to GPT all of these files so that it would understand them. I told it where on Google drive the files were located and that it should create all python files with those directories in mind. It's basic instructions were, to use the parsers on Google drive to parse the Voynich and the 4 control corpus. To create code to upload to Colab and run various tests.
Here, let me show you the results:
You are not allowed to view links. Register or Login to view.
At the top is a cell that contains text that describes what the charts are and the theory used to create them. Each "cell" below that is a chunk of Python code. There is a button at the top left of each cell where you can 'run' the python code and produce the chart results.
So, what I do is I discuss with GPT what I'd like to do. I have it create a 'sample' chart that I can view in the chat. If it's something I want to keep, I ask it to produce the Colab code for it. I then copy and paste the code into Colab, run it and verify the results matched what GPT produced.
CAVEATS: GPT will still lie, it will still try to find shortcuts, it will still fabricate. So, you have to 'eyeball' the code when you paste it in Colab to make sure it's not manufacturing fake data. Colab also has a button where you can have Gemini review the code if you don't write it yourself.
So, here's the results and these are not from GPT, they are from code it produced and then run on Colab. The captions are GPT created.
TLDR;
By using Google Colab, a python parser and Takahashi's transcription you can get ChatGPT to create python files that can be pasted into Colab which allows you to perform some pretty complex tests on the Voynich and display the results. This essentially turns GPT into a resource to create the code and help generate ides on what type of work you'd like to perform. However, it takes the results of that code out of it's hands and places it in a location where YOU have deterministic control. This is, so far, the only way I've found to use GPT to analyze the Voynich but, keep it from fabricating results.
Is this an ultimate solution? No, it can still generate bogus code that will work perfectly on Colab. You'll have to verify it.
FIGURE 1 — Raw Word-Length Distributions: Voynich and Four Corpora
Caption:
Normalized word-length distributions for the Voynich Manuscript (TParser-extracted body text) and four Gutenberg corpora (English, Finnish, Latin, Esperanto). Voynich words peak sharply at 5–6 glyphs, with a long but low-frequency tail. Natural languages show distinct characteristic peaks: English around 4–5 letters, Esperanto around 2–3, Finnish around 6, and Latin around 5–7. This chart establishes the baseline behavior of each corpus before applying any cipher or encoding transformations.
FIGURE 2 — Chunking Model: Voynich vs English vs Bigram-Chunked English (k = 2)
Caption:
Word-length distributions comparing Voynich, raw English, and English encoded by a simple bigram chunking method (k = 2). The chunked-English distribution demonstrates how dividing text into fixed-size units (bigrams) drastically alters word length statistics—producing a sharp peak at 1–2 chunks and an exponential decay thereafter. This serves as a simple toy cipher model illustrating how unit-based encoding compresses or expands word-length structure.
FIGURE 3— SP-System Word-Length Distributions: English, Finnish, Latin, Esperanto
Caption:
SP-System (Syllabic–Polygraphic System) token-length distributions for four languages, using the SP tri-/bi-gram inventory built separately for each corpus. The SP transformation compresses common digraphs and trigraphs into single units while preserving rare sequences at the character level. Each language retains its unique “signature,” but the SP system generally shifts distributions toward mid-range token lengths (3–6 units) and generates long-tail behavior similar to some cipher traditions. This figure demonstrates the effect of polygraphic encoding on natural-language word-length structure.
[i][b]FIGURE 4 — SP3-A Word-Length Distributions for Four Languages
[/i][/b]
Caption:
Normalized SP3-A token-length distributions for four Gutenberg corpora (English, Finnish, Latin, Esperanto). Under the SP3-A model, high-frequency function words (e.g., “the,” “of,” “in,” Latin “et,” “ad,” “in,” etc.) are fused into the following word before encoding. Each fused word is then transformed using the SP2 polygraphic system: commonly occurring trigrams and bigrams are compressed into single units, with remaining letters encoded individually. The resulting SP3-A token-length profiles show how short-word fusion—an attested practice in several 15th-century cipher traditions—shifts natural-language distributions toward longer, mid-range token lengths (4–6 units) while reducing the number of 1-unit tokens. This chart evaluates how historically plausible pre-cipher transformations alter the statistical “shape” of English, Finnish, Latin, and Esperanto before any comparison to the Voynich Manuscript is made.
FIGURE 5 — SP3-A Latin vs Voynich
Caption:
Comparison between Voynich word-length frequencies and Latin transformed through the SP3-A method (short-word fusion + SP2-style polygraphic encoding). Although not a direct match, the SP3-A model shifts Latin toward mid-range token lengths—narrowing the gap between Latin and Voynich compared to raw text. This figure evaluates whether historically plausible pre-cipher transformations (e.g., function-word fusion) increase resemblance to Voynich distributions.
And just for comparison to the one in my OP, here's another letter count heatmap where GPT created the code and Colab executed it and produced the chart.
The Parser Code:
Code:#!/usr/bin/env python3[/i]
[i]# Takahashi Voynich Parser — LOCKED, CLEAN VERSION[/i]
[i]import sys, re, hashlib[/i]
[i]from collections import defaultdict, OrderedDict[/i]
[i]import inspect as _inspect_internal_guard[/i]
[i]# -------------------------[/i]
[i]# TAG + NORMALIZATION[/i]
[i]# -------------------------[/i]
[i]TAG_LINE = re.compile([/i]
[i] r'^<(?P<folio>f\d+[rv](\d*)?)\.(?P<tag>[A-Z]+)(?P<idx>\d+)?(?:\.(?P<line>\d+))?;H>(?P<payload>.*)$'[/i]
[i])[/i]
[i]A_Z_SPACE = re.compile(r'[^a-z ]+')[/i]
[i]def normalize_payload(s: str) -> str:[/i]
[i] s = re.sub(r'\{[^}]*\}', '', s)[/i]
[i] s = re.sub(r'<![^>]*>', '', s)[/i]
[i] s = s.replace('<->', ' ')[/i]
[i] s = s.replace('\t', ' ').replace('.', ' ')[/i]
[i] s = s.lower()[/i]
[i] s = A_Z_SPACE.sub(' ', s)[/i]
[i] s = re.sub(r'\s+', ' ', s).strip()[/i]
[i] return s[/i]
[i]# -------------------------[/i]
[i]# INTERNAL ITERATOR (raw)[/i]
[i]# -------------------------[/i]
[i]def _orig_iter_h_records(path, wanted_folio=None):[/i]
[i] current = None[/i]
[i] buf = [][/i]
[i] with open(path, 'r', encoding='utf-8', errors='ignore') as f:[/i]
[i] for raw in f:[/i]
[i] line = raw.rstrip('\n')[/i]
[i] if not line:[/i]
[i] continue[/i]
[i] if line.startswith('<'):[/i]
[i] if current and buf:[/i]
[i] folio, tag, idx, ln = current[/i]
[i] payload = ''.join(buf)[/i]
[i] yield (folio, tag, idx, ln, payload)[/i]
[i] m = TAG_LINE.match(line)[/i]
[i] if m:[/i]
[i] folio = m.group('folio')[/i]
[i] if (wanted_folio is None) or (folio == wanted_folio):[/i]
[i] tag = m.group('tag')[/i]
[i] idx = m.group('idx') or '0'[/i]
[i] ln = m.group('line') or '1'[/i]
[i] payload = m.group('payload')[/i]
[i] current = (folio, tag, idx, ln)[/i]
[i] buf = [payload][/i]
[i] else:[/i]
[i] current = None[/i]
[i] buf = [][/i]
[i] else:[/i]
[i] current = None[/i]
[i] buf = [][/i]
[i] else:[/i]
[i] if current is not None:[/i]
[i] buf.append(line)[/i]
[i] if current and buf:[/i]
[i] folio, tag, idx, ln = current[/i]
[i] payload = ''.join(buf)[/i]
[i] yield (folio, tag, idx, ln, payload)[/i]
[i]# -------------------------[/i]
[i]# API GUARD[/i]
[i]# -------------------------[/i]
[i]_API_GUARD_ENABLED = True[/i]
[i]def allow_internal_helpers():[/i]
[i] global _API_GUARD_ENABLED[/i]
[i] _API_GUARD_ENABLED = False[/i]
[i]def _guard_internal(name: str):[/i]
[i] if not _API_GUARD_ENABLED:[/i]
[i] return[/i]
[i] frame = _inspect_internal_guard.currentframe()[/i]
[i] caller = frame.f_back[/i]
[i] mod = caller.f_globals.get("__name__", "")[/i]
[i] if mod == __name__:[/i]
[i] return[/i]
[i] raise RuntimeError(f"{name} is INTERNAL. Use public API only.")[/i]
[i]def iter_h_records(*args, **kwargs):[/i]
[i] _guard_internal("iter_h_records")[/i]
[i] return _orig_iter_h_records(*args, **kwargs)[/i]
[i]# -------------------------[/i]
[i]# CORPUS PARSING[/i]
[i]# -------------------------[/i]
[i]from collections import defaultdict, OrderedDict[/i]
[i]def _parse_folio_corpus_for_sanity(path, folio):[/i]
[i] fid = str(folio).lower()[/i]
[i] if _EXCLUDE_EMPTY_FOLIOS_ENABLED and fid in _EMPTY_FOLIOS:[/i]
[i] return ''[/i]
[i] pieces = [][/i]
[i] for _folio, _tag, _idx, _ln, payload in _orig_iter_h_records(path, folio):[/i]
[i] norm = normalize_payload(payload)[/i]
[i] if norm:[/i]
[i] pieces.append(norm)[/i]
[i] return ' '.join(pieces).strip()[/i]
[i]def parse_folio_corpus(path, folio):[/i]
[i] """Deprecated: this mixes body, label, marginal, and special into one blob.[/i]
[i] Kept only for CLI/sanity compatibility. Do NOT use for analysis."""[/i]
[i] raise RuntimeError([/i]
[i] "parse_folio_corpus is deprecated and disabled. Use parse_folio_structured "[/i]
[i] "and iter_corpus_words/profile-aware helpers instead."[/i]
[i] )[/i]
[i]def parse_folio_structured(path, folio):[/i]
[i] assert_folio_exists(path, folio)[/i]
[i] fid = str(folio).lower()[/i]
[i] if _EXCLUDE_EMPTY_FOLIOS_ENABLED and fid in _EMPTY_FOLIOS:[/i]
[i] return {}[/i]
[i] groups = defaultdict(lambda: defaultdict(list))[/i]
[i] for _folio, tag, idx, _ln, payload in _orig_iter_h_records(path, folio):[/i]
[i] norm = normalize_payload(payload)[/i]
[i] if norm:[/i]
[i] groups[tag][idx].append(norm)[/i]
[i] out = {}[/i]
[i] for tag, by_idx in groups.items():[/i]
[i] od = OrderedDict()[/i]
[i] for idx in sorted(by_idx, key=lambda x: int(x)):[/i]
[i] od[f"{tag}{idx}"] = ' '.join(by_idx[idx]).strip()[/i]
[i] out[tag] = od[/i]
[i] out = sort_structured(out)[/i]
[i] return out[/i]
[i]# -------------------------[/i]
[i]# TAG CATEGORIES & CORPUS ITERATOR[/i]
[i]# -------------------------[/i]
[i]# Expected mapping from tag letter to high-level category.[/i]
[i]TAG_CATEGORY = {[/i]
[i] # BODY TEXT[/i]
[i] "P": "body",[/i]
[i] "C": "body",[/i]
[i] "T": "body",[/i]
[i] # LABELS[/i]
[i] "R": "label",[/i]
[i] "S": "label",[/i]
[i] "L": "label",[/i]
[i] "X": "label",[/i]
[i] "Y": "label",[/i]
[i] "Z": "label",[/i]
[i] "B": "label",[/i]
[i] "K": "label",[/i]
[i] "I": "label",[/i]
[i] "F": "label",[/i]
[i] "U": "label",[/i]
[i] "O": "label",[/i]
[i] # MARGINAL / SCRIBBLES[/i]
[i] "M": "marginal",[/i]
[i] "N": "marginal",[/i]
[i] "W": "marginal",[/i]
[i] # SPECIAL STRUCTURE[/i]
[i] "Q": "special",[/i]
[i]}[/i]
[i]def classify_tag(tag: str) -> str:[/i]
[i] """Return high-level category for a tag, or 'unknown'."""[/i]
[i] return TAG_CATEGORY.get(str(tag).upper(), "unknown")[/i]
[i]def assert_folio_exists(pages_path: str, folio: str) -> None:[/i]
[i] """Raise if folio is not present in the transcription."""[/i]
[i] folio_norm = str(folio).lower()[/i]
[i] existing = set(get_folio_list(pages_path, include_excluded=True))[/i]
[i] if folio_norm not in existing:[/i]
[i] # Suggest similar foldouts if any share the prefix.[/i]
[i] prefix = folio_norm[/i]
[i] similar = sorted(f for f in existing if f.startswith(prefix))[/i]
[i] msg = [f"Folio '{folio}' not found in {pages_path}."][/i]
[i] if similar:[/i]
[i] msg.append("Did you mean: " + ", ".join(similar))[/i]
[i] raise ValueError(" ".join(msg))[/i]
[i]def iter_corpus_words(pages_path: str,[/i]
[i] profile: str,[/i]
[i] include_excluded: bool = False):[/i]
[i] """[/i]
[i] Yield (folio, tag, unit_key, word) for a given high-level profile.[/i]
[i] profile:[/i]
[i] 'body' -> tags classified as body[/i]
[i] 'labels' -> tags classified as label[/i]
[i] 'marginal' -> tags classified as marginal[/i]
[i] 'special' -> tags classified as special[/i]
[i] """[/i]
[i] profile = str(profile).lower()[/i]
[i] valid = {"body", "labels", "marginal", "special"}[/i]
[i] if profile not in valid:[/i]
[i] raise ValueError(f"Unknown profile: {profile!r}; expected one of {sorted(valid)}")[/i]
[i] wanted_category = {[/i]
[i] "body": "body",[/i]
[i] "labels": "label",[/i]
[i] "marginal": "marginal",[/i]
[i] "special": "special",[/i]
[i] }[profile][/i]
[i] for folio in get_folio_list(pages_path, include_excluded=include_excluded):[/i]
[i] struct = parse_folio_structured(pages_path, folio)[/i]
[i] for tag, blocks in struct.items():[/i]
[i] cat = classify_tag(tag)[/i]
[i] if cat != wanted_category:[/i]
[i] continue[/i]
[i] if isinstance(blocks, dict):[/i]
[i] for unit_key, text in blocks.items():[/i]
[i] for word in text.split():[/i]
[i] yield folio, tag, unit_key, word[/i]
[i]# -------------------------[/i]
[i]# SANITY CHECK (kept)[/i]
[i]# -------------------------[/i]
[i]import hashlib as _hashlib[/i]
[i]def sha256(text: str) -> str:[/i]
[i] return _hashlib.sha256(text.encode('utf-8')).hexdigest()[/i]
[i]SENTINELS = {[/i]
[i] 'f49v': {'tokens': 151, 'sha256': '172a8f2b7f06e12de9e69a73509a570834b93808d81c79bb17e5d93ebb0ce0d0'},[/i]
[i] 'f68r3': {'tokens': 104, 'sha256': '8e9aa4f9c9ed68f55ab2283c85581c82ec1f85377043a6ad9eff6550ba790f61'},[/i]
[i]}[/i]
[i]def sanity_check(path):[/i]
[i] results = {}[/i]
[i] for folio, exp in SENTINELS.items():[/i]
[i] line = _parse_folio_corpus_for_sanity(path, folio)[/i]
[i] toks = len(line.split())[/i]
[i] dig = sha256(line)[/i]
[i] results[folio] = {[/i]
[i] 'ok': (toks == exp['tokens'] and dig == exp['sha256']),[/i]
[i] 'tokens': toks,[/i]
[i] 'sha256': dig[/i]
[i] }[/i]
[i] all_ok = all(v['ok'] for v in results.values())[/i]
[i] return all_ok, results[/i]
[i]# -------------------------[/i]
[i]# EXCLUSIONS (kept)[/i]
[i]# -------------------------[/i]
[i]_EXCLUDE_EMPTY_FOLIOS_ENABLED = True[/i]
[i]_EMPTY_FOLIOS = set([
'f101r2','f109r','f109v','f110r','f110v','f116v','f12r','f12v',
'f59r','f59v','f60r','f60v','f61r','f61v','f62r','f62v',
'f63r','f63v','f64r','f64v','f74r','f74v','f91r','f91v',
'f92r','f92v','f97r','f97v','f98r','f98v'
])[/i]
[i]def set_exclude_empty_folios(flag: bool):[/i]
[i] global _EXCLUDE_EMPTY_FOLIOS_ENABLED[/i]
[i] _EXCLUDE_EMPTY_FOLIOS_ENABLED = bool(flag)[/i]
[i]def get_exclude_empty_folios() -> bool:[/i]
[i] return _EXCLUDE_EMPTY_FOLIOS_ENABLED[/i]
[i]def get_excluded_folios() -> list:[/i]
[i] return sorted(_EMPTY_FOLIOS)[/i]
[i]# -------------------------[/i]
[i]# SORTING + ORDER[/i]
[i]# -------------------------[/i]
[i]_REGION_ORDER = {"P": 0, "T": 1, "C": 2, "R": 3}[/i]
[i]def sort_structured(struct):[/i]
[i] try:[/i]
[i] out = OrderedDict()[/i]
[i] def region_key(tag):[/i]
[i] return (_REGION_ORDER.get(tag, 99), tag)[/i]
[i] for tag in sorted(struct.keys(), key=region_key):[/i]
[i] blocks = struct[tag][/i]
[i] if isinstance(blocks, dict):[/i]
[i] od = OrderedDict()[/i]
[i] def idx_key(k):[/i]
[i] m = re.search(r"(\d+)$", str(k))[/i]
[i] return int(m.group(1)) if m else 999999[/i]
[i] for k in sorted(blocks.keys(), key=idx_key):[/i]
[i] od[k] = blocks[k][/i]
[i] out[tag] = od[/i]
[i] else:[/i]
[i] out[tag] = blocks[/i]
[i] return out[/i]
[i] except Exception:[/i]
[i] return struct[/i]
[i]def folio_sort_key(fid: str):[/i]
[i] fid = fid.lower().strip()[/i]
[i] m = re.match(r"^f(\d+)(r|v)(\d+)?$", fid)[/i]
[i] if not m:[/i]
[i] return (999999, 9, 999999)[/i]
[i] return (int(m.group(1)), 0 if m.group(2)=='r' else 1, int(m.group(3) or 0))[/i]
[i]# -------------------------[/i]
[i]# ASTRO REMAP (kept)[/i]
[i]# -------------------------[/i]
[i]_ASTRO_START, _ASTRO_END = 67, 73[/i]
[i]_KEEP_AS_IS = {"C", "R", "P", "T"}[/i]
[i]_folio_re_ast = re.compile(r"^f(\d+)(r|v)(\d+)?$")[/i]
[i]def _is_astro_folio(folio: str) -> bool:[/i]
[i] m = _folio_re_ast.match(str(folio).lower())[/i]
[i] if not m:[/i]
[i] return False[/i]
[i] num = int(m.group(1))[/i]
[i] return _ASTRO_START <= num <= _ASTRO_END[/i]
[i]def _astro_remap_unknown_to_R(folio: str, out: dict) -> dict:[/i]
[i] if not isinstance(out, dict) or not _is_astro_folio(folio):[/i]
[i] return sort_structured(out)[/i]
[i] if not out:[/i]
[i] return sort_structured(out)[/i]
[i] out = dict(out) # shallow copy[/i]
[i] out.setdefault("R", OrderedDict())[/i]
[i] unknown = [t for t in list(out.keys()) if t not in _KEEP_AS_IS][/i]
[i] for tag in unknown:[/i]
[i] blocks = out.get(tag, {})[/i]
[i] if isinstance(blocks, dict):[/i]
[i] for unit_key, text in blocks.items():[/i]
[i] new_key = f"R_from_{tag}_{unit_key}"[/i]
[i] if new_key in out["R"]:[/i]
[i] out["R"][new_key] += " " + (text or "")[/i]
[i] else:[/i]
[i] out["R"][new_key] = text[/i]
[i] out.pop(tag, None)[/i]
[i] return sort_structured(out)[/i]
[i]# -------------------------[/i]
[i]# PUBLIC: Folio list[/i]
[i]# -------------------------[/i]
[i]def get_folio_list(pages_path: str, include_excluded: bool=False) -> list:[/i]
[i] ids = set()[/i]
[i] for folio, _t, _i, _l, _p in _orig_iter_h_records(pages_path, None):[/i]
[i] ids.add(folio.lower())[/i]
[i] ids = sorted(ids, key=folio_sort_key)[/i]
[i] if include_excluded:[/i]
[i] return ids[/i]
[i] return [fid for fid in ids if fid not in _EMPTY_FOLIOS][/i]
[i]# -------------------------[/i]
[i]# PUBLIC: Lines in natural order (NEW)[/i]
[i]# -------------------------[/i]
[i]def get_folio_lines_ordered(pages_path: str, folio: str) -> list:[/i]
[i] folio = str(folio).lower()[/i]
[i] lines = [][/i]
[i] for _folio, tag, idx, ln, payload in _orig_iter_h_records(pages_path, folio):[/i]
[i] norm = normalize_payload(payload)[/i]
[i] if norm:[/i]
[i] region_rank = _REGION_ORDER.get(tag, 99)[/i]
[i] lines.append((region_rank, tag, int(idx), int(ln), norm))[/i]
[i] lines.sort(key=lambda x: (x[0], x[1], x[2], x[3]))[/i]
[i] return [x[4] for x in lines][/i]
[i]# -------------------------[/i]
[i]# CLI — ONLY sanity + parse[/i]
[i]# -------------------------[/i]
[i]USAGE = '''[/i]
[i]Usage:[/i]
[i] python TParser.py sanity PagesH.txt[/i]
[i] python TParser.py parse PagesH.txt <folio> corpus[/i]
[i] python TParser.py parse PagesH.txt <folio> structured[/i]
[i]'''[/i]
[i]def main(argv):[/i]
[i] if len(argv) < 3:[/i]
[i] print(USAGE); sys.exit(1)[/i]
[i] cmd = argv[1].lower()[/i]
[i] path = argv[2][/i]
[i] if cmd == 'sanity':[/i]
[i] ok, res = sanity_check(path)[/i]
[i] print("PRECHECK:", "PASS" if ok else "FAIL")[/i]
[i] for f, info in res.items():[/i]
[i] print(f" {f}: ok={info['ok']} tokens={info['tokens']} sha256={info['sha256']}")[/i]
[i] sys.exit(0 if ok else 2)[/i]
[i] if cmd == 'parse':[/i]
[i] if len(argv) != 5:[/i]
[i] print(USAGE); sys.exit(1)[/i]
[i] folio = argv[3].lower()[/i]
[i] mode = argv[4].lower()[/i]
[i] if mode == 'corpus':[/i]
[i] print(parse_folio_corpus(path, folio))[/i]
[i] elif mode == 'structured':[/i]
[i] data = parse_folio_structured(path, folio)[/i]
[i] order = ['P','T','C','R','X','N','S','V','L'][/i]
[i] for grp in order + [k for k in sorted(data) if k not in order]:[/i]
[i] if grp in data and data[grp]:[/i]
[i] print(f'[{grp}]')[/i]
[i] for k,v in data[grp].items():[/i]
[i] print(f'{k}: {v}')[/i]
[i] print()[/i]
[i] else:[/i]
[i] print(USAGE); sys.exit(1)[/i]
[i] sys.exit(0)[/i]
[i] print(USAGE); sys.exit(1)[/i]
[i]if __name__ == '__main__':[/i]
[i] main(sys.argv)[/i]
[i]
Jorge_Stolfi > Today, 10:28 AM
nablator > 11 hours ago
(Today, 10:28 AM)Jorge_Stolfi Wrote: You are not allowed to view links. Register or Login to view.(Today, 06:37 AM)Dunsel Wrote: You are not allowed to view links. Register or Login to view.like chokeeokychokoran in 49v exist where Takahashi jammed 2 words together
That looks like three words smashed together. A Voynichese word normally has at most one gallows...
All the best, --stolfi
nablator > 11 hours ago
(Today, 08:31 AM)Philipp Harland Wrote: You are not allowed to view links. Register or Login to view.Why don't you just learn how to do this analysis on your own and apply it to the Voynich? In the end, it'll be more reliable and you will have more control. It's harder than typing a bunch of prompts, but it's very rewarding.
Jorge_Stolfi > 10 hours ago
(11 hours ago)nablator Wrote: You are not allowed to view links. Register or Login to view.With several (maybe) half-spaces (that are as large as full spaces elsewhere), how do you decide if there are 2, 3, 4 or even (shocking idea) 5 words? ... For word length statistics: garbage in, garbage out.