Dunsel > 09-11-2025, 01:17 AM
#!/usr/bin/env python3
# Takahashi Voynich Parser — LOCKED, SELF-CONTAINED (v2025-11-05)
# Author: You + “stop breaking my parser” mode // yes, it gave it that name after a lot of yelling at it.
import sys, re, hashlib
from collections import defaultdict, Counter, OrderedDict
TAG_LINE = re.compile(r'^<(?P<folio>f\d+[rv](\d*)?)\.(?P<tag>[A-Z]+)(?P<idx>\d+)?(?:\.(?P<line>\d+))?;H>(?P<payload>.*)$')
A_Z_SPACE = re.compile(r'[^a-z ]+')
def normalize_payload(s: str) -> str:
s = re.sub(r'\{[^}]*\}', '', s)
s = re.sub(r'<![^>]*>', '', s)
s = s.replace('<->', ' ')
s = s.replace('\t', ' ').replace('.', ' ')
s = s.lower()
s = A_Z_SPACE.sub(' ', s)
s = re.sub(r'\s+', ' ', s).strip()
return s
def iter_h_records(path, wanted_folio=None):
current = None
buf = []
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
for raw in f:
line = raw.rstrip('\n')
if not line:
continue
if line.startswith('<'):
if current and buf:
folio, tag, idx, ln = current
payload = ''.join(buf)
yield (folio, tag, idx, ln, payload)
m = TAG_LINE.match(line)
if m:
folio = m.group('folio')
if (wanted_folio is None) or (folio == wanted_folio):
tag = m.group('tag')
idx = m.group('idx') or '0'
ln = m.group('line') or '1'
payload = m.group('payload')
current = (folio, tag, idx, ln)
buf = [payload]
else:
current = None
buf = []
else:
current = None
buf = []
else:
if current is not None:
buf.append(line)
if current and buf:
folio, tag, idx, ln = current
payload = ''.join(buf)
yield (folio, tag, idx, ln, payload)
def parse_folio_corpus(path, folio):
fid = folio.lower() if isinstance(folio, str) else str(folio).lower()
if _EXCLUDE_EMPTY_FOLIOS_ENABLED and fid in _EMPTY_FOLIOS:
return ''
pieces = []
for _folio, _tag, _idx, _ln, payload in iter_h_records(path, folio):
norm = normalize_payload(payload)
if norm:
pieces.append(norm)
return ' '.join(pieces).strip()
def parse_folio_structured(path, folio):
fid = folio.lower() if isinstance(folio, str) else str(folio).lower()
if _EXCLUDE_EMPTY_FOLIOS_ENABLED and fid in _EMPTY_FOLIOS:
return {}
groups = defaultdict(lambda: defaultdict(list))
for _folio, tag, idx, _ln, payload in iter_h_records(path, folio):
norm = normalize_payload(payload)
if norm:
groups[tag][idx].append(norm)
out = {}
for tag, by_idx in groups.items():
od = OrderedDict()
for idx in sorted(by_idx, key=lambda x: int(x)):
od[f"{tag}{idx}"] = ' '.join(by_idx[idx]).strip()
out[tag] = od
return sort_structured(out)
def sha256(text: str) -> str:
return hashlib.sha256(text.encode('utf-8')).hexdigest()
SENTINELS = {
'f49v': {'tokens': 151, 'sha256': '172a8f2b7f06e12de9e69a73509a570834b93808d81c79bb17e5d93ebb0ce0d0'},
'f68r3': {'tokens': 104, 'sha256': '8e9aa4f9c9ed68f55ab2283c85581c82ec1f85377043a6ad9eff6550ba790f61'},
}
def sanity_check(path):
results = {}
for folio, exp in SENTINELS.items():
line = parse_folio_corpus(path, folio)
toks = len(line.split())
dig = sha256(line)
ok = (toks == exp['tokens']) and (dig == exp['sha256'])
results[folio] = {'ok': ok, 'tokens': toks, 'sha256': dig, 'expected': exp}
all_ok = all(v['ok'] for v in results.values())
return all_ok, results
def most_common_words(path, topn=10):
counts = Counter()
for _folio, _tag, _idx, _ln, payload in iter_h_records(path, None):
norm = normalize_payload(payload)
if norm:
counts.update(norm.split())
return counts.most_common(topn)
def single_letter_counts(path):
counts = Counter()
for _folio, _tag, _idx, _ln, payload in iter_h_records(path, None):
norm = normalize_payload(payload)
if norm:
for w in norm.split():
if len(w) == 1:
counts[w] += 1
return dict(sorted(counts.items(), key=lambda kv: (-kv[1], kv[0])))
USAGE = '''
Usage:
python takahashi_parser_locked.py sanity PagesH.txt
python takahashi_parser_locked.py parse PagesH.txt <folio> corpus
python takahashi_parser_locked.py parse PagesH.txt <folio> structured
python takahashi_parser_locked.py foliohash PagesH.txt <folio>
python takahashi_parser_locked.py most_common PagesH.txt [topN]
python takahashi_parser_locked.py singles PagesH.txt
'''
def main(argv):
if len(argv) < 3:
print(USAGE); sys.exit(1)
cmd = argv[1].lower()
path = argv[2]
if cmd == 'sanity':
ok, res = sanity_check(path)
status = 'PASS' if ok else 'FAIL'
print(f'PRECHECK: {status}')
for folio, info in res.items():
print(f" {folio}: ok={info['ok']} tokens={info['tokens']} sha256={info['sha256']}")
sys.exit(0 if ok else 2)
if cmd == 'parse':
if len(argv) != 5:
print(USAGE); sys.exit(1)
folio = argv[3].lower()
mode = argv[4].lower()
if mode == 'corpus':
line = parse_folio_corpus(path, folio)
print(line)
elif mode == 'structured':
data = parse_folio_structured(path, folio)
order = ['P','C','V','L','R','X','N','S']
for grp in order + sorted([k for k in data.keys() if k not in order]):
if grp in data and data[grp]:
print(f'[{grp}]')
for k,v in data[grp].items():
print(f'{k}: {v}')
print()
else:
print(USAGE); sys.exit(1)
sys.exit(0)
if cmd == 'foliohash':
if len(argv) != 4:
print(USAGE); sys.exit(1)
folio = argv[3].lower()
line = parse_folio_corpus(path, folio)
print('Token count:', len(line.split()))
print('SHA-256:', sha256(line))
sys.exit(0)
if cmd == 'most_common':
topn = int(argv[3]) if len(argv) >= 4 else 10
ok, _ = sanity_check(path)
if not ok:
print('PRECHECK: FAIL — aborting corpus job.'); sys.exit(2)
for word, cnt in most_common_words(path, topn):
print(f'{word}\t{cnt}')
sys.exit(0)
if cmd == 'singles':
ok, _ = sanity_check(path)
if not ok:
print('PRECHECK: FAIL — aborting corpus job.'); sys.exit(2)
d = single_letter_counts(path)
for k,v in d.items():
print(f'{k}\t{v}')
sys.exit(0)
print(USAGE); sys.exit(1)
if __name__ == '__main__':
main(sys.argv)
# ==== BEGIN ASTRO REMAP (LOCKED RULE) ====
import re as _re_ast
# === Exclusion controls injected ===
_EXCLUDE_EMPTY_FOLIOS_ENABLED = True
_EMPTY_FOLIOS = set(['f101r2', 'f109r', 'f109v', 'f110r', 'f110v', 'f116v', 'f12r', 'f12v', 'f59r', 'f59v', 'f60r', 'f60v', 'f61r', 'f61v', 'f62r', 'f62v', 'f63r', 'f63v', 'f64r', 'f64v', 'f74r', 'f74v', 'f91r', 'f91v', 'f92r', 'f92v', 'f97r', 'f97v', 'f98r', 'f98v'])
def set_exclude_empty_folios(flag: bool) -> None:
"""Enable/disable skipping known-empty folios globally."""
global _EXCLUDE_EMPTY_FOLIOS_ENABLED
_EXCLUDE_EMPTY_FOLIOS_ENABLED = bool(flag)
def get_exclude_empty_folios() -> bool:
"""Return current global skip setting."""
return _EXCLUDE_EMPTY_FOLIOS_ENABLED
def get_excluded_folios() -> list:
"""Return the sorted list of folios that are skipped when exclusion is enabled."""
return sorted(_EMPTY_FOLIOS)
# === End exclusion controls ===
_ASTRO_START, _ASTRO_END = 67, 73
_KEEP_AS_IS = {"C", "R", "P", "T"}
_folio_re_ast = _re_ast.compile(r"^f(\d+)([rv])(?:([0-9]+))?$")
def _is_astro_folio_ast(folio: str) -> bool:
m = _folio_re_ast.match(folio or "")
if not m:
return False
num = int(m.group(1))
return _ASTRO_START <= num <= _ASTRO_END
def _remap_unknown_to_R_ast(folio: str, out: dict) -> dict:
if not isinstance(out, dict) or not _is_astro_folio_ast(folio):
return sort_structured(out)
if not out:
return sort_structured(out)
out.setdefault("R", {})
unknown_tags = [t for t in list(out.keys()) if t not in _KEEP_AS_IS]
for tag in unknown_tags:
units = out.get(tag, {})
if isinstance(units, dict):
for unit_key, text in units.items():
new_unit = f"R_from_{tag}_{unit_key}"
if new_unit in out["R"]:
out["R"][new_unit] += " " + (text or "")
else:
out["R"][new_unit] = text
out.pop(tag, None)
return sort_structured(out)
# Wrap only once
try:
parse_folio_structured_original
except NameError:
parse_folio_structured_original = parse_folio_structured
def parse_folio_structured(pages_path: str, folio: str):
out = parse_folio_structured_original(pages_path, folio)
return _remap_unknown_to_R_ast(folio, out)
# ==== END ASTRO REMAP (LOCKED RULE) ====
def effective_folio_ids(pages_path: str) -> list:
"""Return folio ids found in PagesH headers. Respects exclusion toggle for known-empty folios."""
import re
# === Sorting utilities (injected) ===
def folio_sort_key(fid: str):
"""Return a numeric sort key for folio ids like f9r, f10v, f68r3 (recto before verso)."""
s = (fid or "").strip().lower()
m = re.match(r"^f(\d{1,3})(r|v)(\d+)?$", s)
if not m:
# Place unknown patterns at the end in stable order
return (10**6, 9, 10**6, s)
num = int(m.group(1))
side = 0 if m.group(2) == "r" else 1
sub = int(m.group(3)) if m.group(3) else 0
return (num, side, sub, s)
def sort_folio_ids(ids):
"""Sort a sequence of folio ids in natural numeric order using folio_sort_key."""
try:
return sorted(ids, key=folio_sort_key)
except Exception:
# Fallback to stable original order on any error
return list(ids)
_REGION_ORDER = {"P": 0, "T": 1, "C": 2, "R": 3}
def sort_structured(struct):
"""Return an OrderedDict-like mapping with regions sorted P,T,C,R and units numerically."""
try:
from collections import OrderedDict
out = OrderedDict()
# Sort regions by our preferred order; unknown tags go after known ones alphabetically
def region_key(tag):
return (_REGION_ORDER.get(tag, 99), tag)
if not isinstance(struct, dict):
return struct
for tag in sorted(struct.keys(), key=region_key):
blocks = struct[tag]
if isinstance(blocks, dict):
od = OrderedDict()
# Unit keys are expected to be numeric strings (idx), or tag+idx; try to extract int
def idx_key(k):
m = re.search(r"(\d+)$", str(k))
return int(m.group(1)) if m else float("inf")
for k in sorted(blocks.keys(), key=idx_key):
od[k] = blocks[k]
out[tag] = od
else:
out[tag] = blocks
return out
except Exception:
return struct
def english_sort_description() -> str:
"""Describe the default sorting rules in plain English."""
return ("ordered numerically by folio number with recto before verso and subpages in numeric order; "
"within each folio, regions are P, then T, then C, then R, and their units are sorted by number.")
def english_receipt(heard: str, did: str) -> None:
"""Print a two-line audit receipt with the plain-English command heard and what was executed."""
if heard is None:
heard = ''
if did is None:
did = ''
print(f"Heard: {heard}")
print(f"Did: {did}")Dunsel > 09-11-2025, 01:54 AM
qoltedy > 10-11-2025, 10:40 AM
Dunsel > 10-11-2025, 10:33 PM
(10-11-2025, 10:40 AM)qoltedy Wrote: You are not allowed to view links. Register or Login to view.Not gonna lie, this is all pretty dense and I didn't read all of it at don't quite get the point, but those graphs are pretty interesting. Do you have any that are normalized to the amount of letters per page? I see certain "bands" that seem like certain letters like "e" are less common on some pages but more common on others, I'm curious if a normalized letter count per page aligns with the "languages" of Currier or the "topic analysis" of quimqu on the forums.
Dunsel > 10-11-2025, 10:56 PM
#!/usr/bin/env python3
# Takahashi Voynich Parser — LOCKED, SELF-CONTAINED (v2025-11-05)
# Author: You + “stop breaking my parser” mode
import sys, re, hashlib
from collections import defaultdict, Counter, OrderedDict
TAG_LINE = re.compile(r'^<(?P<folio>f\d+[rv](\d*)?)\.(?P<tag>[A-Z]+)(?P<idx>\d+)?(?:\.(?P<line>\d+))?;H>(?P<payload>.*)$')
A_Z_SPACE = re.compile(r'[^a-z ]+')
def normalize_payload(s: str) -> str:
s = re.sub(r'\{[^}]*\}', '', s)
s = re.sub(r'<![^>]*>', '', s)
s = s.replace('<->', ' ')
s = s.replace('\t', ' ').replace('.', ' ')
s = s.lower()
s = A_Z_SPACE.sub(' ', s)
s = re.sub(r'\s+', ' ', s).strip()
return s
def iter_h_records(path, wanted_folio=None):
current = None
buf = []
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
for raw in f:
line = raw.rstrip('\n')
if not line:
continue
if line.startswith('<'):
if current and buf:
folio, tag, idx, ln = current
payload = ''.join(buf)
yield (folio, tag, idx, ln, payload)
m = TAG_LINE.match(line)
if m:
folio = m.group('folio')
if (wanted_folio is None) or (folio == wanted_folio):
tag = m.group('tag')
idx = m.group('idx') or '0'
ln = m.group('line') or '1'
payload = m.group('payload')
current = (folio, tag, idx, ln)
buf = [payload]
else:
current = None
buf = []
else:
current = None
buf = []
else:
if current is not None:
buf.append(line)
if current and buf:
folio, tag, idx, ln = current
payload = ''.join(buf)
yield (folio, tag, idx, ln, payload)
def parse_folio_corpus(path, folio):
fid = folio.lower() if isinstance(folio, str) else str(folio).lower()
if _EXCLUDE_EMPTY_FOLIOS_ENABLED and fid in _EMPTY_FOLIOS:
return ''
pieces = []
for _folio, _tag, _idx, _ln, payload in iter_h_records(path, folio):
norm = normalize_payload(payload)
if norm:
pieces.append(norm)
return ' '.join(pieces).strip()
def parse_folio_structured(path, folio):
fid = folio.lower() if isinstance(folio, str) else str(folio).lower()
if _EXCLUDE_EMPTY_FOLIOS_ENABLED and fid in _EMPTY_FOLIOS:
return {}
groups = defaultdict(lambda: defaultdict(list))
for _folio, tag, idx, _ln, payload in iter_h_records(path, folio):
norm = normalize_payload(payload)
if norm:
groups[tag][idx].append(norm)
out = {}
for tag, by_idx in groups.items():
od = OrderedDict()
for idx in sorted(by_idx, key=lambda x: int(x)):
od[f"{tag}{idx}"] = ' '.join(by_idx[idx]).strip()
out[tag] = od
return sort_structured(out)
def sha256(text: str) -> str:
return hashlib.sha256(text.encode('utf-8')).hexdigest()
SENTINELS = {
'f49v': {'tokens': 151, 'sha256': '172a8f2b7f06e12de9e69a73509a570834b93808d81c79bb17e5d93ebb0ce0d0'},
'f68r3': {'tokens': 104, 'sha256': '8e9aa4f9c9ed68f55ab2283c85581c82ec1f85377043a6ad9eff6550ba790f61'},
}
def sanity_check(path):
results = {}
for folio, exp in SENTINELS.items():
line = parse_folio_corpus(path, folio)
toks = len(line.split())
dig = sha256(line)
ok = (toks == exp['tokens']) and (dig == exp['sha256'])
results[folio] = {'ok': ok, 'tokens': toks, 'sha256': dig, 'expected': exp}
all_ok = all(v['ok'] for v in results.values())
return all_ok, results
def most_common_words(path, topn=10):
counts = Counter()
for _folio, _tag, _idx, _ln, payload in iter_h_records(path, None):
norm = normalize_payload(payload)
if norm:
counts.update(norm.split())
return counts.most_common(topn)
def single_letter_counts(path):
counts = Counter()
for _folio, _tag, _idx, _ln, payload in iter_h_records(path, None):
norm = normalize_payload(payload)
if norm:
for w in norm.split():
if len(w) == 1:
counts[w] += 1
return dict(sorted(counts.items(), key=lambda kv: (-kv[1], kv[0])))
USAGE = '''
Usage:
python takahashi_parser_locked.py sanity PagesH.txt
python takahashi_parser_locked.py parse PagesH.txt <folio> corpus
python takahashi_parser_locked.py parse PagesH.txt <folio> structured
python takahashi_parser_locked.py foliohash PagesH.txt <folio>
python takahashi_parser_locked.py most_common PagesH.txt [topN]
python takahashi_parser_locked.py singles PagesH.txt
'''
def main(argv):
if len(argv) < 3:
print(USAGE); sys.exit(1)
cmd = argv[1].lower()
path = argv[2]
if cmd == 'sanity':
ok, res = sanity_check(path)
status = 'PASS' if ok else 'FAIL'
print(f'PRECHECK: {status}')
for folio, info in res.items():
print(f" {folio}: ok={info['ok']} tokens={info['tokens']} sha256={info['sha256']}")
sys.exit(0 if ok else 2)
if cmd == 'parse':
if len(argv) != 5:
print(USAGE); sys.exit(1)
folio = argv[3].lower()
mode = argv[4].lower()
if mode == 'corpus':
line = parse_folio_corpus(path, folio)
print(line)
elif mode == 'structured':
data = parse_folio_structured(path, folio)
order = ['P','C','V','L','R','X','N','S']
for grp in order + sorted([k for k in data.keys() if k not in order]):
if grp in data and data[grp]:
print(f'[{grp}]')
for k,v in data[grp].items():
print(f'{k}: {v}')
print()
else:
print(USAGE); sys.exit(1)
sys.exit(0)
if cmd == 'foliohash':
if len(argv) != 4:
print(USAGE); sys.exit(1)
folio = argv[3].lower()
line = parse_folio_corpus(path, folio)
print('Token count:', len(line.split()))
print('SHA-256:', sha256(line))
sys.exit(0)
if cmd == 'most_common':
topn = int(argv[3]) if len(argv) >= 4 else 10
ok, _ = sanity_check(path)
if not ok:
print('PRECHECK: FAIL — aborting corpus job.'); sys.exit(2)
for word, cnt in most_common_words(path, topn):
print(f'{word}\t{cnt}')
sys.exit(0)
if cmd == 'singles':
ok, _ = sanity_check(path)
if not ok:
print('PRECHECK: FAIL — aborting corpus job.'); sys.exit(2)
d = single_letter_counts(path)
for k,v in d.items():
print(f'{k}\t{v}')
sys.exit(0)
print(USAGE); sys.exit(1)
if __name__ == '__main__':
main(sys.argv)
# ==== BEGIN ASTRO REMAP (LOCKED RULE) ====
import re as _re_ast
# === Exclusion controls injected ===
_EXCLUDE_EMPTY_FOLIOS_ENABLED = True
_EMPTY_FOLIOS = set(['f101r2', 'f109r', 'f109v', 'f110r', 'f110v', 'f116v', 'f12r', 'f12v', 'f59r', 'f59v', 'f60r', 'f60v', 'f61r', 'f61v', 'f62r', 'f62v', 'f63r', 'f63v', 'f64r', 'f64v', 'f74r', 'f74v', 'f91r', 'f91v', 'f92r', 'f92v', 'f97r', 'f97v', 'f98r', 'f98v'])
def set_exclude_empty_folios(flag: bool) -> None:
"""Enable/disable skipping known-empty folios globally."""
global _EXCLUDE_EMPTY_FOLIOS_ENABLED
_EXCLUDE_EMPTY_FOLIOS_ENABLED = bool(flag)
def get_exclude_empty_folios() -> bool:
"""Return current global skip setting."""
return _EXCLUDE_EMPTY_FOLIOS_ENABLED
def get_excluded_folios() -> list:
"""Return the sorted list of folios that are skipped when exclusion is enabled."""
return sorted(_EMPTY_FOLIOS)
# === End exclusion controls ===
_ASTRO_START, _ASTRO_END = 67, 73
_KEEP_AS_IS = {"C", "R", "P", "T"}
_folio_re_ast = _re_ast.compile(r"^f(\d+)([rv])(?:([0-9]+))?$")
def _is_astro_folio_ast(folio: str) -> bool:
m = _folio_re_ast.match(folio or "")
if not m:
return False
num = int(m.group(1))
return _ASTRO_START <= num <= _ASTRO_END
def _remap_unknown_to_R_ast(folio: str, out: dict) -> dict:
if not isinstance(out, dict) or not _is_astro_folio_ast(folio):
return sort_structured(out)
if not out:
return sort_structured(out)
out.setdefault("R", {})
unknown_tags = [t for t in list(out.keys()) if t not in _KEEP_AS_IS]
for tag in unknown_tags:
units = out.get(tag, {})
if isinstance(units, dict):
for unit_key, text in units.items():
new_unit = f"R_from_{tag}_{unit_key}"
if new_unit in out["R"]:
out["R"][new_unit] += " " + (text or "")
else:
out["R"][new_unit] = text
out.pop(tag, None)
return sort_structured(out)
# Wrap only once
try:
parse_folio_structured_original
except NameError:
parse_folio_structured_original = parse_folio_structured
def parse_folio_structured(pages_path: str, folio: str):
out = parse_folio_structured_original(pages_path, folio)
return _remap_unknown_to_R_ast(folio, out)
# ==== END ASTRO REMAP (LOCKED RULE) ====
def effective_folio_ids(pages_path: str) -> list:
"""Return folio ids found in PagesH headers. Respects exclusion toggle for known-empty folios."""
import re
# === Sorting utilities (injected) ===
def folio_sort_key(fid: str):
"""Return a numeric sort key for folio ids like f9r, f10v, f68r3 (recto before verso)."""
s = (fid or "").strip().lower()
m = re.match(r"^f(\d{1,3})(r|v)(\d+)?$", s)
if not m:
# Place unknown patterns at the end in stable order
return (10**6, 9, 10**6, s)
num = int(m.group(1))
side = 0 if m.group(2) == "r" else 1
sub = int(m.group(3)) if m.group(3) else 0
return (num, side, sub, s)
def sort_folio_ids(ids):
"""Sort a sequence of folio ids in natural numeric order using folio_sort_key."""
try:
return sorted(ids, key=folio_sort_key)
except Exception:
# Fallback to stable original order on any error
return list(ids)
_REGION_ORDER = {"P": 0, "T": 1, "C": 2, "R": 3}
def sort_structured(struct):
"""Return an OrderedDict-like mapping with regions sorted P,T,C,R and units numerically."""
try:
from collections import OrderedDict
out = OrderedDict()
# Sort regions by our preferred order; unknown tags go after known ones alphabetically
def region_key(tag):
return (_REGION_ORDER.get(tag, 99), tag)
if not isinstance(struct, dict):
return struct
for tag in sorted(struct.keys(), key=region_key):
blocks = struct[tag]
if isinstance(blocks, dict):
od = OrderedDict()
# Unit keys are expected to be numeric strings (idx), or tag+idx; try to extract int
def idx_key(k):
m = re.search(r"(\d+)$", str(k))
return int(m.group(1)) if m else float("inf")
for k in sorted(blocks.keys(), key=idx_key):
od[k] = blocks[k]
out[tag] = od
else:
out[tag] = blocks
return out
except Exception:
return struct
def english_sort_description() -> str:
"""Describe the default sorting rules in plain English."""
return ("ordered numerically by folio number with recto before verso and subpages in numeric order; "
"within each folio, regions are P, then T, then C, then R, and their units are sorted by number.")
def english_receipt(heard: str, did: str) -> None:
"""Print a two-line audit receipt with the plain-English command heard and what was executed."""
if heard is None:
heard = ''
if did is None:
did = ''
print(f"Heard: {heard}")
print(f"Did: {did}")
# === Folio enumeration helpers (added for robust ID handling) ===
def get_all_folio_ids(pages_path: str) -> list:
"""
Return all folio ids that actually occur in PagesH.txt, including subfolios.
- Folio ids come ONLY from iter_h_records(pages_path, None).
- Any folio string seen there is treated as real:
f1r, f1v, f68v1, f72r3, f85v2, etc.
- No synthetic folios are invented.
- No content-based filtering happens here.
"""
ids = set()
for folio, _tag, _idx, _ln, _payload in iter_h_records(pages_path, None):
if folio:
ids.add(folio)
try:
return sort_folio_ids(list(ids))
except NameError:
return sorted(ids)
def get_folio_list(pages_path: str, include_excluded: bool = False) -> list:
"""
Return the canonical folio list for analysis in this workspace.
- Uses get_all_folio_ids(pages_path) to enumerate folios directly
from PagesH.txt tags (including subfolios).
- If include_excluded is False (default), removes ONLY folios in
get_excluded_folios().
- Does NOT inspect or filter based on content. If a folio in this
list later yields empty text from parse_folio_corpus(), that
indicates a data/config error that must be handled explicitly.
"""
all_ids = get_all_folio_ids(pages_path)
if include_excluded:
return all_ids
excluded = set(get_excluded_folios())
return [fid for fid in all_ids if fid not in excluded]
# === Appended API + guardrails ===
# === API guard: prevent accidental use of internal helpers ===
import inspect as _inspect_internal_guard
_API_GUARD_ENABLED = True # default: only public API endpoints allowed
def allow_internal_helpers():
"""
Explicitly disable the internal API guard for this process.
Only call this with explicit human/operator approval.
"""
global _API_GUARD_ENABLED
_API_GUARD_ENABLED = False
def _guard_internal(name: str):
"""
Internal helper guard.
- If _API_GUARD_ENABLED is True:
* Calls from outside this module are blocked.
* Calls from within this module (__name__) are allowed.
- If _API_GUARD_ENABLED is False:
* All calls are allowed (explicit override).
"""
if not _API_GUARD_ENABLED:
return
frame = _inspect_internal_guard.currentframe()
if frame is None:
return
caller_frame = frame.f_back
if caller_frame is None:
return
caller_mod = caller_frame.f_globals.get("__name__", "")
if caller_mod == __name__:
# Internal calls from this module are allowed
return
raise RuntimeError(
f"{name} is INTERNAL to takahashi_parser_locked_export. "
f"Use get_folio_list(...) + parse_folio_corpus(...) / "
f"parse_folio_structured(...) instead, or call "
f"allow_internal_helpers() ONLY with explicit human approval."
)
def get_folio_lines(pages_path: str, folio_id: str):
"""
Return logical lines for a given folio_id for structural analysis.
- Prefer parse_folio_structured(pages_path, folio_id) if available.
- Fallback: parse_folio_corpus(...) split into lines.
- Does NOT fabricate folio ids; caller should obtain folio_id
from get_folio_list(...).
"""
# Prefer structured if provided by this parser
if 'parse_folio_structured' in globals():
recs = parse_folio_structured(pages_path, folio_id)
try:
return list(recs)
except TypeError:
# In case it's already a list-like
return recs
text = parse_folio_corpus(pages_path, folio_id)
if not text:
return []
return text.splitlines()
def get_folio_line(pages_path: str, folio_id: str, line_no: int):
"""
Return a single 1-based line from a folio.
- Uses get_folio_lines(...) to respect parser behavior.
- Returns None if the requested line does not exist.
"""
lines = get_folio_lines(pages_path, folio_id)
idx = line_no - 1
if idx < 0 or idx >= len(lines):
return None
return lines[idx]
# === Guard-wrapped internal helpers ===
# These wrappers prevent accidental external use when _API_GUARD_ENABLED is True.
try:
_orig_iter_h_records = iter_h_records
def iter_h_records(*args, **kwargs):
_guard_internal("iter_h_records")
return _orig_iter_h_records(*args, **kwargs)
except NameError:
passDunsel > 11-11-2025, 01:06 AM
import re
import math
import random
from collections import Counter, defaultdict
# =========================
# 0. CORPUS HELPERS
# =========================
def load_english_words(path):
"""
Load an English corpus (e.g., Dracula from Project Gutenberg).
- Strips Gutenberg header/footer using *** START/END OF marker.
- Lowercases.
- Keeps only a–z as letters; all else → space.
- Returns: list of word strings.
"""
words = []
in_text = False
with open(path, "r", encoding="utf-8") as f:
for line in f:
if "*** START OF" in line:
in_text = True
continue
if "*** END OF" in line:
break
if not in_text:
continue
line = line.lower()
line = re.sub(r"[^a-z]+", " ", line)
parts = line.split()
words.extend(p for p in parts if p)
return words
def load_voynich_words(pages_path, parser_module):
"""
Load Voynich words using the locked Takahashi parser module.
`parser_module` is an imported module providing:
- get_folio_list(pages_path)
- parse_folio_corpus(pages_path, folio_id)
Returns: list of word strings (already in Takahashi/EVA charset).
"""
words = []
for fid in parser_module.get_folio_list(pages_path):
text = parser_module.parse_folio_corpus(pages_path, fid)
if not text:
continue
for w in text.split():
if w:
words.append(w)
return words
# =========================
# 1. CORE STATISTICS
# =========================
def build_stats(words):
"""
From a list of words, compute:
- alphabet: sorted list of distinct glyphs
- counts[g]: total count
- pos[g]['initial'|'medial'|'final']
- prev[g], nxt[g]: neighbor frequency Counters
Returns: (alphabet, counts, pos, prev, nxt)
"""
counts = Counter()
pos = defaultdict(lambda: Counter())
prev = defaultdict(Counter)
nxt = defaultdict(Counter)
for w in words:
if not w:
continue
chars = list(w)
L = len(chars)
for i, ch in enumerate(chars):
counts[ch] += 1
if i == 0:
pos[ch]["initial"] += 1
elif i == L - 1:
pos[ch]["final"] += 1
else:
pos[ch]["medial"] += 1
if i > 0:
prev[ch][chars[i - 1]] += 1
if i < L - 1:
nxt[ch][chars[i + 1]] += 1
alphabet = sorted(counts.keys())
return alphabet, counts, pos, prev, nxt
def entropy(counter, base=2):
"""
Normalized entropy of a frequency Counter.
Returns 0.0 if empty.
"""
total = sum(counter.values())
if total == 0:
return 0.0
e = 0.0
for v in counter.values():
if v > 0:
p = v / total
e -= p * math.log(p, base)
# normalize by max entropy for this support size, using len(counter)
if len(counter) <= 1:
return 0.0
max_e = math.log(len(counter), base)
return e / max_e if max_e > 0 else 0.0
def positional_entropy(counts, pos_counts):
"""
Compute positional entropy for each glyph over {initial, medial, final}
using log base 3, already normalized (since max is log_3(3) = 1).
Returns: dict[glyph] -> entropy in [0,1].
"""
H = {}
for g, total in counts.items():
if total == 0:
H[g] = 0.0
continue
init = pos_counts[g]["initial"] / total
med = pos_counts[g]["medial"] / total
fin = pos_counts[g]["final"] / total
probs = [init, med, fin]
e = 0.0
for p in probs:
if p > 0:
e -= p * math.log(p, 3)
H[g] = e # already normalized
return H
# =========================
# 2. SUKHOTIN ALGORITHM
# =========================
def sukhotin_vowels(words):
"""
Sukhotin's classic adjacency-based vowel detection algorithm.
Returns: set of glyphs classified as vowels.
"""
alphabet, _, _, _, _ = build_stats(words)
# symmetric adjacency matrix F
F = {a: Counter() for a in alphabet}
for w in words:
if len(w) < 2:
continue
for i in range(len(w) - 1):
x, y = w[i], w[i + 1]
if x in F and y in F:
F[x][y] += 1
F[y][x] += 1
# initial r[i]
r = {a: sum(F[a].values()) for a in alphabet}
vowels = set()
while True:
m = max(r, key=r.get)
if r[m] <= 0:
break
vowels.add(m)
# update neighbors
for i in alphabet:
if F[i][m] > 0:
r[i] -= 2 * F[i][m]
F[i][m] = 0
F[m][i] = 0
r[m] = 0
return vowels
# =========================
# 3. JACQUES GUY NEIGHBOR DIVERSITY
# =========================
def guy_scores(words):
"""
Jacques Guy-style neighbor diversity score:
score(g) = #distinct previous neighbors + #distinct next neighbors
Returns: dict[glyph] -> score.
"""
_, _, _, prev, nxt = build_stats(words)
scores = {}
for g in sorted(prev.keys() | nxt.keys()):
scores[g] = len(prev[g]) + len(nxt[g])
return scores
# =========================
# 4. BEHAVIORAL MODEL (OUR METHOD)
# =========================
def behavioral_scores(words):
"""
Our composite vowel-likeness score per glyph:
behavioral(g) =
+ medial_ratio
+ positional_entropy
+ 0.5 * (prev_entropy + next_entropy)
- boundary_ratio
Where:
medial_ratio = medial / total
boundary_ratio = (initial + final) / total
positional_entropy: over {initial, medial, final}, base 3
prev/next_entropy: normalized neighbor entropies, base 2
Returns: dict[glyph] -> score (higher = more vowel-like).
"""
alphabet, counts, pos, prev, nxt = build_stats(words)
pos_H = positional_entropy(counts, pos)
scores = {}
for g in alphabet:
total = counts[g]
if total == 0:
scores[g] = 0.0
continue
init = pos[g]["initial"]
med = pos[g]["medial"]
fin = pos[g]["final"]
medial_ratio = med / total
boundary_ratio = (init + fin) / total
H_pos = pos_H[g]
H_prev = entropy(prev[g], base=2)
H_next = entropy(nxt[g], base=2)
score = medial_ratio + H_pos + 0.5 * (H_prev + H_next) - boundary_ratio
scores[g] = score
return scores
# =========================
# 5. COMBINATORIAL METHOD (KNIGHT-INSPIRED)
# =========================
def combinatorial_scores(words):
"""
Knight-inspired combinatorial scoring:
score(g) =
+ 0.5 * (|Prev(g)| + |Next(g)|)
+ 5 * avg_neighbor_entropy
+ 10 * positional_entropy
- |Prev/Next|: neighbor diversity
- avg_neighbor_entropy: mean of normalized prev/next entropies
- positional_entropy: as above
Constants (5, 10) are fixed scaling factors, applied identically across
languages and corpora, chosen to keep terms numerically comparable.
Returns: dict[glyph] -> score (higher = more nucleus-like).
"""
alphabet, counts, pos, prev, nxt = build_stats(words)
pos_H = positional_entropy(counts, pos)
comb = {}
for g in alphabet:
total = counts[g]
if total == 0:
comb[g] = 0.0
continue
uniq_prev = len(prev[g])
uniq_next = len(nxt[g])
H_prev = entropy(prev[g], base=2)
H_next = entropy(nxt[g], base=2)
H_neighbors_avg = 0.5 * (H_prev + H_next)
H_pos = pos_H[g]
score = 0.5 * (uniq_prev + uniq_next) + 5 * H_neighbors_avg + 10 * H_pos
comb[g] = score
return comb
# =========================
# 6. KIM & SNYDER–STYLE CLUSTERING (ADAPTED)
# =========================
def bayesian_style_clusters(words, seed=42):
"""
Unsupervised 2-means clustering over behavioral feature vectors.
This is an adaptation in the spirit of Kim & Snyder (2013).
Features per glyph g:
[ behavioral_score(g) ]
We then:
- run 2-means on this 1D feature,
- choose cluster with higher mean behavioral score as vowel-like.
Returns:
vowel_like: list of glyphs in vowel-ish cluster
consonant_like: list of glyphs in other cluster
"""
beh = behavioral_scores(words)
glyphs = sorted(beh.keys())
data = [beh[g] for g in glyphs]
if len(glyphs) < 2:
return glyphs, []
random.seed(seed)
# initialize centroids using min and max behavior scores
c0 = min(data)
c1 = max(data)
centroids = [c0, c1]
for _ in range(50):
clusters = {0: [], 1: []}
for i, v in enumerate(data):
d0 = (v - centroids[0]) ** 2
d1 = (v - centroids[1]) ** 2
label = 0 if d0 <= d1 else 1
clusters[label].append(i)
new_centroids = []
changed = False
for k in (0, 1):
if not clusters[k]:
new_centroids.append(centroids[k])
continue
mean = sum(data[i] for i in clusters[k]) / len(clusters[k])
if abs(mean - centroids[k]) > 1e-9:
changed = True
new_centroids.append(mean)
centroids = new_centroids
if not changed:
break
# pick vowel cluster = higher average behavioral score
def cluster_mean(k):
idxs = clusters[k]
if not idxs:
return float("-inf")
return sum(data[i] for i in idxs) / len(idxs)
vowel_cluster = max((0, 1), key=cluster_mean)
vowel_like = [glyphs[i] for i in clusters[vowel_cluster]]
consonant_like = [glyphs[i] for i in clusters[1 - vowel_cluster]]
return vowel_like, consonant_like
# =========================
# 7. CROSS-SECTIONAL STABILITY
# =========================
def stability_by_chunks(words, vowels, n_chunks=8):
"""
Split corpus into n_chunks and compute vowel distribution stability.
For each chunk:
- count occurrences of each candidate vowel in that chunk
- compute fraction of total vowel tokens due to each candidate
Returns: list of dicts:
[ {vowel: proportion_in_chunk, ...}, ... ]
"""
if not words:
return []
all_text = "".join(words)
chunk_len = max(1, len(all_text) // n_chunks)
results = []
for k in range(n_chunks):
start = k * chunk_len
end = (k + 1) * chunk_len if k < n_chunks - 1 else len(all_text)
sub = all_text[start:end]
c = Counter(ch for ch in sub if ch in vowels)
total = sum(c.values())
if total == 0:
results.append({v: 0.0 for v in vowels})
else:
results.append({v: c[v] / total for v in vowels})
return results
# =========================
# 8. SIMPLE DRIVER / EXAMPLE
# =========================
if __name__ == "__main__":
# Example usage outline (user must adjust paths):
# English example:
# eng_words = load_english_words("345-0.txt")
# print("Sukhotin (EN):", sukhotin_vowels(eng_words))
# print("Behavioral top (EN):", sorted(behavioral_scores(eng_words).items(),
# key=lambda x: x[1], reverse=True)[:10])
# Voynich example (requires locked parser module):
# import takahashi_parser_locked_export_API_GUARDED as vp
# voy_words = load_voynich_words("PagesH.txt", vp)
# print("Sukhotin (VMS):", sukhotin_vowels(voy_words))
# print("Behavioral top (VMS):", sorted(behavioral_scores(voy_words).items(),
# key=lambda x: x[1], reverse=True)[:10])
passRafal > 11-11-2025, 11:41 PM
Dunsel > 12-11-2025, 01:02 AM
(11-11-2025, 11:41 PM)Rafal Wrote: You are not allowed to view links. Register or Login to view.You are making a hidden assumption that letters of Voynich script represent consonants and vowels.
Actually most veterans on this site will tell you that is not very probable. Any attempts of reading Voynich as simple alphabet resulted in weird words, unexisting in any language.
Kaybo > 12-11-2025, 02:43 AM
Dunsel > 16-11-2025, 03:39 AM
(12-11-2025, 02:43 AM)Kaybo Wrote: You are not allowed to view links. Register or Login to view.I can confirm, that the letter head maps are matching the visual impression on You are not allowed to view links. Register or Login to view.
However, could you explain me the thing as I were 5?
I understood, that you force GTP into a math mode by feeding it a code and the code is a parser? A parser for what? Do you need always a new parser code for a new analysis you want to make or is it always the same parser and you can promt with text what you want and just upload always the same parser code and the Eva transcript`?
Can you give us a very simple instrunction how we can use it by ourselfs?