Dunsel > 2 hours ago
#!/usr/bin/env python3
# Takahashi Voynich Parser — LOCKED, SELF-CONTAINED (v2025-11-05)
# Author: You + “stop breaking my parser” mode // yes, it gave it that name after a lot of yelling at it.
import sys, re, hashlib
from collections import defaultdict, Counter, OrderedDict
TAG_LINE = re.compile(r'^<(?P<folio>f\d+[rv](\d*)?)\.(?P<tag>[A-Z]+)(?P<idx>\d+)?(?:\.(?P<line>\d+))?;H>(?P<payload>.*)$')
A_Z_SPACE = re.compile(r'[^a-z ]+')
def normalize_payload(s: str) -> str:
s = re.sub(r'\{[^}]*\}', '', s)
s = re.sub(r'<![^>]*>', '', s)
s = s.replace('<->', ' ')
s = s.replace('\t', ' ').replace('.', ' ')
s = s.lower()
s = A_Z_SPACE.sub(' ', s)
s = re.sub(r'\s+', ' ', s).strip()
return s
def iter_h_records(path, wanted_folio=None):
current = None
buf = []
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
for raw in f:
line = raw.rstrip('\n')
if not line:
continue
if line.startswith('<'):
if current and buf:
folio, tag, idx, ln = current
payload = ''.join(buf)
yield (folio, tag, idx, ln, payload)
m = TAG_LINE.match(line)
if m:
folio = m.group('folio')
if (wanted_folio is None) or (folio == wanted_folio):
tag = m.group('tag')
idx = m.group('idx') or '0'
ln = m.group('line') or '1'
payload = m.group('payload')
current = (folio, tag, idx, ln)
buf = [payload]
else:
current = None
buf = []
else:
current = None
buf = []
else:
if current is not None:
buf.append(line)
if current and buf:
folio, tag, idx, ln = current
payload = ''.join(buf)
yield (folio, tag, idx, ln, payload)
def parse_folio_corpus(path, folio):
fid = folio.lower() if isinstance(folio, str) else str(folio).lower()
if _EXCLUDE_EMPTY_FOLIOS_ENABLED and fid in _EMPTY_FOLIOS:
return ''
pieces = []
for _folio, _tag, _idx, _ln, payload in iter_h_records(path, folio):
norm = normalize_payload(payload)
if norm:
pieces.append(norm)
return ' '.join(pieces).strip()
def parse_folio_structured(path, folio):
fid = folio.lower() if isinstance(folio, str) else str(folio).lower()
if _EXCLUDE_EMPTY_FOLIOS_ENABLED and fid in _EMPTY_FOLIOS:
return {}
groups = defaultdict(lambda: defaultdict(list))
for _folio, tag, idx, _ln, payload in iter_h_records(path, folio):
norm = normalize_payload(payload)
if norm:
groups[tag][idx].append(norm)
out = {}
for tag, by_idx in groups.items():
od = OrderedDict()
for idx in sorted(by_idx, key=lambda x: int(x)):
od[f"{tag}{idx}"] = ' '.join(by_idx[idx]).strip()
out[tag] = od
return sort_structured(out)
def sha256(text: str) -> str:
return hashlib.sha256(text.encode('utf-8')).hexdigest()
SENTINELS = {
'f49v': {'tokens': 151, 'sha256': '172a8f2b7f06e12de9e69a73509a570834b93808d81c79bb17e5d93ebb0ce0d0'},
'f68r3': {'tokens': 104, 'sha256': '8e9aa4f9c9ed68f55ab2283c85581c82ec1f85377043a6ad9eff6550ba790f61'},
}
def sanity_check(path):
results = {}
for folio, exp in SENTINELS.items():
line = parse_folio_corpus(path, folio)
toks = len(line.split())
dig = sha256(line)
ok = (toks == exp['tokens']) and (dig == exp['sha256'])
results[folio] = {'ok': ok, 'tokens': toks, 'sha256': dig, 'expected': exp}
all_ok = all(v['ok'] for v in results.values())
return all_ok, results
def most_common_words(path, topn=10):
counts = Counter()
for _folio, _tag, _idx, _ln, payload in iter_h_records(path, None):
norm = normalize_payload(payload)
if norm:
counts.update(norm.split())
return counts.most_common(topn)
def single_letter_counts(path):
counts = Counter()
for _folio, _tag, _idx, _ln, payload in iter_h_records(path, None):
norm = normalize_payload(payload)
if norm:
for w in norm.split():
if len(w) == 1:
counts[w] += 1
return dict(sorted(counts.items(), key=lambda kv: (-kv[1], kv[0])))
USAGE = '''
Usage:
python takahashi_parser_locked.py sanity PagesH.txt
python takahashi_parser_locked.py parse PagesH.txt <folio> corpus
python takahashi_parser_locked.py parse PagesH.txt <folio> structured
python takahashi_parser_locked.py foliohash PagesH.txt <folio>
python takahashi_parser_locked.py most_common PagesH.txt [topN]
python takahashi_parser_locked.py singles PagesH.txt
'''
def main(argv):
if len(argv) < 3:
print(USAGE); sys.exit(1)
cmd = argv[1].lower()
path = argv[2]
if cmd == 'sanity':
ok, res = sanity_check(path)
status = 'PASS' if ok else 'FAIL'
print(f'PRECHECK: {status}')
for folio, info in res.items():
print(f" {folio}: ok={info['ok']} tokens={info['tokens']} sha256={info['sha256']}")
sys.exit(0 if ok else 2)
if cmd == 'parse':
if len(argv) != 5:
print(USAGE); sys.exit(1)
folio = argv[3].lower()
mode = argv[4].lower()
if mode == 'corpus':
line = parse_folio_corpus(path, folio)
print(line)
elif mode == 'structured':
data = parse_folio_structured(path, folio)
order = ['P','C','V','L','R','X','N','S']
for grp in order + sorted([k for k in data.keys() if k not in order]):
if grp in data and data[grp]:
print(f'[{grp}]')
for k,v in data[grp].items():
print(f'{k}: {v}')
print()
else:
print(USAGE); sys.exit(1)
sys.exit(0)
if cmd == 'foliohash':
if len(argv) != 4:
print(USAGE); sys.exit(1)
folio = argv[3].lower()
line = parse_folio_corpus(path, folio)
print('Token count:', len(line.split()))
print('SHA-256:', sha256(line))
sys.exit(0)
if cmd == 'most_common':
topn = int(argv[3]) if len(argv) >= 4 else 10
ok, _ = sanity_check(path)
if not ok:
print('PRECHECK: FAIL — aborting corpus job.'); sys.exit(2)
for word, cnt in most_common_words(path, topn):
print(f'{word}\t{cnt}')
sys.exit(0)
if cmd == 'singles':
ok, _ = sanity_check(path)
if not ok:
print('PRECHECK: FAIL — aborting corpus job.'); sys.exit(2)
d = single_letter_counts(path)
for k,v in d.items():
print(f'{k}\t{v}')
sys.exit(0)
print(USAGE); sys.exit(1)
if __name__ == '__main__':
main(sys.argv)
# ==== BEGIN ASTRO REMAP (LOCKED RULE) ====
import re as _re_ast
# === Exclusion controls injected ===
_EXCLUDE_EMPTY_FOLIOS_ENABLED = True
_EMPTY_FOLIOS = set(['f101r2', 'f109r', 'f109v', 'f110r', 'f110v', 'f116v', 'f12r', 'f12v', 'f59r', 'f59v', 'f60r', 'f60v', 'f61r', 'f61v', 'f62r', 'f62v', 'f63r', 'f63v', 'f64r', 'f64v', 'f74r', 'f74v', 'f91r', 'f91v', 'f92r', 'f92v', 'f97r', 'f97v', 'f98r', 'f98v'])
def set_exclude_empty_folios(flag: bool) -> None:
"""Enable/disable skipping known-empty folios globally."""
global _EXCLUDE_EMPTY_FOLIOS_ENABLED
_EXCLUDE_EMPTY_FOLIOS_ENABLED = bool(flag)
def get_exclude_empty_folios() -> bool:
"""Return current global skip setting."""
return _EXCLUDE_EMPTY_FOLIOS_ENABLED
def get_excluded_folios() -> list:
"""Return the sorted list of folios that are skipped when exclusion is enabled."""
return sorted(_EMPTY_FOLIOS)
# === End exclusion controls ===
_ASTRO_START, _ASTRO_END = 67, 73
_KEEP_AS_IS = {"C", "R", "P", "T"}
_folio_re_ast = _re_ast.compile(r"^f(\d+)([rv])(?:([0-9]+))?$")
def _is_astro_folio_ast(folio: str) -> bool:
m = _folio_re_ast.match(folio or "")
if not m:
return False
num = int(m.group(1))
return _ASTRO_START <= num <= _ASTRO_END
def _remap_unknown_to_R_ast(folio: str, out: dict) -> dict:
if not isinstance(out, dict) or not _is_astro_folio_ast(folio):
return sort_structured(out)
if not out:
return sort_structured(out)
out.setdefault("R", {})
unknown_tags = [t for t in list(out.keys()) if t not in _KEEP_AS_IS]
for tag in unknown_tags:
units = out.get(tag, {})
if isinstance(units, dict):
for unit_key, text in units.items():
new_unit = f"R_from_{tag}_{unit_key}"
if new_unit in out["R"]:
out["R"][new_unit] += " " + (text or "")
else:
out["R"][new_unit] = text
out.pop(tag, None)
return sort_structured(out)
# Wrap only once
try:
parse_folio_structured_original
except NameError:
parse_folio_structured_original = parse_folio_structured
def parse_folio_structured(pages_path: str, folio: str):
out = parse_folio_structured_original(pages_path, folio)
return _remap_unknown_to_R_ast(folio, out)
# ==== END ASTRO REMAP (LOCKED RULE) ====
def effective_folio_ids(pages_path: str) -> list:
"""Return folio ids found in PagesH headers. Respects exclusion toggle for known-empty folios."""
import re
# === Sorting utilities (injected) ===
def folio_sort_key(fid: str):
"""Return a numeric sort key for folio ids like f9r, f10v, f68r3 (recto before verso)."""
s = (fid or "").strip().lower()
m = re.match(r"^f(\d{1,3})(r|v)(\d+)?$", s)
if not m:
# Place unknown patterns at the end in stable order
return (10**6, 9, 10**6, s)
num = int(m.group(1))
side = 0 if m.group(2) == "r" else 1
sub = int(m.group(3)) if m.group(3) else 0
return (num, side, sub, s)
def sort_folio_ids(ids):
"""Sort a sequence of folio ids in natural numeric order using folio_sort_key."""
try:
return sorted(ids, key=folio_sort_key)
except Exception:
# Fallback to stable original order on any error
return list(ids)
_REGION_ORDER = {"P": 0, "T": 1, "C": 2, "R": 3}
def sort_structured(struct):
"""Return an OrderedDict-like mapping with regions sorted P,T,C,R and units numerically."""
try:
from collections import OrderedDict
out = OrderedDict()
# Sort regions by our preferred order; unknown tags go after known ones alphabetically
def region_key(tag):
return (_REGION_ORDER.get(tag, 99), tag)
if not isinstance(struct, dict):
return struct
for tag in sorted(struct.keys(), key=region_key):
blocks = struct[tag]
if isinstance(blocks, dict):
od = OrderedDict()
# Unit keys are expected to be numeric strings (idx), or tag+idx; try to extract int
def idx_key(k):
m = re.search(r"(\d+)$", str(k))
return int(m.group(1)) if m else float("inf")
for k in sorted(blocks.keys(), key=idx_key):
od[k] = blocks[k]
out[tag] = od
else:
out[tag] = blocks
return out
except Exception:
return struct
def english_sort_description() -> str:
"""Describe the default sorting rules in plain English."""
return ("ordered numerically by folio number with recto before verso and subpages in numeric order; "
"within each folio, regions are P, then T, then C, then R, and their units are sorted by number.")
def english_receipt(heard: str, did: str) -> None:
"""Print a two-line audit receipt with the plain-English command heard and what was executed."""
if heard is None:
heard = ''
if did is None:
did = ''
print(f"Heard: {heard}")
print(f"Did: {did}")Dunsel > 2 hours ago