If you feel like experimenting a little with threshold values and prefer to work with a GUI (python > tkinter), here's your chance:
Code:
#!/usr/bin/env python3
# coding: utf-8
import sys
import threading
import os
import subprocess
from tkinter import Tk, Label, Button, Text, Scrollbar, Entry, X, BOTTOM, END, BOTH
from metaphone import doublemetaphone
from collections import defaultdict
HIT_SYMBOL = "+"
MISS_SYMBOL = "-"
# ---------------------------------------------------------
def levenshtein(a, b):
if not a: return len(b)
if not b: return len(a)
dp = range(len(b) + 1)
for i, ca in enumerate(a, 1):
new_dp = [i]
for j, cb in enumerate(b, 1):
if ca == cb:
new_dp.append(dp[j - 1])
else:
new_dp.append(1 + min(dp[j - 1], dp[j], new_dp[-1]))
dp = new_dp
return dp[-1]
def load_wordlist(path="wordlist.txt"):
words = []
with open(path, "r", encoding="utf8") as f:
for line in f:
w = line.strip().lower()
if w:
words.append(w)
return words
def normalize_voynich_word(w):
return "".join(c.lower() for c in w if c.isalpha())
def build_metaphone_index(words):
index = defaultdict(list)
for w in words:
m1, m2 = doublemetaphone(w)
if m1:
index[m1].append(w)
if m2 and m2 != m1:
index[m2].append(w)
return index
def hybrid_match(v_word, index, MAX_RATIO, top_n=1):
if not v_word:
return ["<no match>"]
m1, m2 = doublemetaphone(v_word)
candidates = []
if m1 in index:
candidates.extend(index[m1])
if m2 in index:
candidates.extend(index[m2])
if len(candidates) < 5 and m1:
prefix = m1[:2]
for key in index:
if key.startswith(prefix):
candidates.extend(index[key])
if not candidates:
return ["<no match>"]
scored = [(levenshtein(v_word, cand[:len(v_word)]), cand) for cand in candidates]
scored.sort(key=lambda x: x[0])
best_score = scored[0][0]
max_allowed = max(1, int(len(v_word) * MAX_RATIO))
if best_score > max_allowed:
return ["<no match>"]
best = [w for s, w in scored if s == best_score]
return best[:top_n]
# ---------------------------------------------------------
class DecoderGUI:
def __init__(self, master, infile, outfile):
self.master = master
master.title("Voynich to Latin")
master.geometry("1200x500")
# Threshold Eingabe
self.threshold_label = Label(master, text="Threshold value (0.0 – 1.0):", font=("Courier", 12))
self.threshold_label.pack(anchor="w", padx=5, pady=2)
self.threshold_entry = Entry(master, width=10, font=("Courier", 12))
self.threshold_entry.insert(0, "0.35")
self.threshold_entry.pack(anchor="w", padx=5)
# Info-Lines
self.info_label = Label(master, text="Initializing...", font=("Courier", 12), justify="left")
self.info_label.pack(anchor="w", padx=5, pady=2)
# Hit/Miss Zeile
self.hit_text = Text(master, width=150, height=4, font=("Courier", 20), wrap="none")
self.hit_text.pack(fill=BOTH, expand=True, padx=5)
self.hit_text.tag_config("hit", foreground="green")
self.hit_text.tag_config("miss", foreground="red")
# Scrollbar
self.scrollbar = Scrollbar(master, orient="horizontal", command=self.hit_text.xview)
self.scrollbar.pack(side=BOTTOM, fill=X)
self.hit_text.config(xscrollcommand=self.scrollbar.set)
# Fortschritt
self.progress_label = Label(master, text="", font=("Courier", 12))
self.progress_label.pack(anchor="w", padx=5, pady=2)
# Statistik
self.stats_label = Label(master, text="", font=("Courier", 12), justify="left")
self.stats_label.pack(anchor="w", padx=5, pady=2)
# Buttons
self.start_button = Button(master, text="Start Decoding", command=self.start_decoding)
self.start_button.pack(side="left", padx=5, pady=5)
self.open_button = Button(master, text="Open Output File", command=self.open_output, state="disabled")
self.open_button.pack(side="left", padx=5, pady=5)
self.close_button = Button(master, text="Close", command=self.close_window)
self.close_button.pack(side="left", padx=5, pady=5)
self.infile = infile
self.outfile = outfile
self.stop_flag = False
# -----------------------------------------------------
def start_decoding(self):
# RESET aller Ausgaben
self.hit_text.delete("1.0", END)
self.progress_label.config(text="")
self.stats_label.config(text="")
self.open_button.config(state="disabled")
# Threshold lesen
try:
value = float(self.threshold_entry.get().strip())
if not (0 <= value <= 1):
raise ValueError
self.MAX_RATIO = value
except:
self.MAX_RATIO = 0.35
self.threshold_entry.delete(0, END)
self.threshold_entry.insert(0, "0.35")
# während Dekodierung deaktivieren
self.threshold_entry.config(state="disabled")
self.start_button.config(state="disabled")
threading.Thread(target=self.decode).start()
# -----------------------------------------------------
def open_output(self):
if sys.platform.startswith("win"):
os.startfile(self.outfile)
elif sys.platform == "darwin":
subprocess.Popen(["open", self.outfile])
else:
subprocess.Popen(["xdg-open", self.outfile])
# -----------------------------------------------------
def close_window(self):
self.stop_flag = True
self.master.destroy()
# -----------------------------------------------------
def insert_hit_line(self, hit_line):
for c in hit_line:
if c == HIT_SYMBOL:
self.hit_text.insert(END, c, "hit")
else:
self.hit_text.insert(END, c, "miss")
self.hit_text.insert(END, "\n")
self.hit_text.see(END)
# -----------------------------------------------------
def decode(self):
latin_words = load_wordlist("wordlist.txt")
index = build_metaphone_index(latin_words)
info_text = (
f"Loading Latin wordlist ...\n"
f"Total words: {len(latin_words)}\n"
f"Metaphone keys: {len(index)}\n"
f"Decoding ..."
)
self.info_label.config(text=info_text)
self.master.update()
with open(self.infile, "r", encoding="utf8") as f:
lines = f.readlines()
output_lines = []
total_words = total_match = total_no_match = 0
for i, line in enumerate(lines):
if self.stop_flag:
break
line = line.strip()
if not line:
output_lines.append("")
continue
words = line.split()
decoded_line = []
hit_line = ""
for w in words:
total_words += 1
normalized = normalize_voynich_word(w)
matches = hybrid_match(normalized, index, self.MAX_RATIO)
decoded = matches[0]
decoded_line.append(decoded)
if decoded == "<no match>":
hit_line += MISS_SYMBOL
total_no_match += 1
else:
hit_line += HIT_SYMBOL
total_match += 1
self.master.after(0, lambda hl=hit_line: self.insert_hit_line(hl))
output_lines.append(" ".join(decoded_line))
self.progress_label.config(text=f"{i+1}/{len(lines)} lines decoded")
self.master.update()
# Datei schreiben
with open(self.outfile, "w", encoding="utf8") as f:
for row in output_lines:
f.write(row + "\n")
stats_text = (
f"Decoding finished.\n"
f"Total words: {total_words}\n"
f"Matches: {total_match} ({total_match/total_words*100:.2f}%)\n"
f"No matches: {total_no_match} ({total_no_match/total_words*100:.2f}%)\n"
f"Output file: {self.outfile}\n"
f"Threshold value used: {self.MAX_RATIO}"
)
self.stats_label.config(text=stats_text)
# Eingabefeld & Button wieder aktivieren
self.start_button.config(state="normal")
self.threshold_entry.config(state="normal")
# **NEU: Output-File öffnen Button aktivieren**
self.open_button.config(state="normal")
# ---------------------------------------------------------
if __name__ == "__main__":
if len(sys.argv) < 3:
print("Usage: python phonetic_levenshtein_max_ratio_tk.py voyn2latin.txt mapped.txt")
sys.exit(1)
infile = sys.argv[1]
outfile = sys.argv[2]
root = Tk()
gui = DecoderGUI(root, infile, outfile)
root.mainloop()
After visually comparing the output with the threshold-free version, I considered a threshold of 0.55 to be realistic for the “constructions” of solvers. However, perception is always a very subjective thing that can never be accurately replicated on a computer.