Code:
#!/usr/bin/env python3
import sys
import re
import os
import numpy as np
import matplotlib.pyplot as plt
def tokenize(text):
return re.findall(r"\w+", text.lower())
def compute_ttr(tokens, step=100):
x = []
y = []
for i in range(step, len(tokens) + 1, step):
chunk = tokens[:i]
types = len(set(chunk))
x.append(i)
y.append(types / len(chunk))
return np.array(x[1:]), np.array(y[1:])
def fit_bounded(x, y):
"""
Fit:
y = 1 / (1 + a * x^b)
Linearized:
log((1/y) - 1) = log(a) + b * log(x)
"""
z = (1 / y) - 1
logx = np.log(x)
logz = np.log(z)
slope, intercept = np.polyfit(logx, logz, 1)
b = slope
a = np.exp(intercept)
return a, b
def reference_curve(x, a, b):
return 1 / (1 + a * x**b)
def main():
if len(sys.argv) != 2:
print(f"Usage: {sys.argv[0]} <textfile.txt>")
sys.exit(1)
filepath = sys.argv[1]
filename = os.path.basename(filepath)
with open(filepath, "r", encoding="utf-8") as f:
text = f.read()
tokens = tokenize(text)
if len(tokens) < 200:
print("Text is too short.")
sys.exit(1)
# dynamic step selection
if len(tokens) < 10000:
step = 20
else:
step = 100
print(f"Using step size: {step}")
# compute TTR
x, y = compute_ttr(tokens, step=step)
if len(x) < 2:
print("Not enough data points for fitting.")
sys.exit(1)
# fit bounded model
a, b = fit_bounded(x, y)
y_fit = reference_curve(x, a, b)
# output parameters
print("\nFitted parameters:")
print(f" a = {a:.6f}")
print(f" b = {b:.6f}")
print()
# plot
plt.figure(figsize=(10, 6))
plt.plot(
x,
y,
label=f"Observed TTR ({filename})",
alpha=0.8
)
plt.plot(
x,
y_fit,
label=f"Bounded reference curve (a={a:.3f}, b={b:.3f})",
linewidth=2
)
plt.xlabel("Tokens")
plt.ylabel("Type/Token Ratio (TTR)")
plt.title("TTR Analysis: Observed vs Bounded Model")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
if __name__ == "__main__":
main()
Edit: I've improved the code.