bi3mw > 5 hours ago
nablator > 4 hours ago
Torsten > 3 hours ago
(5 hours ago)bi3mw Wrote: You are not allowed to view links. Register or Login to view.While experimenting with the type/token ratio in the VMS corpus, I compared the original curve with a smoothed ideal curve. With one exception, the curves are almost identical. The “kink” above the original curve can only mean that there are more new words than “usual” in this section of text. Since this deviation occurs only once, it is quite remarkable or am I overinterpreting this observation ?
Jorge_Stolfi > 3 hours ago
(5 hours ago)bi3mw Wrote: You are not allowed to view links. Register or Login to view.While experimenting with the type/token ratio in the VMS corpus, I compared the original curve with a smoothed ideal curve.
Jorge_Stolfi > 3 hours ago
(5 hours ago)bi3mw Wrote: You are not allowed to view links. Register or Login to view.The “kink” above the original curve can only mean that there are more new words than “usual” in this section of text.
bi3mw > 3 hours ago
(3 hours ago)Jorge_Stolfi Wrote: You are not allowed to view links. Register or Login to view.Is that what your "smoothed curve" is?
nablator > 2 hours ago
(2 hours ago)bi3mw Wrote: You are not allowed to view links. Register or Login to view.(2 hours ago)nablator Wrote: You are not allowed to view links. Register or Login to view.Is b close to 1/2 ?No, it's much closer to 1/4 than to 1/2. Isn't that okay?

bi3mw > 1 hour ago
#!/usr/bin/env python3
import sys
import re
import os
import numpy as np
import matplotlib.pyplot as plt
def tokenize(text):
return re.findall(r"\w+", text.lower())
def compute_ttr(tokens, step=100):
x = []
y = []
for i in range(step, len(tokens) + 1, step):
chunk = tokens[:i]
types = len(set(chunk))
x.append(i)
y.append(types / len(chunk))
# Remove first point to reduce early-start artifact
return np.array(x[1:]), np.array(y[1:])
def fit_power_law(x, y):
"""
Fit model:
y = a * x^(-b)
by linearizing in log-log space.
"""
logx = np.log(x)
logy = np.log(y)
slope, intercept = np.polyfit(logx, logy, 1)
b = -slope
a = np.exp(intercept)
return a, b
def reference_curve(x, a, b):
return a * x ** (-b)
def main():
if len(sys.argv) != 2:
print(f"Usage: {sys.argv[0]} <textfile.txt>")
sys.exit(1)
filepath = sys.argv[1]
filename = os.path.basename(filepath)
with open(filepath, "r", encoding="utf-8") as f:
text = f.read()
tokens = tokenize(text)
if len(tokens) < 200:
print("Text is too short.")
sys.exit(1)
# Compute observed TTR
x, y = compute_ttr(tokens, step=100)
if len(x) < 2:
print("Not enough data points for fitting.")
sys.exit(1)
# Fit reference curve
a, b = fit_power_law(x, y)
y_fit = reference_curve(x, a, b)
# Output fitted parameters
print("\nFitted parameters:")
print(f" a = {a:.6f}")
print(f" b = {b:.6f}")
print()
# Plot
plt.figure(figsize=(10, 6))
plt.plot(
x,
y,
label=f"Observed TTR ({filename})",
alpha=0.8
)
plt.plot(
x,
y_fit,
label=f"Fitted reference curve (a={a:.3f}, b={b:.3f})",
linewidth=2
)
plt.xlabel("Tokens")
plt.ylabel("Type/Token Ratio (TTR)")
plt.title("TTR Analysis: Observed vs. Reference Curve")
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
if __name__ == "__main__":
main()