I have extended the code in the opening post #1 so that it clearly shows the binomial distribution in the generated text (plot). The distribution seems to be correct.
Code:
import sys
import numpy as np
import matplotlib.pyplot as plt
from scipy.special import comb
def calculate_binomial_distribution(n, max_length):
"""Calculates a binomial distribution for word lengths."""
k_values = np.arange(1, max_length + 1)
# Calculate the binomial distribution for the formula comb(n, k) * (0.5 ** n)
probabilities = [comb(n, k) * (0.5 ** n) for k in k_values]
# Normalize the distribution
probabilities /= np.sum(probabilities)
return k_values, probabilities
def plot_binomial_distribution(n, max_length):
"""Visualizes the binomial distribution and overlays the theoretical curve."""
k_values, probabilities = calculate_binomial_distribution(n, max_length)
# Calculate the theoretical binomial curve
theoretical_probabilities = [comb(n, k) * (0.5 ** n) for k in k_values]
theoretical_probabilities /= np.sum(theoretical_probabilities)
plt.figure(figsize=(10, 6))
# Plot the calculated probabilities
plt.bar(k_values, probabilities, width=0.6, edgecolor='black', alpha=0.6, label='Calculated Distribution')
# Plot the theoretical binomial curve
plt.plot(k_values, theoretical_probabilities, 'r--', marker='o', label='Theoretical Distribution')
plt.xlabel('Word Length')
plt.ylabel('Probability')
plt.title(f'Binomial Distribution for n={n}')
plt.legend()
plt.grid(True)
plt.show()
def adjust_word_lengths(words, target_distribution):
"""Adjusts word lengths to fit the target distribution by truncating or extending words."""
adjusted_words = []
max_word_length = len(target_distribution)
length_bins = np.arange(1, max_word_length + 1)
length_probs = np.array(target_distribution)
for word in words:
current_length = len(word)
target_length = np.random.choice(length_bins, p=length_probs)
# If the target word length is shorter, truncate the word
if target_length < current_length:
adjusted_words.append(word[:target_length])
# If the target word length is longer, extend the word with 'X'
elif target_length > current_length:
adjusted_words.append(word + 'X' * (target_length - current_length))
else:
adjusted_words.append(word) # If the length matches, keep the word as is
return adjusted_words
def process_text(file_path, output_path):
"""Reads the text from the file, adjusts word lengths, and writes the modified text to an output file."""
try:
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
except FileNotFoundError:
print(f"Error: The file {file_path} was not found.")
sys.exit(1)
except IOError as e:
print(f"Error: An error occurred while reading the file: {e}")
sys.exit(1)
max_word_length = 15 # Set maximum word length
n = 9 # Number of trials for the binomial distribution
# Calculate the binomial distribution
_, target_distribution = calculate_binomial_distribution(n, max_word_length)
adjusted_lines = []
for line in lines:
words = line.split()
adjusted_words = adjust_word_lengths(words, target_distribution)
adjusted_lines.append(' '.join(adjusted_words))
# Write the modified text to the output file
try:
with open(output_path, 'w', encoding='utf-8') as file:
file.write('\n'.join(adjusted_lines))
print(f"Modified text has been written to {output_path}.")
except IOError as e:
print(f"Error: An error occurred while writing the file: {e}")
sys.exit(1)
def main():
if len(sys.argv) != 3:
print("Usage: python adjust_word_length.py <input_filename> <output_filename>")
sys.exit(1)
input_file_path = sys.argv[1]
output_file_path = sys.argv[2]
# Optional visualization of the distribution
max_word_length = 15
n = 9
plot_binomial_distribution(n, max_word_length)
# Process the text and write to the output file
process_text(input_file_path, output_file_path)
if __name__ == "__main__":
main()
As @RobGea has already noted, the distribution in the VMS seems to fit roughly section by section, but not folio by folio. I can't understand this, because the result for the entire VMS text must come out somehow. Does anyone have an explanation ?