I have modified the code in the first post. The aim was to make the text easier to reconstruct and to find a method that also generates duplicate strings. The binomial distribution should be strictly maintained (this requires the shortening and lengthening of words).
How does this work?
If the program has to shorten a word, it saves the part of the word that was cut off. This saved part is then used to lengthen other words when the next word needs to be made longer. So if a word, say "cat", needs to be lengthened to a length of 5 characters, and a part like "ze" was left over from the last shortening, this part is used to lengthen the word, i.e. from "cat" to "catze".
What happens exactly?
Shortening words: If a word is too long, it is shortened to the desired length. The truncated part of the word is saved because this part is used later to lengthen a word.
Lengthening words: If a word is too short, it is lengthened to the desired length. The saved part of the previously shortened word is used. If this part is not long enough to achieve the desired length, it is repeated or cut off.
These are the repeating strings ( not as many as in the VMS, but at least they are there. ) :
'eliooo'
'elisa'
'felic'
'judi'
'moys'
'obtuli'
'off'
'sae'
'sancti'
Code:
import sys
import numpy as np
from scipy.special import comb
def calculate_binomial_distribution(n, max_length):
"""Berechnet die binomiale Verteilung für Wortlängen."""
k_values = np.arange(1, max_length + 1)
probabilities = [comb(n, k) * (0.5 ** n) for k in k_values]
probabilities /= np.sum(probabilities)
return probabilities
def adjust_word_lengths(words, target_distribution, last_truncated_part):
"""Passt die Wortlängen an, um die Zielverteilung zu erfüllen, indem Wörter gekürzt oder verlängert werden."""
adjusted_words = []
max_word_length = len(target_distribution)
length_bins = np.arange(1, max_word_length + 1)
length_probs = np.array(target_distribution)
new_last_truncated_part = last_truncated_part
for word in words:
current_length = len(word)
target_length = np.random.choice(length_bins, p=length_probs)
if target_length < current_length:
# Speichern des gekürzten Teils
new_last_truncated_part = word[target_length:]
adjusted_word = word[:target_length]
adjusted_words.append(adjusted_word)
elif target_length > current_length:
if new_last_truncated_part:
# Berechnen der benötigten Länge für die Verlängerung
needed_length = target_length - current_length
# Erstellen des Erweiterungsteils durch Wiederholung des gekürzten Teils
repeated_part = (new_last_truncated_part * ((needed_length // len(new_last_truncated_part)) + 1))[:needed_length]
adjusted_word = word + repeated_part
else:
# Falls kein gekürzter Teil vorhanden ist, das Wort mit Fallback-Zeichen verlängern
adjusted_word = word + "_" * (target_length - current_length)
adjusted_words.append(adjusted_word)
else:
adjusted_words.append(word) # Länge entspricht der Zielvorgabe, Wort bleibt unverändert
return adjusted_words, new_last_truncated_part
def process_text(file_path, output_path, target_distribution):
"""Liest den Text aus der Datei, passt die Wortlängen an und schreibt den modifizierten Text in eine Ausgabedatei."""
try:
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
except FileNotFoundError:
print(f"Fehler: Die Datei {file_path} wurde nicht gefunden.")
sys.exit(1)
except IOError as e:
print(f"Fehler: Ein Fehler ist beim Lesen der Datei aufgetreten: {e}")
sys.exit(1)
last_truncated_part = ""
adjusted_lines = []
for line in lines:
words = line.split()
adjusted_words, last_truncated_part = adjust_word_lengths(words, target_distribution, last_truncated_part)
adjusted_lines.append(' '.join(adjusted_words))
# Schreiben des modifizierten Textes in die Ausgabedatei
try:
with open(output_path, 'w', encoding='utf-8') as file:
file.write('\n'.join(adjusted_lines))
print(f"Modifizierter Text wurde in {output_path} geschrieben.")
except IOError as e:
print(f"Fehler: Ein Fehler ist beim Schreiben der Datei aufgetreten: {e}")
sys.exit(1)
def main():
if len(sys.argv) != 3:
print("Verwendung: python adjust_word_length.py <input_filename> <output_filename>")
sys.exit(1)
input_file_path = sys.argv[1]
output_file_path = sys.argv[2]
max_word_length = 15
n = 10
# Berechnen der Binomialverteilung
target_distribution = calculate_binomial_distribution(n, max_word_length)
# Prozess des Textes und Schreiben in die Ausgabedatei
process_text(input_file_path, output_file_path, target_distribution)
if __name__ == "__main__":
main()
[
attachment=9073]