(13-08-2025, 06:45 PM)RobGea Wrote: You are not allowed to view links. Register or Login to view.i would be interested to know which "language-independent morphological parser" you used. Web-based or software or whatever, thanks
First, you need to install Pandas for Python:
pip3 install pandas seaborn matplotlib
I wrote a Python script that does this job.
Call example (the number is variable and specifies how many top n combinations are output):
python parser.py naibbe_Cleaned_52_01_10_word_lines.txt parsed.txt 25
Code:
#!/usr/bin/env python3
import sys
import re
from collections import Counter
def find_affixes(words, min_prefix_len=2, min_suffix_len=2, top_n=None):
prefixes = Counter()
suffixes = Counter()
for w in words:
if len(w) >= min_prefix_len:
for i in range(min_prefix_len, min(len(w), 6)):
prefixes[w[:i]] += 1
if len(w) >= min_suffix_len:
for i in range(min_suffix_len, min(len(w), 6)):
suffixes[w[-i:]] += 1
if top_n is None:
return prefixes.most_common(), suffixes.most_common()
else:
return prefixes.most_common(top_n), suffixes.most_common(top_n)
def segment_word_multi(word, prefix_set, suffix_set):
parts = []
stem = word
# Mehrere Präfixe entfernen
while True:
match = None
for p in sorted(prefix_set, key=len, reverse=True):
if stem.startswith(p) and len(stem) > len(p) + 1:
match = p
break
if match:
parts.append(match)
stem = stem[len(match):]
else:
break
suffix_parts = []
# Mehrere Suffixe entfernen
while True:
match = None
for s in sorted(suffix_set, key=len, reverse=True):
if stem.endswith(s) and len(stem) > len(s) + 1:
match = s
break
if match:
suffix_parts.insert(0, match)
stem = stem[:-len(match)]
else:
break
parts.append(stem)
parts.extend(suffix_parts)
return parts
def find_stem(parts, prefix_set, suffix_set):
for p in parts:
if p not in prefix_set and p not in suffix_set:
return p
return ""
def main():
argc = len(sys.argv)
if argc < 3 or argc > 4:
print(f"Usage: {sys.argv[0]} <input_file> <output_file> [top_n|'all']")
sys.exit(1)
input_file = sys.argv[1]
output_file = sys.argv[2]
top_n = 10 # Standardwert
if argc == 4:
param = sys.argv[3].lower()
if param == "all":
top_n = None # Alle ausgeben
else:
try:
top_n = int(param)
if top_n < 1:
raise ValueError()
except ValueError:
print("Der dritte Parameter muss eine positive ganze Zahl oder 'all' sein.")
sys.exit(1)
with open(input_file, "r", encoding="utf-8") as f:
text = f.read()
words = [w.lower() for w in re.findall(r"[A-Za-z]+", text)]
top_prefixes, top_suffixes = find_affixes(words, top_n=top_n)
prefix_set = {p for p, _ in top_prefixes}
suffix_set = {s for s, _ in top_suffixes}
stem_counter = Counter()
def replacer(match):
token = match.group(0)
parts = segment_word_multi(token.lower(), prefix_set, suffix_set)
stem = find_stem(parts, prefix_set, suffix_set)
stem_counter[stem] += 1
segmented = "/".join(parts)
if token[0].isupper():
segmented = segmented.capitalize()
return segmented
output_text = re.sub(r"[A-Za-z]+", replacer, text)
with open(output_file, "w", encoding="utf-8") as f:
f.write(output_text)
f.write("\n\n=== Präfixe ===\n")
for p, c in top_prefixes:
f.write(f"{p}\t{c}\n")
f.write("\n=== Stämme ===\n")
for s, c in stem_counter.most_common():
if s != "":
f.write(f"{s}\t{c}\n")
f.write("\n=== Suffixe ===\n")
for s, c in top_suffixes:
f.write(f"{s}\t{c}\n")
if __name__ == "__main__":
main()
For the heat map, the call is as follows:
python heatmap_prefix_suffix.py parsed.txt 25
Code:
#!/usr/bin/env python3
import sys
from collections import Counter
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
def extract_prefix_suffix(word):
parts = word.split('/')
if len(parts) < 3:
return [], [], None
stem_index = len(parts) // 2
prefix = parts[:stem_index]
suffix = parts[stem_index+1:]
return prefix, suffix, parts[stem_index]
def main():
if len(sys.argv) < 2:
print("Usage: python heatmap_prefix_suffix.py output_segmented.txt [N]")
sys.exit(1)
filename = sys.argv[1]
top_n = 20 # default number of top prefixes and suffixes
if len(sys.argv) >= 3:
try:
top_n = int(sys.argv[2])
except ValueError:
print("Parameter N must be an integer.")
sys.exit(1)
prefix_suffix_counts = Counter()
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
words = line.strip().split()
for w in words:
prefix, suffix, stem = extract_prefix_suffix(w)
for pre in prefix:
for suf in suffix:
prefix_suffix_counts[(pre, suf)] += 1
if not prefix_suffix_counts:
print("No prefix-suffix combinations found.")
sys.exit(1)
data = []
for (pre, suf), count in prefix_suffix_counts.items():
data.append({"prefix": pre, "suffix": suf, "count": count})
df = pd.DataFrame(data)
# Select top N prefixes and suffixes by total count
top_prefixes = df.groupby('prefix')['count'].sum().nlargest(top_n).index
top_suffixes = df.groupby('suffix')['count'].sum().nlargest(top_n).index
pivot = df.pivot(index='prefix', columns='suffix', values='count').fillna(0)
pivot_top = pivot.loc[top_prefixes, top_suffixes]
plt.figure(figsize=(18, 12))
sns.heatmap(pivot_top, annot=True, fmt=".0f", cmap="YlGnBu", cbar_kws={"shrink": 0.5})
plt.title(f"Top {top_n} Prefix-Suffix Combinations")
plt.xlabel("Suffix")
plt.ylabel("Prefix")
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(rotation=0, fontsize=12) # Präfix-Beschriftungen waagerecht
plt.subplots_adjust(bottom=0.2) # make space for xlabel
plt.show()
if __name__ == "__main__":
main()