quimqu > 14-08-2025, 10:17 AM
bi3mw > 14-08-2025, 04:47 PM
#!/usr/bin/env python3
import sys
from collections import Counter
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import font_manager as fm
def extract_prefix_suffix(word):
parts = word.split('/')
if len(parts) < 3:
return [], [], None
stem_index = len(parts) // 2
prefix = parts[:stem_index]
suffix = parts[stem_index+1:]
return prefix, suffix, parts[stem_index]
def main():
if len(sys.argv) < 2:
print("Usage: python heatmap_prefix_suffix.py output_segmented.txt [N]")
sys.exit(1)
filename = sys.argv[1]
top_n = 20 # default number of top prefixes and suffixes
if len(sys.argv) >= 3:
try:
top_n = int(sys.argv[2])
except ValueError:
print("Parameter N must be an integer.")
sys.exit(1)
prefix_suffix_counts = Counter()
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
words = line.strip().split()
for w in words:
prefix, suffix, stem = extract_prefix_suffix(w)
for pre in prefix:
for suf in suffix:
prefix_suffix_counts[(pre, suf)] += 1
if not prefix_suffix_counts:
print("No prefix-suffix combinations found.")
sys.exit(1)
data = [{"prefix": pre, "suffix": suf, "count": count}
for (pre, suf), count in prefix_suffix_counts.items()]
df = pd.DataFrame(data)
# Select top N prefixes and suffixes by total count
top_prefixes = df.groupby('prefix')['count'].sum().nlargest(top_n).index
top_suffixes = df.groupby('suffix')['count'].sum().nlargest(top_n).index
pivot = df.pivot(index='prefix', columns='suffix', values='count').fillna(0)
pivot_top = pivot.loc[top_prefixes, top_suffixes]
# Lade lokalen Font
custom_font_path = "/home/me/.local/share/fonts/eva1.ttf"
custom_font = fm.FontProperties(fname=custom_font_path, size=12)
# Plot
plt.figure(figsize=(18, 12))
ax = sns.heatmap(
pivot_top,
annot=True,
fmt=".0f",
cmap="YlGnBu",
cbar_kws={"shrink": 0.5}
)
plt.title(f"Top {top_n} Prefix-Suffix Combinations")
plt.xlabel("Suffix")
plt.ylabel("Prefix")
# Achsenbeschriftungen (Tick-Labels) mit lokalem Font
ax.set_xticklabels(ax.get_xticklabels(), fontproperties=custom_font, rotation=45, ha='right')
ax.set_yticklabels(ax.get_yticklabels(), fontproperties=custom_font, rotation=0)
plt.subplots_adjust(bottom=0.2)
plt.show()
if __name__ == "__main__":
main()