bollwerk/.github/genome/genome-extract.py
Jens Reinemann 10cb474906 refactor(genome): rewrite extraction in Python for proper UTF-8 support
PowerShell auf Windows hat Encoding-Probleme mit Git-Output (Umlaute).
Python 3 handhabt UTF-8 nativ korrekt.
2026-05-18 09:49:37 +02:00

282 lines
8.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Genome Engine Phase 1: Extraction
Extrahiert Mutations aus der Git-History für Copilot-Customization-Dateien.
Scannt git log für Änderungen im Genome-Scope (.github/skills, agents, prompts, instructions).
Gruppiert Diffs nach Trait und gibt strukturiertes Markdown aus.
Usage:
python .github/genome/genome-extract.py --since "7 days ago"
python .github/genome/genome-extract.py --since "4 days ago" --repo /path/to/repo
"""
import argparse
import os
import re
import subprocess
import sys
from collections import defaultdict
from datetime import datetime
from pathlib import Path
# --- Konfiguration ---
GENOME_SCOPES = [
".github/skills/",
".github/agents/",
".github/prompts/",
".github/copilot-instructions.md",
".github/kotlin-conventions.instructions.md",
]
MAX_DIFF_LINES = 80
def run_git(*args: str, cwd: str = ".") -> str:
"""Git-Kommando ausführen, UTF-8-Output zurückgeben."""
result = subprocess.run(
["git"] + list(args),
cwd=cwd,
capture_output=True,
text=True,
encoding="utf-8",
errors="replace",
)
return result.stdout
def is_in_genome_scope(filepath: str) -> bool:
"""Prüft ob ein Dateipfad im Genome-Scope liegt."""
for scope in GENOME_SCOPES:
if scope.endswith("/"):
if filepath.startswith(scope):
return True
else:
if filepath == scope:
return True
return False
def get_trait_key(filepath: str, repo_path: str) -> str | None:
"""Leitet den Trait-Key aus einem Dateipfad ab."""
# Skills: skill/<ordnername>
m = re.match(r"^\.github/skills/([^/]+)/", filepath)
if m:
return f"skill/{m.group(1)}"
# Agents: agent/<dateiname-ohne-extension>
m = re.match(r"^\.github/agents/(.+)\.agent\.md$", filepath)
if m:
return f"agent/{m.group(1)}"
# Prompts: Standalone oder Verbund
m = re.match(r"^\.github/prompts/(.+)\.prompt\.md$", filepath)
if m:
name = m.group(1)
# Verbund-Erkennung: <router>-<sub>.prompt.md → Trait des Routers
parts = name.split("-")
if len(parts) > 1:
# Versuche progressiv kürzere Präfixe als Router-Name
for i in range(len(parts) - 1, 0, -1):
candidate = "-".join(parts[:i])
router_path = Path(repo_path) / f".github/prompts/{candidate}.prompt.md"
if router_path.exists():
return f"prompt/{candidate}"
# Standalone-Prompt
return f"prompt/{name}"
# Instructions (*.instructions.md)
m = re.match(r"^\.github/(.+)\.instructions\.md$", filepath)
if m:
return f"instructions/{m.group(1)}"
# copilot-instructions.md
if filepath == ".github/copilot-instructions.md":
return "instructions/copilot-instructions"
return None
def get_mutation_type(status: str) -> str:
"""Bestimmt den Mutation-Typ aus dem Git-Status-Buchstaben."""
if status.startswith("A"):
return "member-added"
elif status.startswith("D"):
return "member-removed"
else:
return "content-change"
def extract_mutations(repo_path: str, since: str) -> dict[str, list[dict]]:
"""Extrahiert alle Mutations aus der Git-History."""
mutations: dict[str, list[dict]] = defaultdict(list)
# Git-Log abrufen
log_output = run_git(
"log",
"--format=%H|%aI|%an|%s",
f"--since={since}",
"--",
*GENOME_SCOPES,
cwd=repo_path,
)
if not log_output.strip():
return mutations
for line in log_output.strip().split("\n"):
if "|" not in line:
continue
parts = line.split("|", 3)
if len(parts) < 4:
continue
commit_hash, date, author, message = parts
# Geänderte Dateien für diesen Commit
diff_tree_output = run_git(
"diff-tree", "--no-commit-id", "-r", "--name-status", commit_hash,
cwd=repo_path,
)
for diff_line in diff_tree_output.strip().split("\n"):
if not diff_line or not diff_line[0].isalpha():
continue
diff_parts = diff_line.split("\t", 2)
status = diff_parts[0]
filepath = diff_parts[1] if len(diff_parts) > 1 else ""
# Bei Renames: Zielpfad verwenden
if status.startswith("R") and len(diff_parts) >= 3:
filepath = diff_parts[2]
# Normalisieren
filepath = filepath.replace("\\", "/")
if not is_in_genome_scope(filepath):
continue
trait_key = get_trait_key(filepath, repo_path)
if not trait_key:
continue
mutation_type = get_mutation_type(status)
# Diff holen
diff_output = run_git(
"show", "--format=", "--no-color", commit_hash, "--", filepath,
cwd=repo_path,
)
# Diff kürzen
diff_lines = diff_output.strip().split("\n") if diff_output.strip() else []
if len(diff_lines) > MAX_DIFF_LINES:
truncated = len(diff_lines) - MAX_DIFF_LINES
diff_lines = diff_lines[:MAX_DIFF_LINES] + [f"... ({truncated} weitere Zeilen)"]
mutations[trait_key].append({
"hash": commit_hash[:8],
"date": date,
"author": author,
"message": message,
"file": filepath,
"type": mutation_type,
"diff": "\n".join(diff_lines),
})
return mutations
def generate_markdown(mutations: dict[str, list[dict]], repo_path: str, since: str) -> str:
"""Generiert die Markdown-Ausgabe."""
lines = []
lines.append("# Raw Mutations")
lines.append("")
lines.append(f"**Extrahiert:** {datetime.now().strftime('%Y-%m-%d %H:%M')}")
lines.append(f"**Zeitraum:** seit {since}")
lines.append(f"**Repository:** {Path(repo_path).resolve().name}")
lines.append(f"**Traits mit Mutations:** {len(mutations)}")
lines.append("")
lines.append("---")
lines.append("")
if not mutations:
lines.append("*Keine Mutations im angegebenen Zeitraum gefunden.*")
else:
for trait_key in sorted(mutations.keys()):
trait_mutations = mutations[trait_key]
lines.append(f"## Trait: `{trait_key}`")
lines.append("")
lines.append("| Mutations | Dateien |")
lines.append("|-----------|---------|")
unique_files = sorted(set(m["file"] for m in trait_mutations))
lines.append(f"| {len(trait_mutations)} | {', '.join(unique_files)} |")
lines.append("")
# Gruppiert nach Commit
commits_seen: dict[str, list[dict]] = {}
for m in trait_mutations:
commits_seen.setdefault(m["hash"], []).append(m)
for commit_hash, commit_mutations in commits_seen.items():
first = commit_mutations[0]
lines.append(f"### [{first['hash']}] {first['message']}")
lines.append("")
lines.append(f"- **Datum:** {first['date']}")
lines.append(f"- **Autor:** {first['author']}")
lines.append("")
for mutation in commit_mutations:
lines.append(f"#### `{mutation['type']}` {mutation['file']}")
lines.append("")
if mutation["diff"]:
lines.append("```diff")
lines.append(mutation["diff"])
lines.append("```")
lines.append("")
lines.append("---")
lines.append("")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(description="Genome Engine Extraction")
parser.add_argument("--since", default="7 days ago", help='Zeitspanne (z.B. "7 days ago")')
parser.add_argument("--repo", default=".", help="Pfad zum Repository")
parser.add_argument("--output", default="", help="Output-Pfad (default: .github/genome/output/raw-mutations.md)")
args = parser.parse_args()
repo_path = os.path.abspath(args.repo)
output_path = args.output or os.path.join(repo_path, ".github/genome/output/raw-mutations.md")
print(f"Genome Extract: Scanning commits since '{args.since}'...")
mutations = extract_mutations(repo_path, args.since)
markdown = generate_markdown(mutations, repo_path, args.since)
# Output schreiben
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
f.write(markdown)
total_mutations = sum(len(v) for v in mutations.values())
print()
print("Extraction abgeschlossen:")
print(f" Traits: {len(mutations)}")
print(f" Mutations: {total_mutations}")
print(f" Output: {output_path}")
if __name__ == "__main__":
main()