bollwerk/.github/skills/genome/genome-extract.py
Jens Reinemann ddf9272dda refactor(genome): Terminologie vereinheitlichen – Trait→Capability, Mutation→Improvement, Growth Vector→Insight, Propagation→Transfer
Alle Genome-Engine-Dateien auf lernbasierte Begriffe umgestellt:
- Concept Doc: komplett überarbeitet mit Mermaid-Diagrammen
- genome.prompt.md: neue Dateinamen + Begriffe
- genome-distill.prompt.md: Improvements/Insights statt Mutations/Vectors
- genome-propagate.prompt.md: Transfer statt Propagation, Capability statt Trait
- genome-extract.py: Output-Dateiname + Ausgabetext aktualisiert
- SKILL.md: Beschreibung + Dateitabelle aktualisiert
2026-05-18 12:46:39 +02:00

287 lines
9.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Genome Engine Phase 1: Extraction
Extrahiert Improvements aus der Git-History für KI-Tooling-Dateien.
Scannt git log für Änderungen im Genome-Scope (.github/skills, agents, prompts, instructions).
Gruppiert Diffs nach Capability und gibt strukturiertes Markdown aus.
Usage:
python .github/skills/genome/genome-extract.py --since "7 days ago"
python .github/skills/genome/genome-extract.py --since "4 days ago" --repo /path/to/repo
"""
import argparse
import os
import re
import subprocess
import sys
from collections import defaultdict
from datetime import datetime
from pathlib import Path
# --- Konfiguration ---
GENOME_SCOPES = [
".github/skills/",
".github/agents/",
".github/prompts/",
".github/copilot-instructions.md",
]
# Dynamisches Pattern für weitere Instructions-Dateien
INSTRUCTIONS_PATTERN = re.compile(r"^\.github/[^/]+\.instructions\.md$")
MAX_DIFF_LINES = 80
def run_git(*args: str, cwd: str = ".") -> str:
"""Git-Kommando ausführen, UTF-8-Output zurückgeben."""
result = subprocess.run(
["git"] + list(args),
cwd=cwd,
capture_output=True,
text=True,
encoding="utf-8",
errors="replace",
)
return result.stdout
def is_in_genome_scope(filepath: str) -> bool:
"""Prüft ob ein Dateipfad im Genome-Scope liegt."""
for scope in GENOME_SCOPES:
if scope.endswith("/"):
if filepath.startswith(scope):
return True
else:
if filepath == scope:
return True
# Dynamisch: .github/*.instructions.md
if INSTRUCTIONS_PATTERN.match(filepath):
return True
return False
def get_trait_key(filepath: str, repo_path: str) -> str | None:
"""Leitet den Capability-Key aus einem Dateipfad ab."""
# Skills: skill/<ordnername>
m = re.match(r"^\.github/skills/([^/]+)/", filepath)
if m:
return f"skill/{m.group(1)}"
# Agents: agent/<dateiname-ohne-extension>
m = re.match(r"^\.github/agents/(.+)\.agent\.md$", filepath)
if m:
return f"agent/{m.group(1)}"
# Prompts: Standalone oder Verbund
m = re.match(r"^\.github/prompts/(.+)\.prompt\.md$", filepath)
if m:
name = m.group(1)
# Verbund-Erkennung: <router>-<sub>.prompt.md → Trait des Routers
parts = name.split("-")
if len(parts) > 1:
# Versuche progressiv kürzere Präfixe als Router-Name
for i in range(len(parts) - 1, 0, -1):
candidate = "-".join(parts[:i])
router_path = Path(repo_path) / f".github/prompts/{candidate}.prompt.md"
if router_path.exists():
return f"prompt/{candidate}"
# Standalone-Prompt
return f"prompt/{name}"
# Instructions (*.instructions.md)
m = re.match(r"^\.github/(.+)\.instructions\.md$", filepath)
if m:
return f"instructions/{m.group(1)}"
# copilot-instructions.md
if filepath == ".github/copilot-instructions.md":
return "instructions/copilot-instructions"
return None
def get_mutation_type(status: str) -> str:
"""Bestimmt den Improvement-Typ aus dem Git-Status-Buchstaben."""
if status.startswith("A"):
return "member-added"
elif status.startswith("D"):
return "member-removed"
else:
return "content-change"
def extract_mutations(repo_path: str, since: str) -> dict[str, list[dict]]:
"""Extrahiert alle Improvements aus der Git-History."""
mutations: dict[str, list[dict]] = defaultdict(list)
# Git-Log abrufen
log_output = run_git(
"log",
"--format=%H|%aI|%an|%s",
f"--since={since}",
"--",
*GENOME_SCOPES,
cwd=repo_path,
)
if not log_output.strip():
return mutations
for line in log_output.strip().split("\n"):
if "|" not in line:
continue
parts = line.split("|", 3)
if len(parts) < 4:
continue
commit_hash, date, author, message = parts
# Geänderte Dateien für diesen Commit
diff_tree_output = run_git(
"diff-tree", "--no-commit-id", "-r", "--name-status", commit_hash,
cwd=repo_path,
)
for diff_line in diff_tree_output.strip().split("\n"):
if not diff_line or not diff_line[0].isalpha():
continue
diff_parts = diff_line.split("\t", 2)
status = diff_parts[0]
filepath = diff_parts[1] if len(diff_parts) > 1 else ""
# Bei Renames: Zielpfad verwenden
if status.startswith("R") and len(diff_parts) >= 3:
filepath = diff_parts[2]
# Normalisieren
filepath = filepath.replace("\\", "/")
if not is_in_genome_scope(filepath):
continue
trait_key = get_trait_key(filepath, repo_path)
if not trait_key:
continue
mutation_type = get_mutation_type(status)
# Diff holen
diff_output = run_git(
"show", "--format=", "--no-color", commit_hash, "--", filepath,
cwd=repo_path,
)
# Diff kürzen
diff_lines = diff_output.strip().split("\n") if diff_output.strip() else []
if len(diff_lines) > MAX_DIFF_LINES:
truncated = len(diff_lines) - MAX_DIFF_LINES
diff_lines = diff_lines[:MAX_DIFF_LINES] + [f"... ({truncated} weitere Zeilen)"]
mutations[trait_key].append({
"hash": commit_hash[:8],
"date": date,
"author": author,
"message": message,
"file": filepath,
"type": mutation_type,
"diff": "\n".join(diff_lines),
})
return mutations
def generate_markdown(mutations: dict[str, list[dict]], repo_path: str, since: str) -> str:
"""Generiert die Markdown-Ausgabe."""
lines = []
lines.append("# Raw Improvements")
lines.append("")
lines.append(f"**Extrahiert:** {datetime.now().strftime('%Y-%m-%d %H:%M')}")
lines.append(f"**Zeitraum:** seit {since}")
lines.append(f"**Repository:** {Path(repo_path).resolve().name}")
lines.append(f"**Capabilities mit Improvements:** {len(mutations)}")
lines.append("")
lines.append("---")
lines.append("")
if not mutations:
lines.append("*Keine Improvements im angegebenen Zeitraum gefunden.*")
else:
for trait_key in sorted(mutations.keys()):
trait_mutations = mutations[trait_key]
lines.append(f"## Capability: `{trait_key}`")
lines.append("")
lines.append("| Improvements | Dateien |")
lines.append("|-------------|---------|")
unique_files = sorted(set(m["file"] for m in trait_mutations))
lines.append(f"| {len(trait_mutations)} | {', '.join(unique_files)} |")
lines.append("")
# Gruppiert nach Commit
commits_seen: dict[str, list[dict]] = {}
for m in trait_mutations:
commits_seen.setdefault(m["hash"], []).append(m)
for commit_hash, commit_mutations in commits_seen.items():
first = commit_mutations[0]
lines.append(f"### [{first['hash']}] {first['message']}")
lines.append("")
lines.append(f"- **Datum:** {first['date']}")
lines.append(f"- **Autor:** {first['author']}")
lines.append("")
for mutation in commit_mutations:
lines.append(f"#### `{mutation['type']}` {mutation['file']}")
lines.append("")
if mutation["diff"]:
lines.append("```diff")
lines.append(mutation["diff"])
lines.append("```")
lines.append("")
lines.append("---")
lines.append("")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(description="Genome Engine Extraction")
parser.add_argument("--since", default="7 days ago", help='Zeitspanne (z.B. "7 days ago")')
parser.add_argument("--repo", default=".", help="Pfad zum Repository")
parser.add_argument("--output", default="", help="Output-Pfad (default: .github/genome/output/raw-improvements.md)")
args = parser.parse_args()
repo_path = os.path.abspath(args.repo)
output_path = args.output or os.path.join(repo_path, ".github/genome/output/raw-improvements.md")
print(f"Genome Extract: Scanning commits since '{args.since}'...")
mutations = extract_mutations(repo_path, args.since)
markdown = generate_markdown(mutations, repo_path, args.since)
# Output schreiben
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
f.write(markdown)
total_mutations = sum(len(v) for v in mutations.values())
print()
print("Extraction abgeschlossen:")
print(f" Capabilities: {len(mutations)}")
print(f" Improvements: {total_mutations}")
print(f" Output: {output_path}")
if __name__ == "__main__":
main()