refactor(genome): rewrite extraction in Python for proper UTF-8 support

PowerShell auf Windows hat Encoding-Probleme mit Git-Output (Umlaute).
Python 3 handhabt UTF-8 nativ korrekt.
This commit is contained in:
Jens Reinemann 2026-05-18 09:49:37 +02:00
parent 24c6fac0f8
commit 10cb474906
2 changed files with 282 additions and 296 deletions

View file

@ -1,296 +0,0 @@
<#
.SYNOPSIS
Genome Engine Phase 1: Extraction
Extrahiert Mutations aus der Git-History für Copilot-Customization-Dateien.
.DESCRIPTION
Scannt git log für Änderungen im Genome-Scope (.github/skills, agents, prompts, instructions).
Gruppiert Diffs nach Trait und gibt strukturiertes Markdown aus.
.PARAMETER Since
Zeitspanne für git log (z.B. "4 days ago", "2 weeks ago"). Default: "7 days ago"
.PARAMETER RepoPath
Pfad zum Repository. Default: aktuelles Verzeichnis.
.PARAMETER OutputPath
Pfad für die Ausgabedatei. Default: .github/genome/output/raw-mutations.md
.EXAMPLE
.\.github\genome\genome-extract.ps1 -Since "4 days ago"
#>
param(
[string]$Since = "7 days ago",
[string]$RepoPath = ".",
[string]$OutputPath = ""
)
Set-StrictMode -Version Latest
$ErrorActionPreference = "Stop"
# --- Konfiguration ---
$GenomeScopes = @(
".github/skills/"
".github/agents/"
".github/prompts/"
".github/copilot-instructions.md"
".github/kotlin-conventions.instructions.md"
)
# --- Funktionen ---
function Get-TraitKey {
<#
.SYNOPSIS
Leitet den Trait-Key aus einem Dateipfad ab.
#>
param([string]$FilePath)
# Skills: skill/<ordnername>
if ($FilePath -match "^\.github/skills/([^/]+)/") {
return "skill/$($Matches[1])"
}
# Agents: agent/<dateiname-ohne-extension>
if ($FilePath -match "^\.github/agents/(.+)\.agent\.md$") {
return "agent/$($Matches[1])"
}
# Prompts: Standalone oder Verbund
if ($FilePath -match "^\.github/prompts/(.+)\.prompt\.md$") {
$name = $Matches[1]
# Prüfe ob es ein Sub-Prompt ist (enthält Bindestrich und Router existiert)
# Verbund-Erkennung: <router>-<sub>.prompt.md → trait des Routers
# Wir suchen den längsten Präfix, der als Router existieren könnte
$parts = $name -split "-"
if ($parts.Count -gt 1) {
# Versuche progressiv kürzere Präfixe als Router-Name
for ($i = $parts.Count - 1; $i -ge 1; $i--) {
$candidate = ($parts[0..($i-1)] -join "-")
$routerPath = ".github/prompts/$candidate.prompt.md"
$fullRouterPath = Join-Path $RepoPath $routerPath
if (Test-Path $fullRouterPath) {
return "prompt/$candidate"
}
}
}
# Standalone-Prompt
return "prompt/$name"
}
# Instructions
if ($FilePath -match "^\.github/(.+)\.instructions\.md$") {
return "instructions/$($Matches[1])"
}
if ($FilePath -match "^\.github/copilot-instructions\.md$") {
return "instructions/copilot-instructions"
}
return $null
}
function Get-MutationType {
<#
.SYNOPSIS
Bestimmt den Mutation-Typ aus dem Git diff-filter Status.
#>
param(
[string]$Status # A, M, D, R, etc.
)
switch -Regex ($Status) {
"^A" { return "member-added" }
"^D" { return "member-removed" }
default { return "content-change" }
}
}
function Test-InGenomeScope {
<#
.SYNOPSIS
Prüft ob ein Dateipfad im Genome-Scope liegt.
#>
param([string]$FilePath)
foreach ($scope in $GenomeScopes) {
if ($scope.EndsWith("/")) {
if ($FilePath.StartsWith($scope)) { return $true }
} else {
if ($FilePath -eq $scope) { return $true }
}
}
return $false
}
# --- Hauptlogik ---
Push-Location $RepoPath
try {
# Output-Pfad bestimmen
if (-not $OutputPath) {
$OutputPath = Join-Path $RepoPath ".github/genome/output/raw-mutations.md"
}
Write-Host "Genome Extract: Scanning commits since '$Since'..." -ForegroundColor Cyan
# Git-Log abrufen: Commits die Genome-Scope-Dateien betreffen
$logFormat = "--format=%H|%aI|%an|%s"
$commits = git log $logFormat --since="$Since" -- $GenomeScopes 2>&1
if (-not $commits -or $LASTEXITCODE -ne 0) {
Write-Host "Keine Commits im Genome-Scope seit '$Since' gefunden." -ForegroundColor Yellow
$commits = @()
}
# Commits parsen
$mutations = @{} # Key: trait → Value: Liste von Mutations
foreach ($line in $commits) {
if (-not $line -or $line -notmatch "\|") { continue }
$parts = $line -split "\|", 4
if ($parts.Count -lt 4) { continue }
$hash = $parts[0]
$date = $parts[1]
$author = $parts[2]
$message = $parts[3]
# Geänderte Dateien für diesen Commit abrufen
$diffFiles = git diff-tree --no-commit-id -r --name-status $hash 2>&1
foreach ($diffLine in $diffFiles) {
if (-not $diffLine -or $diffLine -notmatch "^\w") { continue }
$diffParts = $diffLine -split "\t", 3
$status = $diffParts[0]
$filePath = $diffParts[1]
# Bei Renames: Zielpfad verwenden
if ($status -match "^R" -and $diffParts.Count -ge 3) {
$filePath = $diffParts[2]
}
# Normalisieren (Backslash → Forward Slash)
$filePath = $filePath -replace "\\", "/"
# Prüfe ob im Genome-Scope
if (-not (Test-InGenomeScope $filePath)) { continue }
# Trait-Key ableiten
$traitKey = Get-TraitKey $filePath
if (-not $traitKey) { continue }
# Mutation-Typ bestimmen
$mutationType = Get-MutationType $status
# Diff für diese Datei holen
$diff = git show --format="" --no-color $hash -- $filePath 2>&1
if ($LASTEXITCODE -ne 0) {
# Fallback: diff-tree
$diff = git diff-tree -p $hash -- $filePath 2>&1
}
$diffText = ($diff | Out-String).Trim()
# Mutation speichern
if (-not $mutations.ContainsKey($traitKey)) {
$mutations[$traitKey] = @()
}
$mutations[$traitKey] += @{
Hash = $hash.Substring(0, [Math]::Min(8, $hash.Length))
Date = $date
Author = $author
Message = $message
File = $filePath
Type = $mutationType
Diff = $diffText
}
}
}
# --- Output generieren ---
$sb = [System.Text.StringBuilder]::new()
[void]$sb.AppendLine("# Raw Mutations")
[void]$sb.AppendLine("")
[void]$sb.AppendLine("**Extrahiert:** $(Get-Date -Format 'yyyy-MM-dd HH:mm')")
[void]$sb.AppendLine("**Zeitraum:** seit $Since")
[void]$sb.AppendLine("**Repository:** $(Split-Path $RepoPath -Leaf)")
[void]$sb.AppendLine("**Traits mit Mutations:** $($mutations.Count)")
[void]$sb.AppendLine("")
[void]$sb.AppendLine("---")
[void]$sb.AppendLine("")
if ($mutations.Count -eq 0) {
[void]$sb.AppendLine("*Keine Mutations im angegebenen Zeitraum gefunden.*")
} else {
# Sortiert nach Trait-Key ausgeben
foreach ($traitKey in ($mutations.Keys | Sort-Object)) {
$traitMutations = $mutations[$traitKey]
[void]$sb.AppendLine("## Trait: ``$traitKey``")
[void]$sb.AppendLine("")
[void]$sb.AppendLine("| Mutations | Dateien |")
[void]$sb.AppendLine("|-----------|---------|")
$uniqueFiles = ($traitMutations | ForEach-Object { $_.File } | Sort-Object -Unique) -join ", "
[void]$sb.AppendLine("| $($traitMutations.Count) | $uniqueFiles |")
[void]$sb.AppendLine("")
# Gruppiert nach Commit (Hash)
$byCommit = $traitMutations | Group-Object -Property Hash
foreach ($commitGroup in $byCommit) {
$first = $commitGroup.Group[0]
[void]$sb.AppendLine("### [$($first.Hash)] $($first.Message)")
[void]$sb.AppendLine("")
[void]$sb.AppendLine("- **Datum:** $($first.Date)")
[void]$sb.AppendLine("- **Autor:** $($first.Author)")
[void]$sb.AppendLine("")
foreach ($mutation in $commitGroup.Group) {
$header = "#### " + '`' + $mutation.Type + '`' + " - " + $mutation.File
[void]$sb.AppendLine($header)
[void]$sb.AppendLine("")
if ($mutation.Diff) {
# Diff auf max 80 Zeilen begrenzen
$diffLines = $mutation.Diff -split [Environment]::NewLine
if ($diffLines.Count -gt 80) {
$truncMsg = "... ($($diffLines.Count - 80) weitere Zeilen)"
$diffLines = $diffLines[0..79] + @($truncMsg)
}
[void]$sb.AppendLine('```diff')
[void]$sb.AppendLine(($diffLines -join [Environment]::NewLine))
[void]$sb.AppendLine('```')
}
[void]$sb.AppendLine("")
}
}
[void]$sb.AppendLine("---")
[void]$sb.AppendLine("")
}
}
# Datei schreiben
$outputDir = Split-Path $OutputPath -Parent
if (-not (Test-Path $outputDir)) {
New-Item -ItemType Directory -Path $outputDir -Force | Out-Null
}
$sb.ToString() | Set-Content -Path $OutputPath -Encoding UTF8
Write-Host ""
Write-Host "Extraction abgeschlossen:" -ForegroundColor Green
Write-Host " Traits: $($mutations.Count)" -ForegroundColor White
Write-Host " Mutations: $(($mutations.Values | ForEach-Object { $_.Count } | Measure-Object -Sum).Sum)" -ForegroundColor White
Write-Host " Output: $OutputPath" -ForegroundColor White
} finally {
Pop-Location
}

282
.github/genome/genome-extract.py vendored Normal file
View file

@ -0,0 +1,282 @@
#!/usr/bin/env python3
"""
Genome Engine Phase 1: Extraction
Extrahiert Mutations aus der Git-History für Copilot-Customization-Dateien.
Scannt git log für Änderungen im Genome-Scope (.github/skills, agents, prompts, instructions).
Gruppiert Diffs nach Trait und gibt strukturiertes Markdown aus.
Usage:
python .github/genome/genome-extract.py --since "7 days ago"
python .github/genome/genome-extract.py --since "4 days ago" --repo /path/to/repo
"""
import argparse
import os
import re
import subprocess
import sys
from collections import defaultdict
from datetime import datetime
from pathlib import Path
# --- Konfiguration ---
GENOME_SCOPES = [
".github/skills/",
".github/agents/",
".github/prompts/",
".github/copilot-instructions.md",
".github/kotlin-conventions.instructions.md",
]
MAX_DIFF_LINES = 80
def run_git(*args: str, cwd: str = ".") -> str:
"""Git-Kommando ausführen, UTF-8-Output zurückgeben."""
result = subprocess.run(
["git"] + list(args),
cwd=cwd,
capture_output=True,
text=True,
encoding="utf-8",
errors="replace",
)
return result.stdout
def is_in_genome_scope(filepath: str) -> bool:
"""Prüft ob ein Dateipfad im Genome-Scope liegt."""
for scope in GENOME_SCOPES:
if scope.endswith("/"):
if filepath.startswith(scope):
return True
else:
if filepath == scope:
return True
return False
def get_trait_key(filepath: str, repo_path: str) -> str | None:
"""Leitet den Trait-Key aus einem Dateipfad ab."""
# Skills: skill/<ordnername>
m = re.match(r"^\.github/skills/([^/]+)/", filepath)
if m:
return f"skill/{m.group(1)}"
# Agents: agent/<dateiname-ohne-extension>
m = re.match(r"^\.github/agents/(.+)\.agent\.md$", filepath)
if m:
return f"agent/{m.group(1)}"
# Prompts: Standalone oder Verbund
m = re.match(r"^\.github/prompts/(.+)\.prompt\.md$", filepath)
if m:
name = m.group(1)
# Verbund-Erkennung: <router>-<sub>.prompt.md → Trait des Routers
parts = name.split("-")
if len(parts) > 1:
# Versuche progressiv kürzere Präfixe als Router-Name
for i in range(len(parts) - 1, 0, -1):
candidate = "-".join(parts[:i])
router_path = Path(repo_path) / f".github/prompts/{candidate}.prompt.md"
if router_path.exists():
return f"prompt/{candidate}"
# Standalone-Prompt
return f"prompt/{name}"
# Instructions (*.instructions.md)
m = re.match(r"^\.github/(.+)\.instructions\.md$", filepath)
if m:
return f"instructions/{m.group(1)}"
# copilot-instructions.md
if filepath == ".github/copilot-instructions.md":
return "instructions/copilot-instructions"
return None
def get_mutation_type(status: str) -> str:
"""Bestimmt den Mutation-Typ aus dem Git-Status-Buchstaben."""
if status.startswith("A"):
return "member-added"
elif status.startswith("D"):
return "member-removed"
else:
return "content-change"
def extract_mutations(repo_path: str, since: str) -> dict[str, list[dict]]:
"""Extrahiert alle Mutations aus der Git-History."""
mutations: dict[str, list[dict]] = defaultdict(list)
# Git-Log abrufen
log_output = run_git(
"log",
"--format=%H|%aI|%an|%s",
f"--since={since}",
"--",
*GENOME_SCOPES,
cwd=repo_path,
)
if not log_output.strip():
return mutations
for line in log_output.strip().split("\n"):
if "|" not in line:
continue
parts = line.split("|", 3)
if len(parts) < 4:
continue
commit_hash, date, author, message = parts
# Geänderte Dateien für diesen Commit
diff_tree_output = run_git(
"diff-tree", "--no-commit-id", "-r", "--name-status", commit_hash,
cwd=repo_path,
)
for diff_line in diff_tree_output.strip().split("\n"):
if not diff_line or not diff_line[0].isalpha():
continue
diff_parts = diff_line.split("\t", 2)
status = diff_parts[0]
filepath = diff_parts[1] if len(diff_parts) > 1 else ""
# Bei Renames: Zielpfad verwenden
if status.startswith("R") and len(diff_parts) >= 3:
filepath = diff_parts[2]
# Normalisieren
filepath = filepath.replace("\\", "/")
if not is_in_genome_scope(filepath):
continue
trait_key = get_trait_key(filepath, repo_path)
if not trait_key:
continue
mutation_type = get_mutation_type(status)
# Diff holen
diff_output = run_git(
"show", "--format=", "--no-color", commit_hash, "--", filepath,
cwd=repo_path,
)
# Diff kürzen
diff_lines = diff_output.strip().split("\n") if diff_output.strip() else []
if len(diff_lines) > MAX_DIFF_LINES:
truncated = len(diff_lines) - MAX_DIFF_LINES
diff_lines = diff_lines[:MAX_DIFF_LINES] + [f"... ({truncated} weitere Zeilen)"]
mutations[trait_key].append({
"hash": commit_hash[:8],
"date": date,
"author": author,
"message": message,
"file": filepath,
"type": mutation_type,
"diff": "\n".join(diff_lines),
})
return mutations
def generate_markdown(mutations: dict[str, list[dict]], repo_path: str, since: str) -> str:
"""Generiert die Markdown-Ausgabe."""
lines = []
lines.append("# Raw Mutations")
lines.append("")
lines.append(f"**Extrahiert:** {datetime.now().strftime('%Y-%m-%d %H:%M')}")
lines.append(f"**Zeitraum:** seit {since}")
lines.append(f"**Repository:** {Path(repo_path).resolve().name}")
lines.append(f"**Traits mit Mutations:** {len(mutations)}")
lines.append("")
lines.append("---")
lines.append("")
if not mutations:
lines.append("*Keine Mutations im angegebenen Zeitraum gefunden.*")
else:
for trait_key in sorted(mutations.keys()):
trait_mutations = mutations[trait_key]
lines.append(f"## Trait: `{trait_key}`")
lines.append("")
lines.append("| Mutations | Dateien |")
lines.append("|-----------|---------|")
unique_files = sorted(set(m["file"] for m in trait_mutations))
lines.append(f"| {len(trait_mutations)} | {', '.join(unique_files)} |")
lines.append("")
# Gruppiert nach Commit
commits_seen: dict[str, list[dict]] = {}
for m in trait_mutations:
commits_seen.setdefault(m["hash"], []).append(m)
for commit_hash, commit_mutations in commits_seen.items():
first = commit_mutations[0]
lines.append(f"### [{first['hash']}] {first['message']}")
lines.append("")
lines.append(f"- **Datum:** {first['date']}")
lines.append(f"- **Autor:** {first['author']}")
lines.append("")
for mutation in commit_mutations:
lines.append(f"#### `{mutation['type']}` {mutation['file']}")
lines.append("")
if mutation["diff"]:
lines.append("```diff")
lines.append(mutation["diff"])
lines.append("```")
lines.append("")
lines.append("---")
lines.append("")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(description="Genome Engine Extraction")
parser.add_argument("--since", default="7 days ago", help='Zeitspanne (z.B. "7 days ago")')
parser.add_argument("--repo", default=".", help="Pfad zum Repository")
parser.add_argument("--output", default="", help="Output-Pfad (default: .github/genome/output/raw-mutations.md)")
args = parser.parse_args()
repo_path = os.path.abspath(args.repo)
output_path = args.output or os.path.join(repo_path, ".github/genome/output/raw-mutations.md")
print(f"Genome Extract: Scanning commits since '{args.since}'...")
mutations = extract_mutations(repo_path, args.since)
markdown = generate_markdown(mutations, repo_path, args.since)
# Output schreiben
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
f.write(markdown)
total_mutations = sum(len(v) for v in mutations.values())
print()
print("Extraction abgeschlossen:")
print(f" Traits: {len(mutations)}")
print(f" Mutations: {total_mutations}")
print(f" Output: {output_path}")
if __name__ == "__main__":
main()