Files
paliad/internal/handlers/paliadin.go
m 3e1f4eee4b fix(t-paliad-155): cold-start timeout headroom + ban DB fallbacks in skill
Shim's run-turn hard timeout: 60s → 120s (PALIADIN_TIMEOUT_S default).
First turn after a fresh tmux session stacks claude boot + skill load
+ MCP discovery + first reasoning, which can blow past 60s before the
response file lands.

Aligned the surrounding timeouts so 120s is actually reachable:
- callShim ctx (paliadin_remote.go): 70s → 130s (shim 120 + 10 SSH).
- runPaliadinTurnAsync handler ctx: 120s → 150s (shim 120 + 10 SSH +
  20 paliad-side overhead).

SKILL.md hard rule #6 added: never fall back to psql / curl PostgREST /
nix-shell — mcp__supabase__execute_sql is the only DB tool. If it's
unavailable, write a short 'DB nicht erreichbar — bitte paliad neu
deployen oder PALIADIN_REMOTE_CWD prüfen' response immediately with
classifier_tag=meta. Saves the 60s-fallback-dance failure mode m hit
on the cwd-misconfig turn.
2026-05-08 13:19:27 +02:00

357 lines
11 KiB
Go

package handlers
// paliadin.go — HTTP/SSE handlers for the Paliadin PoC (t-paliad-146).
//
// Design: docs/design-paliadin-2026-05-07.md §0.5.
//
// Three user-facing surfaces:
// GET /paliadin — chat panel page shell
// POST /api/paliadin/turn — initiate a turn, returns {turn_id, sse_url}
// GET /api/paliadin/stream/{id} — SSE stream of the turn
// POST /api/paliadin/reset — /clear the conversation
// GET /admin/paliadin — monitoring dashboard (global_admin)
// GET /api/admin/paliadin/stats — stats JSON
// GET /api/admin/paliadin/turns — recent turns JSON
//
// Routes register only when the PaliadinService is wired (which only
// happens when PALIADIN_ENABLED=true at boot). On prod, where it's
// false by default, none of these URLs exist — they 404 like any
// unrouted path.
import (
"context"
"encoding/json"
"errors"
"fmt"
"net/http"
"sync"
"time"
"github.com/google/uuid"
"mgit.msbls.de/m/paliad/internal/services"
)
// newDetachedContext returns a context with timeout that is independent
// of any incoming request — needed so the Claude-via-tmux turn isn't
// cancelled when the originating POST returns ahead of the SSE stream.
func newDetachedContext(timeout time.Duration) (context.Context, context.CancelFunc) {
return context.WithTimeout(context.Background(), timeout)
}
// paliadinSvc is the live Paliadin backend. nil when DATABASE_URL was
// unset (the service depends on the audit table). Set by Register() at
// boot. The concrete type is decided in cmd/server/main.go: local-tmux
// PoC, remote-via-SSH (mRiver), or a disabled stub.
var paliadinSvc services.Paliadin
// requirePaliadinOwner gates every paliadin handler to the single
// owner email (services.PaliadinOwnerEmail = m). Anyone else gets a
// 404 — the gate is a "this URL doesn't exist for you" pretence
// rather than a 403, so a curious colleague can't even confirm the
// route is wired.
func requirePaliadinOwner(w http.ResponseWriter, r *http.Request) bool {
if paliadinSvc == nil {
http.NotFound(w, r)
return false
}
uid, ok := requireUser(w, r)
if !ok {
return false
}
owner, err := paliadinSvc.IsOwner(r.Context(), uid)
if err != nil {
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": err.Error()})
return false
}
if !owner {
http.NotFound(w, r)
return false
}
return true
}
// pendingTurns is an in-memory map of turn_id → result channel. The POST
// /api/paliadin/turn endpoint kicks off the work + writes a synthetic
// turn record; the GET /api/paliadin/stream/{id} endpoint reads from
// the channel + emits SSE events. PoC scope: single user, in-process
// state. Production v1 would use a Postgres-backed queue or pgnotify.
var (
pendingMu sync.Mutex
pendingTurns = map[uuid.UUID]chan turnEvent{}
)
// turnEvent is one SSE event for a turn-in-flight.
type turnEvent struct {
Kind string `json:"kind"` // meta | content | end | error
Data map[string]any `json:"data"`
}
// turnRequest is the JSON body of POST /api/paliadin/turn.
type turnRequest struct {
UserMessage string `json:"user_message"`
SessionID string `json:"session_id"`
PageOrigin string `json:"page_origin,omitempty"`
}
// turnResponse is what POST /api/paliadin/turn returns.
type turnResponse struct {
TurnID string `json:"turn_id"`
SSEURL string `json:"sse_url"`
}
// handlePaliadinPage serves the static /paliadin chat panel. Gated to
// the single Paliadin owner (m); every other authenticated user gets
// a 404 — the route effectively does not exist for them.
func handlePaliadinPage(w http.ResponseWriter, r *http.Request) {
if !requirePaliadinOwner(w, r) {
return
}
http.ServeFile(w, r, "dist/paliadin.html")
}
// handleAdminPaliadinPage serves the /admin/paliadin monitoring page.
// Same owner gate — even other global_admins can't see this surface.
func handleAdminPaliadinPage(w http.ResponseWriter, r *http.Request) {
if !requirePaliadinOwner(w, r) {
return
}
http.ServeFile(w, r, "dist/admin-paliadin.html")
}
// handlePaliadinTurn kicks off a turn and returns the SSE URL.
//
// We don't block here; the actual Claude work runs in a goroutine that
// pushes events into the per-turn channel. The client immediately opens
// EventSource on the returned URL and reads as the goroutine writes.
func handlePaliadinTurn(w http.ResponseWriter, r *http.Request) {
if !requirePaliadinOwner(w, r) {
return
}
uid, _ := requireUser(w, r) // already validated by requirePaliadinOwner
var req turnRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "invalid JSON"})
return
}
if req.UserMessage == "" {
writeJSON(w, http.StatusBadRequest, map[string]string{"error": "user_message required"})
return
}
if req.SessionID == "" {
req.SessionID = uuid.New().String()
}
turnID := uuid.New()
ch := make(chan turnEvent, 16)
pendingMu.Lock()
pendingTurns[turnID] = ch
pendingMu.Unlock()
// Goroutine drives the actual Claude turn. We use a fresh context
// (not r.Context()) because the request is going to return as soon
// as we hand back the SSE URL — we don't want the whole turn to
// cancel when the POST completes.
go runPaliadinTurnAsync(turnID, services.TurnRequest{
UserID: uid,
SessionID: req.SessionID,
UserMessage: req.UserMessage,
PageOrigin: req.PageOrigin,
}, ch)
writeJSON(w, http.StatusOK, turnResponse{
TurnID: turnID.String(),
SSEURL: "/api/paliadin/stream/" + turnID.String(),
})
}
// runPaliadinTurnAsync executes the turn and writes events into ch.
// Uses a 150 s hard timeout independently of the originating request,
// which leaves headroom over the shim's 120 s run-turn cap + SSH
// overhead (t-paliad-155: cold-start safety for skill + MCP discovery).
func runPaliadinTurnAsync(turnID uuid.UUID, req services.TurnRequest, ch chan<- turnEvent) {
defer func() {
// Drain + close. The SSE handler reads until the channel closes.
close(ch)
}()
// Send a meta event so the client can show "Paliadin denkt nach …"
send(ch, turnEvent{
Kind: "meta",
Data: map[string]any{
"turn_id": turnID.String(),
"started_at": time.Now().UTC().Format(time.RFC3339),
},
})
ctx, cancel := newDetachedContext(150 * time.Second)
defer cancel()
result, err := paliadinSvc.RunTurn(ctx, req)
if err != nil {
errCode := "upstream_error"
if errors.Is(err, services.ErrTmuxUnavailable) {
errCode = "tmux_unavailable"
}
send(ch, turnEvent{
Kind: "error",
Data: map[string]any{"code": errCode, "message": err.Error()},
})
return
}
// One-shot content event with the full body. The frontend simulates
// streaming with a typewriter effect (cf. design §0.5.5: real
// chunked streaming would require Claude to write the response file
// progressively — out of PoC scope).
send(ch, turnEvent{
Kind: "content",
Data: map[string]any{"text": result.Response},
})
send(ch, turnEvent{
Kind: "end",
Data: map[string]any{
"turn_id": turnID.String(),
"used_tools": result.UsedTools,
"rows_seen": result.RowsSeen,
"chip_count": result.ChipCount,
"classifier_tag": result.ClassifierTag,
"duration_ms": result.DurationMS,
},
})
}
// handlePaliadinStream is the SSE endpoint the EventSource subscribes
// to. Reads from the per-turn channel + writes SSE-framed events.
func handlePaliadinStream(w http.ResponseWriter, r *http.Request) {
if !requirePaliadinOwner(w, r) {
return
}
turnIDStr := r.PathValue("id")
turnID, err := uuid.Parse(turnIDStr)
if err != nil {
http.Error(w, "invalid turn_id", http.StatusBadRequest)
return
}
pendingMu.Lock()
ch, ok := pendingTurns[turnID]
pendingMu.Unlock()
if !ok {
http.Error(w, "unknown turn_id (already finished, or never started)", http.StatusNotFound)
return
}
// SSE headers.
w.Header().Set("Content-Type", "text/event-stream")
w.Header().Set("Cache-Control", "no-cache, no-transform")
w.Header().Set("Connection", "keep-alive")
w.Header().Set("X-Accel-Buffering", "no") // disable nginx/Traefik buffering
flusher, ok := w.(http.Flusher)
if !ok {
http.Error(w, "streaming unsupported", http.StatusInternalServerError)
return
}
// Heartbeat ticker keeps reverse proxies from reaping the connection.
heartbeat := time.NewTicker(25 * time.Second)
defer heartbeat.Stop()
for {
select {
case <-r.Context().Done():
// Client closed the connection — don't drain, leave the
// channel for the goroutine to finish into. Pending-turns
// cleanup happens after end/error events flush below.
return
case <-heartbeat.C:
fmt.Fprint(w, "event: ping\ndata: {}\n\n")
flusher.Flush()
case ev, more := <-ch:
if !more {
// Goroutine finished. Tidy up the pending-turns map.
pendingMu.Lock()
delete(pendingTurns, turnID)
pendingMu.Unlock()
return
}
payload, _ := json.Marshal(ev.Data)
fmt.Fprintf(w, "event: %s\ndata: %s\n\n", ev.Kind, payload)
flusher.Flush()
if ev.Kind == "end" || ev.Kind == "error" {
// Don't return immediately — wait for the channel to
// close so the cleanup branch above runs and the client
// gets a clean EOF.
}
}
}
}
// handlePaliadinReset kills the caller's Paliadin tmux session so the
// next turn boots a fresh claude pane (per-user — see t-paliad-155).
func handlePaliadinReset(w http.ResponseWriter, r *http.Request) {
if !requirePaliadinOwner(w, r) {
return
}
uid, _ := requireUser(w, r) // already validated by requirePaliadinOwner
ctx, cancel := newDetachedContext(10 * time.Second)
defer cancel()
if err := paliadinSvc.ResetSession(ctx, uid); err != nil {
writeJSON(w, http.StatusInternalServerError, map[string]string{
"error": "reset failed: " + err.Error(),
})
return
}
writeJSON(w, http.StatusOK, map[string]bool{"ok": true})
}
// =============================================================================
// /admin/paliadin — monitoring dashboard.
// =============================================================================
// handleAdminPaliadinStats returns the aggregate stats for the dashboard.
func handleAdminPaliadinStats(w http.ResponseWriter, r *http.Request) {
if !requirePaliadinOwner(w, r) {
return
}
uid, _ := requireUser(w, r)
stats, err := paliadinSvc.Stats(r.Context(), uid)
if err != nil {
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": err.Error()})
return
}
writeJSON(w, http.StatusOK, stats)
}
// handleAdminPaliadinTurns returns the most recent turn rows.
func handleAdminPaliadinTurns(w http.ResponseWriter, r *http.Request) {
if !requirePaliadinOwner(w, r) {
return
}
uid, _ := requireUser(w, r)
turns, err := paliadinSvc.ListRecentTurns(r.Context(), uid, 50)
if err != nil {
writeJSON(w, http.StatusInternalServerError, map[string]string{"error": err.Error()})
return
}
writeJSON(w, http.StatusOK, turns)
}
// =============================================================================
// helpers.
// =============================================================================
// send pushes an event onto the channel without blocking — drops on
// overflow. PoC scope: 16-deep buffer, single subscriber, very unlikely
// to overflow even with the slow Claude-via-tmux path.
func send(ch chan<- turnEvent, ev turnEvent) {
select {
case ch <- ev:
default:
// Channel full — drop. Logged by the SSE consumer's gap (it
// keeps reading; only "end"/"error" matter for completion).
}
}