diff --git a/Dockerfile b/Dockerfile index c75a6a3..e21898b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,7 +11,7 @@ COPY . . RUN CGO_ENABLED=0 go build -ldflags="-s -w" -o /paliad ./cmd/server FROM alpine:3.21 -RUN apk add --no-cache ca-certificates +RUN apk add --no-cache ca-certificates openssh-client WORKDIR /app COPY --from=backend /paliad /app/paliad COPY --from=frontend /app/frontend/dist /app/dist diff --git a/cmd/server/main.go b/cmd/server/main.go index f05db7d..32c9ca0 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -2,10 +2,13 @@ package main import ( "context" + "fmt" "log" "net/http" "os" + "os/exec" "os/signal" + "strconv" "syscall" // Embed Go's IANA tz database into the binary so time.LoadLocation works @@ -165,20 +168,34 @@ func main() { CardLayout: services.NewCardLayoutService(pool), } - // t-paliad-146 — Paliadin PoC. Always wired when DATABASE_URL - // is set; the per-request handler gate (requirePaliadinOwner) - // restricts access to the single owner email - // (services.PaliadinOwnerEmail). All other authenticated users - // get a 404 — the route effectively does not exist for them. - // On hosts without tmux + the `claude` CLI (e.g. the Dokploy - // container), the owner gate still applies; if m ever hits the - // route from such a host, the service returns "tmux unavailable" - // without ever invoking shell-out. - tmuxSession := os.Getenv("PALIADIN_TMUX_SESSION") - responseDir := os.Getenv("PALIADIN_RESPONSE_DIR") - svcBundle.Paliadin = services.NewPaliadinService(pool, users, tmuxSession, responseDir) - log.Printf("paliadin: wired (owner=%s; gate is per-request, not per-deploy)", - services.PaliadinOwnerEmail) + // Paliadin backend selection (t-paliad-146 + t-paliad-151): + // PALIADIN_REMOTE_HOST set → RemotePaliadinService (ssh to mRiver) + // else: local tmux available → LocalPaliadinService (PoC path) + // else: DisabledPaliadinService (handlers still 404 for non-owners + // via the gate; for m, RunTurn returns ErrPaliadinDisabled + // which surfaces as a friendly error). + // + // All three implement services.Paliadin; the per-request handler + // gate (requirePaliadinOwner) is unchanged and applies to every + // backend. + if remoteHost := os.Getenv("PALIADIN_REMOTE_HOST"); remoteHost != "" { + cfg, err := buildPaliadinRemoteConfig(remoteHost) + if err != nil { + log.Fatalf("paliadin: remote config: %v", err) + } + svcBundle.Paliadin = services.NewRemotePaliadinService(pool, users, cfg) + log.Printf("paliadin: remote mode → ssh %s@%s:%d (owner=%s)", + cfg.SSHUser, cfg.SSHHost, cfg.SSHPort, services.PaliadinOwnerEmail) + } else if _, err := exec.LookPath("tmux"); err == nil { + tmuxSession := os.Getenv("PALIADIN_TMUX_SESSION") + responseDir := os.Getenv("PALIADIN_RESPONSE_DIR") + svcBundle.Paliadin = services.NewLocalPaliadinService(pool, users, tmuxSession, responseDir) + log.Printf("paliadin: local tmux mode (owner=%s)", services.PaliadinOwnerEmail) + } else { + svcBundle.Paliadin = services.NewDisabledPaliadinService(pool, users) + log.Printf("paliadin: disabled (no PALIADIN_REMOTE_HOST, no local tmux; owner=%s)", + services.PaliadinOwnerEmail) + } // Wire ApprovalService into the entity services so Create / Update / // Complete / Delete consult paliad.approval_policies (t-paliad-138). // Without this wiring, the policies and request tables exist but no @@ -217,3 +234,83 @@ func main() { log.Fatal(err) } } + +// buildPaliadinRemoteConfig assembles a RemotePaliadinConfig from +// environment variables, materialising the SSH private key and +// known_hosts blobs into chmod-600/644 tmpfiles for OpenSSH to read. +// +// The blobs travel as Dokploy secrets (multi-line env vars). We never +// persist them to disk — tmpfiles live for the process lifetime in +// /tmp and disappear on container restart. Re-creating them every boot +// is fine; the keys themselves rotate independently via Dokploy +// secret updates. +// +// Required: PALIADIN_REMOTE_HOST, PALIADIN_SSH_PRIVATE_KEY, PALIADIN_KNOWN_HOSTS. +// Optional: PALIADIN_REMOTE_USER (default "m"), PALIADIN_REMOTE_PORT +// (default 22022 — bypasses Tailscale SSH on :22, see design §4.5). +func buildPaliadinRemoteConfig(host string) (services.RemotePaliadinConfig, error) { + cfg := services.RemotePaliadinConfig{ + SSHHost: host, + SSHUser: cmpOr(os.Getenv("PALIADIN_REMOTE_USER"), "m"), + SSHPort: 22022, + } + if p := os.Getenv("PALIADIN_REMOTE_PORT"); p != "" { + n, err := strconv.Atoi(p) + if err != nil || n <= 0 || n > 65535 { + return cfg, fmt.Errorf("PALIADIN_REMOTE_PORT %q: not a valid port", p) + } + cfg.SSHPort = n + } + + keyPath, err := writeSecretFile("paliadin-id_ed25519-", os.Getenv("PALIADIN_SSH_PRIVATE_KEY"), 0o600) + if err != nil { + return cfg, fmt.Errorf("PALIADIN_SSH_PRIVATE_KEY: %w", err) + } + if keyPath == "" { + return cfg, fmt.Errorf("PALIADIN_REMOTE_HOST set but PALIADIN_SSH_PRIVATE_KEY empty") + } + cfg.SSHKeyPath = keyPath + + knownHostsPath, err := writeSecretFile("paliadin-known_hosts-", os.Getenv("PALIADIN_KNOWN_HOSTS"), 0o644) + if err != nil { + return cfg, fmt.Errorf("PALIADIN_KNOWN_HOSTS: %w", err) + } + if knownHostsPath == "" { + return cfg, fmt.Errorf("PALIADIN_REMOTE_HOST set but PALIADIN_KNOWN_HOSTS empty") + } + cfg.KnownHostsPath = knownHostsPath + + return cfg, nil +} + +// writeSecretFile writes blob to a tmpfile with the given mode and +// returns its path. Returns ("", nil) when blob is empty so callers +// can distinguish "not set" from real I/O errors. +func writeSecretFile(prefix, blob string, mode os.FileMode) (string, error) { + if blob == "" { + return "", nil + } + f, err := os.CreateTemp("", prefix+"*") + if err != nil { + return "", err + } + if _, err := f.WriteString(blob); err != nil { + _ = f.Close() + _ = os.Remove(f.Name()) + return "", err + } + if err := f.Close(); err != nil { + return "", err + } + if err := os.Chmod(f.Name(), mode); err != nil { + return "", err + } + return f.Name(), nil +} + +func cmpOr(s, fallback string) string { + if s != "" { + return s + } + return fallback +} diff --git a/docs/design-paliadin-tailscale-ssh-2026-05-07.md b/docs/design-paliadin-tailscale-ssh-2026-05-07.md new file mode 100644 index 0000000..c8bfdac --- /dev/null +++ b/docs/design-paliadin-tailscale-ssh-2026-05-07.md @@ -0,0 +1,677 @@ +# Paliadin: route prod via Tailscale SSH to mRiver + +**Issue:** m/paliad#12 — t-paliad-151 +**Date:** 2026-05-07 +**Author:** noether (inventor) +**Supersedes nothing.** Extends `docs/design-paliadin-2026-05-07.md` (the Phase 0 PoC) with a third deployment path between "laptop-only PoC" and "Anthropic API direct". +**Related:** t-paliad-146 (PoC ship), t-paliad-150 (`friendlyErrorMessage` pattern). + +--- + +## 1. Goal + +Make Paliadin reachable from `paliad.de` (Dokploy on mLake) without losing m's Claude Code subscription, by routing each turn over Tailscale + SSH from the paliad container to mRiver, where the existing long-lived `tmux` + `claude` PoC keeps running. + +**Non-goals (v1):** + +- Multi-host failover. +- Encryption beyond SSH-over-tailnet (already E2E-encrypted by Tailscale's WireGuard layer). +- Anthropic API fallback when mRiver is offline — show a friendly error instead. +- Wake-on-LAN of mRiver. +- Multi-tenant or multi-firm variants. + +--- + +## 2. Live state — what was verified before designing + +A design built on stale facts rots fast. These were probed on 2026-05-07, not assumed from CLAUDE.md or memory: + +| Fact | How verified | Result | +|---|---|---| +| mRiver = `100.99.98.203`, has tmux + claude | this worker runs on mRiver; `tmux -V` → `tmux 3.6a`; `which claude` → `/home/m/.local/bin/claude` | confirmed | +| mLake (`100.99.98.201`) has Tailscale running | `ssh m@mlake tailscale status` | confirmed; mRiver visible as `active; direct [2a02:4780:41:3fbc::1]:41641` | +| paliad container Dockerfile is alpine:3.21 minimal, no SSH, no tailscaled | `Dockerfile` | confirmed (only `ca-certificates`) | +| paliad compose runs default Docker bridge (no `network_mode`) | `docker-compose.yml` | confirmed | +| mRiver has no `~/.ssh/authorized_keys` yet | `ls ~/.ssh/` | confirmed — file must be created in Phase A | +| `/tmp/paliadin/` does not exist on mRiver yet | `ls /tmp/paliadin` | confirmed — created on first turn (paliadin.go:185 `os.MkdirAll`) | +| `paliad-paliadin` tmux session is not currently running on mRiver | `tmux ls` | not present; the existing PoC creates it on demand | + +**Implication for design:** the paliad container needs new infrastructure on three axes — network reachability of the tailnet, an SSH client + identity, and a service-layer code path that talks to a remote tmux instead of a local one. Each axis is its own sub-design below. + +--- + +## 3. Locked decisions (m, 2026-05-07 22:35) + +m made four design-shaping calls via the inventor's `AskUserQuestion` pass. They are recorded here verbatim because every downstream choice in §4–§6 follows from them. + +| # | Question | m's choice | +|---|---|---| +| 1 | Container Tailscale shape | **`network_mode: host` on paliad** | +| 2 | SSH-to-mRiver protocol granularity | **Server-side `paliadin-shim` (one RPC per turn)** | +| 3 | Routing trigger | **Env var `PALIADIN_REMOTE_HOST` + interface split** | +| 4 | SSH private key storage | **Dokploy secret env var `PALIADIN_SSH_PRIVATE_KEY`** | +| 5 | SSH port to bypass Tailscale SSH | **Port 22022 via `ssh.socket` drop-in (Phase A finding, 23:30)** | + +Decision (1) was *not* the inventor's recommendation — host mode has known interaction risk with traefik (§4.2). m is overriding the recommendation; this design accepts the call and codifies a Phase A test step that gates the rollout on traefik still working under host mode. If Phase A blows up, the fallback is to revisit (1) in a follow-up issue, not to silently swap to a sidecar. + +Decision (5) emerged during Phase A: Tailscale SSH on mRiver was found to intercept `:22` from tailnet peers and bypass OpenSSH's `authorized_keys` entirely (banner says "Tailscale", auth method "none"). The `command=` shim restriction therefore never fires on the standard port. Adding port 22022 via a `systemd ssh.socket` drop-in routes paliad's connections to real OpenSSH where the restriction works. m's interactive `tailscale ssh m@mriver` on `:22` stays untouched. See §4.4 for the implementation. + +--- + +## 4. Sub-design A — Container Tailscale shape + +### 4.1 Shape: `network_mode: host` + +paliad's container shares mLake's network namespace. `tailscale0` (mLake's tailnet interface) is directly visible from inside the container. Outbound `ssh m@100.99.98.203` reaches mRiver over the tailnet without any sidecar, userspace tailscaled, SOCKS proxy, or auth-key flow inside the container. + +```yaml +# docker-compose.yml diff +services: + web: + build: . + network_mode: host # NEW + # remove: expose: ["8080"] # host mode means port is on the host directly + environment: + - PORT=8080 + ... + # NEW Paliadin remote-routing knobs + - PALIADIN_REMOTE_HOST=${PALIADIN_REMOTE_HOST} # 100.99.98.203 + - PALIADIN_REMOTE_PORT=${PALIADIN_REMOTE_PORT} # 22022 (bypasses Tailscale SSH, see §4.5) + - PALIADIN_REMOTE_USER=${PALIADIN_REMOTE_USER} # m + - PALIADIN_SSH_PRIVATE_KEY=${PALIADIN_SSH_PRIVATE_KEY} + - PALIADIN_KNOWN_HOSTS=${PALIADIN_KNOWN_HOSTS} # one-line ssh-keyscan -p 22022 output + restart: unless-stopped +``` + +### 4.2 Trade-off accepted: traefik routing under host mode + +paliad.de's TLS is provided by Dokploy's traefik on the `dokploy-network` overlay. With `network_mode: host`, paliad is no longer attached to that overlay. Two failure modes are possible: + +- **(M1)** traefik can't discover the service via Docker DNS → 502 at the edge. +- **(M2)** traefik routes via host loopback (`http://127.0.0.1:8080` or `host.docker.internal`) and works fine. + +Recent Dokploy versions configure traefik with both `loadbalancer.server.url` and Docker labels; (M2) is the documented host-mode path. **Phase A explicitly tests this** (§7) before any code is written; if (M1) materialises, the design rolls back to the sidecar variant of decision 1 in a follow-up issue. + +Other host-mode side-effects to flag in operations: + +- paliad listens on host port 8080 directly. Any other compose service binding 8080 conflicts. +- paliad's outbound DNS uses host resolver (no Docker-internal `web` etc.). Currently fine: paliad's only network deps are external (Supabase, SMTP, GitHub raw). No service on `dokploy-network` is referenced by name. +- The container can reach **every** Tailscale node, not just mRiver. Mitigations live in §5 (key restriction) and §5.2 (`from=` clause on mRiver authorized_keys). + +### 4.3 Dockerfile diff + +```dockerfile +# Final stage adds the SSH client only. Tailscale is provided by the host. +FROM alpine:3.21 +RUN apk add --no-cache ca-certificates openssh-client # +openssh-client (~1MB) +WORKDIR /app +COPY --from=backend /paliad /app/paliad +COPY --from=frontend /app/frontend/dist /app/dist +EXPOSE 8080 +CMD ["/app/paliad"] +``` + +Image-size delta: alpine `openssh-client` is ~1.1 MB compressed — negligible. No tailscaled, no entrypoint script, no extra processes inside the container. + +### 4.4 What does NOT change + +- No Tailscale auth-key inside paliad. The container inherits the host's tailnet binding, so there is no per-container Tailscale identity to rotate. mLake's existing Tailscale auth is the only one in scope. +- No tailscaled process inside the container. +- No new sidecar container. + +### 4.5 Bypassing Tailscale SSH via port 22022 (Phase A discovery) + +**Phase A revealed** that Tailscale SSH on mRiver intercepts `:22` from tailnet peers before OpenSSH sees the connection. The SSH banner reads `SSH-2.0-Tailscale`, the verbose log shows `Authenticated using "none"`, and the `authorized_keys command=` directive is therefore inert. mRiver's `tailscale status --json` confirms the `https://tailscale.com/cap/ssh` capability is enabled. + +The fix: a separate listening port for the paliad route, where Tailscale SSH does not intercept and real OpenSSH handles auth. + +mRiver uses systemd socket activation for sshd (`/usr/lib/systemd/system/ssh.socket` binds `:22`). Setting `Port 22022` in `sshd_config` is **ignored** under socket activation — listen ports come from the socket unit, not sshd's own config. The correct change is a drop-in: + +```ini +# /etc/systemd/system/ssh.socket.d/paliad.conf +[Socket] +ListenStream=0.0.0.0:22022 +ListenStream=[::]:22022 +``` + +Followed by `systemctl daemon-reload && systemctl restart ssh.socket`. Both `:22` (still routed through Tailscale SSH for m's interactive use) and `:22022` (real OpenSSH) end up listening. The same sshd binary handles both — same host key, same `authorized_keys`, same sshd_config. The only difference is *which port* a peer dials. + +A failed first attempt (2026-05-07 23:07) added the drop-in while a stale `Port 22022` directive in `sshd_config.d/99-paliad-test.conf` was still bound — the resulting `Address already in use` took `ssh.socket` down for ~30 s until reverted. Lesson: clean any prior `Port` directives out of `sshd_config.d/*.conf` before retrying the socket drop-in. + +Phase A end-to-end test (2026-05-07 23:31) succeeded with port 22022: + +- `ssh -p 22022 -i paliad-prod-key m@100.99.98.203 health` → `ok` +- `run-turn ` → 3.4 s round-trip including a Claude-Code response +- `from="100.99.98.201"` correctly rejected a connection sourced from mRiver itself (`Permission denied (publickey,password)`) + +--- + +## 5. Sub-design B — SSH identity, restricted shim, host-key pinning + +### 5.1 Identity: dedicated ed25519 keypair `paliad-prod` + +One keypair, generated once on mRiver during Phase A, used by every paliad-prod deploy: + +```bash +# On mRiver (Phase A bootstrap): +ssh-keygen -t ed25519 -N "" -C "paliad-prod $(date +%Y-%m-%d)" -f /tmp/paliad-prod-key +# Public key → mRiver authorized_keys (see 5.2) +# Private key → Dokploy secret store as PALIADIN_SSH_PRIVATE_KEY +shred -u /tmp/paliad-prod-key # only the encrypted/secret-stored copies survive +``` + +Rotation: regenerate, push public key to mRiver authorized_keys, update Dokploy secret, redeploy. No code change needed — paliad's startup re-reads the env var on every boot. + +The private key is delivered to the container as a multi-line env var. At process start, paliad writes it to a tmpfile so OpenSSH can use it: + +```go +// cmd/server/main.go (sketch) +func loadPaliadinSSHKey() (string, error) { + blob := os.Getenv("PALIADIN_SSH_PRIVATE_KEY") + if blob == "" { return "", nil } // remote mode disabled + f, err := os.CreateTemp("", "paliadin-id_ed25519-") + if err != nil { return "", err } + if err := os.Chmod(f.Name(), 0o600); err != nil { return "", err } + if _, err := f.WriteString(blob); err != nil { return "", err } + if err := f.Close(); err != nil { return "", err } + return f.Name(), nil // path passed to RemotePaliadinService +} +``` + +The tmpfile lives at `/tmp/paliadin-id_ed25519-` for the container's lifetime. On container restart, a fresh tmpfile is written. We never persist the key to a volume. + +### 5.2 mRiver `authorized_keys` entry + +``` +command="/home/m/.local/bin/paliadin-shim",no-pty,no-port-forwarding,no-agent-forwarding,no-X11-forwarding,no-user-rc,from="100.99.98.201" ssh-ed25519 AAAA...PUBKEY... paliad-prod +``` + +Each restriction matters: + +- `command=` — every `ssh m@mriver …` invocation runs the shim regardless of what the client asked for. The client's requested command is exposed as `$SSH_ORIGINAL_COMMAND` for the shim to dispatch on. +- `no-pty,no-port-forwarding,no-agent-forwarding,no-X11-forwarding,no-user-rc` — defence-in-depth: even if someone steals the key and bypasses the shim's argument validation, they can't get an interactive shell, can't tunnel ports, can't pivot via agent forwarding. +- `from="100.99.98.201"` — only accept connections from mLake's tailnet IP. Defends against the "container has full tailnet visibility" host-mode side-effect from §4.2: if the key leaks off mLake, it can't be replayed from another tailnet host. + +### 5.3 Host-key pinning + +`StrictHostKeyChecking=accept-new` is too loose for a long-lived production identity (one-time MITM during first connect substitutes a different key forever). Instead: + +- During Phase A, run `ssh-keyscan -p 22022 -t ed25519 100.99.98.203` on mLake. +- Capture the single output line. The host-key portion is identical to the `:22` entry — same sshd, same keys — but the `[100.99.98.203]:22022` prefix matters because OpenSSH's `known_hosts` is `host:port`-keyed for non-22 ports. +- Store as Dokploy secret `PALIADIN_KNOWN_HOSTS`. +- At container startup, write to `/tmp/paliadin-known_hosts` chmod 644. +- Pass to OpenSSH via `-o UserKnownHostsFile=/tmp/paliadin-known_hosts -o StrictHostKeyChecking=yes`. + +If mRiver's host key ever rotates (rare; only on disk wipe / fresh OS), Phase A runs again and the secret is updated. SSH refuses to connect with a clear "host key changed" error, which surfaces as `mriver_unreachable` to the user — exactly the right blast-radius (loud failure, no silent connect to a substitute host). + +### 5.4 The shim — `paliadin-shim` + +A bash script on mRiver at `/home/m/.local/bin/paliadin-shim`. It is the **only** thing the paliad-prod key is allowed to invoke, and it dispatches on `$SSH_ORIGINAL_COMMAND`. Three RPCs: + +```bash +#!/bin/bash +# paliadin-shim — server-side RPC for paliad's remote-tmux turns. +# Invoked via authorized_keys command= with $SSH_ORIGINAL_COMMAND set. +set -euo pipefail +umask 077 + +readonly TMUX_SESSION="${PALIADIN_TMUX_SESSION:-paliad-paliadin}" +readonly RESPONSE_DIR="${PALIADIN_RESPONSE_DIR:-/tmp/paliadin}" +readonly TIMEOUT_S=60 +readonly TURN_ID_RE='^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$' + +mkdir -p "$RESPONSE_DIR" + +# Parse $SSH_ORIGINAL_COMMAND. Format: " …" +read -r -a argv <<< "${SSH_ORIGINAL_COMMAND:-}" +verb="${argv[0]:-}" + +ensure_pane() { + if ! tmux has-session -t "$TMUX_SESSION" 2>/dev/null; then + tmux new-session -d -s "$TMUX_SESSION" + fi + # Find or create the @paliadin-scope=chat window. + local target="" + while read -r idx; do + scope=$(tmux show-window-option -t "$TMUX_SESSION:$idx" -v @paliadin-scope 2>/dev/null || true) + if [[ "$scope" == "chat" ]]; then target="$TMUX_SESSION:$idx"; break; fi + done < <(tmux list-windows -t "$TMUX_SESSION" -F '#{window_index}') + if [[ -z "$target" ]]; then + idx=$(tmux new-window -t "$TMUX_SESSION" -n claude-paliadin -P -F '#{window_index}' claude) + target="$TMUX_SESSION:$idx" + # Wait for claude to settle (60s bound; matches Go waitForPaneReady). + for _ in $(seq 1 120); do + pane=$(tmux capture-pane -t "$target" -p 2>/dev/null || true) + if [[ "$pane" == *"❯"* || "$pane" == *"│"* ]]; then break; fi + sleep 0.5 + done + tmux set-window-option -t "$target" @paliadin-scope chat + tmux set-window-option -t "$target" @fix-name claude-paliadin + # Bootstrap system prompt — reuses the Go service's prompt text. + # The Go side sends this via the `bootstrap` RPC on first turn instead + # of duplicating the prompt here. See §6.4. + fi + echo "$target" +} + +case "$verb" in + health) + # Liveness check — used by paliad to short-circuit when mRiver is offline. + # Returns "ok" iff tmux + claude are reachable. + tmux has-session -t "$TMUX_SESSION" 2>/dev/null \ + || tmux new-session -d -s "$TMUX_SESSION" + command -v claude >/dev/null && echo ok || { echo no-claude; exit 1; } + ;; + + bootstrap) + # First-turn-only: ensure pane exists and inject the system prompt. + # $1 = base64-encoded prompt body (avoids quoting hell). + target=$(ensure_pane) + prompt=$(printf '%s' "${argv[1]:?missing prompt}" | base64 -d) + tmux send-keys -t "$target" -l -- "$prompt" + tmux send-keys -t "$target" Enter + sleep 2 # give claude a moment to absorb + echo ok + ;; + + run-turn) + # $1 = turn_id (UUID); $2 = base64-encoded user message. + turn_id="${argv[1]:?missing turn_id}" + [[ "$turn_id" =~ $TURN_ID_RE ]] || { echo >&2 "bad turn_id"; exit 2; } + msg=$(printf '%s' "${argv[2]:?missing message}" | base64 -d) + target=$(ensure_pane) + out="$RESPONSE_DIR/$turn_id.txt" + rm -f "$out" + # Envelope matches what paliadin_prompt.go expects. + tmux send-keys -t "$target" -l -- "[PALIADIN:$turn_id] $msg" + tmux send-keys -t "$target" Enter + # Poll for the response file. Same shape as Go pollForResponse. + for _ in $(seq 1 $((TIMEOUT_S * 5))); do + if [[ -s "$out" ]]; then + sleep 0.05 # settle + cat "$out" + rm -f "$out" + exit 0 + fi + sleep 0.2 + done + echo >&2 "paliadin: response timeout after ${TIMEOUT_S}s" + exit 124 + ;; + + reset) + # /clear the conversation; next turn starts fresh. + target=$(ensure_pane) + tmux send-keys -t "$target" -l -- "/clear" + tmux send-keys -t "$target" Enter + echo ok + ;; + + *) + echo >&2 "paliadin-shim: unknown verb '$verb'" + exit 2 + ;; +esac +``` + +Why a shim instead of raw tmux-over-SSH: + +- One SSH round-trip per turn (~50 ms over tailnet) vs ~10–20 round-trips for the granular pattern. +- Argument validation lives in one place (UUID regex on turn_id, base64 for messages, fixed verb list) — easier to audit than a regex over `$SSH_ORIGINAL_COMMAND` matching `tmux send-keys …`. +- mRiver-side concerns (response polling, settle delays, pane-readiness) stay on mRiver, which is where the tmux state lives. The Go service stops caring about local file polling at all. + +--- + +## 6. Sub-design C — Service-layer integration, routing, reliability + +### 6.1 Interface split + +The current `*PaliadinService` becomes an interface with two implementations: `LocalPaliadinService` (the existing tmux code, renamed) and `RemotePaliadinService` (the new SSH code). Construction picks one at startup based on `PALIADIN_REMOTE_HOST`. + +```go +// internal/services/paliadin.go (after refactor) + +type Paliadin interface { + RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) + ResetSession(ctx context.Context) error + ListRecentTurns(ctx context.Context, callerID uuid.UUID, limit int) ([]PaliadinTurn, error) + Stats(ctx context.Context, callerID uuid.UUID) (*PaliadinStats, error) + IsOwner(ctx context.Context, userID uuid.UUID) (bool, error) +} + +// LocalPaliadinService wraps the current tmux PoC (laptop / dev path). +type LocalPaliadinService struct { /* identical to today's PaliadinService */ } + +// RemotePaliadinService talks to a paliadin-shim over SSH on mRiver. +type RemotePaliadinService struct { + db *sqlx.DB + users *UserService + sshHost string // 100.99.98.203 + sshPort int // 22022 — bypasses Tailscale SSH on :22 (see §4.5) + sshUser string // m + sshKeyPath string // /tmp/paliadin-id_ed25519- + knownHosts string // /tmp/paliadin-known_hosts + turnMu sync.Mutex + + // Health-check cache. + healthMu sync.Mutex + healthOK bool + healthCheckedAt time.Time +} +``` + +DB access (`ListRecentTurns`, `Stats`, `IsOwner`) is identical for both — they only read `paliad.paliadin_turns`. They live in a shared `paliadinDB` helper struct embedded in both implementations. + +### 6.2 Wiring at startup + +```go +// cmd/server/main.go (excerpt) +var paliadin services.Paliadin +remoteHost := os.Getenv("PALIADIN_REMOTE_HOST") +switch { +case remoteHost != "": + keyPath, err := loadPaliadinSSHKey() + if err != nil { log.Fatalf("paliadin: load ssh key: %v", err) } + if keyPath == "" { log.Fatalf("paliadin: PALIADIN_REMOTE_HOST set but no PALIADIN_SSH_PRIVATE_KEY") } + knownHosts, err := loadPaliadinKnownHosts() + if err != nil { log.Fatalf("paliadin: load known_hosts: %v", err) } + port, _ := strconv.Atoi(cmpOr(os.Getenv("PALIADIN_REMOTE_PORT"), "22022")) + paliadin = services.NewRemotePaliadinService(db, userSvc, services.RemotePaliadinConfig{ + SSHHost: remoteHost, + SSHPort: port, + SSHUser: cmpOr(os.Getenv("PALIADIN_REMOTE_USER"), "m"), + SSHKeyPath: keyPath, + KnownHostsPath: knownHosts, + }) + log.Printf("paliadin: remote mode → ssh %s@%s:%d", "m", remoteHost, port) +case localTmuxAvailable(): + paliadin = services.NewLocalPaliadinService(db, userSvc, "", "") + log.Printf("paliadin: local tmux mode") +default: + paliadin = services.NewDisabledPaliadinService(db, userSvc) + log.Printf("paliadin: disabled (no remote host, no local tmux)") +} +``` + +`NewDisabledPaliadinService` exists today implicitly via the `ErrTmuxUnavailable` path; making it explicit gives the constructor a clear name and the handler doesn't have to special-case `nil`. + +### 6.3 SSH invocation pattern + +`RemotePaliadinService` runs every RPC through the same helper: + +```go +func (s *RemotePaliadinService) callShim(ctx context.Context, args ...string) ([]byte, error) { + sshArgs := []string{ + "-F", "/dev/null", // ignore /etc/ssh/ssh_config + ~/.ssh/config + "-i", s.sshKeyPath, + "-p", strconv.Itoa(s.sshPort), // 22022 — bypasses Tailscale SSH on :22 + "-o", "IdentitiesOnly=yes", // don't fall back to other keys + "-o", "UserKnownHostsFile=" + s.knownHostsPath, + "-o", "StrictHostKeyChecking=yes", + "-o", "BatchMode=yes", + "-o", "ConnectTimeout=3", + "-o", "ServerAliveInterval=10", + "-o", "ServerAliveCountMax=3", + s.sshUser + "@" + s.sshHost, + "--", + } + sshArgs = append(sshArgs, args...) + c, cancel := context.WithTimeout(ctx, 70*time.Second) // shim has its own 60s; +10s for SSH overhead + defer cancel() + cmd := exec.CommandContext(c, "ssh", sshArgs...) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout; cmd.Stderr = &stderr + if err := cmd.Run(); err != nil { + return nil, fmt.Errorf("paliadin: ssh shim %v: %w (stderr: %s)", args, err, stderr.String()) + } + return stdout.Bytes(), nil +} +``` + +`RunTurn` becomes: + +```go +func (s *RemotePaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) { + s.turnMu.Lock() + defer s.turnMu.Unlock() + + if err := s.healthGate(ctx); err != nil { + return nil, err // ErrMRiverUnreachable, picked up by handler + } + + turnID := uuid.New() + started := time.Now().UTC() + if err := s.insertTurnRow(ctx, …); err != nil { return nil, err } + + // First-turn-only: bootstrap the system prompt on mRiver. Detected by + // checking whether any prior turn for this user has succeeded. + if err := s.ensureBootstrapped(ctx); err != nil { + _ = s.markTurnError(ctx, turnID, "bootstrap_failed") + return nil, err + } + + msg := sanitiseForTmux(req.UserMessage) + msgB64 := base64.StdEncoding.EncodeToString([]byte(msg)) + body, err := s.callShim(ctx, "run-turn", turnID.String(), msgB64) + if err != nil { + _ = s.markTurnError(ctx, turnID, classifySSHError(err)) + return nil, err + } + + // Same trailer-parse + audit-row writes as Local, factored into shared helper. + return s.completeTurnFromBody(ctx, turnID, started, string(body)) +} +``` + +### 6.4 System prompt bootstrap + +The local PoC calls `paliadinSystemPrompt(s.responseDir)` once when it creates the pane. The remote path needs the same hook. Two options that don't require duplicating the German prompt body to mRiver: + +- **Lazy bootstrap (chosen):** the first `RunTurn` after a paliad-prod restart sends the system prompt via `bootstrap` RPC, then runs the actual turn. Subsequent turns skip the bootstrap. State is per-process: `RemotePaliadinService.bootstrapped` boolean guarded by mutex. +- Eager bootstrap at startup is rejected — it forces every container start to wait for mRiver to be online, which couples paliad's boot to mRiver's availability. + +Lazy bootstrap means the very first turn after a paliad redeploy pays a ~3 s extra cost (claude pane spin-up + system prompt absorb). Acceptable for a single-user PoC. + +### 6.5 Health-check gating (`mriver_unreachable`) + +Every `RunTurn` first calls `healthGate(ctx)`: + +- Cached for 10 s. If last check was <10 s ago and was OK, skip the probe. +- Otherwise: `s.callShim(ctx, "health")` with a 3 s timeout. On success, set cache OK; on failure, return `ErrMRiverUnreachable`. + +Why 10 s: short enough that "I just woke my laptop" propagates inside one user retry; long enough that a busy chat doesn't probe on every turn. + +```go +var ErrMRiverUnreachable = errors.New("paliadin: mriver unreachable") + +func (s *RemotePaliadinService) healthGate(ctx context.Context) error { + s.healthMu.Lock() + defer s.healthMu.Unlock() + if s.healthOK && time.Since(s.healthCheckedAt) < 10*time.Second { + return nil + } + c, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + out, err := s.callShim(c, "health") + s.healthCheckedAt = time.Now() + if err != nil || strings.TrimSpace(string(out)) != "ok" { + s.healthOK = false + return fmt.Errorf("%w: %v", ErrMRiverUnreachable, err) + } + s.healthOK = true + return nil +} +``` + +### 6.6 Friendly error code (extends t-paliad-150) + +`friendlyErrorMessage` already maps `tmux_unavailable` to a localised message. We add one new code: + +- `mriver_unreachable` → DE: *"mRiver ist offline — Paliadin nicht erreichbar. Mach mRiver an, oder nutze Paliadin lokal mit `./paliad`."* / EN: *"mRiver is offline — Paliadin can't reach it. Wake mRiver, or run Paliadin locally with `./paliad`."* + +Implementation: one new `case` in the SSE-error switch in `frontend/src/client/paliadin.ts`'s `friendlyErrorMessage`, plus matching i18n keys (`paliadin.error.mriver_unreachable.de` / `.en`). Server-side: `paliadin` HTTP handler maps `errors.Is(err, services.ErrMRiverUnreachable)` to `event: error\ndata: {"code":"mriver_unreachable","message":"..."}\n\n`. + +### 6.7 Rate limit + +A runaway loop on the paliad side could DOS the SSH connection. Cheapest cap: enforce one in-flight turn at a time via `turnMu` (already exists in the local PoC). On top of that, a rolling cap of N=20 turns/min in `RemotePaliadinService` rejects with `ErrRateLimited` (mapped to a friendly `paliadin.error.rate_limited`). PoC has one user (m); the cap is a paranoid safety, not a real throttle. + +### 6.8 What about ControlMaster? + +Decision-2's chosen path (server-side shim with one RPC per turn) makes ControlMaster optional. The shim collapses ~10 raw-tmux ops into a single SSH connect — that's already the latency win ControlMaster would buy. + +Adding it on top would save ~30–50 ms per turn but adds: + +- A persistent `~/.ssh/cm-*` socket inside the container. +- Cleanup logic on shutdown. +- A subtle interaction with the SSH BatchMode + ConnectTimeout settings. + +Verdict: skip ControlMaster in v1. If turn latency over Tailscale is measured >300 ms in practice and hot enough to matter, add it in a follow-up; the call site is one helper. + +--- + +## 7. Phasing + +### Phase A — manual proof-of-concept (no Dockerfile change yet) + +Goal: validate the round-trip end-to-end on a deployed paliad, before touching the image. + +**Phase A.0 (DONE 2026-05-07 23:31):** SSH+shim end-to-end on the tailnet. + +1. ✅ **Generate keypair** on mRiver: `ssh-keygen -t ed25519 -N "" -C "paliad-prod" -f ~/.paliad-staging/paliad-prod-key`. Fingerprint `SHA256:5uV8v872F/IhJycjjq0crFue/emAYfw71N9bxTvkl9c`. +2. ✅ **Commit shim** to `scripts/paliadin-shim` and **install** at `/home/m/.local/bin/paliadin-shim`, `chmod 755`. +3. ✅ **Write authorized_keys** with public key + `command=`/`from="100.99.98.201"`/no-pty/no-port-forwarding/no-agent-forwarding/no-X11-forwarding/no-user-rc restrictions (§5.2). +4. ✅ **Add port 22022 socket drop-in** at `/etc/systemd/system/ssh.socket.d/paliad.conf`, `systemctl daemon-reload && systemctl restart ssh.socket`. Both `:22` (Tailscale SSH for m) and `:22022` (real OpenSSH for paliad) listening (§4.5). +5. ✅ **Capture mRiver:22022 host key**: `ssh-keyscan -p 22022 -t ed25519 100.99.98.203 > ~/.paliad-staging/known_hosts` from mLake. Fingerprint `SHA256:HPoUzy60Cb8yLERIBQcB2mHihNST3NaTODx5Ypd1XpA`. +6. ✅ **Smoke-test from mLake** (without paliad container, just raw ssh from mLake's host shell): + ``` + ssh -F /dev/null -i /tmp/paliad-prod-key -o UserKnownHostsFile=/tmp/paliad-known_hosts \ + -o StrictHostKeyChecking=yes -o IdentitiesOnly=yes -o BatchMode=yes \ + -p 22022 m@100.99.98.203 health + → ok + ssh … run-turn $(uuidgen) "$(printf 'Sag …' | base64 -w0)" + → "test ok" (3.4 s round-trip including a real Claude response) + ``` +7. ✅ **from= rejection verified**: the same key from mRiver itself (`100.99.98.203`) → `Permission denied (publickey,password)` as expected. + +**Phase A.5 (PENDING m's hands):** validate `network_mode: host` + traefik routing on prod paliad.de. + +- Branch the live `docker-compose.yml` on a temp branch. +- Add `network_mode: host` to the `web` service; remove `expose: ["8080"]`. +- Push to trigger a Dokploy redeploy. +- `curl --connect-timeout 5 -sSI https://paliad.de/` — expect 200 (or login redirect), NOT 502. +- If 502: revert the temp branch (`git revert HEAD && git push`); revisit decision 1 in a follow-up issue. +- If 200: keep the host-mode change; ready for Phase B. + +This is **m's call to execute** — it briefly touches prod paliad.de. Inventor/coder should not flip prod compose without explicit go-ahead. Rollback is one revert + redeploy. + +**Phase A.6 (after A.5 passes):** smoke-test SSH from inside the paliad-prod container itself (the real container, not just the mLake host shell): +``` +docker exec -it sh +apk add --no-cache openssh-client # one-shot, before Dockerfile change +ssh -F /dev/null -i /tmp/paliad-prod-key -o UserKnownHostsFile=/tmp/paliad-known_hosts \ + -o StrictHostKeyChecking=yes -o IdentitiesOnly=yes -o BatchMode=yes \ + -p 22022 m@100.99.98.203 health +# expected: "ok" +``` +This proves the container's host-mode networking actually delivers a tailnet connect. + +**Phase A.7:** wire env vars manually via Dokploy UI for one deploy; confirm `/paliadin` chat works against mRiver from paliad.de. + +If A.5 fails: the design rolls back to a sidecar in a new issue (decision 1 follow-up). The SSH path (A.0) and traefik path (A.5) are independent — A.0 is already proven; only A.5+ is at risk. + +### Phase B — bake into Dockerfile + Dokploy secrets + +1. Dockerfile: add `openssh-client` to the final stage (§4.3). +2. compose: add `network_mode: host` and the four new env vars (§4.1). +3. Dokploy secrets: register `PALIADIN_REMOTE_HOST=100.99.98.203`, `PALIADIN_REMOTE_USER=m`, `PALIADIN_SSH_PRIVATE_KEY=...`, `PALIADIN_KNOWN_HOSTS=...`. +4. Code: refactor `PaliadinService` to the interface split (§6.1–§6.2). New file `internal/services/paliadin_remote.go`. Tests: `paliadin_remote_test.go` mocks `callShim` to verify `RunTurn` audit-row writes, error mapping, and `healthGate` caching. +5. Ship under one PR; tag t-paliad-151 done. + +### Phase C — friendly errors + monitoring + +1. `paliadin.error.mriver_unreachable` i18n keys + `friendlyErrorMessage` case (§6.6). +2. `/admin/paliadin` shows last health-probe result + last successful turn timestamp. +3. Optional: `mai-mesh` integration to surface mRiver-offline events to m on Telegram (out-of-band; not gating). + +--- + +## 8. Security review summary + +| Risk | Mitigation | +|---|---| +| Stolen private key → arbitrary SSH on mRiver | `command=` shim restriction + `from="100.99.98.201"` + ed25519 key + private key only in Dokploy secret store (encrypted at rest); paliad route uses port 22022 where real OpenSSH enforces all of the above | +| Stolen private key → tailnet-wide SSH from non-mLake host | `from="100.99.98.201"` clause (verified: rejected from mRiver itself in Phase A.0) | +| Tailscale SSH on `:22` bypasses `authorized_keys` | The paliad-prod key's `command=` restriction is not enforced on `:22`. Mitigation: paliad always dials `:22022`, which is real OpenSSH. m's interactive `tailscale ssh m@mriver` on `:22` continues to be governed by Tailscale ACLs, separate from paliad's identity. | +| Container compromise → key extraction | Key written to tmpfile chmod 600, only root inside container can read; alpine container has no shell-on-error trampolines | +| Host-key MITM during connect | Pinned `known_hosts`; `StrictHostKeyChecking=yes` | +| Shim argument injection (e.g. via `run-turn $(rm -rf /)`) | Shim parses positional args from `$SSH_ORIGINAL_COMMAND` via `read -r -a`; never passes args to a subshell `eval`; turn_id validated by UUID regex; message body always base64-decoded into a single shell variable, never re-evaluated | +| Runaway loop → SSH flood | Single-flight `turnMu` + 20/min rolling cap | +| `network_mode: host` widens blast radius | The `command=` + `from=` restrictions on mRiver mean container compromise = "can run shim verbs against mRiver only", not "shell on mRiver" | +| PaliadinOwnerEmail bypass | Unchanged from PoC: gate is in Go (`/paliadin` 404s for any other user). Even if mRiver SSH key leaks, attacker still needs paliad session as `m@hoganlovells.com`. | + +--- + +## 9. Out-of-scope clarifications (for review) + +These were called out in the issue but the design intentionally does not solve them, to keep v1 tight. Each is acknowledged so review knows it wasn't an oversight: + +- **Wake-on-LAN of mRiver:** out of scope. v1's UX when mRiver is asleep is the friendly error from §6.6. Future work: integrate with `mai-mesh` capability fallback. +- **Multi-host failover:** out of scope. Only mRiver is targeted. +- **Anthropic API fallback when mRiver offline:** out of scope per CLAUDE.md (`ANTHROPIC_API_KEY` reserved for production-v1, unused in PoC). +- **ControlMaster:** v1 ships without; revisit if turn latency >300 ms in practice (§6.8). + +--- + +## 10. File-level deliverables (for the coder shift) + +When this design is approved and the coder shift starts, the work splits roughly into: + +- `Dockerfile` — `+openssh-client`. +- `docker-compose.yml` — `network_mode: host`, five new env entries (`PALIADIN_REMOTE_HOST`, `PALIADIN_REMOTE_PORT`, `PALIADIN_REMOTE_USER`, `PALIADIN_SSH_PRIVATE_KEY`, `PALIADIN_KNOWN_HOSTS`). +- `internal/services/paliadin.go` — extract `Paliadin` interface; rename existing to `LocalPaliadinService`; pull DB-only methods (`ListRecentTurns`, `Stats`, `IsOwner`) into a shared embedded `paliadinDB` so both implementations get them for free. +- `internal/services/paliadin_remote.go` — new file: `RemotePaliadinService`, `RemotePaliadinConfig` (with `SSHPort`), `callShim`, `healthGate`, `ensureBootstrapped`, `classifySSHError`, `ErrMRiverUnreachable`. +- `internal/services/paliadin_remote_test.go` — unit tests with a mocked `callShim`. +- `cmd/server/main.go` — env-var-based wiring (§6.2), `loadPaliadinSSHKey`, `loadPaliadinKnownHosts`, `PALIADIN_REMOTE_PORT` parse with default `22022`. +- `frontend/src/client/paliadin.ts` — one `case` in `friendlyErrorMessage` for `mriver_unreachable`. +- `frontend/src/i18n.ts` — two new keys (`paliadin.error.mriver_unreachable.de` / `.en`). +- `scripts/paliadin-shim` — server-side script (§5.4); already shipped + installed on mRiver during Phase A.0, not part of any container. Repo location chosen so the security-relevant script is version-controlled. +- `docs/project-status.md` — note Phase 0.5 (PoC) → Phase 0.6 (Tailscale-SSH prod route). +- **mRiver host setup (one-time, already done in Phase A.0):** `/etc/systemd/system/ssh.socket.d/paliad.conf` (port 22022 listen drop-in); `~/.ssh/authorized_keys` (paliad-prod public key with restrictions); `/home/m/.local/bin/paliadin-shim` (executable). These are NOT in the repo because they live on m's laptop; `docs/project-status.md` should reference them. + +No DB migrations needed — `paliad.paliadin_turns` schema already covers everything (`error_code` field already accepts free-form codes including `mriver_unreachable`). + +--- + +## 11. Open questions for review + +- **Q (m), still open:** Phase A.5 (traefik+host-mode on prod paliad.de) is not yet executed. m drives this; rollback is one revert. Dokploy doc check before flipping is recommended but not blocking. +- **Q (m), resolved 2026-05-07 23:50:** shim location → repo (`scripts/paliadin-shim`, committed in `0248411`). Version-controlled and auditable. +- **Q (m), still open:** `ANTHROPIC_API_KEY` env var reservation in compose comments — keep for production-v1, or strip now? Not blocking either phase; defer. + +--- + +## 12. Phase A.0 completion summary (2026-05-07 23:50) + +**Coder shift (noether) executed Phase A.0 in full:** + +1. ✅ shim committed at `scripts/paliadin-shim` (commit `0248411`, repo-version-controlled) +2. ✅ shim installed at `/home/m/.local/bin/paliadin-shim` on mRiver +3. ✅ ed25519 keypair `paliad-prod` generated, public-key fingerprint `SHA256:5uV8v872F/IhJycjjq0crFue/emAYfw71N9bxTvkl9c`, private key staged at `~/.paliad-staging/paliad-prod-key` on mRiver (mode 600) +4. ✅ `~/.ssh/authorized_keys` written with `command=`/`from=`/no-pty/no-port-forwarding/no-agent-forwarding/no-X11-forwarding/no-user-rc restrictions +5. ✅ `ssh.socket` drop-in installed at `/etc/systemd/system/ssh.socket.d/paliad.conf`; both `:22` and `:22022` listening +6. ✅ host key for `:22022` captured at `~/.paliad-staging/known_hosts` (fingerprint `SHA256:HPoUzy60Cb8yLERIBQcB2mHihNST3NaTODx5Ypd1XpA`) +7. ✅ end-to-end SSH+shim+Claude run-turn validated from mLake → mRiver:22022 (3.4 s round-trip) +8. ✅ `from="100.99.98.201"` rejection verified + +**Three secrets ready for Dokploy registration** (m to copy from `~/.paliad-staging/` on mRiver): +- `PALIADIN_SSH_PRIVATE_KEY` ← `cat ~/.paliad-staging/paliad-prod-key` +- `PALIADIN_KNOWN_HOSTS` ← `cat ~/.paliad-staging/known_hosts` +- `PALIADIN_REMOTE_HOST=100.99.98.203`, `PALIADIN_REMOTE_PORT=22022`, `PALIADIN_REMOTE_USER=m` + +**Phase A.5 (traefik+host-mode test) and Phase A.6/A.7 (in-container SSH smoke + paliad/paliadin end-to-end) await m's hands** — they touch prod paliad.de. + +**Phase B (Dockerfile + Go interface split + Dokploy secrets) is unblocked from a code perspective** — but should not merge until Phase A.5 confirms the host-mode networking trade-off is acceptable. + +--- + +**Inventor design + coder Phase A.0 complete.** Awaiting m for Phase A.5 traefik validation before the coder writes the Go interface split. diff --git a/frontend/src/client/i18n.ts b/frontend/src/client/i18n.ts index 97fd575..dbe7214 100644 --- a/frontend/src/client/i18n.ts +++ b/frontend/src/client/i18n.ts @@ -1558,6 +1558,10 @@ const translations: Record> = { "paliadin.stop": "Stop", "paliadin.reset": "Neue Unterhaltung", "paliadin.error.local_only": "Paliadin läuft nur lokal. Diese Instanz hat kein tmux/claude installiert — lokal mit ./paliad starten.", + "paliadin.error.mriver_unreachable": "mRiver ist offline — Paliadin nicht erreichbar. Mach mRiver an, oder nutze Paliadin lokal mit ./paliad.", + "paliadin.error.shim_auth_failed": "Paliadin-Authentifizierung fehlgeschlagen. SSH-Schlüssel oder Berechtigung auf mRiver prüfen.", + "paliadin.error.shim_error": "Paliadin-Fehler auf mRiver. tmux/claude-Pane prüfen.", + "paliadin.error.timeout": "Paliadin antwortet nicht (Timeout 60s). Nochmal versuchen.", "paliadin.error.connection_lost": "Verbindung verloren.", "paliadin.error.upstream": "Fehler beim Senden.", "nav.admin.paliadin": "Paliadin Monitor", @@ -3553,6 +3557,10 @@ const translations: Record> = { "paliadin.stop": "Stop", "paliadin.reset": "New conversation", "paliadin.error.local_only": "Paliadin only runs locally. This instance has no tmux/claude installed — start it locally via ./paliad.", + "paliadin.error.mriver_unreachable": "mRiver is offline — Paliadin can't reach it. Wake mRiver, or run Paliadin locally with ./paliad.", + "paliadin.error.shim_auth_failed": "Paliadin auth failed. Check the SSH key or authorized_keys on mRiver.", + "paliadin.error.shim_error": "Paliadin error on mRiver. Check the tmux/claude pane.", + "paliadin.error.timeout": "Paliadin didn't respond in time (60s). Try again.", "paliadin.error.connection_lost": "Connection lost.", "paliadin.error.upstream": "Send failed.", "nav.admin.paliadin": "Paliadin Monitor", diff --git a/frontend/src/client/paliadin.ts b/frontend/src/client/paliadin.ts index acde8d1..d3c174e 100644 --- a/frontend/src/client/paliadin.ts +++ b/frontend/src/client/paliadin.ts @@ -210,8 +210,24 @@ function friendlyErrorMessage(data: unknown): string { } try { const parsed = JSON.parse(data) as { code?: string }; - if (parsed.code === "tmux_unavailable") { - return t("paliadin.error.local_only"); + switch (parsed.code) { + case "tmux_unavailable": + // Local PoC path: paliad is running on a host without tmux/claude + // (typically the legacy laptop-only build). + return t("paliadin.error.local_only"); + case "mriver_unreachable": + // t-paliad-151: prod path's mRiver is offline (laptop asleep, off + // tailnet, or paliadin-shim missing). + return t("paliadin.error.mriver_unreachable"); + case "shim_auth_failed": + // SSH key wrong or authorized_keys drifted. + return t("paliadin.error.shim_auth_failed"); + case "shim_error": + case "bootstrap_failed": + // Generic remote shim failure or system-prompt bootstrap error. + return t("paliadin.error.shim_error"); + case "timeout": + return t("paliadin.error.timeout"); } } catch { // Not JSON — fall through to the generic connection-lost message diff --git a/frontend/src/i18n-keys.ts b/frontend/src/i18n-keys.ts index ad85ce8..706ab7e 100644 --- a/frontend/src/i18n-keys.ts +++ b/frontend/src/i18n-keys.ts @@ -1423,6 +1423,10 @@ export type I18nKey = | "paliadin.empty" | "paliadin.error.connection_lost" | "paliadin.error.local_only" + | "paliadin.error.mriver_unreachable" + | "paliadin.error.shim_auth_failed" + | "paliadin.error.shim_error" + | "paliadin.error.timeout" | "paliadin.error.upstream" | "paliadin.heading" | "paliadin.input.placeholder" diff --git a/internal/handlers/handlers.go b/internal/handlers/handlers.go index 213b4c1..0f0e673 100644 --- a/internal/handlers/handlers.go +++ b/internal/handlers/handlers.go @@ -69,10 +69,12 @@ type Services struct { Pin *services.PinService CardLayout *services.CardLayoutService - // Paliadin is wired only when PALIADIN_ENABLED=true at boot - // (PoC; m's laptop only). On prod it stays nil and all /paliadin* - // routes 404 because Register() skips registering them. - Paliadin *services.PaliadinService + // Paliadin is wired when DATABASE_URL is set. The concrete backend + // is picked in cmd/server/main.go based on PALIADIN_REMOTE_HOST + // (remote → mRiver via SSH) or local tmux availability. Stays nil + // without DATABASE_URL; in that case the per-request handler gate + // 404s anyway. + Paliadin services.Paliadin } func Register(mux *http.ServeMux, client *auth.Client, giteaAPIToken string, svc *Services) { diff --git a/internal/handlers/paliadin.go b/internal/handlers/paliadin.go index 68c0bd5..c521bca 100644 --- a/internal/handlers/paliadin.go +++ b/internal/handlers/paliadin.go @@ -39,10 +39,11 @@ func newDetachedContext(timeout time.Duration) (context.Context, context.CancelF return context.WithTimeout(context.Background(), timeout) } -// paliadinSvc is the live PaliadinService instance. nil when -// DATABASE_URL was unset (the service depends on the audit table). -// Set by Register() at boot. -var paliadinSvc *services.PaliadinService +// paliadinSvc is the live Paliadin backend. nil when DATABASE_URL was +// unset (the service depends on the audit table). Set by Register() at +// boot. The concrete type is decided in cmd/server/main.go: local-tmux +// PoC, remote-via-SSH (mRiver), or a disabled stub. +var paliadinSvc services.Paliadin // requirePaliadinOwner gates every paliadin handler to the single // owner email (services.PaliadinOwnerEmail = m). Anyone else gets a diff --git a/internal/services/paliadin.go b/internal/services/paliadin.go index 1641197..afc9019 100644 --- a/internal/services/paliadin.go +++ b/internal/services/paliadin.go @@ -1,23 +1,23 @@ package services -// PaliadinService — Phase 0 PoC of the in-app AI buddy (t-paliad-146). +// Paliadin — the in-app AI buddy. Two implementations of the same +// interface, picked at boot time (see cmd/server/main.go): // -// Design: docs/design-paliadin-2026-05-07.md §0.5 (PoC track). +// - LocalPaliadinService — talks to a `claude` CLI in a local tmux +// session. The PoC path (t-paliad-146); used on m's laptop. +// - RemotePaliadinService — shells out to ssh on mRiver where the +// long-lived tmux+claude pane lives. The prod path (t-paliad-151); +// used by the paliad.de Dokploy container, which has no `claude` +// CLI of its own. // -// Architecture: a long-lived `claude` process inside a tmux session. -// Prompts go in via `tmux send-keys -l`; responses come back via a -// per-turn file the system prompt instructs Claude to write -// (Write(/tmp/paliadin/{turn_id}.txt)). The service polls that file, -// strips the [paliadin-meta] trailer block, parses the metadata, writes -// an audit row, and emits the response back to the SSE handler. +// Designs: +// - docs/design-paliadin-2026-05-07.md (PoC architecture) +// - docs/design-paliadin-tailscale-ssh-2026-05-07.md (remote routing) // -// The architecture is lifted (with adaptation to Go) from -// ~/dev/mVoice/server.py:250-380, which has been driving the goldi voice -// surface in production since 2026-Q1. -// -// PoC ONLY runs on m's laptop (PALIADIN_ENABLED=false on prod default). -// Hardcoded single-user, single-tmux-window scope. Do not attempt to -// deploy this to the Dokploy container — there is no `claude` CLI there. +// Both implementations share the audit-table I/O (paliadinDB) and the +// trailer parser. The conversation state (turn ordering, response file +// polling) is split: Local owns the tmux pane directly; Remote delegates +// to the paliadin-shim on mRiver and reads the file there. import ( "bytes" @@ -50,12 +50,36 @@ import ( // path to enabling Paliadin. const PaliadinOwnerEmail = "matthias.siebels@hoganlovells.com" -// PaliadinService manages the tmux-claude PoC. -type PaliadinService struct { - db *sqlx.DB +// Paliadin is the interface every Paliadin backend implements. Two +// production implementations: LocalPaliadinService (local tmux+claude) +// and RemotePaliadinService (ssh+paliadin-shim on mRiver). A +// DisabledPaliadinService stub is constructed when neither is available +// so callers don't have to nil-check on every entry point. +type Paliadin interface { + RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) + ResetSession(ctx context.Context) error + ListRecentTurns(ctx context.Context, callerID uuid.UUID, limit int) ([]PaliadinTurn, error) + Stats(ctx context.Context, callerID uuid.UUID) (*PaliadinStats, error) + IsOwner(ctx context.Context, userID uuid.UUID) (bool, error) +} + +// paliadinDB is the audit-table read/write surface shared by every +// Paliadin implementation. Embedded in LocalPaliadinService and +// RemotePaliadinService so they inherit IsOwner / ListRecentTurns / +// Stats and the per-turn row writers without duplication. +type paliadinDB struct { + db *sqlx.DB + users *UserService +} + +// LocalPaliadinService runs the local tmux+claude PoC (t-paliad-146). +// Hardcoded single-user, single-tmux-window scope. Used on m's laptop; +// not deployed to prod (the Dokploy container has no `claude` CLI — +// see RemotePaliadinService for that path). +type LocalPaliadinService struct { + paliadinDB tmuxSession string responseDir string - users *UserService // Cached pane target ("session:window-idx") once the voice window is // either discovered or created. Reset to "" if the pane dies. @@ -74,7 +98,7 @@ type PaliadinService struct { // // Returns (false, nil) for any other user — including unknown UUIDs and // users without an email row. Errors only on DB failure. -func (s *PaliadinService) IsOwner(ctx context.Context, userID uuid.UUID) (bool, error) { +func (s *paliadinDB) IsOwner(ctx context.Context, userID uuid.UUID) (bool, error) { var email string err := s.db.QueryRowxContext(ctx, `SELECT email FROM paliad.users WHERE id = $1`, userID).Scan(&email) @@ -87,19 +111,19 @@ func (s *PaliadinService) IsOwner(ctx context.Context, userID uuid.UUID) (bool, return strings.EqualFold(email, PaliadinOwnerEmail), nil } -// NewPaliadinService wires the service. Call only when PALIADIN_ENABLED=true. -func NewPaliadinService(db *sqlx.DB, users *UserService, tmuxSession, responseDir string) *PaliadinService { +// NewLocalPaliadinService wires the local-tmux PoC backend. Falls back +// to default tmux session + response dir when env vars are empty. +func NewLocalPaliadinService(db *sqlx.DB, users *UserService, tmuxSession, responseDir string) *LocalPaliadinService { if tmuxSession == "" { tmuxSession = "paliad-paliadin" } if responseDir == "" { responseDir = "/tmp/paliadin" } - return &PaliadinService{ - db: db, + return &LocalPaliadinService{ + paliadinDB: paliadinDB{db: db, users: users}, tmuxSession: tmuxSession, responseDir: responseDir, - users: users, } } @@ -156,7 +180,7 @@ var ErrTmuxUnavailable = errors.New("paliadin: tmux unavailable") // // PoC: serialised. The package-level turnMu enforces "one at a time". // m is the only user, so this is fine. -func (s *PaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) { +func (s *LocalPaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) { s.turnMu.Lock() defer s.turnMu.Unlock() @@ -238,7 +262,7 @@ func (s *PaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnRe // ResetSession sends `/clear` to the Claude pane so the next turn starts // from a clean conversation. Used by the "New conversation" button. -func (s *PaliadinService) ResetSession(ctx context.Context) error { +func (s *LocalPaliadinService) ResetSession(ctx context.Context) error { s.mu.Lock() target := s.paneTarget s.mu.Unlock() @@ -254,7 +278,7 @@ func (s *PaliadinService) ResetSession(ctx context.Context) error { // ListRecentTurns reads the last N turns visible to the caller. // global_admin sees everything; everyone else sees their own. -func (s *PaliadinService) ListRecentTurns(ctx context.Context, callerID uuid.UUID, limit int) ([]PaliadinTurn, error) { +func (s *paliadinDB) ListRecentTurns(ctx context.Context, callerID uuid.UUID, limit int) ([]PaliadinTurn, error) { if limit <= 0 || limit > 200 { limit = 50 } @@ -302,7 +326,7 @@ type PaliadinPromptCount struct { // Stats computes the dashboard aggregate. global_admin sees everything; // everyone else sees their own slice (PoC has only m, but the policy // matches RLS on the table). -func (s *PaliadinService) Stats(ctx context.Context, callerID uuid.UUID) (*PaliadinStats, error) { +func (s *paliadinDB) Stats(ctx context.Context, callerID uuid.UUID) (*PaliadinStats, error) { stats := &PaliadinStats{ ByClassifier: map[string]int{}, DailyCounts: []PaliadinDailyCount{}, @@ -404,7 +428,7 @@ func (s *PaliadinService) Stats(ctx context.Context, callerID uuid.UUID) (*Palia // ensurePane returns the tmux target ("session:window-idx") of the live // Claude pane, creating both session and window if missing. -func (s *PaliadinService) ensurePane(ctx context.Context) (string, error) { +func (s *LocalPaliadinService) ensurePane(ctx context.Context) (string, error) { s.mu.Lock() defer s.mu.Unlock() @@ -468,7 +492,7 @@ func (s *PaliadinService) ensurePane(ctx context.Context) (string, error) { return target, nil } -func (s *PaliadinService) findChatWindow(ctx context.Context) string { +func (s *LocalPaliadinService) findChatWindow(ctx context.Context) string { out, err := runTmuxOut(ctx, "list-windows", "-t", s.tmuxSession, "-F", "#{window_index}") if err != nil { @@ -485,14 +509,14 @@ func (s *PaliadinService) findChatWindow(ctx context.Context) string { return "" } -func (s *PaliadinService) paneAlive(ctx context.Context, target string) bool { +func (s *LocalPaliadinService) paneAlive(ctx context.Context, target string) bool { if err := runTmux(ctx, "has-session", "-t", target); err != nil { return false } return true } -func (s *PaliadinService) waitForPaneReady(ctx context.Context, target string, timeout time.Duration) error { +func (s *LocalPaliadinService) waitForPaneReady(ctx context.Context, target string, timeout time.Duration) error { deadline := time.Now().Add(timeout) for time.Now().Before(deadline) { select { @@ -509,7 +533,7 @@ func (s *PaliadinService) waitForPaneReady(ctx context.Context, target string, t return fmt.Errorf("pane %s not ready within %s", target, timeout) } -func (s *PaliadinService) sendToPane(ctx context.Context, target, msg string) error { +func (s *LocalPaliadinService) sendToPane(ctx context.Context, target, msg string) error { // `-l` sends the message literally (no key parsing) — necessary so // our prompt's special characters don't get interpreted. if err := runTmux(ctx, "send-keys", "-t", target, "-l", msg); err != nil { @@ -527,7 +551,7 @@ func (s *PaliadinService) sendToPane(ctx context.Context, target, msg string) er // over from earlier turns) as a non-event — the file existing without a // fresh mtime is a corner case the caller already de-duplicates by // having a unique turn_id per request. -func (s *PaliadinService) pollForResponse(ctx context.Context, path string, timeout time.Duration) (string, error) { +func (s *LocalPaliadinService) pollForResponse(ctx context.Context, path string, timeout time.Duration) (string, error) { deadline := time.Now().Add(timeout) for time.Now().Before(deadline) { select { @@ -687,7 +711,7 @@ func countChips(s string) int { // audit-row writers. // ============================================================================= -func (s *PaliadinService) insertTurnRow(ctx context.Context, t *PaliadinTurn) error { +func (s *paliadinDB) insertTurnRow(ctx context.Context, t *PaliadinTurn) error { q := ` INSERT INTO paliad.paliadin_turns ( turn_id, user_id, session_id, started_at, user_message, page_origin @@ -698,7 +722,7 @@ func (s *PaliadinService) insertTurnRow(ctx context.Context, t *PaliadinTurn) er return err } -func (s *PaliadinService) completeTurn(ctx context.Context, turnID uuid.UUID, +func (s *paliadinDB) completeTurn(ctx context.Context, turnID uuid.UUID, finishedAt time.Time, durationMS int, response string, tokens int, meta trailerMeta, chipCount int) error { rowsSeen := make(pq.Int64Array, 0, len(meta.RowsSeen)) @@ -724,7 +748,7 @@ func (s *PaliadinService) completeTurn(ctx context.Context, turnID uuid.UUID, return err } -func (s *PaliadinService) markTurnError(ctx context.Context, turnID uuid.UUID, code string) error { +func (s *paliadinDB) markTurnError(ctx context.Context, turnID uuid.UUID, code string) error { finished := time.Now().UTC() q := ` UPDATE paliad.paliadin_turns @@ -735,7 +759,7 @@ func (s *PaliadinService) markTurnError(ctx context.Context, turnID uuid.UUID, c return err } -func (s *PaliadinService) markTurnAbandonedOrError(ctx context.Context, turnID uuid.UUID, code string, abandoned bool) error { +func (s *paliadinDB) markTurnAbandonedOrError(ctx context.Context, turnID uuid.UUID, code string, abandoned bool) error { finished := time.Now().UTC() q := ` UPDATE paliad.paliadin_turns diff --git a/internal/services/paliadin_remote.go b/internal/services/paliadin_remote.go new file mode 100644 index 0000000..36f34e0 --- /dev/null +++ b/internal/services/paliadin_remote.go @@ -0,0 +1,322 @@ +package services + +// RemotePaliadinService — the prod path of the Paliadin backend. +// +// Design: docs/design-paliadin-tailscale-ssh-2026-05-07.md. +// +// Where the local backend (LocalPaliadinService) drives a tmux+claude +// pane in-process, the remote backend shells out to ssh m@mriver +// paliadin-shim — the script at scripts/paliadin-shim, installed at +// /home/m/.local/bin/paliadin-shim on m's laptop. The shim owns the +// tmux+claude pane on mRiver; this Go side just wraps each turn in one +// SSH call. +// +// The path was chosen so paliad.de (deployed in a Dokploy container on +// mLake, no `claude` CLI of its own) can keep using m's Claude Code +// subscription instead of paying API tokens. Tailscale provides the +// transport — mLake's tailscale0 interface is shared into the container +// via network_mode: host (compose layer; not this file's concern). +// +// Wiring is gated on PALIADIN_REMOTE_HOST in cmd/server/main.go. When +// that env var is unset, the binary falls back to LocalPaliadinService +// (or DisabledPaliadinService if neither tmux nor remote is available). + +import ( + "bytes" + "context" + "encoding/base64" + "errors" + "fmt" + "log" + "os/exec" + "strconv" + "strings" + "sync" + "time" + + "github.com/google/uuid" + "github.com/jmoiron/sqlx" +) + +// ErrMRiverUnreachable signals that the remote paliadin-shim could not +// be contacted within the health-check window. The handler maps this to +// the friendly mriver_unreachable error code (see frontend +// friendlyErrorMessage). +var ErrMRiverUnreachable = errors.New("paliadin: mriver unreachable") + +// RemotePaliadinConfig is the bag of knobs cmd/server/main.go passes +// when constructing a RemotePaliadinService. +type RemotePaliadinConfig struct { + SSHHost string // 100.99.98.203 — mRiver's tailnet IP + SSHPort int // 22022 — bypasses Tailscale SSH on :22 (design §4.5) + SSHUser string // m + SSHKeyPath string // /tmp/paliadin-id_ed25519- (chmod 600) + KnownHostsPath string // /tmp/paliadin-known_hosts +} + +// RemotePaliadinService implements Paliadin against a remote +// paliadin-shim over SSH. +type RemotePaliadinService struct { + paliadinDB + cfg RemotePaliadinConfig + + // Single in-flight turn. mRiver's claude pane is single-user; we + // serialise turns the same way LocalPaliadinService does. + turnMu sync.Mutex + + // Health-check cache. Avoids probing mRiver on every turn — once + // the cache is warm, RunTurn skips the probe for 10 seconds. + healthMu sync.Mutex + healthOK bool + healthCheckedAt time.Time + + // Lazy bootstrap state. The system prompt only needs to be sent + // once per claude pane; on first RunTurn after a paliad restart we + // inject it, and remember we did so we don't re-send. + bootstrapMu sync.Mutex + bootstrapped bool + + // Hook for tests — when non-nil, callShim delegates here instead + // of exec'ing ssh. Production code never sets this. + callShimHook func(ctx context.Context, args ...string) ([]byte, error) +} + +// NewRemotePaliadinService wires the remote backend. Call only when +// PALIADIN_REMOTE_HOST is set in the environment; the constructor does +// not probe mRiver — first probe happens on the first RunTurn call via +// healthGate. +func NewRemotePaliadinService(db *sqlx.DB, users *UserService, cfg RemotePaliadinConfig) *RemotePaliadinService { + if cfg.SSHPort == 0 { + cfg.SSHPort = 22022 + } + if cfg.SSHUser == "" { + cfg.SSHUser = "m" + } + return &RemotePaliadinService{ + paliadinDB: paliadinDB{db: db, users: users}, + cfg: cfg, + } +} + +// RunTurn drives one Q&A round against the remote claude pane. Same +// audit-row contract as LocalPaliadinService: write the row first, run +// the turn, complete the row on success, mark error on failure. +func (s *RemotePaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) { + s.turnMu.Lock() + defer s.turnMu.Unlock() + + turnID := uuid.New() + startedAt := time.Now().UTC() + + // Audit row first — leave traces even if we crash mid-turn. + if err := s.insertTurnRow(ctx, &PaliadinTurn{ + TurnID: turnID, + UserID: req.UserID, + SessionID: req.SessionID, + StartedAt: startedAt, + UserMessage: req.UserMessage, + PageOrigin: optionalString(req.PageOrigin), + }); err != nil { + return nil, fmt.Errorf("paliadin: insert turn row: %w", err) + } + + // Health-gate before paying the cost of a real turn. Caches OK for + // 10 s so a fast back-to-back chat doesn't probe every time. + if err := s.healthGate(ctx); err != nil { + _ = s.markTurnError(ctx, turnID, "mriver_unreachable") + return nil, err + } + + // Lazy bootstrap — first turn after a paliad restart sends the + // system prompt; subsequent turns skip. + if err := s.ensureBootstrapped(ctx); err != nil { + _ = s.markTurnError(ctx, turnID, "bootstrap_failed") + return nil, err + } + + msg := sanitiseForTmux(req.UserMessage) + msgB64 := base64.StdEncoding.EncodeToString([]byte(msg)) + + body, err := s.callShim(ctx, "run-turn", turnID.String(), msgB64) + if err != nil { + _ = s.markTurnError(ctx, turnID, classifySSHError(err)) + return nil, err + } + + // Same trailer parse + audit completion as the local path. + cleanBody, meta := splitTrailer(string(body)) + tokens := approxTokenCount(cleanBody) + chipCount := countChips(cleanBody) + finished := time.Now().UTC() + durationMS := int(finished.Sub(startedAt) / time.Millisecond) + + if err := s.completeTurn(ctx, turnID, finished, durationMS, cleanBody, tokens, meta, chipCount); err != nil { + log.Printf("paliadin: complete turn %s: %v", turnID, err) + } + + return &TurnResult{ + TurnID: turnID, + Response: cleanBody, + UsedTools: meta.UsedTools, + RowsSeen: meta.RowsSeen, + ChipCount: chipCount, + ClassifierTag: meta.ClassifierTag, + DurationMS: durationMS, + }, nil +} + +// ResetSession sends `/clear` to the remote claude pane. +func (s *RemotePaliadinService) ResetSession(ctx context.Context) error { + if _, err := s.callShim(ctx, "reset"); err != nil { + return fmt.Errorf("paliadin: reset: %w", err) + } + return nil +} + +// healthGate runs the shim's `health` verb at most once per 10 s. +// Returns ErrMRiverUnreachable wrapping the underlying error on miss. +func (s *RemotePaliadinService) healthGate(ctx context.Context) error { + s.healthMu.Lock() + defer s.healthMu.Unlock() + + if s.healthOK && time.Since(s.healthCheckedAt) < 10*time.Second { + return nil + } + + probeCtx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + out, err := s.callShim(probeCtx, "health") + s.healthCheckedAt = time.Now() + if err != nil { + s.healthOK = false + return fmt.Errorf("%w: %v", ErrMRiverUnreachable, err) + } + if strings.TrimSpace(string(out)) != "ok" { + s.healthOK = false + return fmt.Errorf("%w: shim returned %q", ErrMRiverUnreachable, string(out)) + } + s.healthOK = true + return nil +} + +// ensureBootstrapped sends the Paliadin system prompt to the remote +// claude pane on first call. Idempotent — subsequent calls return nil +// without doing any work. +func (s *RemotePaliadinService) ensureBootstrapped(ctx context.Context) error { + s.bootstrapMu.Lock() + defer s.bootstrapMu.Unlock() + if s.bootstrapped { + return nil + } + prompt := paliadinSystemPrompt("/tmp/paliadin") + promptB64 := base64.StdEncoding.EncodeToString([]byte(prompt)) + if _, err := s.callShim(ctx, "bootstrap", promptB64); err != nil { + return fmt.Errorf("paliadin: bootstrap: %w", err) + } + s.bootstrapped = true + return nil +} + +// callShim runs `ssh @ -- ` against the +// paliadin-shim. The shim's authorized_keys command= directive ensures +// the verb + args are passed via $SSH_ORIGINAL_COMMAND regardless of +// what we put after the `--`; we keep the explicit argv form anyway so +// reading the code at the call site is unambiguous. +// +// Tests set callShimHook to bypass exec. +func (s *RemotePaliadinService) callShim(ctx context.Context, args ...string) ([]byte, error) { + if s.callShimHook != nil { + return s.callShimHook(ctx, args...) + } + + sshArgs := []string{ + "-F", "/dev/null", // ignore /etc/ssh/ssh_config + ~/.ssh/config + "-i", s.cfg.SSHKeyPath, + "-p", strconv.Itoa(s.cfg.SSHPort), // 22022 — bypasses Tailscale SSH on :22 + "-o", "IdentitiesOnly=yes", + "-o", "UserKnownHostsFile=" + s.cfg.KnownHostsPath, + "-o", "StrictHostKeyChecking=yes", + "-o", "BatchMode=yes", + "-o", "ConnectTimeout=3", + "-o", "ServerAliveInterval=10", + "-o", "ServerAliveCountMax=3", + s.cfg.SSHUser + "@" + s.cfg.SSHHost, + "--", + } + sshArgs = append(sshArgs, args...) + + // Shim's run-turn timeout is 60 s; +10 s gives SSH some overhead. + c, cancel := context.WithTimeout(ctx, 70*time.Second) + defer cancel() + + cmd := exec.CommandContext(c, "ssh", sshArgs...) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + if err := cmd.Run(); err != nil { + return nil, fmt.Errorf("ssh %s: %w (stderr: %s)", strings.Join(args, " "), err, strings.TrimSpace(stderr.String())) + } + return stdout.Bytes(), nil +} + +// classifySSHError turns a callShim error into one of the audit-row +// error codes. Codes are stable strings shown on the admin dashboard +// and used by the frontend's friendlyErrorMessage to localise. +func classifySSHError(err error) string { + if err == nil { + return "" + } + if errors.Is(err, ErrMRiverUnreachable) { + return "mriver_unreachable" + } + if errors.Is(err, context.DeadlineExceeded) { + return "timeout" + } + msg := err.Error() + switch { + case strings.Contains(msg, "Connection timed out"), + strings.Contains(msg, "Connection refused"), + strings.Contains(msg, "Could not resolve hostname"), + strings.Contains(msg, "Network is unreachable"): + return "mriver_unreachable" + case strings.Contains(msg, "exit status 124"): + // Shim's run-turn 60 s timeout — Claude didn't write the + // response file in time. + return "timeout" + case strings.Contains(msg, "Permission denied"): + return "shim_auth_failed" + default: + return "shim_error" + } +} + +// DisabledPaliadinService is a stub that always returns +// ErrPaliadinDisabled. cmd/server/main.go constructs one when neither +// PALIADIN_REMOTE_HOST is set nor a local tmux is available; without +// the stub, the handler would have to nil-check on every entry point. +type DisabledPaliadinService struct { + paliadinDB +} + +// NewDisabledPaliadinService wires the stub. DB methods (IsOwner / +// ListRecentTurns / Stats) still work; only RunTurn / ResetSession +// return ErrPaliadinDisabled. +func NewDisabledPaliadinService(db *sqlx.DB, users *UserService) *DisabledPaliadinService { + return &DisabledPaliadinService{paliadinDB: paliadinDB{db: db, users: users}} +} + +func (s *DisabledPaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) { + return nil, ErrPaliadinDisabled +} + +func (s *DisabledPaliadinService) ResetSession(ctx context.Context) error { + return ErrPaliadinDisabled +} + +// Compile-time interface conformance checks — fail the build, not a +// runtime test, if a method drifts off any backend. +var ( + _ Paliadin = (*LocalPaliadinService)(nil) + _ Paliadin = (*RemotePaliadinService)(nil) + _ Paliadin = (*DisabledPaliadinService)(nil) +) diff --git a/internal/services/paliadin_remote_test.go b/internal/services/paliadin_remote_test.go new file mode 100644 index 0000000..98f204f --- /dev/null +++ b/internal/services/paliadin_remote_test.go @@ -0,0 +1,257 @@ +package services + +import ( + "context" + "errors" + "fmt" + "strings" + "sync/atomic" + "testing" + "time" +) + +// Tests for the remote-Paliadin backend. Every test bypasses exec via +// the callShimHook field — no real ssh is ever invoked, no DB rows are +// written. Tests that would need DB I/O (audit row insert/complete on +// RunTurn) are not in scope here; paliad's test suite has no sqlx mock +// and the existing paliadin_test.go only covers pure functions. + +func TestNewRemotePaliadinService_Defaults(t *testing.T) { + s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{ + SSHHost: "100.99.98.203", + // SSHPort + SSHUser intentionally left zero/empty + }) + if s.cfg.SSHPort != 22022 { + t.Errorf("SSHPort default = %d; want 22022 (Tailscale-SSH bypass port)", s.cfg.SSHPort) + } + if s.cfg.SSHUser != "m" { + t.Errorf("SSHUser default = %q; want %q", s.cfg.SSHUser, "m") + } + if s.cfg.SSHHost != "100.99.98.203" { + t.Errorf("SSHHost not preserved: %q", s.cfg.SSHHost) + } +} + +func TestNewRemotePaliadinService_HonoursOverrides(t *testing.T) { + s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{ + SSHHost: "10.0.0.1", + SSHPort: 2222, + SSHUser: "alice", + }) + if s.cfg.SSHPort != 2222 { + t.Errorf("SSHPort override lost: %d", s.cfg.SSHPort) + } + if s.cfg.SSHUser != "alice" { + t.Errorf("SSHUser override lost: %q", s.cfg.SSHUser) + } +} + +func TestClassifySSHError(t *testing.T) { + cases := []struct { + name string + err error + want string + }{ + {"nil", nil, ""}, + {"explicit ErrMRiverUnreachable", ErrMRiverUnreachable, "mriver_unreachable"}, + {"wrapped ErrMRiverUnreachable", fmt.Errorf("foo: %w", ErrMRiverUnreachable), "mriver_unreachable"}, + {"context deadline", context.DeadlineExceeded, "timeout"}, + {"shim run-turn timeout (exit 124)", errors.New("ssh run-turn …: exit status 124 (stderr: response timeout)"), "timeout"}, + {"connection refused", errors.New("ssh health: dial: Connection refused"), "mriver_unreachable"}, + {"connection timed out", errors.New("ssh health: Connection timed out"), "mriver_unreachable"}, + {"permission denied", errors.New("ssh: Permission denied (publickey)"), "shim_auth_failed"}, + {"unknown", errors.New("ssh: some other failure"), "shim_error"}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got := classifySSHError(c.err) + if got != c.want { + t.Errorf("classifySSHError(%v) = %q; want %q", c.err, got, c.want) + } + }) + } +} + +func TestHealthGate_CachesOnSuccess(t *testing.T) { + var calls int32 + s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{SSHHost: "x"}) + s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) { + atomic.AddInt32(&calls, 1) + if len(args) != 1 || args[0] != "health" { + t.Errorf("unexpected callShim args: %v", args) + } + return []byte("ok\n"), nil + } + for i := 0; i < 5; i++ { + if err := s.healthGate(context.Background()); err != nil { + t.Fatalf("healthGate iteration %d: %v", i, err) + } + } + if got := atomic.LoadInt32(&calls); got != 1 { + t.Errorf("expected 1 callShim call (cached); got %d", got) + } +} + +func TestHealthGate_RetriesAfterFailure(t *testing.T) { + var calls int32 + s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{SSHHost: "x"}) + s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) { + atomic.AddInt32(&calls, 1) + return nil, errors.New("ssh: Connection refused") + } + for i := 0; i < 3; i++ { + err := s.healthGate(context.Background()) + if !errors.Is(err, ErrMRiverUnreachable) { + t.Errorf("iteration %d: err %v; want wrapping ErrMRiverUnreachable", i, err) + } + } + // Failed health is NOT cached — every call re-probes. + if got := atomic.LoadInt32(&calls); got != 3 { + t.Errorf("expected 3 callShim calls (no caching on failure); got %d", got) + } +} + +func TestHealthGate_RejectsUnexpectedReply(t *testing.T) { + s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{SSHHost: "x"}) + s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) { + return []byte("not-ok"), nil + } + err := s.healthGate(context.Background()) + if !errors.Is(err, ErrMRiverUnreachable) { + t.Errorf("err = %v; want wrap of ErrMRiverUnreachable for non-ok reply", err) + } +} + +func TestEnsureBootstrapped_RunsOnce(t *testing.T) { + var calls int32 + s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{SSHHost: "x"}) + s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) { + atomic.AddInt32(&calls, 1) + if len(args) != 2 || args[0] != "bootstrap" { + t.Errorf("unexpected callShim args: %v", args) + } + // args[1] is the base64'd system prompt — no need to decode in + // the test; just sanity-check it isn't trivially empty. + if len(args[1]) < 100 { + t.Errorf("bootstrap prompt suspiciously short: %d bytes", len(args[1])) + } + return []byte("ok\n"), nil + } + for i := 0; i < 3; i++ { + if err := s.ensureBootstrapped(context.Background()); err != nil { + t.Fatalf("ensureBootstrapped iteration %d: %v", i, err) + } + } + if got := atomic.LoadInt32(&calls); got != 1 { + t.Errorf("expected 1 callShim call (bootstrap is one-shot); got %d", got) + } +} + +func TestEnsureBootstrapped_RetriesOnFailure(t *testing.T) { + var calls int32 + var failOnce atomic.Bool + s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{SSHHost: "x"}) + s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) { + atomic.AddInt32(&calls, 1) + if failOnce.CompareAndSwap(false, true) { + return nil, errors.New("ssh: transient failure") + } + return []byte("ok\n"), nil + } + if err := s.ensureBootstrapped(context.Background()); err == nil { + t.Fatal("first call should error") + } + if err := s.ensureBootstrapped(context.Background()); err != nil { + t.Fatalf("second call should succeed: %v", err) + } + // Third call should be a cache hit (bootstrapped flag set on success). + if err := s.ensureBootstrapped(context.Background()); err != nil { + t.Fatalf("third call should be cached: %v", err) + } + if got := atomic.LoadInt32(&calls); got != 2 { + t.Errorf("expected 2 callShim calls (1 fail + 1 succeed; 3rd cached); got %d", got) + } +} + +func TestHealthGate_CacheExpires(t *testing.T) { + var calls int32 + s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{SSHHost: "x"}) + s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) { + atomic.AddInt32(&calls, 1) + return []byte("ok"), nil + } + if err := s.healthGate(context.Background()); err != nil { + t.Fatalf("first probe: %v", err) + } + // Force the cached timestamp to expire. + s.healthMu.Lock() + s.healthCheckedAt = time.Now().Add(-11 * time.Second) + s.healthMu.Unlock() + if err := s.healthGate(context.Background()); err != nil { + t.Fatalf("second probe (expired cache): %v", err) + } + if got := atomic.LoadInt32(&calls); got != 2 { + t.Errorf("expected 2 callShim calls (cache expired between); got %d", got) + } +} + +func TestRemotePaliadin_ImplementsPaliadin(t *testing.T) { + // Compile-time check is in paliadin_remote.go; this test makes the + // failure mode obvious if someone accidentally drops a method. + var _ Paliadin = (*RemotePaliadinService)(nil) + var _ Paliadin = (*LocalPaliadinService)(nil) + var _ Paliadin = (*DisabledPaliadinService)(nil) +} + +func TestDisabledPaliadinService(t *testing.T) { + s := NewDisabledPaliadinService(nil, nil) + if _, err := s.RunTurn(context.Background(), TurnRequest{}); !errors.Is(err, ErrPaliadinDisabled) { + t.Errorf("RunTurn error = %v; want ErrPaliadinDisabled", err) + } + if err := s.ResetSession(context.Background()); !errors.Is(err, ErrPaliadinDisabled) { + t.Errorf("ResetSession error = %v; want ErrPaliadinDisabled", err) + } +} + +func TestCallShim_SSHArgvShape(t *testing.T) { + // Verify the ssh argv we'd construct includes the bypass-port flag, + // the key + known_hosts paths, and the verb after `--`. We don't + // actually exec ssh — we set callShimHook so callShim never reaches + // the exec path; this test just guards the constructor wiring. + s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{ + SSHHost: "100.99.98.203", + SSHPort: 22022, + SSHUser: "m", + SSHKeyPath: "/tmp/k", + KnownHostsPath: "/tmp/kh", + }) + var captured []string + s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) { + captured = append([]string(nil), args...) + return []byte("ok"), nil + } + _, _ = s.callShim(context.Background(), "health") + if len(captured) != 1 || captured[0] != "health" { + t.Errorf("callShim forwarded args = %v; want [health]", captured) + } +} + +func TestCallShim_StderrSurfacesInError(t *testing.T) { + // When the real exec path fails, callShim wraps stderr into the + // returned error so classifySSHError can pattern-match. Simulate + // that contract via the hook. + s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{SSHHost: "x"}) + s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) { + return nil, errors.New("ssh health: exit status 1 (stderr: Permission denied (publickey))") + } + _, err := s.callShim(context.Background(), "health") + if err == nil { + t.Fatal("expected error") + } + if !strings.Contains(err.Error(), "Permission denied") { + t.Errorf("error should preserve stderr: %v", err) + } + if classifySSHError(err) != "shim_auth_failed" { + t.Errorf("classifier should pick up Permission denied; got %q", classifySSHError(err)) + } +} diff --git a/scripts/paliadin-shim b/scripts/paliadin-shim new file mode 100755 index 0000000..5ab7667 --- /dev/null +++ b/scripts/paliadin-shim @@ -0,0 +1,185 @@ +#!/bin/bash +# paliadin-shim — server-side RPC for paliad's remote-tmux turns. +# +# Invoked via mRiver's ~/.ssh/authorized_keys command= restriction. The +# client's requested command is exposed in $SSH_ORIGINAL_COMMAND; this +# script parses it and dispatches to a fixed verb set. +# +# Design: docs/design-paliadin-tailscale-ssh-2026-05-07.md §5.4. +# +# Verbs: +# health -> "ok" iff tmux + claude reachable +# bootstrap -> ensure pane + send system prompt +# run-turn -> send framed prompt, poll, return +# reset -> /clear the conversation +# +# All multi-character payloads (prompts, messages) are base64-encoded by +# the Go caller so we never have to quote them through ssh's argv. +# +# Errors go to stderr with a non-zero exit. The Go side maps the exit +# status into a friendly error code. +set -euo pipefail +umask 077 + +readonly TMUX_SESSION="${PALIADIN_TMUX_SESSION:-paliad-paliadin}" +readonly RESPONSE_DIR="${PALIADIN_RESPONSE_DIR:-/tmp/paliadin}" +readonly TIMEOUT_S="${PALIADIN_TIMEOUT_S:-60}" +readonly PANE_READY_S=60 # max wait for claude pane to settle +readonly TURN_ID_RE='^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$' + +mkdir -p "$RESPONSE_DIR" +chmod 700 "$RESPONSE_DIR" + +# Parse $SSH_ORIGINAL_COMMAND into argv. Format: " …". +# We never `eval` this; `read -r -a` splits on $IFS without word-expansion. +read -r -a argv <<< "${SSH_ORIGINAL_COMMAND:-}" +verb="${argv[0]:-}" + +# --------------------------------------------------------------------------- +# helpers +# --------------------------------------------------------------------------- + +log_err() { printf 'paliadin-shim: %s\n' "$*" >&2; } + +# ensure_pane creates the tmux session + claude window if missing, waits +# for the pane to become ready, and prints the target identifier +# ("session:window-idx") on stdout. +ensure_pane() { + if ! tmux has-session -t "$TMUX_SESSION" 2>/dev/null; then + tmux new-session -d -s "$TMUX_SESSION" + fi + + # Look for an existing window tagged with @paliadin-scope=chat. + local target="" + local idx scope + while read -r idx; do + [[ -z "$idx" ]] && continue + scope=$(tmux show-window-option -t "$TMUX_SESSION:$idx" -v @paliadin-scope 2>/dev/null || true) + if [[ "$scope" == "chat" ]]; then + target="$TMUX_SESSION:$idx" + break + fi + done < <(tmux list-windows -t "$TMUX_SESSION" -F '#{window_index}' 2>/dev/null || true) + + if [[ -z "$target" ]]; then + if ! command -v claude >/dev/null 2>&1; then + log_err "claude CLI not found in PATH" + exit 3 + fi + idx=$(tmux new-window -t "$TMUX_SESSION" -n claude-paliadin -P -F '#{window_index}' claude) + target="$TMUX_SESSION:$idx" + + # Wait for claude to settle. Matches Go waitForPaneReady (paliadin.go:495). + local deadline=$(( $(date +%s) + PANE_READY_S )) + local pane="" + while [[ $(date +%s) -lt $deadline ]]; do + pane=$(tmux capture-pane -t "$target" -p 2>/dev/null || true) + if [[ "$pane" == *"❯"* || "$pane" == *"│"* ]]; then + break + fi + sleep 0.5 + done + + tmux set-window-option -t "$target" @paliadin-scope chat >/dev/null + tmux set-window-option -t "$target" @fix-name claude-paliadin >/dev/null + fi + + printf '%s' "$target" +} + +# send_to_pane writes a literal string then Enter. +send_to_pane() { + local target="$1" msg="$2" + tmux send-keys -t "$target" -l -- "$msg" + tmux send-keys -t "$target" Enter +} + +# --------------------------------------------------------------------------- +# verb dispatch +# --------------------------------------------------------------------------- + +case "$verb" in + + health) + # Used by the Go side's healthGate to short-circuit when mRiver is + # offline or tmux/claude is broken. Output is parsed verbatim. + if ! command -v tmux >/dev/null 2>&1; then + log_err "tmux not in PATH"; exit 1 + fi + if ! command -v claude >/dev/null 2>&1; then + log_err "claude not in PATH"; exit 1 + fi + if ! tmux has-session -t "$TMUX_SESSION" 2>/dev/null; then + tmux new-session -d -s "$TMUX_SESSION" + fi + echo ok + ;; + + bootstrap) + # Inject the system prompt into a fresh claude pane. Idempotent — + # the Go side may call this repeatedly; tmux send-keys is harmless + # against a settled pane. + if [[ -z "${argv[1]:-}" ]]; then + log_err "bootstrap: missing prompt"; exit 2 + fi + if ! prompt=$(printf '%s' "${argv[1]}" | base64 -d 2>/dev/null); then + log_err "bootstrap: invalid base64 prompt"; exit 2 + fi + target=$(ensure_pane) + send_to_pane "$target" "$prompt" + sleep 2 # let claude absorb before turns flow + echo ok + ;; + + run-turn) + # $1 = turn_id (UUID), $2 = base64-encoded user message. + turn_id="${argv[1]:-}" + if [[ ! "$turn_id" =~ $TURN_ID_RE ]]; then + log_err "run-turn: bad turn_id"; exit 2 + fi + if [[ -z "${argv[2]:-}" ]]; then + log_err "run-turn: missing message"; exit 2 + fi + if ! msg=$(printf '%s' "${argv[2]}" | base64 -d 2>/dev/null); then + log_err "run-turn: invalid base64 message"; exit 2 + fi + target=$(ensure_pane) + out="$RESPONSE_DIR/$turn_id.txt" + rm -f "$out" + + # Envelope matches paliadin_prompt.go's `[PALIADIN:turn_id] ` shape. + send_to_pane "$target" "[PALIADIN:$turn_id] $msg" + + # Poll for the response file. Same shape as Go pollForResponse + # (paliadin.go:530). Settle delay so we don't read mid-flush. + deadline=$(( $(date +%s) + TIMEOUT_S )) + while [[ $(date +%s) -lt $deadline ]]; do + if [[ -s "$out" ]]; then + sleep 0.05 + cat "$out" + rm -f "$out" + exit 0 + fi + sleep 0.2 + done + log_err "response timeout after ${TIMEOUT_S}s" + exit 124 + ;; + + reset) + # Send `/clear` so the next turn starts a fresh conversation. + target=$(ensure_pane) + send_to_pane "$target" "/clear" + echo ok + ;; + + '') + log_err "no verb (set SSH_ORIGINAL_COMMAND via authorized_keys command=)" + exit 2 + ;; + + *) + log_err "unknown verb '$verb'" + exit 2 + ;; +esac