Merge: t-paliad-151 Phase B code (env-var-gated, compose flip held for A.5) — Paliadin remote-routing via Tailscale SSH to mRiver. Includes Phase A.0 design doc + scripts/paliadin-shim from earlier shift. Production behavior unchanged: without PALIADIN_REMOTE_HOST in env, paliad never invokes ssh and uses local-tmux PoC path byte-identically. Refactor: Paliadin interface + LocalPaliadinService + RemotePaliadinService + DisabledPaliadinService stub. main.go env-var switch (remote/local/disabled). Dockerfile +openssh-client. 14 unit tests via callShimHook. Frontend friendlyErrorMessage for mriver_unreachable/shim_auth_failed/shim_error/bootstrap_failed/timeout (DE+EN). NOT included: docker-compose network_mode: host flip — held on branch as da971a7 pending Phase A.5 traefik test by m. NOT cronus.

This commit is contained in:
m
2026-05-08 02:23:38 +02:00
12 changed files with 1657 additions and 64 deletions

View File

@@ -11,7 +11,7 @@ COPY . .
RUN CGO_ENABLED=0 go build -ldflags="-s -w" -o /paliad ./cmd/server RUN CGO_ENABLED=0 go build -ldflags="-s -w" -o /paliad ./cmd/server
FROM alpine:3.21 FROM alpine:3.21
RUN apk add --no-cache ca-certificates RUN apk add --no-cache ca-certificates openssh-client
WORKDIR /app WORKDIR /app
COPY --from=backend /paliad /app/paliad COPY --from=backend /paliad /app/paliad
COPY --from=frontend /app/frontend/dist /app/dist COPY --from=frontend /app/frontend/dist /app/dist

View File

@@ -2,10 +2,13 @@ package main
import ( import (
"context" "context"
"fmt"
"log" "log"
"net/http" "net/http"
"os" "os"
"os/exec"
"os/signal" "os/signal"
"strconv"
"syscall" "syscall"
// Embed Go's IANA tz database into the binary so time.LoadLocation works // Embed Go's IANA tz database into the binary so time.LoadLocation works
@@ -165,20 +168,34 @@ func main() {
CardLayout: services.NewCardLayoutService(pool), CardLayout: services.NewCardLayoutService(pool),
} }
// t-paliad-146 — Paliadin PoC. Always wired when DATABASE_URL // Paliadin backend selection (t-paliad-146 + t-paliad-151):
// is set; the per-request handler gate (requirePaliadinOwner) // PALIADIN_REMOTE_HOST set → RemotePaliadinService (ssh to mRiver)
// restricts access to the single owner email // else: local tmux available → LocalPaliadinService (PoC path)
// (services.PaliadinOwnerEmail). All other authenticated users // else: DisabledPaliadinService (handlers still 404 for non-owners
// get a 404 — the route effectively does not exist for them. // via the gate; for m, RunTurn returns ErrPaliadinDisabled
// On hosts without tmux + the `claude` CLI (e.g. the Dokploy // which surfaces as a friendly error).
// container), the owner gate still applies; if m ever hits the //
// route from such a host, the service returns "tmux unavailable" // All three implement services.Paliadin; the per-request handler
// without ever invoking shell-out. // gate (requirePaliadinOwner) is unchanged and applies to every
tmuxSession := os.Getenv("PALIADIN_TMUX_SESSION") // backend.
responseDir := os.Getenv("PALIADIN_RESPONSE_DIR") if remoteHost := os.Getenv("PALIADIN_REMOTE_HOST"); remoteHost != "" {
svcBundle.Paliadin = services.NewPaliadinService(pool, users, tmuxSession, responseDir) cfg, err := buildPaliadinRemoteConfig(remoteHost)
log.Printf("paliadin: wired (owner=%s; gate is per-request, not per-deploy)", if err != nil {
services.PaliadinOwnerEmail) log.Fatalf("paliadin: remote config: %v", err)
}
svcBundle.Paliadin = services.NewRemotePaliadinService(pool, users, cfg)
log.Printf("paliadin: remote mode → ssh %s@%s:%d (owner=%s)",
cfg.SSHUser, cfg.SSHHost, cfg.SSHPort, services.PaliadinOwnerEmail)
} else if _, err := exec.LookPath("tmux"); err == nil {
tmuxSession := os.Getenv("PALIADIN_TMUX_SESSION")
responseDir := os.Getenv("PALIADIN_RESPONSE_DIR")
svcBundle.Paliadin = services.NewLocalPaliadinService(pool, users, tmuxSession, responseDir)
log.Printf("paliadin: local tmux mode (owner=%s)", services.PaliadinOwnerEmail)
} else {
svcBundle.Paliadin = services.NewDisabledPaliadinService(pool, users)
log.Printf("paliadin: disabled (no PALIADIN_REMOTE_HOST, no local tmux; owner=%s)",
services.PaliadinOwnerEmail)
}
// Wire ApprovalService into the entity services so Create / Update / // Wire ApprovalService into the entity services so Create / Update /
// Complete / Delete consult paliad.approval_policies (t-paliad-138). // Complete / Delete consult paliad.approval_policies (t-paliad-138).
// Without this wiring, the policies and request tables exist but no // Without this wiring, the policies and request tables exist but no
@@ -217,3 +234,83 @@ func main() {
log.Fatal(err) log.Fatal(err)
} }
} }
// buildPaliadinRemoteConfig assembles a RemotePaliadinConfig from
// environment variables, materialising the SSH private key and
// known_hosts blobs into chmod-600/644 tmpfiles for OpenSSH to read.
//
// The blobs travel as Dokploy secrets (multi-line env vars). We never
// persist them to disk — tmpfiles live for the process lifetime in
// /tmp and disappear on container restart. Re-creating them every boot
// is fine; the keys themselves rotate independently via Dokploy
// secret updates.
//
// Required: PALIADIN_REMOTE_HOST, PALIADIN_SSH_PRIVATE_KEY, PALIADIN_KNOWN_HOSTS.
// Optional: PALIADIN_REMOTE_USER (default "m"), PALIADIN_REMOTE_PORT
// (default 22022 — bypasses Tailscale SSH on :22, see design §4.5).
func buildPaliadinRemoteConfig(host string) (services.RemotePaliadinConfig, error) {
cfg := services.RemotePaliadinConfig{
SSHHost: host,
SSHUser: cmpOr(os.Getenv("PALIADIN_REMOTE_USER"), "m"),
SSHPort: 22022,
}
if p := os.Getenv("PALIADIN_REMOTE_PORT"); p != "" {
n, err := strconv.Atoi(p)
if err != nil || n <= 0 || n > 65535 {
return cfg, fmt.Errorf("PALIADIN_REMOTE_PORT %q: not a valid port", p)
}
cfg.SSHPort = n
}
keyPath, err := writeSecretFile("paliadin-id_ed25519-", os.Getenv("PALIADIN_SSH_PRIVATE_KEY"), 0o600)
if err != nil {
return cfg, fmt.Errorf("PALIADIN_SSH_PRIVATE_KEY: %w", err)
}
if keyPath == "" {
return cfg, fmt.Errorf("PALIADIN_REMOTE_HOST set but PALIADIN_SSH_PRIVATE_KEY empty")
}
cfg.SSHKeyPath = keyPath
knownHostsPath, err := writeSecretFile("paliadin-known_hosts-", os.Getenv("PALIADIN_KNOWN_HOSTS"), 0o644)
if err != nil {
return cfg, fmt.Errorf("PALIADIN_KNOWN_HOSTS: %w", err)
}
if knownHostsPath == "" {
return cfg, fmt.Errorf("PALIADIN_REMOTE_HOST set but PALIADIN_KNOWN_HOSTS empty")
}
cfg.KnownHostsPath = knownHostsPath
return cfg, nil
}
// writeSecretFile writes blob to a tmpfile with the given mode and
// returns its path. Returns ("", nil) when blob is empty so callers
// can distinguish "not set" from real I/O errors.
func writeSecretFile(prefix, blob string, mode os.FileMode) (string, error) {
if blob == "" {
return "", nil
}
f, err := os.CreateTemp("", prefix+"*")
if err != nil {
return "", err
}
if _, err := f.WriteString(blob); err != nil {
_ = f.Close()
_ = os.Remove(f.Name())
return "", err
}
if err := f.Close(); err != nil {
return "", err
}
if err := os.Chmod(f.Name(), mode); err != nil {
return "", err
}
return f.Name(), nil
}
func cmpOr(s, fallback string) string {
if s != "" {
return s
}
return fallback
}

View File

@@ -0,0 +1,677 @@
# Paliadin: route prod via Tailscale SSH to mRiver
**Issue:** m/paliad#12 — t-paliad-151
**Date:** 2026-05-07
**Author:** noether (inventor)
**Supersedes nothing.** Extends `docs/design-paliadin-2026-05-07.md` (the Phase 0 PoC) with a third deployment path between "laptop-only PoC" and "Anthropic API direct".
**Related:** t-paliad-146 (PoC ship), t-paliad-150 (`friendlyErrorMessage` pattern).
---
## 1. Goal
Make Paliadin reachable from `paliad.de` (Dokploy on mLake) without losing m's Claude Code subscription, by routing each turn over Tailscale + SSH from the paliad container to mRiver, where the existing long-lived `tmux` + `claude` PoC keeps running.
**Non-goals (v1):**
- Multi-host failover.
- Encryption beyond SSH-over-tailnet (already E2E-encrypted by Tailscale's WireGuard layer).
- Anthropic API fallback when mRiver is offline — show a friendly error instead.
- Wake-on-LAN of mRiver.
- Multi-tenant or multi-firm variants.
---
## 2. Live state — what was verified before designing
A design built on stale facts rots fast. These were probed on 2026-05-07, not assumed from CLAUDE.md or memory:
| Fact | How verified | Result |
|---|---|---|
| mRiver = `100.99.98.203`, has tmux + claude | this worker runs on mRiver; `tmux -V``tmux 3.6a`; `which claude``/home/m/.local/bin/claude` | confirmed |
| mLake (`100.99.98.201`) has Tailscale running | `ssh m@mlake tailscale status` | confirmed; mRiver visible as `active; direct [2a02:4780:41:3fbc::1]:41641` |
| paliad container Dockerfile is alpine:3.21 minimal, no SSH, no tailscaled | `Dockerfile` | confirmed (only `ca-certificates`) |
| paliad compose runs default Docker bridge (no `network_mode`) | `docker-compose.yml` | confirmed |
| mRiver has no `~/.ssh/authorized_keys` yet | `ls ~/.ssh/` | confirmed — file must be created in Phase A |
| `/tmp/paliadin/` does not exist on mRiver yet | `ls /tmp/paliadin` | confirmed — created on first turn (paliadin.go:185 `os.MkdirAll`) |
| `paliad-paliadin` tmux session is not currently running on mRiver | `tmux ls` | not present; the existing PoC creates it on demand |
**Implication for design:** the paliad container needs new infrastructure on three axes — network reachability of the tailnet, an SSH client + identity, and a service-layer code path that talks to a remote tmux instead of a local one. Each axis is its own sub-design below.
---
## 3. Locked decisions (m, 2026-05-07 22:35)
m made four design-shaping calls via the inventor's `AskUserQuestion` pass. They are recorded here verbatim because every downstream choice in §4§6 follows from them.
| # | Question | m's choice |
|---|---|---|
| 1 | Container Tailscale shape | **`network_mode: host` on paliad** |
| 2 | SSH-to-mRiver protocol granularity | **Server-side `paliadin-shim` (one RPC per turn)** |
| 3 | Routing trigger | **Env var `PALIADIN_REMOTE_HOST` + interface split** |
| 4 | SSH private key storage | **Dokploy secret env var `PALIADIN_SSH_PRIVATE_KEY`** |
| 5 | SSH port to bypass Tailscale SSH | **Port 22022 via `ssh.socket` drop-in (Phase A finding, 23:30)** |
Decision (1) was *not* the inventor's recommendation — host mode has known interaction risk with traefik (§4.2). m is overriding the recommendation; this design accepts the call and codifies a Phase A test step that gates the rollout on traefik still working under host mode. If Phase A blows up, the fallback is to revisit (1) in a follow-up issue, not to silently swap to a sidecar.
Decision (5) emerged during Phase A: Tailscale SSH on mRiver was found to intercept `:22` from tailnet peers and bypass OpenSSH's `authorized_keys` entirely (banner says "Tailscale", auth method "none"). The `command=` shim restriction therefore never fires on the standard port. Adding port 22022 via a `systemd ssh.socket` drop-in routes paliad's connections to real OpenSSH where the restriction works. m's interactive `tailscale ssh m@mriver` on `:22` stays untouched. See §4.4 for the implementation.
---
## 4. Sub-design A — Container Tailscale shape
### 4.1 Shape: `network_mode: host`
paliad's container shares mLake's network namespace. `tailscale0` (mLake's tailnet interface) is directly visible from inside the container. Outbound `ssh m@100.99.98.203` reaches mRiver over the tailnet without any sidecar, userspace tailscaled, SOCKS proxy, or auth-key flow inside the container.
```yaml
# docker-compose.yml diff
services:
web:
build: .
network_mode: host # NEW
# remove: expose: ["8080"] # host mode means port is on the host directly
environment:
- PORT=8080
...
# NEW Paliadin remote-routing knobs
- PALIADIN_REMOTE_HOST=${PALIADIN_REMOTE_HOST} # 100.99.98.203
- PALIADIN_REMOTE_PORT=${PALIADIN_REMOTE_PORT} # 22022 (bypasses Tailscale SSH, see §4.5)
- PALIADIN_REMOTE_USER=${PALIADIN_REMOTE_USER} # m
- PALIADIN_SSH_PRIVATE_KEY=${PALIADIN_SSH_PRIVATE_KEY}
- PALIADIN_KNOWN_HOSTS=${PALIADIN_KNOWN_HOSTS} # one-line ssh-keyscan -p 22022 output
restart: unless-stopped
```
### 4.2 Trade-off accepted: traefik routing under host mode
paliad.de's TLS is provided by Dokploy's traefik on the `dokploy-network` overlay. With `network_mode: host`, paliad is no longer attached to that overlay. Two failure modes are possible:
- **(M1)** traefik can't discover the service via Docker DNS → 502 at the edge.
- **(M2)** traefik routes via host loopback (`http://127.0.0.1:8080` or `host.docker.internal`) and works fine.
Recent Dokploy versions configure traefik with both `loadbalancer.server.url` and Docker labels; (M2) is the documented host-mode path. **Phase A explicitly tests this** (§7) before any code is written; if (M1) materialises, the design rolls back to the sidecar variant of decision 1 in a follow-up issue.
Other host-mode side-effects to flag in operations:
- paliad listens on host port 8080 directly. Any other compose service binding 8080 conflicts.
- paliad's outbound DNS uses host resolver (no Docker-internal `web` etc.). Currently fine: paliad's only network deps are external (Supabase, SMTP, GitHub raw). No service on `dokploy-network` is referenced by name.
- The container can reach **every** Tailscale node, not just mRiver. Mitigations live in §5 (key restriction) and §5.2 (`from=` clause on mRiver authorized_keys).
### 4.3 Dockerfile diff
```dockerfile
# Final stage adds the SSH client only. Tailscale is provided by the host.
FROM alpine:3.21
RUN apk add --no-cache ca-certificates openssh-client # +openssh-client (~1MB)
WORKDIR /app
COPY --from=backend /paliad /app/paliad
COPY --from=frontend /app/frontend/dist /app/dist
EXPOSE 8080
CMD ["/app/paliad"]
```
Image-size delta: alpine `openssh-client` is ~1.1 MB compressed — negligible. No tailscaled, no entrypoint script, no extra processes inside the container.
### 4.4 What does NOT change
- No Tailscale auth-key inside paliad. The container inherits the host's tailnet binding, so there is no per-container Tailscale identity to rotate. mLake's existing Tailscale auth is the only one in scope.
- No tailscaled process inside the container.
- No new sidecar container.
### 4.5 Bypassing Tailscale SSH via port 22022 (Phase A discovery)
**Phase A revealed** that Tailscale SSH on mRiver intercepts `:22` from tailnet peers before OpenSSH sees the connection. The SSH banner reads `SSH-2.0-Tailscale`, the verbose log shows `Authenticated using "none"`, and the `authorized_keys command=` directive is therefore inert. mRiver's `tailscale status --json` confirms the `https://tailscale.com/cap/ssh` capability is enabled.
The fix: a separate listening port for the paliad route, where Tailscale SSH does not intercept and real OpenSSH handles auth.
mRiver uses systemd socket activation for sshd (`/usr/lib/systemd/system/ssh.socket` binds `:22`). Setting `Port 22022` in `sshd_config` is **ignored** under socket activation — listen ports come from the socket unit, not sshd's own config. The correct change is a drop-in:
```ini
# /etc/systemd/system/ssh.socket.d/paliad.conf
[Socket]
ListenStream=0.0.0.0:22022
ListenStream=[::]:22022
```
Followed by `systemctl daemon-reload && systemctl restart ssh.socket`. Both `:22` (still routed through Tailscale SSH for m's interactive use) and `:22022` (real OpenSSH) end up listening. The same sshd binary handles both — same host key, same `authorized_keys`, same sshd_config. The only difference is *which port* a peer dials.
A failed first attempt (2026-05-07 23:07) added the drop-in while a stale `Port 22022` directive in `sshd_config.d/99-paliad-test.conf` was still bound — the resulting `Address already in use` took `ssh.socket` down for ~30 s until reverted. Lesson: clean any prior `Port` directives out of `sshd_config.d/*.conf` before retrying the socket drop-in.
Phase A end-to-end test (2026-05-07 23:31) succeeded with port 22022:
- `ssh -p 22022 -i paliad-prod-key m@100.99.98.203 health``ok`
- `run-turn <uuid> <base64-msg>` → 3.4 s round-trip including a Claude-Code response
- `from="100.99.98.201"` correctly rejected a connection sourced from mRiver itself (`Permission denied (publickey,password)`)
---
## 5. Sub-design B — SSH identity, restricted shim, host-key pinning
### 5.1 Identity: dedicated ed25519 keypair `paliad-prod`
One keypair, generated once on mRiver during Phase A, used by every paliad-prod deploy:
```bash
# On mRiver (Phase A bootstrap):
ssh-keygen -t ed25519 -N "" -C "paliad-prod $(date +%Y-%m-%d)" -f /tmp/paliad-prod-key
# Public key → mRiver authorized_keys (see 5.2)
# Private key → Dokploy secret store as PALIADIN_SSH_PRIVATE_KEY
shred -u /tmp/paliad-prod-key # only the encrypted/secret-stored copies survive
```
Rotation: regenerate, push public key to mRiver authorized_keys, update Dokploy secret, redeploy. No code change needed — paliad's startup re-reads the env var on every boot.
The private key is delivered to the container as a multi-line env var. At process start, paliad writes it to a tmpfile so OpenSSH can use it:
```go
// cmd/server/main.go (sketch)
func loadPaliadinSSHKey() (string, error) {
blob := os.Getenv("PALIADIN_SSH_PRIVATE_KEY")
if blob == "" { return "", nil } // remote mode disabled
f, err := os.CreateTemp("", "paliadin-id_ed25519-")
if err != nil { return "", err }
if err := os.Chmod(f.Name(), 0o600); err != nil { return "", err }
if _, err := f.WriteString(blob); err != nil { return "", err }
if err := f.Close(); err != nil { return "", err }
return f.Name(), nil // path passed to RemotePaliadinService
}
```
The tmpfile lives at `/tmp/paliadin-id_ed25519-<rand>` for the container's lifetime. On container restart, a fresh tmpfile is written. We never persist the key to a volume.
### 5.2 mRiver `authorized_keys` entry
```
command="/home/m/.local/bin/paliadin-shim",no-pty,no-port-forwarding,no-agent-forwarding,no-X11-forwarding,no-user-rc,from="100.99.98.201" ssh-ed25519 AAAA...PUBKEY... paliad-prod
```
Each restriction matters:
- `command=` — every `ssh m@mriver …` invocation runs the shim regardless of what the client asked for. The client's requested command is exposed as `$SSH_ORIGINAL_COMMAND` for the shim to dispatch on.
- `no-pty,no-port-forwarding,no-agent-forwarding,no-X11-forwarding,no-user-rc` — defence-in-depth: even if someone steals the key and bypasses the shim's argument validation, they can't get an interactive shell, can't tunnel ports, can't pivot via agent forwarding.
- `from="100.99.98.201"` — only accept connections from mLake's tailnet IP. Defends against the "container has full tailnet visibility" host-mode side-effect from §4.2: if the key leaks off mLake, it can't be replayed from another tailnet host.
### 5.3 Host-key pinning
`StrictHostKeyChecking=accept-new` is too loose for a long-lived production identity (one-time MITM during first connect substitutes a different key forever). Instead:
- During Phase A, run `ssh-keyscan -p 22022 -t ed25519 100.99.98.203` on mLake.
- Capture the single output line. The host-key portion is identical to the `:22` entry — same sshd, same keys — but the `[100.99.98.203]:22022` prefix matters because OpenSSH's `known_hosts` is `host:port`-keyed for non-22 ports.
- Store as Dokploy secret `PALIADIN_KNOWN_HOSTS`.
- At container startup, write to `/tmp/paliadin-known_hosts` chmod 644.
- Pass to OpenSSH via `-o UserKnownHostsFile=/tmp/paliadin-known_hosts -o StrictHostKeyChecking=yes`.
If mRiver's host key ever rotates (rare; only on disk wipe / fresh OS), Phase A runs again and the secret is updated. SSH refuses to connect with a clear "host key changed" error, which surfaces as `mriver_unreachable` to the user — exactly the right blast-radius (loud failure, no silent connect to a substitute host).
### 5.4 The shim — `paliadin-shim`
A bash script on mRiver at `/home/m/.local/bin/paliadin-shim`. It is the **only** thing the paliad-prod key is allowed to invoke, and it dispatches on `$SSH_ORIGINAL_COMMAND`. Three RPCs:
```bash
#!/bin/bash
# paliadin-shim — server-side RPC for paliad's remote-tmux turns.
# Invoked via authorized_keys command= with $SSH_ORIGINAL_COMMAND set.
set -euo pipefail
umask 077
readonly TMUX_SESSION="${PALIADIN_TMUX_SESSION:-paliad-paliadin}"
readonly RESPONSE_DIR="${PALIADIN_RESPONSE_DIR:-/tmp/paliadin}"
readonly TIMEOUT_S=60
readonly TURN_ID_RE='^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$'
mkdir -p "$RESPONSE_DIR"
# Parse $SSH_ORIGINAL_COMMAND. Format: "<verb> <arg1> <arg2> …"
read -r -a argv <<< "${SSH_ORIGINAL_COMMAND:-}"
verb="${argv[0]:-}"
ensure_pane() {
if ! tmux has-session -t "$TMUX_SESSION" 2>/dev/null; then
tmux new-session -d -s "$TMUX_SESSION"
fi
# Find or create the @paliadin-scope=chat window.
local target=""
while read -r idx; do
scope=$(tmux show-window-option -t "$TMUX_SESSION:$idx" -v @paliadin-scope 2>/dev/null || true)
if [[ "$scope" == "chat" ]]; then target="$TMUX_SESSION:$idx"; break; fi
done < <(tmux list-windows -t "$TMUX_SESSION" -F '#{window_index}')
if [[ -z "$target" ]]; then
idx=$(tmux new-window -t "$TMUX_SESSION" -n claude-paliadin -P -F '#{window_index}' claude)
target="$TMUX_SESSION:$idx"
# Wait for claude to settle (60s bound; matches Go waitForPaneReady).
for _ in $(seq 1 120); do
pane=$(tmux capture-pane -t "$target" -p 2>/dev/null || true)
if [[ "$pane" == *""* || "$pane" == *"│"* ]]; then break; fi
sleep 0.5
done
tmux set-window-option -t "$target" @paliadin-scope chat
tmux set-window-option -t "$target" @fix-name claude-paliadin
# Bootstrap system prompt — reuses the Go service's prompt text.
# The Go side sends this via the `bootstrap` RPC on first turn instead
# of duplicating the prompt here. See §6.4.
fi
echo "$target"
}
case "$verb" in
health)
# Liveness check — used by paliad to short-circuit when mRiver is offline.
# Returns "ok" iff tmux + claude are reachable.
tmux has-session -t "$TMUX_SESSION" 2>/dev/null \
|| tmux new-session -d -s "$TMUX_SESSION"
command -v claude >/dev/null && echo ok || { echo no-claude; exit 1; }
;;
bootstrap)
# First-turn-only: ensure pane exists and inject the system prompt.
# $1 = base64-encoded prompt body (avoids quoting hell).
target=$(ensure_pane)
prompt=$(printf '%s' "${argv[1]:?missing prompt}" | base64 -d)
tmux send-keys -t "$target" -l -- "$prompt"
tmux send-keys -t "$target" Enter
sleep 2 # give claude a moment to absorb
echo ok
;;
run-turn)
# $1 = turn_id (UUID); $2 = base64-encoded user message.
turn_id="${argv[1]:?missing turn_id}"
[[ "$turn_id" =~ $TURN_ID_RE ]] || { echo >&2 "bad turn_id"; exit 2; }
msg=$(printf '%s' "${argv[2]:?missing message}" | base64 -d)
target=$(ensure_pane)
out="$RESPONSE_DIR/$turn_id.txt"
rm -f "$out"
# Envelope matches what paliadin_prompt.go expects.
tmux send-keys -t "$target" -l -- "[PALIADIN:$turn_id] $msg"
tmux send-keys -t "$target" Enter
# Poll for the response file. Same shape as Go pollForResponse.
for _ in $(seq 1 $((TIMEOUT_S * 5))); do
if [[ -s "$out" ]]; then
sleep 0.05 # settle
cat "$out"
rm -f "$out"
exit 0
fi
sleep 0.2
done
echo >&2 "paliadin: response timeout after ${TIMEOUT_S}s"
exit 124
;;
reset)
# /clear the conversation; next turn starts fresh.
target=$(ensure_pane)
tmux send-keys -t "$target" -l -- "/clear"
tmux send-keys -t "$target" Enter
echo ok
;;
*)
echo >&2 "paliadin-shim: unknown verb '$verb'"
exit 2
;;
esac
```
Why a shim instead of raw tmux-over-SSH:
- One SSH round-trip per turn (~50 ms over tailnet) vs ~1020 round-trips for the granular pattern.
- Argument validation lives in one place (UUID regex on turn_id, base64 for messages, fixed verb list) — easier to audit than a regex over `$SSH_ORIGINAL_COMMAND` matching `tmux send-keys …`.
- mRiver-side concerns (response polling, settle delays, pane-readiness) stay on mRiver, which is where the tmux state lives. The Go service stops caring about local file polling at all.
---
## 6. Sub-design C — Service-layer integration, routing, reliability
### 6.1 Interface split
The current `*PaliadinService` becomes an interface with two implementations: `LocalPaliadinService` (the existing tmux code, renamed) and `RemotePaliadinService` (the new SSH code). Construction picks one at startup based on `PALIADIN_REMOTE_HOST`.
```go
// internal/services/paliadin.go (after refactor)
type Paliadin interface {
RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error)
ResetSession(ctx context.Context) error
ListRecentTurns(ctx context.Context, callerID uuid.UUID, limit int) ([]PaliadinTurn, error)
Stats(ctx context.Context, callerID uuid.UUID) (*PaliadinStats, error)
IsOwner(ctx context.Context, userID uuid.UUID) (bool, error)
}
// LocalPaliadinService wraps the current tmux PoC (laptop / dev path).
type LocalPaliadinService struct { /* identical to today's PaliadinService */ }
// RemotePaliadinService talks to a paliadin-shim over SSH on mRiver.
type RemotePaliadinService struct {
db *sqlx.DB
users *UserService
sshHost string // 100.99.98.203
sshPort int // 22022 — bypasses Tailscale SSH on :22 (see §4.5)
sshUser string // m
sshKeyPath string // /tmp/paliadin-id_ed25519-<rand>
knownHosts string // /tmp/paliadin-known_hosts
turnMu sync.Mutex
// Health-check cache.
healthMu sync.Mutex
healthOK bool
healthCheckedAt time.Time
}
```
DB access (`ListRecentTurns`, `Stats`, `IsOwner`) is identical for both — they only read `paliad.paliadin_turns`. They live in a shared `paliadinDB` helper struct embedded in both implementations.
### 6.2 Wiring at startup
```go
// cmd/server/main.go (excerpt)
var paliadin services.Paliadin
remoteHost := os.Getenv("PALIADIN_REMOTE_HOST")
switch {
case remoteHost != "":
keyPath, err := loadPaliadinSSHKey()
if err != nil { log.Fatalf("paliadin: load ssh key: %v", err) }
if keyPath == "" { log.Fatalf("paliadin: PALIADIN_REMOTE_HOST set but no PALIADIN_SSH_PRIVATE_KEY") }
knownHosts, err := loadPaliadinKnownHosts()
if err != nil { log.Fatalf("paliadin: load known_hosts: %v", err) }
port, _ := strconv.Atoi(cmpOr(os.Getenv("PALIADIN_REMOTE_PORT"), "22022"))
paliadin = services.NewRemotePaliadinService(db, userSvc, services.RemotePaliadinConfig{
SSHHost: remoteHost,
SSHPort: port,
SSHUser: cmpOr(os.Getenv("PALIADIN_REMOTE_USER"), "m"),
SSHKeyPath: keyPath,
KnownHostsPath: knownHosts,
})
log.Printf("paliadin: remote mode → ssh %s@%s:%d", "m", remoteHost, port)
case localTmuxAvailable():
paliadin = services.NewLocalPaliadinService(db, userSvc, "", "")
log.Printf("paliadin: local tmux mode")
default:
paliadin = services.NewDisabledPaliadinService(db, userSvc)
log.Printf("paliadin: disabled (no remote host, no local tmux)")
}
```
`NewDisabledPaliadinService` exists today implicitly via the `ErrTmuxUnavailable` path; making it explicit gives the constructor a clear name and the handler doesn't have to special-case `nil`.
### 6.3 SSH invocation pattern
`RemotePaliadinService` runs every RPC through the same helper:
```go
func (s *RemotePaliadinService) callShim(ctx context.Context, args ...string) ([]byte, error) {
sshArgs := []string{
"-F", "/dev/null", // ignore /etc/ssh/ssh_config + ~/.ssh/config
"-i", s.sshKeyPath,
"-p", strconv.Itoa(s.sshPort), // 22022 — bypasses Tailscale SSH on :22
"-o", "IdentitiesOnly=yes", // don't fall back to other keys
"-o", "UserKnownHostsFile=" + s.knownHostsPath,
"-o", "StrictHostKeyChecking=yes",
"-o", "BatchMode=yes",
"-o", "ConnectTimeout=3",
"-o", "ServerAliveInterval=10",
"-o", "ServerAliveCountMax=3",
s.sshUser + "@" + s.sshHost,
"--",
}
sshArgs = append(sshArgs, args...)
c, cancel := context.WithTimeout(ctx, 70*time.Second) // shim has its own 60s; +10s for SSH overhead
defer cancel()
cmd := exec.CommandContext(c, "ssh", sshArgs...)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout; cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
return nil, fmt.Errorf("paliadin: ssh shim %v: %w (stderr: %s)", args, err, stderr.String())
}
return stdout.Bytes(), nil
}
```
`RunTurn` becomes:
```go
func (s *RemotePaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) {
s.turnMu.Lock()
defer s.turnMu.Unlock()
if err := s.healthGate(ctx); err != nil {
return nil, err // ErrMRiverUnreachable, picked up by handler
}
turnID := uuid.New()
started := time.Now().UTC()
if err := s.insertTurnRow(ctx, ); err != nil { return nil, err }
// First-turn-only: bootstrap the system prompt on mRiver. Detected by
// checking whether any prior turn for this user has succeeded.
if err := s.ensureBootstrapped(ctx); err != nil {
_ = s.markTurnError(ctx, turnID, "bootstrap_failed")
return nil, err
}
msg := sanitiseForTmux(req.UserMessage)
msgB64 := base64.StdEncoding.EncodeToString([]byte(msg))
body, err := s.callShim(ctx, "run-turn", turnID.String(), msgB64)
if err != nil {
_ = s.markTurnError(ctx, turnID, classifySSHError(err))
return nil, err
}
// Same trailer-parse + audit-row writes as Local, factored into shared helper.
return s.completeTurnFromBody(ctx, turnID, started, string(body))
}
```
### 6.4 System prompt bootstrap
The local PoC calls `paliadinSystemPrompt(s.responseDir)` once when it creates the pane. The remote path needs the same hook. Two options that don't require duplicating the German prompt body to mRiver:
- **Lazy bootstrap (chosen):** the first `RunTurn` after a paliad-prod restart sends the system prompt via `bootstrap` RPC, then runs the actual turn. Subsequent turns skip the bootstrap. State is per-process: `RemotePaliadinService.bootstrapped` boolean guarded by mutex.
- Eager bootstrap at startup is rejected — it forces every container start to wait for mRiver to be online, which couples paliad's boot to mRiver's availability.
Lazy bootstrap means the very first turn after a paliad redeploy pays a ~3 s extra cost (claude pane spin-up + system prompt absorb). Acceptable for a single-user PoC.
### 6.5 Health-check gating (`mriver_unreachable`)
Every `RunTurn` first calls `healthGate(ctx)`:
- Cached for 10 s. If last check was <10 s ago and was OK, skip the probe.
- Otherwise: `s.callShim(ctx, "health")` with a 3 s timeout. On success, set cache OK; on failure, return `ErrMRiverUnreachable`.
Why 10 s: short enough that "I just woke my laptop" propagates inside one user retry; long enough that a busy chat doesn't probe on every turn.
```go
var ErrMRiverUnreachable = errors.New("paliadin: mriver unreachable")
func (s *RemotePaliadinService) healthGate(ctx context.Context) error {
s.healthMu.Lock()
defer s.healthMu.Unlock()
if s.healthOK && time.Since(s.healthCheckedAt) < 10*time.Second {
return nil
}
c, cancel := context.WithTimeout(ctx, 3*time.Second)
defer cancel()
out, err := s.callShim(c, "health")
s.healthCheckedAt = time.Now()
if err != nil || strings.TrimSpace(string(out)) != "ok" {
s.healthOK = false
return fmt.Errorf("%w: %v", ErrMRiverUnreachable, err)
}
s.healthOK = true
return nil
}
```
### 6.6 Friendly error code (extends t-paliad-150)
`friendlyErrorMessage` already maps `tmux_unavailable` to a localised message. We add one new code:
- `mriver_unreachable` DE: *"mRiver ist offline — Paliadin nicht erreichbar. Mach mRiver an, oder nutze Paliadin lokal mit `./paliad`."* / EN: *"mRiver is offline — Paliadin can't reach it. Wake mRiver, or run Paliadin locally with `./paliad`."*
Implementation: one new `case` in the SSE-error switch in `frontend/src/client/paliadin.ts`'s `friendlyErrorMessage`, plus matching i18n keys (`paliadin.error.mriver_unreachable.de` / `.en`). Server-side: `paliadin` HTTP handler maps `errors.Is(err, services.ErrMRiverUnreachable)` to `event: error\ndata: {"code":"mriver_unreachable","message":"..."}\n\n`.
### 6.7 Rate limit
A runaway loop on the paliad side could DOS the SSH connection. Cheapest cap: enforce one in-flight turn at a time via `turnMu` (already exists in the local PoC). On top of that, a rolling cap of N=20 turns/min in `RemotePaliadinService` rejects with `ErrRateLimited` (mapped to a friendly `paliadin.error.rate_limited`). PoC has one user (m); the cap is a paranoid safety, not a real throttle.
### 6.8 What about ControlMaster?
Decision-2's chosen path (server-side shim with one RPC per turn) makes ControlMaster optional. The shim collapses ~10 raw-tmux ops into a single SSH connect that's already the latency win ControlMaster would buy.
Adding it on top would save ~3050 ms per turn but adds:
- A persistent `~/.ssh/cm-*` socket inside the container.
- Cleanup logic on shutdown.
- A subtle interaction with the SSH BatchMode + ConnectTimeout settings.
Verdict: skip ControlMaster in v1. If turn latency over Tailscale is measured >300 ms in practice and hot enough to matter, add it in a follow-up; the call site is one helper.
---
## 7. Phasing
### Phase A — manual proof-of-concept (no Dockerfile change yet)
Goal: validate the round-trip end-to-end on a deployed paliad, before touching the image.
**Phase A.0 (DONE 2026-05-07 23:31):** SSH+shim end-to-end on the tailnet.
1.**Generate keypair** on mRiver: `ssh-keygen -t ed25519 -N "" -C "paliad-prod" -f ~/.paliad-staging/paliad-prod-key`. Fingerprint `SHA256:5uV8v872F/IhJycjjq0crFue/emAYfw71N9bxTvkl9c`.
2.**Commit shim** to `scripts/paliadin-shim` and **install** at `/home/m/.local/bin/paliadin-shim`, `chmod 755`.
3.**Write authorized_keys** with public key + `command=`/`from="100.99.98.201"`/no-pty/no-port-forwarding/no-agent-forwarding/no-X11-forwarding/no-user-rc restrictions (§5.2).
4.**Add port 22022 socket drop-in** at `/etc/systemd/system/ssh.socket.d/paliad.conf`, `systemctl daemon-reload && systemctl restart ssh.socket`. Both `:22` (Tailscale SSH for m) and `:22022` (real OpenSSH for paliad) listening (§4.5).
5.**Capture mRiver:22022 host key**: `ssh-keyscan -p 22022 -t ed25519 100.99.98.203 > ~/.paliad-staging/known_hosts` from mLake. Fingerprint `SHA256:HPoUzy60Cb8yLERIBQcB2mHihNST3NaTODx5Ypd1XpA`.
6.**Smoke-test from mLake** (without paliad container, just raw ssh from mLake's host shell):
```
ssh -F /dev/null -i /tmp/paliad-prod-key -o UserKnownHostsFile=/tmp/paliad-known_hosts \
-o StrictHostKeyChecking=yes -o IdentitiesOnly=yes -o BatchMode=yes \
-p 22022 m@100.99.98.203 health
→ ok
ssh … run-turn $(uuidgen) "$(printf 'Sag …' | base64 -w0)"
→ "test ok" (3.4 s round-trip including a real Claude response)
```
7. ✅ **from= rejection verified**: the same key from mRiver itself (`100.99.98.203`) → `Permission denied (publickey,password)` as expected.
**Phase A.5 (PENDING m's hands):** validate `network_mode: host` + traefik routing on prod paliad.de.
- Branch the live `docker-compose.yml` on a temp branch.
- Add `network_mode: host` to the `web` service; remove `expose: ["8080"]`.
- Push to trigger a Dokploy redeploy.
- `curl --connect-timeout 5 -sSI https://paliad.de/` — expect 200 (or login redirect), NOT 502.
- If 502: revert the temp branch (`git revert HEAD && git push`); revisit decision 1 in a follow-up issue.
- If 200: keep the host-mode change; ready for Phase B.
This is **m's call to execute** — it briefly touches prod paliad.de. Inventor/coder should not flip prod compose without explicit go-ahead. Rollback is one revert + redeploy.
**Phase A.6 (after A.5 passes):** smoke-test SSH from inside the paliad-prod container itself (the real container, not just the mLake host shell):
```
docker exec -it <paliad-container> sh
apk add --no-cache openssh-client # one-shot, before Dockerfile change
ssh -F /dev/null -i /tmp/paliad-prod-key -o UserKnownHostsFile=/tmp/paliad-known_hosts \
-o StrictHostKeyChecking=yes -o IdentitiesOnly=yes -o BatchMode=yes \
-p 22022 m@100.99.98.203 health
# expected: "ok"
```
This proves the container's host-mode networking actually delivers a tailnet connect.
**Phase A.7:** wire env vars manually via Dokploy UI for one deploy; confirm `/paliadin` chat works against mRiver from paliad.de.
If A.5 fails: the design rolls back to a sidecar in a new issue (decision 1 follow-up). The SSH path (A.0) and traefik path (A.5) are independent — A.0 is already proven; only A.5+ is at risk.
### Phase B — bake into Dockerfile + Dokploy secrets
1. Dockerfile: add `openssh-client` to the final stage (§4.3).
2. compose: add `network_mode: host` and the four new env vars (§4.1).
3. Dokploy secrets: register `PALIADIN_REMOTE_HOST=100.99.98.203`, `PALIADIN_REMOTE_USER=m`, `PALIADIN_SSH_PRIVATE_KEY=...`, `PALIADIN_KNOWN_HOSTS=...`.
4. Code: refactor `PaliadinService` to the interface split (§6.1§6.2). New file `internal/services/paliadin_remote.go`. Tests: `paliadin_remote_test.go` mocks `callShim` to verify `RunTurn` audit-row writes, error mapping, and `healthGate` caching.
5. Ship under one PR; tag t-paliad-151 done.
### Phase C — friendly errors + monitoring
1. `paliadin.error.mriver_unreachable` i18n keys + `friendlyErrorMessage` case (§6.6).
2. `/admin/paliadin` shows last health-probe result + last successful turn timestamp.
3. Optional: `mai-mesh` integration to surface mRiver-offline events to m on Telegram (out-of-band; not gating).
---
## 8. Security review summary
| Risk | Mitigation |
|---|---|
| Stolen private key → arbitrary SSH on mRiver | `command=` shim restriction + `from="100.99.98.201"` + ed25519 key + private key only in Dokploy secret store (encrypted at rest); paliad route uses port 22022 where real OpenSSH enforces all of the above |
| Stolen private key → tailnet-wide SSH from non-mLake host | `from="100.99.98.201"` clause (verified: rejected from mRiver itself in Phase A.0) |
| Tailscale SSH on `:22` bypasses `authorized_keys` | The paliad-prod key's `command=` restriction is not enforced on `:22`. Mitigation: paliad always dials `:22022`, which is real OpenSSH. m's interactive `tailscale ssh m@mriver` on `:22` continues to be governed by Tailscale ACLs, separate from paliad's identity. |
| Container compromise → key extraction | Key written to tmpfile chmod 600, only root inside container can read; alpine container has no shell-on-error trampolines |
| Host-key MITM during connect | Pinned `known_hosts`; `StrictHostKeyChecking=yes` |
| Shim argument injection (e.g. via `run-turn $(rm -rf /)`) | Shim parses positional args from `$SSH_ORIGINAL_COMMAND` via `read -r -a`; never passes args to a subshell `eval`; turn_id validated by UUID regex; message body always base64-decoded into a single shell variable, never re-evaluated |
| Runaway loop → SSH flood | Single-flight `turnMu` + 20/min rolling cap |
| `network_mode: host` widens blast radius | The `command=` + `from=` restrictions on mRiver mean container compromise = "can run shim verbs against mRiver only", not "shell on mRiver" |
| PaliadinOwnerEmail bypass | Unchanged from PoC: gate is in Go (`/paliadin` 404s for any other user). Even if mRiver SSH key leaks, attacker still needs paliad session as `m@hoganlovells.com`. |
---
## 9. Out-of-scope clarifications (for review)
These were called out in the issue but the design intentionally does not solve them, to keep v1 tight. Each is acknowledged so review knows it wasn't an oversight:
- **Wake-on-LAN of mRiver:** out of scope. v1's UX when mRiver is asleep is the friendly error from §6.6. Future work: integrate with `mai-mesh` capability fallback.
- **Multi-host failover:** out of scope. Only mRiver is targeted.
- **Anthropic API fallback when mRiver offline:** out of scope per CLAUDE.md (`ANTHROPIC_API_KEY` reserved for production-v1, unused in PoC).
- **ControlMaster:** v1 ships without; revisit if turn latency >300 ms in practice (§6.8).
---
## 10. File-level deliverables (for the coder shift)
When this design is approved and the coder shift starts, the work splits roughly into:
- `Dockerfile` — `+openssh-client`.
- `docker-compose.yml` — `network_mode: host`, five new env entries (`PALIADIN_REMOTE_HOST`, `PALIADIN_REMOTE_PORT`, `PALIADIN_REMOTE_USER`, `PALIADIN_SSH_PRIVATE_KEY`, `PALIADIN_KNOWN_HOSTS`).
- `internal/services/paliadin.go` — extract `Paliadin` interface; rename existing to `LocalPaliadinService`; pull DB-only methods (`ListRecentTurns`, `Stats`, `IsOwner`) into a shared embedded `paliadinDB` so both implementations get them for free.
- `internal/services/paliadin_remote.go` — new file: `RemotePaliadinService`, `RemotePaliadinConfig` (with `SSHPort`), `callShim`, `healthGate`, `ensureBootstrapped`, `classifySSHError`, `ErrMRiverUnreachable`.
- `internal/services/paliadin_remote_test.go` — unit tests with a mocked `callShim`.
- `cmd/server/main.go` — env-var-based wiring (§6.2), `loadPaliadinSSHKey`, `loadPaliadinKnownHosts`, `PALIADIN_REMOTE_PORT` parse with default `22022`.
- `frontend/src/client/paliadin.ts` — one `case` in `friendlyErrorMessage` for `mriver_unreachable`.
- `frontend/src/i18n.ts` — two new keys (`paliadin.error.mriver_unreachable.de` / `.en`).
- `scripts/paliadin-shim` — server-side script (§5.4); already shipped + installed on mRiver during Phase A.0, not part of any container. Repo location chosen so the security-relevant script is version-controlled.
- `docs/project-status.md` — note Phase 0.5 (PoC) → Phase 0.6 (Tailscale-SSH prod route).
- **mRiver host setup (one-time, already done in Phase A.0):** `/etc/systemd/system/ssh.socket.d/paliad.conf` (port 22022 listen drop-in); `~/.ssh/authorized_keys` (paliad-prod public key with restrictions); `/home/m/.local/bin/paliadin-shim` (executable). These are NOT in the repo because they live on m's laptop; `docs/project-status.md` should reference them.
No DB migrations needed — `paliad.paliadin_turns` schema already covers everything (`error_code` field already accepts free-form codes including `mriver_unreachable`).
---
## 11. Open questions for review
- **Q (m), still open:** Phase A.5 (traefik+host-mode on prod paliad.de) is not yet executed. m drives this; rollback is one revert. Dokploy doc check before flipping is recommended but not blocking.
- **Q (m), resolved 2026-05-07 23:50:** shim location → repo (`scripts/paliadin-shim`, committed in `0248411`). Version-controlled and auditable.
- **Q (m), still open:** `ANTHROPIC_API_KEY` env var reservation in compose comments — keep for production-v1, or strip now? Not blocking either phase; defer.
---
## 12. Phase A.0 completion summary (2026-05-07 23:50)
**Coder shift (noether) executed Phase A.0 in full:**
1. ✅ shim committed at `scripts/paliadin-shim` (commit `0248411`, repo-version-controlled)
2. ✅ shim installed at `/home/m/.local/bin/paliadin-shim` on mRiver
3. ✅ ed25519 keypair `paliad-prod` generated, public-key fingerprint `SHA256:5uV8v872F/IhJycjjq0crFue/emAYfw71N9bxTvkl9c`, private key staged at `~/.paliad-staging/paliad-prod-key` on mRiver (mode 600)
4. ✅ `~/.ssh/authorized_keys` written with `command=`/`from=`/no-pty/no-port-forwarding/no-agent-forwarding/no-X11-forwarding/no-user-rc restrictions
5. ✅ `ssh.socket` drop-in installed at `/etc/systemd/system/ssh.socket.d/paliad.conf`; both `:22` and `:22022` listening
6. ✅ host key for `:22022` captured at `~/.paliad-staging/known_hosts` (fingerprint `SHA256:HPoUzy60Cb8yLERIBQcB2mHihNST3NaTODx5Ypd1XpA`)
7. ✅ end-to-end SSH+shim+Claude run-turn validated from mLake → mRiver:22022 (3.4 s round-trip)
8. ✅ `from="100.99.98.201"` rejection verified
**Three secrets ready for Dokploy registration** (m to copy from `~/.paliad-staging/` on mRiver):
- `PALIADIN_SSH_PRIVATE_KEY` ← `cat ~/.paliad-staging/paliad-prod-key`
- `PALIADIN_KNOWN_HOSTS` ← `cat ~/.paliad-staging/known_hosts`
- `PALIADIN_REMOTE_HOST=100.99.98.203`, `PALIADIN_REMOTE_PORT=22022`, `PALIADIN_REMOTE_USER=m`
**Phase A.5 (traefik+host-mode test) and Phase A.6/A.7 (in-container SSH smoke + paliad/paliadin end-to-end) await m's hands** — they touch prod paliad.de.
**Phase B (Dockerfile + Go interface split + Dokploy secrets) is unblocked from a code perspective** — but should not merge until Phase A.5 confirms the host-mode networking trade-off is acceptable.
---
**Inventor design + coder Phase A.0 complete.** Awaiting m for Phase A.5 traefik validation before the coder writes the Go interface split.

View File

@@ -1558,6 +1558,10 @@ const translations: Record<Lang, Record<string, string>> = {
"paliadin.stop": "Stop", "paliadin.stop": "Stop",
"paliadin.reset": "Neue Unterhaltung", "paliadin.reset": "Neue Unterhaltung",
"paliadin.error.local_only": "Paliadin läuft nur lokal. Diese Instanz hat kein tmux/claude installiert — lokal mit ./paliad starten.", "paliadin.error.local_only": "Paliadin läuft nur lokal. Diese Instanz hat kein tmux/claude installiert — lokal mit ./paliad starten.",
"paliadin.error.mriver_unreachable": "mRiver ist offline — Paliadin nicht erreichbar. Mach mRiver an, oder nutze Paliadin lokal mit ./paliad.",
"paliadin.error.shim_auth_failed": "Paliadin-Authentifizierung fehlgeschlagen. SSH-Schlüssel oder Berechtigung auf mRiver prüfen.",
"paliadin.error.shim_error": "Paliadin-Fehler auf mRiver. tmux/claude-Pane prüfen.",
"paliadin.error.timeout": "Paliadin antwortet nicht (Timeout 60s). Nochmal versuchen.",
"paliadin.error.connection_lost": "Verbindung verloren.", "paliadin.error.connection_lost": "Verbindung verloren.",
"paliadin.error.upstream": "Fehler beim Senden.", "paliadin.error.upstream": "Fehler beim Senden.",
"nav.admin.paliadin": "Paliadin Monitor", "nav.admin.paliadin": "Paliadin Monitor",
@@ -3553,6 +3557,10 @@ const translations: Record<Lang, Record<string, string>> = {
"paliadin.stop": "Stop", "paliadin.stop": "Stop",
"paliadin.reset": "New conversation", "paliadin.reset": "New conversation",
"paliadin.error.local_only": "Paliadin only runs locally. This instance has no tmux/claude installed — start it locally via ./paliad.", "paliadin.error.local_only": "Paliadin only runs locally. This instance has no tmux/claude installed — start it locally via ./paliad.",
"paliadin.error.mriver_unreachable": "mRiver is offline — Paliadin can't reach it. Wake mRiver, or run Paliadin locally with ./paliad.",
"paliadin.error.shim_auth_failed": "Paliadin auth failed. Check the SSH key or authorized_keys on mRiver.",
"paliadin.error.shim_error": "Paliadin error on mRiver. Check the tmux/claude pane.",
"paliadin.error.timeout": "Paliadin didn't respond in time (60s). Try again.",
"paliadin.error.connection_lost": "Connection lost.", "paliadin.error.connection_lost": "Connection lost.",
"paliadin.error.upstream": "Send failed.", "paliadin.error.upstream": "Send failed.",
"nav.admin.paliadin": "Paliadin Monitor", "nav.admin.paliadin": "Paliadin Monitor",

View File

@@ -210,8 +210,24 @@ function friendlyErrorMessage(data: unknown): string {
} }
try { try {
const parsed = JSON.parse(data) as { code?: string }; const parsed = JSON.parse(data) as { code?: string };
if (parsed.code === "tmux_unavailable") { switch (parsed.code) {
return t("paliadin.error.local_only"); case "tmux_unavailable":
// Local PoC path: paliad is running on a host without tmux/claude
// (typically the legacy laptop-only build).
return t("paliadin.error.local_only");
case "mriver_unreachable":
// t-paliad-151: prod path's mRiver is offline (laptop asleep, off
// tailnet, or paliadin-shim missing).
return t("paliadin.error.mriver_unreachable");
case "shim_auth_failed":
// SSH key wrong or authorized_keys drifted.
return t("paliadin.error.shim_auth_failed");
case "shim_error":
case "bootstrap_failed":
// Generic remote shim failure or system-prompt bootstrap error.
return t("paliadin.error.shim_error");
case "timeout":
return t("paliadin.error.timeout");
} }
} catch { } catch {
// Not JSON — fall through to the generic connection-lost message // Not JSON — fall through to the generic connection-lost message

View File

@@ -1423,6 +1423,10 @@ export type I18nKey =
| "paliadin.empty" | "paliadin.empty"
| "paliadin.error.connection_lost" | "paliadin.error.connection_lost"
| "paliadin.error.local_only" | "paliadin.error.local_only"
| "paliadin.error.mriver_unreachable"
| "paliadin.error.shim_auth_failed"
| "paliadin.error.shim_error"
| "paliadin.error.timeout"
| "paliadin.error.upstream" | "paliadin.error.upstream"
| "paliadin.heading" | "paliadin.heading"
| "paliadin.input.placeholder" | "paliadin.input.placeholder"

View File

@@ -69,10 +69,12 @@ type Services struct {
Pin *services.PinService Pin *services.PinService
CardLayout *services.CardLayoutService CardLayout *services.CardLayoutService
// Paliadin is wired only when PALIADIN_ENABLED=true at boot // Paliadin is wired when DATABASE_URL is set. The concrete backend
// (PoC; m's laptop only). On prod it stays nil and all /paliadin* // is picked in cmd/server/main.go based on PALIADIN_REMOTE_HOST
// routes 404 because Register() skips registering them. // (remote → mRiver via SSH) or local tmux availability. Stays nil
Paliadin *services.PaliadinService // without DATABASE_URL; in that case the per-request handler gate
// 404s anyway.
Paliadin services.Paliadin
} }
func Register(mux *http.ServeMux, client *auth.Client, giteaAPIToken string, svc *Services) { func Register(mux *http.ServeMux, client *auth.Client, giteaAPIToken string, svc *Services) {

View File

@@ -39,10 +39,11 @@ func newDetachedContext(timeout time.Duration) (context.Context, context.CancelF
return context.WithTimeout(context.Background(), timeout) return context.WithTimeout(context.Background(), timeout)
} }
// paliadinSvc is the live PaliadinService instance. nil when // paliadinSvc is the live Paliadin backend. nil when DATABASE_URL was
// DATABASE_URL was unset (the service depends on the audit table). // unset (the service depends on the audit table). Set by Register() at
// Set by Register() at boot. // boot. The concrete type is decided in cmd/server/main.go: local-tmux
var paliadinSvc *services.PaliadinService // PoC, remote-via-SSH (mRiver), or a disabled stub.
var paliadinSvc services.Paliadin
// requirePaliadinOwner gates every paliadin handler to the single // requirePaliadinOwner gates every paliadin handler to the single
// owner email (services.PaliadinOwnerEmail = m). Anyone else gets a // owner email (services.PaliadinOwnerEmail = m). Anyone else gets a

View File

@@ -1,23 +1,23 @@
package services package services
// PaliadinService — Phase 0 PoC of the in-app AI buddy (t-paliad-146). // Paliadin — the in-app AI buddy. Two implementations of the same
// interface, picked at boot time (see cmd/server/main.go):
// //
// Design: docs/design-paliadin-2026-05-07.md §0.5 (PoC track). // - LocalPaliadinService — talks to a `claude` CLI in a local tmux
// session. The PoC path (t-paliad-146); used on m's laptop.
// - RemotePaliadinService — shells out to ssh on mRiver where the
// long-lived tmux+claude pane lives. The prod path (t-paliad-151);
// used by the paliad.de Dokploy container, which has no `claude`
// CLI of its own.
// //
// Architecture: a long-lived `claude` process inside a tmux session. // Designs:
// Prompts go in via `tmux send-keys -l`; responses come back via a // - docs/design-paliadin-2026-05-07.md (PoC architecture)
// per-turn file the system prompt instructs Claude to write // - docs/design-paliadin-tailscale-ssh-2026-05-07.md (remote routing)
// (Write(/tmp/paliadin/{turn_id}.txt)). The service polls that file,
// strips the [paliadin-meta] trailer block, parses the metadata, writes
// an audit row, and emits the response back to the SSE handler.
// //
// The architecture is lifted (with adaptation to Go) from // Both implementations share the audit-table I/O (paliadinDB) and the
// ~/dev/mVoice/server.py:250-380, which has been driving the goldi voice // trailer parser. The conversation state (turn ordering, response file
// surface in production since 2026-Q1. // polling) is split: Local owns the tmux pane directly; Remote delegates
// // to the paliadin-shim on mRiver and reads the file there.
// PoC ONLY runs on m's laptop (PALIADIN_ENABLED=false on prod default).
// Hardcoded single-user, single-tmux-window scope. Do not attempt to
// deploy this to the Dokploy container — there is no `claude` CLI there.
import ( import (
"bytes" "bytes"
@@ -50,12 +50,36 @@ import (
// path to enabling Paliadin. // path to enabling Paliadin.
const PaliadinOwnerEmail = "matthias.siebels@hoganlovells.com" const PaliadinOwnerEmail = "matthias.siebels@hoganlovells.com"
// PaliadinService manages the tmux-claude PoC. // Paliadin is the interface every Paliadin backend implements. Two
type PaliadinService struct { // production implementations: LocalPaliadinService (local tmux+claude)
db *sqlx.DB // and RemotePaliadinService (ssh+paliadin-shim on mRiver). A
// DisabledPaliadinService stub is constructed when neither is available
// so callers don't have to nil-check on every entry point.
type Paliadin interface {
RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error)
ResetSession(ctx context.Context) error
ListRecentTurns(ctx context.Context, callerID uuid.UUID, limit int) ([]PaliadinTurn, error)
Stats(ctx context.Context, callerID uuid.UUID) (*PaliadinStats, error)
IsOwner(ctx context.Context, userID uuid.UUID) (bool, error)
}
// paliadinDB is the audit-table read/write surface shared by every
// Paliadin implementation. Embedded in LocalPaliadinService and
// RemotePaliadinService so they inherit IsOwner / ListRecentTurns /
// Stats and the per-turn row writers without duplication.
type paliadinDB struct {
db *sqlx.DB
users *UserService
}
// LocalPaliadinService runs the local tmux+claude PoC (t-paliad-146).
// Hardcoded single-user, single-tmux-window scope. Used on m's laptop;
// not deployed to prod (the Dokploy container has no `claude` CLI —
// see RemotePaliadinService for that path).
type LocalPaliadinService struct {
paliadinDB
tmuxSession string tmuxSession string
responseDir string responseDir string
users *UserService
// Cached pane target ("session:window-idx") once the voice window is // Cached pane target ("session:window-idx") once the voice window is
// either discovered or created. Reset to "" if the pane dies. // either discovered or created. Reset to "" if the pane dies.
@@ -74,7 +98,7 @@ type PaliadinService struct {
// //
// Returns (false, nil) for any other user — including unknown UUIDs and // Returns (false, nil) for any other user — including unknown UUIDs and
// users without an email row. Errors only on DB failure. // users without an email row. Errors only on DB failure.
func (s *PaliadinService) IsOwner(ctx context.Context, userID uuid.UUID) (bool, error) { func (s *paliadinDB) IsOwner(ctx context.Context, userID uuid.UUID) (bool, error) {
var email string var email string
err := s.db.QueryRowxContext(ctx, err := s.db.QueryRowxContext(ctx,
`SELECT email FROM paliad.users WHERE id = $1`, userID).Scan(&email) `SELECT email FROM paliad.users WHERE id = $1`, userID).Scan(&email)
@@ -87,19 +111,19 @@ func (s *PaliadinService) IsOwner(ctx context.Context, userID uuid.UUID) (bool,
return strings.EqualFold(email, PaliadinOwnerEmail), nil return strings.EqualFold(email, PaliadinOwnerEmail), nil
} }
// NewPaliadinService wires the service. Call only when PALIADIN_ENABLED=true. // NewLocalPaliadinService wires the local-tmux PoC backend. Falls back
func NewPaliadinService(db *sqlx.DB, users *UserService, tmuxSession, responseDir string) *PaliadinService { // to default tmux session + response dir when env vars are empty.
func NewLocalPaliadinService(db *sqlx.DB, users *UserService, tmuxSession, responseDir string) *LocalPaliadinService {
if tmuxSession == "" { if tmuxSession == "" {
tmuxSession = "paliad-paliadin" tmuxSession = "paliad-paliadin"
} }
if responseDir == "" { if responseDir == "" {
responseDir = "/tmp/paliadin" responseDir = "/tmp/paliadin"
} }
return &PaliadinService{ return &LocalPaliadinService{
db: db, paliadinDB: paliadinDB{db: db, users: users},
tmuxSession: tmuxSession, tmuxSession: tmuxSession,
responseDir: responseDir, responseDir: responseDir,
users: users,
} }
} }
@@ -156,7 +180,7 @@ var ErrTmuxUnavailable = errors.New("paliadin: tmux unavailable")
// //
// PoC: serialised. The package-level turnMu enforces "one at a time". // PoC: serialised. The package-level turnMu enforces "one at a time".
// m is the only user, so this is fine. // m is the only user, so this is fine.
func (s *PaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) { func (s *LocalPaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) {
s.turnMu.Lock() s.turnMu.Lock()
defer s.turnMu.Unlock() defer s.turnMu.Unlock()
@@ -238,7 +262,7 @@ func (s *PaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnRe
// ResetSession sends `/clear` to the Claude pane so the next turn starts // ResetSession sends `/clear` to the Claude pane so the next turn starts
// from a clean conversation. Used by the "New conversation" button. // from a clean conversation. Used by the "New conversation" button.
func (s *PaliadinService) ResetSession(ctx context.Context) error { func (s *LocalPaliadinService) ResetSession(ctx context.Context) error {
s.mu.Lock() s.mu.Lock()
target := s.paneTarget target := s.paneTarget
s.mu.Unlock() s.mu.Unlock()
@@ -254,7 +278,7 @@ func (s *PaliadinService) ResetSession(ctx context.Context) error {
// ListRecentTurns reads the last N turns visible to the caller. // ListRecentTurns reads the last N turns visible to the caller.
// global_admin sees everything; everyone else sees their own. // global_admin sees everything; everyone else sees their own.
func (s *PaliadinService) ListRecentTurns(ctx context.Context, callerID uuid.UUID, limit int) ([]PaliadinTurn, error) { func (s *paliadinDB) ListRecentTurns(ctx context.Context, callerID uuid.UUID, limit int) ([]PaliadinTurn, error) {
if limit <= 0 || limit > 200 { if limit <= 0 || limit > 200 {
limit = 50 limit = 50
} }
@@ -302,7 +326,7 @@ type PaliadinPromptCount struct {
// Stats computes the dashboard aggregate. global_admin sees everything; // Stats computes the dashboard aggregate. global_admin sees everything;
// everyone else sees their own slice (PoC has only m, but the policy // everyone else sees their own slice (PoC has only m, but the policy
// matches RLS on the table). // matches RLS on the table).
func (s *PaliadinService) Stats(ctx context.Context, callerID uuid.UUID) (*PaliadinStats, error) { func (s *paliadinDB) Stats(ctx context.Context, callerID uuid.UUID) (*PaliadinStats, error) {
stats := &PaliadinStats{ stats := &PaliadinStats{
ByClassifier: map[string]int{}, ByClassifier: map[string]int{},
DailyCounts: []PaliadinDailyCount{}, DailyCounts: []PaliadinDailyCount{},
@@ -404,7 +428,7 @@ func (s *PaliadinService) Stats(ctx context.Context, callerID uuid.UUID) (*Palia
// ensurePane returns the tmux target ("session:window-idx") of the live // ensurePane returns the tmux target ("session:window-idx") of the live
// Claude pane, creating both session and window if missing. // Claude pane, creating both session and window if missing.
func (s *PaliadinService) ensurePane(ctx context.Context) (string, error) { func (s *LocalPaliadinService) ensurePane(ctx context.Context) (string, error) {
s.mu.Lock() s.mu.Lock()
defer s.mu.Unlock() defer s.mu.Unlock()
@@ -468,7 +492,7 @@ func (s *PaliadinService) ensurePane(ctx context.Context) (string, error) {
return target, nil return target, nil
} }
func (s *PaliadinService) findChatWindow(ctx context.Context) string { func (s *LocalPaliadinService) findChatWindow(ctx context.Context) string {
out, err := runTmuxOut(ctx, "list-windows", "-t", s.tmuxSession, out, err := runTmuxOut(ctx, "list-windows", "-t", s.tmuxSession,
"-F", "#{window_index}") "-F", "#{window_index}")
if err != nil { if err != nil {
@@ -485,14 +509,14 @@ func (s *PaliadinService) findChatWindow(ctx context.Context) string {
return "" return ""
} }
func (s *PaliadinService) paneAlive(ctx context.Context, target string) bool { func (s *LocalPaliadinService) paneAlive(ctx context.Context, target string) bool {
if err := runTmux(ctx, "has-session", "-t", target); err != nil { if err := runTmux(ctx, "has-session", "-t", target); err != nil {
return false return false
} }
return true return true
} }
func (s *PaliadinService) waitForPaneReady(ctx context.Context, target string, timeout time.Duration) error { func (s *LocalPaliadinService) waitForPaneReady(ctx context.Context, target string, timeout time.Duration) error {
deadline := time.Now().Add(timeout) deadline := time.Now().Add(timeout)
for time.Now().Before(deadline) { for time.Now().Before(deadline) {
select { select {
@@ -509,7 +533,7 @@ func (s *PaliadinService) waitForPaneReady(ctx context.Context, target string, t
return fmt.Errorf("pane %s not ready within %s", target, timeout) return fmt.Errorf("pane %s not ready within %s", target, timeout)
} }
func (s *PaliadinService) sendToPane(ctx context.Context, target, msg string) error { func (s *LocalPaliadinService) sendToPane(ctx context.Context, target, msg string) error {
// `-l` sends the message literally (no key parsing) — necessary so // `-l` sends the message literally (no key parsing) — necessary so
// our prompt's special characters don't get interpreted. // our prompt's special characters don't get interpreted.
if err := runTmux(ctx, "send-keys", "-t", target, "-l", msg); err != nil { if err := runTmux(ctx, "send-keys", "-t", target, "-l", msg); err != nil {
@@ -527,7 +551,7 @@ func (s *PaliadinService) sendToPane(ctx context.Context, target, msg string) er
// over from earlier turns) as a non-event — the file existing without a // over from earlier turns) as a non-event — the file existing without a
// fresh mtime is a corner case the caller already de-duplicates by // fresh mtime is a corner case the caller already de-duplicates by
// having a unique turn_id per request. // having a unique turn_id per request.
func (s *PaliadinService) pollForResponse(ctx context.Context, path string, timeout time.Duration) (string, error) { func (s *LocalPaliadinService) pollForResponse(ctx context.Context, path string, timeout time.Duration) (string, error) {
deadline := time.Now().Add(timeout) deadline := time.Now().Add(timeout)
for time.Now().Before(deadline) { for time.Now().Before(deadline) {
select { select {
@@ -687,7 +711,7 @@ func countChips(s string) int {
// audit-row writers. // audit-row writers.
// ============================================================================= // =============================================================================
func (s *PaliadinService) insertTurnRow(ctx context.Context, t *PaliadinTurn) error { func (s *paliadinDB) insertTurnRow(ctx context.Context, t *PaliadinTurn) error {
q := ` q := `
INSERT INTO paliad.paliadin_turns ( INSERT INTO paliad.paliadin_turns (
turn_id, user_id, session_id, started_at, user_message, page_origin turn_id, user_id, session_id, started_at, user_message, page_origin
@@ -698,7 +722,7 @@ func (s *PaliadinService) insertTurnRow(ctx context.Context, t *PaliadinTurn) er
return err return err
} }
func (s *PaliadinService) completeTurn(ctx context.Context, turnID uuid.UUID, func (s *paliadinDB) completeTurn(ctx context.Context, turnID uuid.UUID,
finishedAt time.Time, durationMS int, response string, tokens int, finishedAt time.Time, durationMS int, response string, tokens int,
meta trailerMeta, chipCount int) error { meta trailerMeta, chipCount int) error {
rowsSeen := make(pq.Int64Array, 0, len(meta.RowsSeen)) rowsSeen := make(pq.Int64Array, 0, len(meta.RowsSeen))
@@ -724,7 +748,7 @@ func (s *PaliadinService) completeTurn(ctx context.Context, turnID uuid.UUID,
return err return err
} }
func (s *PaliadinService) markTurnError(ctx context.Context, turnID uuid.UUID, code string) error { func (s *paliadinDB) markTurnError(ctx context.Context, turnID uuid.UUID, code string) error {
finished := time.Now().UTC() finished := time.Now().UTC()
q := ` q := `
UPDATE paliad.paliadin_turns UPDATE paliad.paliadin_turns
@@ -735,7 +759,7 @@ func (s *PaliadinService) markTurnError(ctx context.Context, turnID uuid.UUID, c
return err return err
} }
func (s *PaliadinService) markTurnAbandonedOrError(ctx context.Context, turnID uuid.UUID, code string, abandoned bool) error { func (s *paliadinDB) markTurnAbandonedOrError(ctx context.Context, turnID uuid.UUID, code string, abandoned bool) error {
finished := time.Now().UTC() finished := time.Now().UTC()
q := ` q := `
UPDATE paliad.paliadin_turns UPDATE paliad.paliadin_turns

View File

@@ -0,0 +1,322 @@
package services
// RemotePaliadinService — the prod path of the Paliadin backend.
//
// Design: docs/design-paliadin-tailscale-ssh-2026-05-07.md.
//
// Where the local backend (LocalPaliadinService) drives a tmux+claude
// pane in-process, the remote backend shells out to ssh m@mriver
// paliadin-shim — the script at scripts/paliadin-shim, installed at
// /home/m/.local/bin/paliadin-shim on m's laptop. The shim owns the
// tmux+claude pane on mRiver; this Go side just wraps each turn in one
// SSH call.
//
// The path was chosen so paliad.de (deployed in a Dokploy container on
// mLake, no `claude` CLI of its own) can keep using m's Claude Code
// subscription instead of paying API tokens. Tailscale provides the
// transport — mLake's tailscale0 interface is shared into the container
// via network_mode: host (compose layer; not this file's concern).
//
// Wiring is gated on PALIADIN_REMOTE_HOST in cmd/server/main.go. When
// that env var is unset, the binary falls back to LocalPaliadinService
// (or DisabledPaliadinService if neither tmux nor remote is available).
import (
"bytes"
"context"
"encoding/base64"
"errors"
"fmt"
"log"
"os/exec"
"strconv"
"strings"
"sync"
"time"
"github.com/google/uuid"
"github.com/jmoiron/sqlx"
)
// ErrMRiverUnreachable signals that the remote paliadin-shim could not
// be contacted within the health-check window. The handler maps this to
// the friendly mriver_unreachable error code (see frontend
// friendlyErrorMessage).
var ErrMRiverUnreachable = errors.New("paliadin: mriver unreachable")
// RemotePaliadinConfig is the bag of knobs cmd/server/main.go passes
// when constructing a RemotePaliadinService.
type RemotePaliadinConfig struct {
SSHHost string // 100.99.98.203 — mRiver's tailnet IP
SSHPort int // 22022 — bypasses Tailscale SSH on :22 (design §4.5)
SSHUser string // m
SSHKeyPath string // /tmp/paliadin-id_ed25519-<rand> (chmod 600)
KnownHostsPath string // /tmp/paliadin-known_hosts
}
// RemotePaliadinService implements Paliadin against a remote
// paliadin-shim over SSH.
type RemotePaliadinService struct {
paliadinDB
cfg RemotePaliadinConfig
// Single in-flight turn. mRiver's claude pane is single-user; we
// serialise turns the same way LocalPaliadinService does.
turnMu sync.Mutex
// Health-check cache. Avoids probing mRiver on every turn — once
// the cache is warm, RunTurn skips the probe for 10 seconds.
healthMu sync.Mutex
healthOK bool
healthCheckedAt time.Time
// Lazy bootstrap state. The system prompt only needs to be sent
// once per claude pane; on first RunTurn after a paliad restart we
// inject it, and remember we did so we don't re-send.
bootstrapMu sync.Mutex
bootstrapped bool
// Hook for tests — when non-nil, callShim delegates here instead
// of exec'ing ssh. Production code never sets this.
callShimHook func(ctx context.Context, args ...string) ([]byte, error)
}
// NewRemotePaliadinService wires the remote backend. Call only when
// PALIADIN_REMOTE_HOST is set in the environment; the constructor does
// not probe mRiver — first probe happens on the first RunTurn call via
// healthGate.
func NewRemotePaliadinService(db *sqlx.DB, users *UserService, cfg RemotePaliadinConfig) *RemotePaliadinService {
if cfg.SSHPort == 0 {
cfg.SSHPort = 22022
}
if cfg.SSHUser == "" {
cfg.SSHUser = "m"
}
return &RemotePaliadinService{
paliadinDB: paliadinDB{db: db, users: users},
cfg: cfg,
}
}
// RunTurn drives one Q&A round against the remote claude pane. Same
// audit-row contract as LocalPaliadinService: write the row first, run
// the turn, complete the row on success, mark error on failure.
func (s *RemotePaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) {
s.turnMu.Lock()
defer s.turnMu.Unlock()
turnID := uuid.New()
startedAt := time.Now().UTC()
// Audit row first — leave traces even if we crash mid-turn.
if err := s.insertTurnRow(ctx, &PaliadinTurn{
TurnID: turnID,
UserID: req.UserID,
SessionID: req.SessionID,
StartedAt: startedAt,
UserMessage: req.UserMessage,
PageOrigin: optionalString(req.PageOrigin),
}); err != nil {
return nil, fmt.Errorf("paliadin: insert turn row: %w", err)
}
// Health-gate before paying the cost of a real turn. Caches OK for
// 10 s so a fast back-to-back chat doesn't probe every time.
if err := s.healthGate(ctx); err != nil {
_ = s.markTurnError(ctx, turnID, "mriver_unreachable")
return nil, err
}
// Lazy bootstrap — first turn after a paliad restart sends the
// system prompt; subsequent turns skip.
if err := s.ensureBootstrapped(ctx); err != nil {
_ = s.markTurnError(ctx, turnID, "bootstrap_failed")
return nil, err
}
msg := sanitiseForTmux(req.UserMessage)
msgB64 := base64.StdEncoding.EncodeToString([]byte(msg))
body, err := s.callShim(ctx, "run-turn", turnID.String(), msgB64)
if err != nil {
_ = s.markTurnError(ctx, turnID, classifySSHError(err))
return nil, err
}
// Same trailer parse + audit completion as the local path.
cleanBody, meta := splitTrailer(string(body))
tokens := approxTokenCount(cleanBody)
chipCount := countChips(cleanBody)
finished := time.Now().UTC()
durationMS := int(finished.Sub(startedAt) / time.Millisecond)
if err := s.completeTurn(ctx, turnID, finished, durationMS, cleanBody, tokens, meta, chipCount); err != nil {
log.Printf("paliadin: complete turn %s: %v", turnID, err)
}
return &TurnResult{
TurnID: turnID,
Response: cleanBody,
UsedTools: meta.UsedTools,
RowsSeen: meta.RowsSeen,
ChipCount: chipCount,
ClassifierTag: meta.ClassifierTag,
DurationMS: durationMS,
}, nil
}
// ResetSession sends `/clear` to the remote claude pane.
func (s *RemotePaliadinService) ResetSession(ctx context.Context) error {
if _, err := s.callShim(ctx, "reset"); err != nil {
return fmt.Errorf("paliadin: reset: %w", err)
}
return nil
}
// healthGate runs the shim's `health` verb at most once per 10 s.
// Returns ErrMRiverUnreachable wrapping the underlying error on miss.
func (s *RemotePaliadinService) healthGate(ctx context.Context) error {
s.healthMu.Lock()
defer s.healthMu.Unlock()
if s.healthOK && time.Since(s.healthCheckedAt) < 10*time.Second {
return nil
}
probeCtx, cancel := context.WithTimeout(ctx, 3*time.Second)
defer cancel()
out, err := s.callShim(probeCtx, "health")
s.healthCheckedAt = time.Now()
if err != nil {
s.healthOK = false
return fmt.Errorf("%w: %v", ErrMRiverUnreachable, err)
}
if strings.TrimSpace(string(out)) != "ok" {
s.healthOK = false
return fmt.Errorf("%w: shim returned %q", ErrMRiverUnreachable, string(out))
}
s.healthOK = true
return nil
}
// ensureBootstrapped sends the Paliadin system prompt to the remote
// claude pane on first call. Idempotent — subsequent calls return nil
// without doing any work.
func (s *RemotePaliadinService) ensureBootstrapped(ctx context.Context) error {
s.bootstrapMu.Lock()
defer s.bootstrapMu.Unlock()
if s.bootstrapped {
return nil
}
prompt := paliadinSystemPrompt("/tmp/paliadin")
promptB64 := base64.StdEncoding.EncodeToString([]byte(prompt))
if _, err := s.callShim(ctx, "bootstrap", promptB64); err != nil {
return fmt.Errorf("paliadin: bootstrap: %w", err)
}
s.bootstrapped = true
return nil
}
// callShim runs `ssh <user>@<host> -- <verb> <args...>` against the
// paliadin-shim. The shim's authorized_keys command= directive ensures
// the verb + args are passed via $SSH_ORIGINAL_COMMAND regardless of
// what we put after the `--`; we keep the explicit argv form anyway so
// reading the code at the call site is unambiguous.
//
// Tests set callShimHook to bypass exec.
func (s *RemotePaliadinService) callShim(ctx context.Context, args ...string) ([]byte, error) {
if s.callShimHook != nil {
return s.callShimHook(ctx, args...)
}
sshArgs := []string{
"-F", "/dev/null", // ignore /etc/ssh/ssh_config + ~/.ssh/config
"-i", s.cfg.SSHKeyPath,
"-p", strconv.Itoa(s.cfg.SSHPort), // 22022 — bypasses Tailscale SSH on :22
"-o", "IdentitiesOnly=yes",
"-o", "UserKnownHostsFile=" + s.cfg.KnownHostsPath,
"-o", "StrictHostKeyChecking=yes",
"-o", "BatchMode=yes",
"-o", "ConnectTimeout=3",
"-o", "ServerAliveInterval=10",
"-o", "ServerAliveCountMax=3",
s.cfg.SSHUser + "@" + s.cfg.SSHHost,
"--",
}
sshArgs = append(sshArgs, args...)
// Shim's run-turn timeout is 60 s; +10 s gives SSH some overhead.
c, cancel := context.WithTimeout(ctx, 70*time.Second)
defer cancel()
cmd := exec.CommandContext(c, "ssh", sshArgs...)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
return nil, fmt.Errorf("ssh %s: %w (stderr: %s)", strings.Join(args, " "), err, strings.TrimSpace(stderr.String()))
}
return stdout.Bytes(), nil
}
// classifySSHError turns a callShim error into one of the audit-row
// error codes. Codes are stable strings shown on the admin dashboard
// and used by the frontend's friendlyErrorMessage to localise.
func classifySSHError(err error) string {
if err == nil {
return ""
}
if errors.Is(err, ErrMRiverUnreachable) {
return "mriver_unreachable"
}
if errors.Is(err, context.DeadlineExceeded) {
return "timeout"
}
msg := err.Error()
switch {
case strings.Contains(msg, "Connection timed out"),
strings.Contains(msg, "Connection refused"),
strings.Contains(msg, "Could not resolve hostname"),
strings.Contains(msg, "Network is unreachable"):
return "mriver_unreachable"
case strings.Contains(msg, "exit status 124"):
// Shim's run-turn 60 s timeout — Claude didn't write the
// response file in time.
return "timeout"
case strings.Contains(msg, "Permission denied"):
return "shim_auth_failed"
default:
return "shim_error"
}
}
// DisabledPaliadinService is a stub that always returns
// ErrPaliadinDisabled. cmd/server/main.go constructs one when neither
// PALIADIN_REMOTE_HOST is set nor a local tmux is available; without
// the stub, the handler would have to nil-check on every entry point.
type DisabledPaliadinService struct {
paliadinDB
}
// NewDisabledPaliadinService wires the stub. DB methods (IsOwner /
// ListRecentTurns / Stats) still work; only RunTurn / ResetSession
// return ErrPaliadinDisabled.
func NewDisabledPaliadinService(db *sqlx.DB, users *UserService) *DisabledPaliadinService {
return &DisabledPaliadinService{paliadinDB: paliadinDB{db: db, users: users}}
}
func (s *DisabledPaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) {
return nil, ErrPaliadinDisabled
}
func (s *DisabledPaliadinService) ResetSession(ctx context.Context) error {
return ErrPaliadinDisabled
}
// Compile-time interface conformance checks — fail the build, not a
// runtime test, if a method drifts off any backend.
var (
_ Paliadin = (*LocalPaliadinService)(nil)
_ Paliadin = (*RemotePaliadinService)(nil)
_ Paliadin = (*DisabledPaliadinService)(nil)
)

View File

@@ -0,0 +1,257 @@
package services
import (
"context"
"errors"
"fmt"
"strings"
"sync/atomic"
"testing"
"time"
)
// Tests for the remote-Paliadin backend. Every test bypasses exec via
// the callShimHook field — no real ssh is ever invoked, no DB rows are
// written. Tests that would need DB I/O (audit row insert/complete on
// RunTurn) are not in scope here; paliad's test suite has no sqlx mock
// and the existing paliadin_test.go only covers pure functions.
func TestNewRemotePaliadinService_Defaults(t *testing.T) {
s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{
SSHHost: "100.99.98.203",
// SSHPort + SSHUser intentionally left zero/empty
})
if s.cfg.SSHPort != 22022 {
t.Errorf("SSHPort default = %d; want 22022 (Tailscale-SSH bypass port)", s.cfg.SSHPort)
}
if s.cfg.SSHUser != "m" {
t.Errorf("SSHUser default = %q; want %q", s.cfg.SSHUser, "m")
}
if s.cfg.SSHHost != "100.99.98.203" {
t.Errorf("SSHHost not preserved: %q", s.cfg.SSHHost)
}
}
func TestNewRemotePaliadinService_HonoursOverrides(t *testing.T) {
s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{
SSHHost: "10.0.0.1",
SSHPort: 2222,
SSHUser: "alice",
})
if s.cfg.SSHPort != 2222 {
t.Errorf("SSHPort override lost: %d", s.cfg.SSHPort)
}
if s.cfg.SSHUser != "alice" {
t.Errorf("SSHUser override lost: %q", s.cfg.SSHUser)
}
}
func TestClassifySSHError(t *testing.T) {
cases := []struct {
name string
err error
want string
}{
{"nil", nil, ""},
{"explicit ErrMRiverUnreachable", ErrMRiverUnreachable, "mriver_unreachable"},
{"wrapped ErrMRiverUnreachable", fmt.Errorf("foo: %w", ErrMRiverUnreachable), "mriver_unreachable"},
{"context deadline", context.DeadlineExceeded, "timeout"},
{"shim run-turn timeout (exit 124)", errors.New("ssh run-turn …: exit status 124 (stderr: response timeout)"), "timeout"},
{"connection refused", errors.New("ssh health: dial: Connection refused"), "mriver_unreachable"},
{"connection timed out", errors.New("ssh health: Connection timed out"), "mriver_unreachable"},
{"permission denied", errors.New("ssh: Permission denied (publickey)"), "shim_auth_failed"},
{"unknown", errors.New("ssh: some other failure"), "shim_error"},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
got := classifySSHError(c.err)
if got != c.want {
t.Errorf("classifySSHError(%v) = %q; want %q", c.err, got, c.want)
}
})
}
}
func TestHealthGate_CachesOnSuccess(t *testing.T) {
var calls int32
s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{SSHHost: "x"})
s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) {
atomic.AddInt32(&calls, 1)
if len(args) != 1 || args[0] != "health" {
t.Errorf("unexpected callShim args: %v", args)
}
return []byte("ok\n"), nil
}
for i := 0; i < 5; i++ {
if err := s.healthGate(context.Background()); err != nil {
t.Fatalf("healthGate iteration %d: %v", i, err)
}
}
if got := atomic.LoadInt32(&calls); got != 1 {
t.Errorf("expected 1 callShim call (cached); got %d", got)
}
}
func TestHealthGate_RetriesAfterFailure(t *testing.T) {
var calls int32
s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{SSHHost: "x"})
s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) {
atomic.AddInt32(&calls, 1)
return nil, errors.New("ssh: Connection refused")
}
for i := 0; i < 3; i++ {
err := s.healthGate(context.Background())
if !errors.Is(err, ErrMRiverUnreachable) {
t.Errorf("iteration %d: err %v; want wrapping ErrMRiverUnreachable", i, err)
}
}
// Failed health is NOT cached — every call re-probes.
if got := atomic.LoadInt32(&calls); got != 3 {
t.Errorf("expected 3 callShim calls (no caching on failure); got %d", got)
}
}
func TestHealthGate_RejectsUnexpectedReply(t *testing.T) {
s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{SSHHost: "x"})
s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) {
return []byte("not-ok"), nil
}
err := s.healthGate(context.Background())
if !errors.Is(err, ErrMRiverUnreachable) {
t.Errorf("err = %v; want wrap of ErrMRiverUnreachable for non-ok reply", err)
}
}
func TestEnsureBootstrapped_RunsOnce(t *testing.T) {
var calls int32
s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{SSHHost: "x"})
s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) {
atomic.AddInt32(&calls, 1)
if len(args) != 2 || args[0] != "bootstrap" {
t.Errorf("unexpected callShim args: %v", args)
}
// args[1] is the base64'd system prompt — no need to decode in
// the test; just sanity-check it isn't trivially empty.
if len(args[1]) < 100 {
t.Errorf("bootstrap prompt suspiciously short: %d bytes", len(args[1]))
}
return []byte("ok\n"), nil
}
for i := 0; i < 3; i++ {
if err := s.ensureBootstrapped(context.Background()); err != nil {
t.Fatalf("ensureBootstrapped iteration %d: %v", i, err)
}
}
if got := atomic.LoadInt32(&calls); got != 1 {
t.Errorf("expected 1 callShim call (bootstrap is one-shot); got %d", got)
}
}
func TestEnsureBootstrapped_RetriesOnFailure(t *testing.T) {
var calls int32
var failOnce atomic.Bool
s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{SSHHost: "x"})
s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) {
atomic.AddInt32(&calls, 1)
if failOnce.CompareAndSwap(false, true) {
return nil, errors.New("ssh: transient failure")
}
return []byte("ok\n"), nil
}
if err := s.ensureBootstrapped(context.Background()); err == nil {
t.Fatal("first call should error")
}
if err := s.ensureBootstrapped(context.Background()); err != nil {
t.Fatalf("second call should succeed: %v", err)
}
// Third call should be a cache hit (bootstrapped flag set on success).
if err := s.ensureBootstrapped(context.Background()); err != nil {
t.Fatalf("third call should be cached: %v", err)
}
if got := atomic.LoadInt32(&calls); got != 2 {
t.Errorf("expected 2 callShim calls (1 fail + 1 succeed; 3rd cached); got %d", got)
}
}
func TestHealthGate_CacheExpires(t *testing.T) {
var calls int32
s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{SSHHost: "x"})
s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) {
atomic.AddInt32(&calls, 1)
return []byte("ok"), nil
}
if err := s.healthGate(context.Background()); err != nil {
t.Fatalf("first probe: %v", err)
}
// Force the cached timestamp to expire.
s.healthMu.Lock()
s.healthCheckedAt = time.Now().Add(-11 * time.Second)
s.healthMu.Unlock()
if err := s.healthGate(context.Background()); err != nil {
t.Fatalf("second probe (expired cache): %v", err)
}
if got := atomic.LoadInt32(&calls); got != 2 {
t.Errorf("expected 2 callShim calls (cache expired between); got %d", got)
}
}
func TestRemotePaliadin_ImplementsPaliadin(t *testing.T) {
// Compile-time check is in paliadin_remote.go; this test makes the
// failure mode obvious if someone accidentally drops a method.
var _ Paliadin = (*RemotePaliadinService)(nil)
var _ Paliadin = (*LocalPaliadinService)(nil)
var _ Paliadin = (*DisabledPaliadinService)(nil)
}
func TestDisabledPaliadinService(t *testing.T) {
s := NewDisabledPaliadinService(nil, nil)
if _, err := s.RunTurn(context.Background(), TurnRequest{}); !errors.Is(err, ErrPaliadinDisabled) {
t.Errorf("RunTurn error = %v; want ErrPaliadinDisabled", err)
}
if err := s.ResetSession(context.Background()); !errors.Is(err, ErrPaliadinDisabled) {
t.Errorf("ResetSession error = %v; want ErrPaliadinDisabled", err)
}
}
func TestCallShim_SSHArgvShape(t *testing.T) {
// Verify the ssh argv we'd construct includes the bypass-port flag,
// the key + known_hosts paths, and the verb after `--`. We don't
// actually exec ssh — we set callShimHook so callShim never reaches
// the exec path; this test just guards the constructor wiring.
s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{
SSHHost: "100.99.98.203",
SSHPort: 22022,
SSHUser: "m",
SSHKeyPath: "/tmp/k",
KnownHostsPath: "/tmp/kh",
})
var captured []string
s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) {
captured = append([]string(nil), args...)
return []byte("ok"), nil
}
_, _ = s.callShim(context.Background(), "health")
if len(captured) != 1 || captured[0] != "health" {
t.Errorf("callShim forwarded args = %v; want [health]", captured)
}
}
func TestCallShim_StderrSurfacesInError(t *testing.T) {
// When the real exec path fails, callShim wraps stderr into the
// returned error so classifySSHError can pattern-match. Simulate
// that contract via the hook.
s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{SSHHost: "x"})
s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) {
return nil, errors.New("ssh health: exit status 1 (stderr: Permission denied (publickey))")
}
_, err := s.callShim(context.Background(), "health")
if err == nil {
t.Fatal("expected error")
}
if !strings.Contains(err.Error(), "Permission denied") {
t.Errorf("error should preserve stderr: %v", err)
}
if classifySSHError(err) != "shim_auth_failed" {
t.Errorf("classifier should pick up Permission denied; got %q", classifySSHError(err))
}
}

185
scripts/paliadin-shim Executable file
View File

@@ -0,0 +1,185 @@
#!/bin/bash
# paliadin-shim — server-side RPC for paliad's remote-tmux turns.
#
# Invoked via mRiver's ~/.ssh/authorized_keys command= restriction. The
# client's requested command is exposed in $SSH_ORIGINAL_COMMAND; this
# script parses it and dispatches to a fixed verb set.
#
# Design: docs/design-paliadin-tailscale-ssh-2026-05-07.md §5.4.
#
# Verbs:
# health -> "ok" iff tmux + claude reachable
# bootstrap <prompt-base64> -> ensure pane + send system prompt
# run-turn <uuid> <msg-base64> -> send framed prompt, poll, return
# reset -> /clear the conversation
#
# All multi-character payloads (prompts, messages) are base64-encoded by
# the Go caller so we never have to quote them through ssh's argv.
#
# Errors go to stderr with a non-zero exit. The Go side maps the exit
# status into a friendly error code.
set -euo pipefail
umask 077
readonly TMUX_SESSION="${PALIADIN_TMUX_SESSION:-paliad-paliadin}"
readonly RESPONSE_DIR="${PALIADIN_RESPONSE_DIR:-/tmp/paliadin}"
readonly TIMEOUT_S="${PALIADIN_TIMEOUT_S:-60}"
readonly PANE_READY_S=60 # max wait for claude pane to settle
readonly TURN_ID_RE='^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$'
mkdir -p "$RESPONSE_DIR"
chmod 700 "$RESPONSE_DIR"
# Parse $SSH_ORIGINAL_COMMAND into argv. Format: "<verb> <arg1> <arg2> …".
# We never `eval` this; `read -r -a` splits on $IFS without word-expansion.
read -r -a argv <<< "${SSH_ORIGINAL_COMMAND:-}"
verb="${argv[0]:-}"
# ---------------------------------------------------------------------------
# helpers
# ---------------------------------------------------------------------------
log_err() { printf 'paliadin-shim: %s\n' "$*" >&2; }
# ensure_pane creates the tmux session + claude window if missing, waits
# for the pane to become ready, and prints the target identifier
# ("session:window-idx") on stdout.
ensure_pane() {
if ! tmux has-session -t "$TMUX_SESSION" 2>/dev/null; then
tmux new-session -d -s "$TMUX_SESSION"
fi
# Look for an existing window tagged with @paliadin-scope=chat.
local target=""
local idx scope
while read -r idx; do
[[ -z "$idx" ]] && continue
scope=$(tmux show-window-option -t "$TMUX_SESSION:$idx" -v @paliadin-scope 2>/dev/null || true)
if [[ "$scope" == "chat" ]]; then
target="$TMUX_SESSION:$idx"
break
fi
done < <(tmux list-windows -t "$TMUX_SESSION" -F '#{window_index}' 2>/dev/null || true)
if [[ -z "$target" ]]; then
if ! command -v claude >/dev/null 2>&1; then
log_err "claude CLI not found in PATH"
exit 3
fi
idx=$(tmux new-window -t "$TMUX_SESSION" -n claude-paliadin -P -F '#{window_index}' claude)
target="$TMUX_SESSION:$idx"
# Wait for claude to settle. Matches Go waitForPaneReady (paliadin.go:495).
local deadline=$(( $(date +%s) + PANE_READY_S ))
local pane=""
while [[ $(date +%s) -lt $deadline ]]; do
pane=$(tmux capture-pane -t "$target" -p 2>/dev/null || true)
if [[ "$pane" == *""* || "$pane" == *"│"* ]]; then
break
fi
sleep 0.5
done
tmux set-window-option -t "$target" @paliadin-scope chat >/dev/null
tmux set-window-option -t "$target" @fix-name claude-paliadin >/dev/null
fi
printf '%s' "$target"
}
# send_to_pane writes a literal string then Enter.
send_to_pane() {
local target="$1" msg="$2"
tmux send-keys -t "$target" -l -- "$msg"
tmux send-keys -t "$target" Enter
}
# ---------------------------------------------------------------------------
# verb dispatch
# ---------------------------------------------------------------------------
case "$verb" in
health)
# Used by the Go side's healthGate to short-circuit when mRiver is
# offline or tmux/claude is broken. Output is parsed verbatim.
if ! command -v tmux >/dev/null 2>&1; then
log_err "tmux not in PATH"; exit 1
fi
if ! command -v claude >/dev/null 2>&1; then
log_err "claude not in PATH"; exit 1
fi
if ! tmux has-session -t "$TMUX_SESSION" 2>/dev/null; then
tmux new-session -d -s "$TMUX_SESSION"
fi
echo ok
;;
bootstrap)
# Inject the system prompt into a fresh claude pane. Idempotent —
# the Go side may call this repeatedly; tmux send-keys is harmless
# against a settled pane.
if [[ -z "${argv[1]:-}" ]]; then
log_err "bootstrap: missing prompt"; exit 2
fi
if ! prompt=$(printf '%s' "${argv[1]}" | base64 -d 2>/dev/null); then
log_err "bootstrap: invalid base64 prompt"; exit 2
fi
target=$(ensure_pane)
send_to_pane "$target" "$prompt"
sleep 2 # let claude absorb before turns flow
echo ok
;;
run-turn)
# $1 = turn_id (UUID), $2 = base64-encoded user message.
turn_id="${argv[1]:-}"
if [[ ! "$turn_id" =~ $TURN_ID_RE ]]; then
log_err "run-turn: bad turn_id"; exit 2
fi
if [[ -z "${argv[2]:-}" ]]; then
log_err "run-turn: missing message"; exit 2
fi
if ! msg=$(printf '%s' "${argv[2]}" | base64 -d 2>/dev/null); then
log_err "run-turn: invalid base64 message"; exit 2
fi
target=$(ensure_pane)
out="$RESPONSE_DIR/$turn_id.txt"
rm -f "$out"
# Envelope matches paliadin_prompt.go's `[PALIADIN:turn_id] <msg>` shape.
send_to_pane "$target" "[PALIADIN:$turn_id] $msg"
# Poll for the response file. Same shape as Go pollForResponse
# (paliadin.go:530). Settle delay so we don't read mid-flush.
deadline=$(( $(date +%s) + TIMEOUT_S ))
while [[ $(date +%s) -lt $deadline ]]; do
if [[ -s "$out" ]]; then
sleep 0.05
cat "$out"
rm -f "$out"
exit 0
fi
sleep 0.2
done
log_err "response timeout after ${TIMEOUT_S}s"
exit 124
;;
reset)
# Send `/clear` so the next turn starts a fresh conversation.
target=$(ensure_pane)
send_to_pane "$target" "/clear"
echo ok
;;
'')
log_err "no verb (set SSH_ORIGINAL_COMMAND via authorized_keys command=)"
exit 2
;;
*)
log_err "unknown verb '$verb'"
exit 2
;;
esac