Merge: t-paliad-151 Phase B code (env-var-gated, compose flip held for A.5) — Paliadin remote-routing via Tailscale SSH to mRiver. Includes Phase A.0 design doc + scripts/paliadin-shim from earlier shift. Production behavior unchanged: without PALIADIN_REMOTE_HOST in env, paliad never invokes ssh and uses local-tmux PoC path byte-identically. Refactor: Paliadin interface + LocalPaliadinService + RemotePaliadinService + DisabledPaliadinService stub. main.go env-var switch (remote/local/disabled). Dockerfile +openssh-client. 14 unit tests via callShimHook. Frontend friendlyErrorMessage for mriver_unreachable/shim_auth_failed/shim_error/bootstrap_failed/timeout (DE+EN). NOT included: docker-compose network_mode: host flip — held on branch as da971a7 pending Phase A.5 traefik test by m. NOT cronus.
This commit is contained in:
@@ -11,7 +11,7 @@ COPY . .
|
|||||||
RUN CGO_ENABLED=0 go build -ldflags="-s -w" -o /paliad ./cmd/server
|
RUN CGO_ENABLED=0 go build -ldflags="-s -w" -o /paliad ./cmd/server
|
||||||
|
|
||||||
FROM alpine:3.21
|
FROM alpine:3.21
|
||||||
RUN apk add --no-cache ca-certificates
|
RUN apk add --no-cache ca-certificates openssh-client
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY --from=backend /paliad /app/paliad
|
COPY --from=backend /paliad /app/paliad
|
||||||
COPY --from=frontend /app/frontend/dist /app/dist
|
COPY --from=frontend /app/frontend/dist /app/dist
|
||||||
|
|||||||
@@ -2,10 +2,13 @@ package main
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
|
"os/exec"
|
||||||
"os/signal"
|
"os/signal"
|
||||||
|
"strconv"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
|
||||||
// Embed Go's IANA tz database into the binary so time.LoadLocation works
|
// Embed Go's IANA tz database into the binary so time.LoadLocation works
|
||||||
@@ -165,20 +168,34 @@ func main() {
|
|||||||
CardLayout: services.NewCardLayoutService(pool),
|
CardLayout: services.NewCardLayoutService(pool),
|
||||||
}
|
}
|
||||||
|
|
||||||
// t-paliad-146 — Paliadin PoC. Always wired when DATABASE_URL
|
// Paliadin backend selection (t-paliad-146 + t-paliad-151):
|
||||||
// is set; the per-request handler gate (requirePaliadinOwner)
|
// PALIADIN_REMOTE_HOST set → RemotePaliadinService (ssh to mRiver)
|
||||||
// restricts access to the single owner email
|
// else: local tmux available → LocalPaliadinService (PoC path)
|
||||||
// (services.PaliadinOwnerEmail). All other authenticated users
|
// else: DisabledPaliadinService (handlers still 404 for non-owners
|
||||||
// get a 404 — the route effectively does not exist for them.
|
// via the gate; for m, RunTurn returns ErrPaliadinDisabled
|
||||||
// On hosts without tmux + the `claude` CLI (e.g. the Dokploy
|
// which surfaces as a friendly error).
|
||||||
// container), the owner gate still applies; if m ever hits the
|
//
|
||||||
// route from such a host, the service returns "tmux unavailable"
|
// All three implement services.Paliadin; the per-request handler
|
||||||
// without ever invoking shell-out.
|
// gate (requirePaliadinOwner) is unchanged and applies to every
|
||||||
tmuxSession := os.Getenv("PALIADIN_TMUX_SESSION")
|
// backend.
|
||||||
responseDir := os.Getenv("PALIADIN_RESPONSE_DIR")
|
if remoteHost := os.Getenv("PALIADIN_REMOTE_HOST"); remoteHost != "" {
|
||||||
svcBundle.Paliadin = services.NewPaliadinService(pool, users, tmuxSession, responseDir)
|
cfg, err := buildPaliadinRemoteConfig(remoteHost)
|
||||||
log.Printf("paliadin: wired (owner=%s; gate is per-request, not per-deploy)",
|
if err != nil {
|
||||||
services.PaliadinOwnerEmail)
|
log.Fatalf("paliadin: remote config: %v", err)
|
||||||
|
}
|
||||||
|
svcBundle.Paliadin = services.NewRemotePaliadinService(pool, users, cfg)
|
||||||
|
log.Printf("paliadin: remote mode → ssh %s@%s:%d (owner=%s)",
|
||||||
|
cfg.SSHUser, cfg.SSHHost, cfg.SSHPort, services.PaliadinOwnerEmail)
|
||||||
|
} else if _, err := exec.LookPath("tmux"); err == nil {
|
||||||
|
tmuxSession := os.Getenv("PALIADIN_TMUX_SESSION")
|
||||||
|
responseDir := os.Getenv("PALIADIN_RESPONSE_DIR")
|
||||||
|
svcBundle.Paliadin = services.NewLocalPaliadinService(pool, users, tmuxSession, responseDir)
|
||||||
|
log.Printf("paliadin: local tmux mode (owner=%s)", services.PaliadinOwnerEmail)
|
||||||
|
} else {
|
||||||
|
svcBundle.Paliadin = services.NewDisabledPaliadinService(pool, users)
|
||||||
|
log.Printf("paliadin: disabled (no PALIADIN_REMOTE_HOST, no local tmux; owner=%s)",
|
||||||
|
services.PaliadinOwnerEmail)
|
||||||
|
}
|
||||||
// Wire ApprovalService into the entity services so Create / Update /
|
// Wire ApprovalService into the entity services so Create / Update /
|
||||||
// Complete / Delete consult paliad.approval_policies (t-paliad-138).
|
// Complete / Delete consult paliad.approval_policies (t-paliad-138).
|
||||||
// Without this wiring, the policies and request tables exist but no
|
// Without this wiring, the policies and request tables exist but no
|
||||||
@@ -217,3 +234,83 @@ func main() {
|
|||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// buildPaliadinRemoteConfig assembles a RemotePaliadinConfig from
|
||||||
|
// environment variables, materialising the SSH private key and
|
||||||
|
// known_hosts blobs into chmod-600/644 tmpfiles for OpenSSH to read.
|
||||||
|
//
|
||||||
|
// The blobs travel as Dokploy secrets (multi-line env vars). We never
|
||||||
|
// persist them to disk — tmpfiles live for the process lifetime in
|
||||||
|
// /tmp and disappear on container restart. Re-creating them every boot
|
||||||
|
// is fine; the keys themselves rotate independently via Dokploy
|
||||||
|
// secret updates.
|
||||||
|
//
|
||||||
|
// Required: PALIADIN_REMOTE_HOST, PALIADIN_SSH_PRIVATE_KEY, PALIADIN_KNOWN_HOSTS.
|
||||||
|
// Optional: PALIADIN_REMOTE_USER (default "m"), PALIADIN_REMOTE_PORT
|
||||||
|
// (default 22022 — bypasses Tailscale SSH on :22, see design §4.5).
|
||||||
|
func buildPaliadinRemoteConfig(host string) (services.RemotePaliadinConfig, error) {
|
||||||
|
cfg := services.RemotePaliadinConfig{
|
||||||
|
SSHHost: host,
|
||||||
|
SSHUser: cmpOr(os.Getenv("PALIADIN_REMOTE_USER"), "m"),
|
||||||
|
SSHPort: 22022,
|
||||||
|
}
|
||||||
|
if p := os.Getenv("PALIADIN_REMOTE_PORT"); p != "" {
|
||||||
|
n, err := strconv.Atoi(p)
|
||||||
|
if err != nil || n <= 0 || n > 65535 {
|
||||||
|
return cfg, fmt.Errorf("PALIADIN_REMOTE_PORT %q: not a valid port", p)
|
||||||
|
}
|
||||||
|
cfg.SSHPort = n
|
||||||
|
}
|
||||||
|
|
||||||
|
keyPath, err := writeSecretFile("paliadin-id_ed25519-", os.Getenv("PALIADIN_SSH_PRIVATE_KEY"), 0o600)
|
||||||
|
if err != nil {
|
||||||
|
return cfg, fmt.Errorf("PALIADIN_SSH_PRIVATE_KEY: %w", err)
|
||||||
|
}
|
||||||
|
if keyPath == "" {
|
||||||
|
return cfg, fmt.Errorf("PALIADIN_REMOTE_HOST set but PALIADIN_SSH_PRIVATE_KEY empty")
|
||||||
|
}
|
||||||
|
cfg.SSHKeyPath = keyPath
|
||||||
|
|
||||||
|
knownHostsPath, err := writeSecretFile("paliadin-known_hosts-", os.Getenv("PALIADIN_KNOWN_HOSTS"), 0o644)
|
||||||
|
if err != nil {
|
||||||
|
return cfg, fmt.Errorf("PALIADIN_KNOWN_HOSTS: %w", err)
|
||||||
|
}
|
||||||
|
if knownHostsPath == "" {
|
||||||
|
return cfg, fmt.Errorf("PALIADIN_REMOTE_HOST set but PALIADIN_KNOWN_HOSTS empty")
|
||||||
|
}
|
||||||
|
cfg.KnownHostsPath = knownHostsPath
|
||||||
|
|
||||||
|
return cfg, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// writeSecretFile writes blob to a tmpfile with the given mode and
|
||||||
|
// returns its path. Returns ("", nil) when blob is empty so callers
|
||||||
|
// can distinguish "not set" from real I/O errors.
|
||||||
|
func writeSecretFile(prefix, blob string, mode os.FileMode) (string, error) {
|
||||||
|
if blob == "" {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
f, err := os.CreateTemp("", prefix+"*")
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if _, err := f.WriteString(blob); err != nil {
|
||||||
|
_ = f.Close()
|
||||||
|
_ = os.Remove(f.Name())
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if err := f.Close(); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if err := os.Chmod(f.Name(), mode); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return f.Name(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func cmpOr(s, fallback string) string {
|
||||||
|
if s != "" {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
return fallback
|
||||||
|
}
|
||||||
|
|||||||
677
docs/design-paliadin-tailscale-ssh-2026-05-07.md
Normal file
677
docs/design-paliadin-tailscale-ssh-2026-05-07.md
Normal file
@@ -0,0 +1,677 @@
|
|||||||
|
# Paliadin: route prod via Tailscale SSH to mRiver
|
||||||
|
|
||||||
|
**Issue:** m/paliad#12 — t-paliad-151
|
||||||
|
**Date:** 2026-05-07
|
||||||
|
**Author:** noether (inventor)
|
||||||
|
**Supersedes nothing.** Extends `docs/design-paliadin-2026-05-07.md` (the Phase 0 PoC) with a third deployment path between "laptop-only PoC" and "Anthropic API direct".
|
||||||
|
**Related:** t-paliad-146 (PoC ship), t-paliad-150 (`friendlyErrorMessage` pattern).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Goal
|
||||||
|
|
||||||
|
Make Paliadin reachable from `paliad.de` (Dokploy on mLake) without losing m's Claude Code subscription, by routing each turn over Tailscale + SSH from the paliad container to mRiver, where the existing long-lived `tmux` + `claude` PoC keeps running.
|
||||||
|
|
||||||
|
**Non-goals (v1):**
|
||||||
|
|
||||||
|
- Multi-host failover.
|
||||||
|
- Encryption beyond SSH-over-tailnet (already E2E-encrypted by Tailscale's WireGuard layer).
|
||||||
|
- Anthropic API fallback when mRiver is offline — show a friendly error instead.
|
||||||
|
- Wake-on-LAN of mRiver.
|
||||||
|
- Multi-tenant or multi-firm variants.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Live state — what was verified before designing
|
||||||
|
|
||||||
|
A design built on stale facts rots fast. These were probed on 2026-05-07, not assumed from CLAUDE.md or memory:
|
||||||
|
|
||||||
|
| Fact | How verified | Result |
|
||||||
|
|---|---|---|
|
||||||
|
| mRiver = `100.99.98.203`, has tmux + claude | this worker runs on mRiver; `tmux -V` → `tmux 3.6a`; `which claude` → `/home/m/.local/bin/claude` | confirmed |
|
||||||
|
| mLake (`100.99.98.201`) has Tailscale running | `ssh m@mlake tailscale status` | confirmed; mRiver visible as `active; direct [2a02:4780:41:3fbc::1]:41641` |
|
||||||
|
| paliad container Dockerfile is alpine:3.21 minimal, no SSH, no tailscaled | `Dockerfile` | confirmed (only `ca-certificates`) |
|
||||||
|
| paliad compose runs default Docker bridge (no `network_mode`) | `docker-compose.yml` | confirmed |
|
||||||
|
| mRiver has no `~/.ssh/authorized_keys` yet | `ls ~/.ssh/` | confirmed — file must be created in Phase A |
|
||||||
|
| `/tmp/paliadin/` does not exist on mRiver yet | `ls /tmp/paliadin` | confirmed — created on first turn (paliadin.go:185 `os.MkdirAll`) |
|
||||||
|
| `paliad-paliadin` tmux session is not currently running on mRiver | `tmux ls` | not present; the existing PoC creates it on demand |
|
||||||
|
|
||||||
|
**Implication for design:** the paliad container needs new infrastructure on three axes — network reachability of the tailnet, an SSH client + identity, and a service-layer code path that talks to a remote tmux instead of a local one. Each axis is its own sub-design below.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Locked decisions (m, 2026-05-07 22:35)
|
||||||
|
|
||||||
|
m made four design-shaping calls via the inventor's `AskUserQuestion` pass. They are recorded here verbatim because every downstream choice in §4–§6 follows from them.
|
||||||
|
|
||||||
|
| # | Question | m's choice |
|
||||||
|
|---|---|---|
|
||||||
|
| 1 | Container Tailscale shape | **`network_mode: host` on paliad** |
|
||||||
|
| 2 | SSH-to-mRiver protocol granularity | **Server-side `paliadin-shim` (one RPC per turn)** |
|
||||||
|
| 3 | Routing trigger | **Env var `PALIADIN_REMOTE_HOST` + interface split** |
|
||||||
|
| 4 | SSH private key storage | **Dokploy secret env var `PALIADIN_SSH_PRIVATE_KEY`** |
|
||||||
|
| 5 | SSH port to bypass Tailscale SSH | **Port 22022 via `ssh.socket` drop-in (Phase A finding, 23:30)** |
|
||||||
|
|
||||||
|
Decision (1) was *not* the inventor's recommendation — host mode has known interaction risk with traefik (§4.2). m is overriding the recommendation; this design accepts the call and codifies a Phase A test step that gates the rollout on traefik still working under host mode. If Phase A blows up, the fallback is to revisit (1) in a follow-up issue, not to silently swap to a sidecar.
|
||||||
|
|
||||||
|
Decision (5) emerged during Phase A: Tailscale SSH on mRiver was found to intercept `:22` from tailnet peers and bypass OpenSSH's `authorized_keys` entirely (banner says "Tailscale", auth method "none"). The `command=` shim restriction therefore never fires on the standard port. Adding port 22022 via a `systemd ssh.socket` drop-in routes paliad's connections to real OpenSSH where the restriction works. m's interactive `tailscale ssh m@mriver` on `:22` stays untouched. See §4.4 for the implementation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Sub-design A — Container Tailscale shape
|
||||||
|
|
||||||
|
### 4.1 Shape: `network_mode: host`
|
||||||
|
|
||||||
|
paliad's container shares mLake's network namespace. `tailscale0` (mLake's tailnet interface) is directly visible from inside the container. Outbound `ssh m@100.99.98.203` reaches mRiver over the tailnet without any sidecar, userspace tailscaled, SOCKS proxy, or auth-key flow inside the container.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# docker-compose.yml diff
|
||||||
|
services:
|
||||||
|
web:
|
||||||
|
build: .
|
||||||
|
network_mode: host # NEW
|
||||||
|
# remove: expose: ["8080"] # host mode means port is on the host directly
|
||||||
|
environment:
|
||||||
|
- PORT=8080
|
||||||
|
...
|
||||||
|
# NEW Paliadin remote-routing knobs
|
||||||
|
- PALIADIN_REMOTE_HOST=${PALIADIN_REMOTE_HOST} # 100.99.98.203
|
||||||
|
- PALIADIN_REMOTE_PORT=${PALIADIN_REMOTE_PORT} # 22022 (bypasses Tailscale SSH, see §4.5)
|
||||||
|
- PALIADIN_REMOTE_USER=${PALIADIN_REMOTE_USER} # m
|
||||||
|
- PALIADIN_SSH_PRIVATE_KEY=${PALIADIN_SSH_PRIVATE_KEY}
|
||||||
|
- PALIADIN_KNOWN_HOSTS=${PALIADIN_KNOWN_HOSTS} # one-line ssh-keyscan -p 22022 output
|
||||||
|
restart: unless-stopped
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.2 Trade-off accepted: traefik routing under host mode
|
||||||
|
|
||||||
|
paliad.de's TLS is provided by Dokploy's traefik on the `dokploy-network` overlay. With `network_mode: host`, paliad is no longer attached to that overlay. Two failure modes are possible:
|
||||||
|
|
||||||
|
- **(M1)** traefik can't discover the service via Docker DNS → 502 at the edge.
|
||||||
|
- **(M2)** traefik routes via host loopback (`http://127.0.0.1:8080` or `host.docker.internal`) and works fine.
|
||||||
|
|
||||||
|
Recent Dokploy versions configure traefik with both `loadbalancer.server.url` and Docker labels; (M2) is the documented host-mode path. **Phase A explicitly tests this** (§7) before any code is written; if (M1) materialises, the design rolls back to the sidecar variant of decision 1 in a follow-up issue.
|
||||||
|
|
||||||
|
Other host-mode side-effects to flag in operations:
|
||||||
|
|
||||||
|
- paliad listens on host port 8080 directly. Any other compose service binding 8080 conflicts.
|
||||||
|
- paliad's outbound DNS uses host resolver (no Docker-internal `web` etc.). Currently fine: paliad's only network deps are external (Supabase, SMTP, GitHub raw). No service on `dokploy-network` is referenced by name.
|
||||||
|
- The container can reach **every** Tailscale node, not just mRiver. Mitigations live in §5 (key restriction) and §5.2 (`from=` clause on mRiver authorized_keys).
|
||||||
|
|
||||||
|
### 4.3 Dockerfile diff
|
||||||
|
|
||||||
|
```dockerfile
|
||||||
|
# Final stage adds the SSH client only. Tailscale is provided by the host.
|
||||||
|
FROM alpine:3.21
|
||||||
|
RUN apk add --no-cache ca-certificates openssh-client # +openssh-client (~1MB)
|
||||||
|
WORKDIR /app
|
||||||
|
COPY --from=backend /paliad /app/paliad
|
||||||
|
COPY --from=frontend /app/frontend/dist /app/dist
|
||||||
|
EXPOSE 8080
|
||||||
|
CMD ["/app/paliad"]
|
||||||
|
```
|
||||||
|
|
||||||
|
Image-size delta: alpine `openssh-client` is ~1.1 MB compressed — negligible. No tailscaled, no entrypoint script, no extra processes inside the container.
|
||||||
|
|
||||||
|
### 4.4 What does NOT change
|
||||||
|
|
||||||
|
- No Tailscale auth-key inside paliad. The container inherits the host's tailnet binding, so there is no per-container Tailscale identity to rotate. mLake's existing Tailscale auth is the only one in scope.
|
||||||
|
- No tailscaled process inside the container.
|
||||||
|
- No new sidecar container.
|
||||||
|
|
||||||
|
### 4.5 Bypassing Tailscale SSH via port 22022 (Phase A discovery)
|
||||||
|
|
||||||
|
**Phase A revealed** that Tailscale SSH on mRiver intercepts `:22` from tailnet peers before OpenSSH sees the connection. The SSH banner reads `SSH-2.0-Tailscale`, the verbose log shows `Authenticated using "none"`, and the `authorized_keys command=` directive is therefore inert. mRiver's `tailscale status --json` confirms the `https://tailscale.com/cap/ssh` capability is enabled.
|
||||||
|
|
||||||
|
The fix: a separate listening port for the paliad route, where Tailscale SSH does not intercept and real OpenSSH handles auth.
|
||||||
|
|
||||||
|
mRiver uses systemd socket activation for sshd (`/usr/lib/systemd/system/ssh.socket` binds `:22`). Setting `Port 22022` in `sshd_config` is **ignored** under socket activation — listen ports come from the socket unit, not sshd's own config. The correct change is a drop-in:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
# /etc/systemd/system/ssh.socket.d/paliad.conf
|
||||||
|
[Socket]
|
||||||
|
ListenStream=0.0.0.0:22022
|
||||||
|
ListenStream=[::]:22022
|
||||||
|
```
|
||||||
|
|
||||||
|
Followed by `systemctl daemon-reload && systemctl restart ssh.socket`. Both `:22` (still routed through Tailscale SSH for m's interactive use) and `:22022` (real OpenSSH) end up listening. The same sshd binary handles both — same host key, same `authorized_keys`, same sshd_config. The only difference is *which port* a peer dials.
|
||||||
|
|
||||||
|
A failed first attempt (2026-05-07 23:07) added the drop-in while a stale `Port 22022` directive in `sshd_config.d/99-paliad-test.conf` was still bound — the resulting `Address already in use` took `ssh.socket` down for ~30 s until reverted. Lesson: clean any prior `Port` directives out of `sshd_config.d/*.conf` before retrying the socket drop-in.
|
||||||
|
|
||||||
|
Phase A end-to-end test (2026-05-07 23:31) succeeded with port 22022:
|
||||||
|
|
||||||
|
- `ssh -p 22022 -i paliad-prod-key m@100.99.98.203 health` → `ok`
|
||||||
|
- `run-turn <uuid> <base64-msg>` → 3.4 s round-trip including a Claude-Code response
|
||||||
|
- `from="100.99.98.201"` correctly rejected a connection sourced from mRiver itself (`Permission denied (publickey,password)`)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Sub-design B — SSH identity, restricted shim, host-key pinning
|
||||||
|
|
||||||
|
### 5.1 Identity: dedicated ed25519 keypair `paliad-prod`
|
||||||
|
|
||||||
|
One keypair, generated once on mRiver during Phase A, used by every paliad-prod deploy:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On mRiver (Phase A bootstrap):
|
||||||
|
ssh-keygen -t ed25519 -N "" -C "paliad-prod $(date +%Y-%m-%d)" -f /tmp/paliad-prod-key
|
||||||
|
# Public key → mRiver authorized_keys (see 5.2)
|
||||||
|
# Private key → Dokploy secret store as PALIADIN_SSH_PRIVATE_KEY
|
||||||
|
shred -u /tmp/paliad-prod-key # only the encrypted/secret-stored copies survive
|
||||||
|
```
|
||||||
|
|
||||||
|
Rotation: regenerate, push public key to mRiver authorized_keys, update Dokploy secret, redeploy. No code change needed — paliad's startup re-reads the env var on every boot.
|
||||||
|
|
||||||
|
The private key is delivered to the container as a multi-line env var. At process start, paliad writes it to a tmpfile so OpenSSH can use it:
|
||||||
|
|
||||||
|
```go
|
||||||
|
// cmd/server/main.go (sketch)
|
||||||
|
func loadPaliadinSSHKey() (string, error) {
|
||||||
|
blob := os.Getenv("PALIADIN_SSH_PRIVATE_KEY")
|
||||||
|
if blob == "" { return "", nil } // remote mode disabled
|
||||||
|
f, err := os.CreateTemp("", "paliadin-id_ed25519-")
|
||||||
|
if err != nil { return "", err }
|
||||||
|
if err := os.Chmod(f.Name(), 0o600); err != nil { return "", err }
|
||||||
|
if _, err := f.WriteString(blob); err != nil { return "", err }
|
||||||
|
if err := f.Close(); err != nil { return "", err }
|
||||||
|
return f.Name(), nil // path passed to RemotePaliadinService
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The tmpfile lives at `/tmp/paliadin-id_ed25519-<rand>` for the container's lifetime. On container restart, a fresh tmpfile is written. We never persist the key to a volume.
|
||||||
|
|
||||||
|
### 5.2 mRiver `authorized_keys` entry
|
||||||
|
|
||||||
|
```
|
||||||
|
command="/home/m/.local/bin/paliadin-shim",no-pty,no-port-forwarding,no-agent-forwarding,no-X11-forwarding,no-user-rc,from="100.99.98.201" ssh-ed25519 AAAA...PUBKEY... paliad-prod
|
||||||
|
```
|
||||||
|
|
||||||
|
Each restriction matters:
|
||||||
|
|
||||||
|
- `command=` — every `ssh m@mriver …` invocation runs the shim regardless of what the client asked for. The client's requested command is exposed as `$SSH_ORIGINAL_COMMAND` for the shim to dispatch on.
|
||||||
|
- `no-pty,no-port-forwarding,no-agent-forwarding,no-X11-forwarding,no-user-rc` — defence-in-depth: even if someone steals the key and bypasses the shim's argument validation, they can't get an interactive shell, can't tunnel ports, can't pivot via agent forwarding.
|
||||||
|
- `from="100.99.98.201"` — only accept connections from mLake's tailnet IP. Defends against the "container has full tailnet visibility" host-mode side-effect from §4.2: if the key leaks off mLake, it can't be replayed from another tailnet host.
|
||||||
|
|
||||||
|
### 5.3 Host-key pinning
|
||||||
|
|
||||||
|
`StrictHostKeyChecking=accept-new` is too loose for a long-lived production identity (one-time MITM during first connect substitutes a different key forever). Instead:
|
||||||
|
|
||||||
|
- During Phase A, run `ssh-keyscan -p 22022 -t ed25519 100.99.98.203` on mLake.
|
||||||
|
- Capture the single output line. The host-key portion is identical to the `:22` entry — same sshd, same keys — but the `[100.99.98.203]:22022` prefix matters because OpenSSH's `known_hosts` is `host:port`-keyed for non-22 ports.
|
||||||
|
- Store as Dokploy secret `PALIADIN_KNOWN_HOSTS`.
|
||||||
|
- At container startup, write to `/tmp/paliadin-known_hosts` chmod 644.
|
||||||
|
- Pass to OpenSSH via `-o UserKnownHostsFile=/tmp/paliadin-known_hosts -o StrictHostKeyChecking=yes`.
|
||||||
|
|
||||||
|
If mRiver's host key ever rotates (rare; only on disk wipe / fresh OS), Phase A runs again and the secret is updated. SSH refuses to connect with a clear "host key changed" error, which surfaces as `mriver_unreachable` to the user — exactly the right blast-radius (loud failure, no silent connect to a substitute host).
|
||||||
|
|
||||||
|
### 5.4 The shim — `paliadin-shim`
|
||||||
|
|
||||||
|
A bash script on mRiver at `/home/m/.local/bin/paliadin-shim`. It is the **only** thing the paliad-prod key is allowed to invoke, and it dispatches on `$SSH_ORIGINAL_COMMAND`. Three RPCs:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
# paliadin-shim — server-side RPC for paliad's remote-tmux turns.
|
||||||
|
# Invoked via authorized_keys command= with $SSH_ORIGINAL_COMMAND set.
|
||||||
|
set -euo pipefail
|
||||||
|
umask 077
|
||||||
|
|
||||||
|
readonly TMUX_SESSION="${PALIADIN_TMUX_SESSION:-paliad-paliadin}"
|
||||||
|
readonly RESPONSE_DIR="${PALIADIN_RESPONSE_DIR:-/tmp/paliadin}"
|
||||||
|
readonly TIMEOUT_S=60
|
||||||
|
readonly TURN_ID_RE='^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$'
|
||||||
|
|
||||||
|
mkdir -p "$RESPONSE_DIR"
|
||||||
|
|
||||||
|
# Parse $SSH_ORIGINAL_COMMAND. Format: "<verb> <arg1> <arg2> …"
|
||||||
|
read -r -a argv <<< "${SSH_ORIGINAL_COMMAND:-}"
|
||||||
|
verb="${argv[0]:-}"
|
||||||
|
|
||||||
|
ensure_pane() {
|
||||||
|
if ! tmux has-session -t "$TMUX_SESSION" 2>/dev/null; then
|
||||||
|
tmux new-session -d -s "$TMUX_SESSION"
|
||||||
|
fi
|
||||||
|
# Find or create the @paliadin-scope=chat window.
|
||||||
|
local target=""
|
||||||
|
while read -r idx; do
|
||||||
|
scope=$(tmux show-window-option -t "$TMUX_SESSION:$idx" -v @paliadin-scope 2>/dev/null || true)
|
||||||
|
if [[ "$scope" == "chat" ]]; then target="$TMUX_SESSION:$idx"; break; fi
|
||||||
|
done < <(tmux list-windows -t "$TMUX_SESSION" -F '#{window_index}')
|
||||||
|
if [[ -z "$target" ]]; then
|
||||||
|
idx=$(tmux new-window -t "$TMUX_SESSION" -n claude-paliadin -P -F '#{window_index}' claude)
|
||||||
|
target="$TMUX_SESSION:$idx"
|
||||||
|
# Wait for claude to settle (60s bound; matches Go waitForPaneReady).
|
||||||
|
for _ in $(seq 1 120); do
|
||||||
|
pane=$(tmux capture-pane -t "$target" -p 2>/dev/null || true)
|
||||||
|
if [[ "$pane" == *"❯"* || "$pane" == *"│"* ]]; then break; fi
|
||||||
|
sleep 0.5
|
||||||
|
done
|
||||||
|
tmux set-window-option -t "$target" @paliadin-scope chat
|
||||||
|
tmux set-window-option -t "$target" @fix-name claude-paliadin
|
||||||
|
# Bootstrap system prompt — reuses the Go service's prompt text.
|
||||||
|
# The Go side sends this via the `bootstrap` RPC on first turn instead
|
||||||
|
# of duplicating the prompt here. See §6.4.
|
||||||
|
fi
|
||||||
|
echo "$target"
|
||||||
|
}
|
||||||
|
|
||||||
|
case "$verb" in
|
||||||
|
health)
|
||||||
|
# Liveness check — used by paliad to short-circuit when mRiver is offline.
|
||||||
|
# Returns "ok" iff tmux + claude are reachable.
|
||||||
|
tmux has-session -t "$TMUX_SESSION" 2>/dev/null \
|
||||||
|
|| tmux new-session -d -s "$TMUX_SESSION"
|
||||||
|
command -v claude >/dev/null && echo ok || { echo no-claude; exit 1; }
|
||||||
|
;;
|
||||||
|
|
||||||
|
bootstrap)
|
||||||
|
# First-turn-only: ensure pane exists and inject the system prompt.
|
||||||
|
# $1 = base64-encoded prompt body (avoids quoting hell).
|
||||||
|
target=$(ensure_pane)
|
||||||
|
prompt=$(printf '%s' "${argv[1]:?missing prompt}" | base64 -d)
|
||||||
|
tmux send-keys -t "$target" -l -- "$prompt"
|
||||||
|
tmux send-keys -t "$target" Enter
|
||||||
|
sleep 2 # give claude a moment to absorb
|
||||||
|
echo ok
|
||||||
|
;;
|
||||||
|
|
||||||
|
run-turn)
|
||||||
|
# $1 = turn_id (UUID); $2 = base64-encoded user message.
|
||||||
|
turn_id="${argv[1]:?missing turn_id}"
|
||||||
|
[[ "$turn_id" =~ $TURN_ID_RE ]] || { echo >&2 "bad turn_id"; exit 2; }
|
||||||
|
msg=$(printf '%s' "${argv[2]:?missing message}" | base64 -d)
|
||||||
|
target=$(ensure_pane)
|
||||||
|
out="$RESPONSE_DIR/$turn_id.txt"
|
||||||
|
rm -f "$out"
|
||||||
|
# Envelope matches what paliadin_prompt.go expects.
|
||||||
|
tmux send-keys -t "$target" -l -- "[PALIADIN:$turn_id] $msg"
|
||||||
|
tmux send-keys -t "$target" Enter
|
||||||
|
# Poll for the response file. Same shape as Go pollForResponse.
|
||||||
|
for _ in $(seq 1 $((TIMEOUT_S * 5))); do
|
||||||
|
if [[ -s "$out" ]]; then
|
||||||
|
sleep 0.05 # settle
|
||||||
|
cat "$out"
|
||||||
|
rm -f "$out"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
sleep 0.2
|
||||||
|
done
|
||||||
|
echo >&2 "paliadin: response timeout after ${TIMEOUT_S}s"
|
||||||
|
exit 124
|
||||||
|
;;
|
||||||
|
|
||||||
|
reset)
|
||||||
|
# /clear the conversation; next turn starts fresh.
|
||||||
|
target=$(ensure_pane)
|
||||||
|
tmux send-keys -t "$target" -l -- "/clear"
|
||||||
|
tmux send-keys -t "$target" Enter
|
||||||
|
echo ok
|
||||||
|
;;
|
||||||
|
|
||||||
|
*)
|
||||||
|
echo >&2 "paliadin-shim: unknown verb '$verb'"
|
||||||
|
exit 2
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
```
|
||||||
|
|
||||||
|
Why a shim instead of raw tmux-over-SSH:
|
||||||
|
|
||||||
|
- One SSH round-trip per turn (~50 ms over tailnet) vs ~10–20 round-trips for the granular pattern.
|
||||||
|
- Argument validation lives in one place (UUID regex on turn_id, base64 for messages, fixed verb list) — easier to audit than a regex over `$SSH_ORIGINAL_COMMAND` matching `tmux send-keys …`.
|
||||||
|
- mRiver-side concerns (response polling, settle delays, pane-readiness) stay on mRiver, which is where the tmux state lives. The Go service stops caring about local file polling at all.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Sub-design C — Service-layer integration, routing, reliability
|
||||||
|
|
||||||
|
### 6.1 Interface split
|
||||||
|
|
||||||
|
The current `*PaliadinService` becomes an interface with two implementations: `LocalPaliadinService` (the existing tmux code, renamed) and `RemotePaliadinService` (the new SSH code). Construction picks one at startup based on `PALIADIN_REMOTE_HOST`.
|
||||||
|
|
||||||
|
```go
|
||||||
|
// internal/services/paliadin.go (after refactor)
|
||||||
|
|
||||||
|
type Paliadin interface {
|
||||||
|
RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error)
|
||||||
|
ResetSession(ctx context.Context) error
|
||||||
|
ListRecentTurns(ctx context.Context, callerID uuid.UUID, limit int) ([]PaliadinTurn, error)
|
||||||
|
Stats(ctx context.Context, callerID uuid.UUID) (*PaliadinStats, error)
|
||||||
|
IsOwner(ctx context.Context, userID uuid.UUID) (bool, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
// LocalPaliadinService wraps the current tmux PoC (laptop / dev path).
|
||||||
|
type LocalPaliadinService struct { /* identical to today's PaliadinService */ }
|
||||||
|
|
||||||
|
// RemotePaliadinService talks to a paliadin-shim over SSH on mRiver.
|
||||||
|
type RemotePaliadinService struct {
|
||||||
|
db *sqlx.DB
|
||||||
|
users *UserService
|
||||||
|
sshHost string // 100.99.98.203
|
||||||
|
sshPort int // 22022 — bypasses Tailscale SSH on :22 (see §4.5)
|
||||||
|
sshUser string // m
|
||||||
|
sshKeyPath string // /tmp/paliadin-id_ed25519-<rand>
|
||||||
|
knownHosts string // /tmp/paliadin-known_hosts
|
||||||
|
turnMu sync.Mutex
|
||||||
|
|
||||||
|
// Health-check cache.
|
||||||
|
healthMu sync.Mutex
|
||||||
|
healthOK bool
|
||||||
|
healthCheckedAt time.Time
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
DB access (`ListRecentTurns`, `Stats`, `IsOwner`) is identical for both — they only read `paliad.paliadin_turns`. They live in a shared `paliadinDB` helper struct embedded in both implementations.
|
||||||
|
|
||||||
|
### 6.2 Wiring at startup
|
||||||
|
|
||||||
|
```go
|
||||||
|
// cmd/server/main.go (excerpt)
|
||||||
|
var paliadin services.Paliadin
|
||||||
|
remoteHost := os.Getenv("PALIADIN_REMOTE_HOST")
|
||||||
|
switch {
|
||||||
|
case remoteHost != "":
|
||||||
|
keyPath, err := loadPaliadinSSHKey()
|
||||||
|
if err != nil { log.Fatalf("paliadin: load ssh key: %v", err) }
|
||||||
|
if keyPath == "" { log.Fatalf("paliadin: PALIADIN_REMOTE_HOST set but no PALIADIN_SSH_PRIVATE_KEY") }
|
||||||
|
knownHosts, err := loadPaliadinKnownHosts()
|
||||||
|
if err != nil { log.Fatalf("paliadin: load known_hosts: %v", err) }
|
||||||
|
port, _ := strconv.Atoi(cmpOr(os.Getenv("PALIADIN_REMOTE_PORT"), "22022"))
|
||||||
|
paliadin = services.NewRemotePaliadinService(db, userSvc, services.RemotePaliadinConfig{
|
||||||
|
SSHHost: remoteHost,
|
||||||
|
SSHPort: port,
|
||||||
|
SSHUser: cmpOr(os.Getenv("PALIADIN_REMOTE_USER"), "m"),
|
||||||
|
SSHKeyPath: keyPath,
|
||||||
|
KnownHostsPath: knownHosts,
|
||||||
|
})
|
||||||
|
log.Printf("paliadin: remote mode → ssh %s@%s:%d", "m", remoteHost, port)
|
||||||
|
case localTmuxAvailable():
|
||||||
|
paliadin = services.NewLocalPaliadinService(db, userSvc, "", "")
|
||||||
|
log.Printf("paliadin: local tmux mode")
|
||||||
|
default:
|
||||||
|
paliadin = services.NewDisabledPaliadinService(db, userSvc)
|
||||||
|
log.Printf("paliadin: disabled (no remote host, no local tmux)")
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`NewDisabledPaliadinService` exists today implicitly via the `ErrTmuxUnavailable` path; making it explicit gives the constructor a clear name and the handler doesn't have to special-case `nil`.
|
||||||
|
|
||||||
|
### 6.3 SSH invocation pattern
|
||||||
|
|
||||||
|
`RemotePaliadinService` runs every RPC through the same helper:
|
||||||
|
|
||||||
|
```go
|
||||||
|
func (s *RemotePaliadinService) callShim(ctx context.Context, args ...string) ([]byte, error) {
|
||||||
|
sshArgs := []string{
|
||||||
|
"-F", "/dev/null", // ignore /etc/ssh/ssh_config + ~/.ssh/config
|
||||||
|
"-i", s.sshKeyPath,
|
||||||
|
"-p", strconv.Itoa(s.sshPort), // 22022 — bypasses Tailscale SSH on :22
|
||||||
|
"-o", "IdentitiesOnly=yes", // don't fall back to other keys
|
||||||
|
"-o", "UserKnownHostsFile=" + s.knownHostsPath,
|
||||||
|
"-o", "StrictHostKeyChecking=yes",
|
||||||
|
"-o", "BatchMode=yes",
|
||||||
|
"-o", "ConnectTimeout=3",
|
||||||
|
"-o", "ServerAliveInterval=10",
|
||||||
|
"-o", "ServerAliveCountMax=3",
|
||||||
|
s.sshUser + "@" + s.sshHost,
|
||||||
|
"--",
|
||||||
|
}
|
||||||
|
sshArgs = append(sshArgs, args...)
|
||||||
|
c, cancel := context.WithTimeout(ctx, 70*time.Second) // shim has its own 60s; +10s for SSH overhead
|
||||||
|
defer cancel()
|
||||||
|
cmd := exec.CommandContext(c, "ssh", sshArgs...)
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
cmd.Stdout = &stdout; cmd.Stderr = &stderr
|
||||||
|
if err := cmd.Run(); err != nil {
|
||||||
|
return nil, fmt.Errorf("paliadin: ssh shim %v: %w (stderr: %s)", args, err, stderr.String())
|
||||||
|
}
|
||||||
|
return stdout.Bytes(), nil
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
`RunTurn` becomes:
|
||||||
|
|
||||||
|
```go
|
||||||
|
func (s *RemotePaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) {
|
||||||
|
s.turnMu.Lock()
|
||||||
|
defer s.turnMu.Unlock()
|
||||||
|
|
||||||
|
if err := s.healthGate(ctx); err != nil {
|
||||||
|
return nil, err // ErrMRiverUnreachable, picked up by handler
|
||||||
|
}
|
||||||
|
|
||||||
|
turnID := uuid.New()
|
||||||
|
started := time.Now().UTC()
|
||||||
|
if err := s.insertTurnRow(ctx, …); err != nil { return nil, err }
|
||||||
|
|
||||||
|
// First-turn-only: bootstrap the system prompt on mRiver. Detected by
|
||||||
|
// checking whether any prior turn for this user has succeeded.
|
||||||
|
if err := s.ensureBootstrapped(ctx); err != nil {
|
||||||
|
_ = s.markTurnError(ctx, turnID, "bootstrap_failed")
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
msg := sanitiseForTmux(req.UserMessage)
|
||||||
|
msgB64 := base64.StdEncoding.EncodeToString([]byte(msg))
|
||||||
|
body, err := s.callShim(ctx, "run-turn", turnID.String(), msgB64)
|
||||||
|
if err != nil {
|
||||||
|
_ = s.markTurnError(ctx, turnID, classifySSHError(err))
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Same trailer-parse + audit-row writes as Local, factored into shared helper.
|
||||||
|
return s.completeTurnFromBody(ctx, turnID, started, string(body))
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6.4 System prompt bootstrap
|
||||||
|
|
||||||
|
The local PoC calls `paliadinSystemPrompt(s.responseDir)` once when it creates the pane. The remote path needs the same hook. Two options that don't require duplicating the German prompt body to mRiver:
|
||||||
|
|
||||||
|
- **Lazy bootstrap (chosen):** the first `RunTurn` after a paliad-prod restart sends the system prompt via `bootstrap` RPC, then runs the actual turn. Subsequent turns skip the bootstrap. State is per-process: `RemotePaliadinService.bootstrapped` boolean guarded by mutex.
|
||||||
|
- Eager bootstrap at startup is rejected — it forces every container start to wait for mRiver to be online, which couples paliad's boot to mRiver's availability.
|
||||||
|
|
||||||
|
Lazy bootstrap means the very first turn after a paliad redeploy pays a ~3 s extra cost (claude pane spin-up + system prompt absorb). Acceptable for a single-user PoC.
|
||||||
|
|
||||||
|
### 6.5 Health-check gating (`mriver_unreachable`)
|
||||||
|
|
||||||
|
Every `RunTurn` first calls `healthGate(ctx)`:
|
||||||
|
|
||||||
|
- Cached for 10 s. If last check was <10 s ago and was OK, skip the probe.
|
||||||
|
- Otherwise: `s.callShim(ctx, "health")` with a 3 s timeout. On success, set cache OK; on failure, return `ErrMRiverUnreachable`.
|
||||||
|
|
||||||
|
Why 10 s: short enough that "I just woke my laptop" propagates inside one user retry; long enough that a busy chat doesn't probe on every turn.
|
||||||
|
|
||||||
|
```go
|
||||||
|
var ErrMRiverUnreachable = errors.New("paliadin: mriver unreachable")
|
||||||
|
|
||||||
|
func (s *RemotePaliadinService) healthGate(ctx context.Context) error {
|
||||||
|
s.healthMu.Lock()
|
||||||
|
defer s.healthMu.Unlock()
|
||||||
|
if s.healthOK && time.Since(s.healthCheckedAt) < 10*time.Second {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
c, cancel := context.WithTimeout(ctx, 3*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
out, err := s.callShim(c, "health")
|
||||||
|
s.healthCheckedAt = time.Now()
|
||||||
|
if err != nil || strings.TrimSpace(string(out)) != "ok" {
|
||||||
|
s.healthOK = false
|
||||||
|
return fmt.Errorf("%w: %v", ErrMRiverUnreachable, err)
|
||||||
|
}
|
||||||
|
s.healthOK = true
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6.6 Friendly error code (extends t-paliad-150)
|
||||||
|
|
||||||
|
`friendlyErrorMessage` already maps `tmux_unavailable` to a localised message. We add one new code:
|
||||||
|
|
||||||
|
- `mriver_unreachable` → DE: *"mRiver ist offline — Paliadin nicht erreichbar. Mach mRiver an, oder nutze Paliadin lokal mit `./paliad`."* / EN: *"mRiver is offline — Paliadin can't reach it. Wake mRiver, or run Paliadin locally with `./paliad`."*
|
||||||
|
|
||||||
|
Implementation: one new `case` in the SSE-error switch in `frontend/src/client/paliadin.ts`'s `friendlyErrorMessage`, plus matching i18n keys (`paliadin.error.mriver_unreachable.de` / `.en`). Server-side: `paliadin` HTTP handler maps `errors.Is(err, services.ErrMRiverUnreachable)` to `event: error\ndata: {"code":"mriver_unreachable","message":"..."}\n\n`.
|
||||||
|
|
||||||
|
### 6.7 Rate limit
|
||||||
|
|
||||||
|
A runaway loop on the paliad side could DOS the SSH connection. Cheapest cap: enforce one in-flight turn at a time via `turnMu` (already exists in the local PoC). On top of that, a rolling cap of N=20 turns/min in `RemotePaliadinService` rejects with `ErrRateLimited` (mapped to a friendly `paliadin.error.rate_limited`). PoC has one user (m); the cap is a paranoid safety, not a real throttle.
|
||||||
|
|
||||||
|
### 6.8 What about ControlMaster?
|
||||||
|
|
||||||
|
Decision-2's chosen path (server-side shim with one RPC per turn) makes ControlMaster optional. The shim collapses ~10 raw-tmux ops into a single SSH connect — that's already the latency win ControlMaster would buy.
|
||||||
|
|
||||||
|
Adding it on top would save ~30–50 ms per turn but adds:
|
||||||
|
|
||||||
|
- A persistent `~/.ssh/cm-*` socket inside the container.
|
||||||
|
- Cleanup logic on shutdown.
|
||||||
|
- A subtle interaction with the SSH BatchMode + ConnectTimeout settings.
|
||||||
|
|
||||||
|
Verdict: skip ControlMaster in v1. If turn latency over Tailscale is measured >300 ms in practice and hot enough to matter, add it in a follow-up; the call site is one helper.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Phasing
|
||||||
|
|
||||||
|
### Phase A — manual proof-of-concept (no Dockerfile change yet)
|
||||||
|
|
||||||
|
Goal: validate the round-trip end-to-end on a deployed paliad, before touching the image.
|
||||||
|
|
||||||
|
**Phase A.0 (DONE 2026-05-07 23:31):** SSH+shim end-to-end on the tailnet.
|
||||||
|
|
||||||
|
1. ✅ **Generate keypair** on mRiver: `ssh-keygen -t ed25519 -N "" -C "paliad-prod" -f ~/.paliad-staging/paliad-prod-key`. Fingerprint `SHA256:5uV8v872F/IhJycjjq0crFue/emAYfw71N9bxTvkl9c`.
|
||||||
|
2. ✅ **Commit shim** to `scripts/paliadin-shim` and **install** at `/home/m/.local/bin/paliadin-shim`, `chmod 755`.
|
||||||
|
3. ✅ **Write authorized_keys** with public key + `command=`/`from="100.99.98.201"`/no-pty/no-port-forwarding/no-agent-forwarding/no-X11-forwarding/no-user-rc restrictions (§5.2).
|
||||||
|
4. ✅ **Add port 22022 socket drop-in** at `/etc/systemd/system/ssh.socket.d/paliad.conf`, `systemctl daemon-reload && systemctl restart ssh.socket`. Both `:22` (Tailscale SSH for m) and `:22022` (real OpenSSH for paliad) listening (§4.5).
|
||||||
|
5. ✅ **Capture mRiver:22022 host key**: `ssh-keyscan -p 22022 -t ed25519 100.99.98.203 > ~/.paliad-staging/known_hosts` from mLake. Fingerprint `SHA256:HPoUzy60Cb8yLERIBQcB2mHihNST3NaTODx5Ypd1XpA`.
|
||||||
|
6. ✅ **Smoke-test from mLake** (without paliad container, just raw ssh from mLake's host shell):
|
||||||
|
```
|
||||||
|
ssh -F /dev/null -i /tmp/paliad-prod-key -o UserKnownHostsFile=/tmp/paliad-known_hosts \
|
||||||
|
-o StrictHostKeyChecking=yes -o IdentitiesOnly=yes -o BatchMode=yes \
|
||||||
|
-p 22022 m@100.99.98.203 health
|
||||||
|
→ ok
|
||||||
|
ssh … run-turn $(uuidgen) "$(printf 'Sag …' | base64 -w0)"
|
||||||
|
→ "test ok" (3.4 s round-trip including a real Claude response)
|
||||||
|
```
|
||||||
|
7. ✅ **from= rejection verified**: the same key from mRiver itself (`100.99.98.203`) → `Permission denied (publickey,password)` as expected.
|
||||||
|
|
||||||
|
**Phase A.5 (PENDING m's hands):** validate `network_mode: host` + traefik routing on prod paliad.de.
|
||||||
|
|
||||||
|
- Branch the live `docker-compose.yml` on a temp branch.
|
||||||
|
- Add `network_mode: host` to the `web` service; remove `expose: ["8080"]`.
|
||||||
|
- Push to trigger a Dokploy redeploy.
|
||||||
|
- `curl --connect-timeout 5 -sSI https://paliad.de/` — expect 200 (or login redirect), NOT 502.
|
||||||
|
- If 502: revert the temp branch (`git revert HEAD && git push`); revisit decision 1 in a follow-up issue.
|
||||||
|
- If 200: keep the host-mode change; ready for Phase B.
|
||||||
|
|
||||||
|
This is **m's call to execute** — it briefly touches prod paliad.de. Inventor/coder should not flip prod compose without explicit go-ahead. Rollback is one revert + redeploy.
|
||||||
|
|
||||||
|
**Phase A.6 (after A.5 passes):** smoke-test SSH from inside the paliad-prod container itself (the real container, not just the mLake host shell):
|
||||||
|
```
|
||||||
|
docker exec -it <paliad-container> sh
|
||||||
|
apk add --no-cache openssh-client # one-shot, before Dockerfile change
|
||||||
|
ssh -F /dev/null -i /tmp/paliad-prod-key -o UserKnownHostsFile=/tmp/paliad-known_hosts \
|
||||||
|
-o StrictHostKeyChecking=yes -o IdentitiesOnly=yes -o BatchMode=yes \
|
||||||
|
-p 22022 m@100.99.98.203 health
|
||||||
|
# expected: "ok"
|
||||||
|
```
|
||||||
|
This proves the container's host-mode networking actually delivers a tailnet connect.
|
||||||
|
|
||||||
|
**Phase A.7:** wire env vars manually via Dokploy UI for one deploy; confirm `/paliadin` chat works against mRiver from paliad.de.
|
||||||
|
|
||||||
|
If A.5 fails: the design rolls back to a sidecar in a new issue (decision 1 follow-up). The SSH path (A.0) and traefik path (A.5) are independent — A.0 is already proven; only A.5+ is at risk.
|
||||||
|
|
||||||
|
### Phase B — bake into Dockerfile + Dokploy secrets
|
||||||
|
|
||||||
|
1. Dockerfile: add `openssh-client` to the final stage (§4.3).
|
||||||
|
2. compose: add `network_mode: host` and the four new env vars (§4.1).
|
||||||
|
3. Dokploy secrets: register `PALIADIN_REMOTE_HOST=100.99.98.203`, `PALIADIN_REMOTE_USER=m`, `PALIADIN_SSH_PRIVATE_KEY=...`, `PALIADIN_KNOWN_HOSTS=...`.
|
||||||
|
4. Code: refactor `PaliadinService` to the interface split (§6.1–§6.2). New file `internal/services/paliadin_remote.go`. Tests: `paliadin_remote_test.go` mocks `callShim` to verify `RunTurn` audit-row writes, error mapping, and `healthGate` caching.
|
||||||
|
5. Ship under one PR; tag t-paliad-151 done.
|
||||||
|
|
||||||
|
### Phase C — friendly errors + monitoring
|
||||||
|
|
||||||
|
1. `paliadin.error.mriver_unreachable` i18n keys + `friendlyErrorMessage` case (§6.6).
|
||||||
|
2. `/admin/paliadin` shows last health-probe result + last successful turn timestamp.
|
||||||
|
3. Optional: `mai-mesh` integration to surface mRiver-offline events to m on Telegram (out-of-band; not gating).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. Security review summary
|
||||||
|
|
||||||
|
| Risk | Mitigation |
|
||||||
|
|---|---|
|
||||||
|
| Stolen private key → arbitrary SSH on mRiver | `command=` shim restriction + `from="100.99.98.201"` + ed25519 key + private key only in Dokploy secret store (encrypted at rest); paliad route uses port 22022 where real OpenSSH enforces all of the above |
|
||||||
|
| Stolen private key → tailnet-wide SSH from non-mLake host | `from="100.99.98.201"` clause (verified: rejected from mRiver itself in Phase A.0) |
|
||||||
|
| Tailscale SSH on `:22` bypasses `authorized_keys` | The paliad-prod key's `command=` restriction is not enforced on `:22`. Mitigation: paliad always dials `:22022`, which is real OpenSSH. m's interactive `tailscale ssh m@mriver` on `:22` continues to be governed by Tailscale ACLs, separate from paliad's identity. |
|
||||||
|
| Container compromise → key extraction | Key written to tmpfile chmod 600, only root inside container can read; alpine container has no shell-on-error trampolines |
|
||||||
|
| Host-key MITM during connect | Pinned `known_hosts`; `StrictHostKeyChecking=yes` |
|
||||||
|
| Shim argument injection (e.g. via `run-turn $(rm -rf /)`) | Shim parses positional args from `$SSH_ORIGINAL_COMMAND` via `read -r -a`; never passes args to a subshell `eval`; turn_id validated by UUID regex; message body always base64-decoded into a single shell variable, never re-evaluated |
|
||||||
|
| Runaway loop → SSH flood | Single-flight `turnMu` + 20/min rolling cap |
|
||||||
|
| `network_mode: host` widens blast radius | The `command=` + `from=` restrictions on mRiver mean container compromise = "can run shim verbs against mRiver only", not "shell on mRiver" |
|
||||||
|
| PaliadinOwnerEmail bypass | Unchanged from PoC: gate is in Go (`/paliadin` 404s for any other user). Even if mRiver SSH key leaks, attacker still needs paliad session as `m@hoganlovells.com`. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. Out-of-scope clarifications (for review)
|
||||||
|
|
||||||
|
These were called out in the issue but the design intentionally does not solve them, to keep v1 tight. Each is acknowledged so review knows it wasn't an oversight:
|
||||||
|
|
||||||
|
- **Wake-on-LAN of mRiver:** out of scope. v1's UX when mRiver is asleep is the friendly error from §6.6. Future work: integrate with `mai-mesh` capability fallback.
|
||||||
|
- **Multi-host failover:** out of scope. Only mRiver is targeted.
|
||||||
|
- **Anthropic API fallback when mRiver offline:** out of scope per CLAUDE.md (`ANTHROPIC_API_KEY` reserved for production-v1, unused in PoC).
|
||||||
|
- **ControlMaster:** v1 ships without; revisit if turn latency >300 ms in practice (§6.8).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 10. File-level deliverables (for the coder shift)
|
||||||
|
|
||||||
|
When this design is approved and the coder shift starts, the work splits roughly into:
|
||||||
|
|
||||||
|
- `Dockerfile` — `+openssh-client`.
|
||||||
|
- `docker-compose.yml` — `network_mode: host`, five new env entries (`PALIADIN_REMOTE_HOST`, `PALIADIN_REMOTE_PORT`, `PALIADIN_REMOTE_USER`, `PALIADIN_SSH_PRIVATE_KEY`, `PALIADIN_KNOWN_HOSTS`).
|
||||||
|
- `internal/services/paliadin.go` — extract `Paliadin` interface; rename existing to `LocalPaliadinService`; pull DB-only methods (`ListRecentTurns`, `Stats`, `IsOwner`) into a shared embedded `paliadinDB` so both implementations get them for free.
|
||||||
|
- `internal/services/paliadin_remote.go` — new file: `RemotePaliadinService`, `RemotePaliadinConfig` (with `SSHPort`), `callShim`, `healthGate`, `ensureBootstrapped`, `classifySSHError`, `ErrMRiverUnreachable`.
|
||||||
|
- `internal/services/paliadin_remote_test.go` — unit tests with a mocked `callShim`.
|
||||||
|
- `cmd/server/main.go` — env-var-based wiring (§6.2), `loadPaliadinSSHKey`, `loadPaliadinKnownHosts`, `PALIADIN_REMOTE_PORT` parse with default `22022`.
|
||||||
|
- `frontend/src/client/paliadin.ts` — one `case` in `friendlyErrorMessage` for `mriver_unreachable`.
|
||||||
|
- `frontend/src/i18n.ts` — two new keys (`paliadin.error.mriver_unreachable.de` / `.en`).
|
||||||
|
- `scripts/paliadin-shim` — server-side script (§5.4); already shipped + installed on mRiver during Phase A.0, not part of any container. Repo location chosen so the security-relevant script is version-controlled.
|
||||||
|
- `docs/project-status.md` — note Phase 0.5 (PoC) → Phase 0.6 (Tailscale-SSH prod route).
|
||||||
|
- **mRiver host setup (one-time, already done in Phase A.0):** `/etc/systemd/system/ssh.socket.d/paliad.conf` (port 22022 listen drop-in); `~/.ssh/authorized_keys` (paliad-prod public key with restrictions); `/home/m/.local/bin/paliadin-shim` (executable). These are NOT in the repo because they live on m's laptop; `docs/project-status.md` should reference them.
|
||||||
|
|
||||||
|
No DB migrations needed — `paliad.paliadin_turns` schema already covers everything (`error_code` field already accepts free-form codes including `mriver_unreachable`).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 11. Open questions for review
|
||||||
|
|
||||||
|
- **Q (m), still open:** Phase A.5 (traefik+host-mode on prod paliad.de) is not yet executed. m drives this; rollback is one revert. Dokploy doc check before flipping is recommended but not blocking.
|
||||||
|
- **Q (m), resolved 2026-05-07 23:50:** shim location → repo (`scripts/paliadin-shim`, committed in `0248411`). Version-controlled and auditable.
|
||||||
|
- **Q (m), still open:** `ANTHROPIC_API_KEY` env var reservation in compose comments — keep for production-v1, or strip now? Not blocking either phase; defer.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 12. Phase A.0 completion summary (2026-05-07 23:50)
|
||||||
|
|
||||||
|
**Coder shift (noether) executed Phase A.0 in full:**
|
||||||
|
|
||||||
|
1. ✅ shim committed at `scripts/paliadin-shim` (commit `0248411`, repo-version-controlled)
|
||||||
|
2. ✅ shim installed at `/home/m/.local/bin/paliadin-shim` on mRiver
|
||||||
|
3. ✅ ed25519 keypair `paliad-prod` generated, public-key fingerprint `SHA256:5uV8v872F/IhJycjjq0crFue/emAYfw71N9bxTvkl9c`, private key staged at `~/.paliad-staging/paliad-prod-key` on mRiver (mode 600)
|
||||||
|
4. ✅ `~/.ssh/authorized_keys` written with `command=`/`from=`/no-pty/no-port-forwarding/no-agent-forwarding/no-X11-forwarding/no-user-rc restrictions
|
||||||
|
5. ✅ `ssh.socket` drop-in installed at `/etc/systemd/system/ssh.socket.d/paliad.conf`; both `:22` and `:22022` listening
|
||||||
|
6. ✅ host key for `:22022` captured at `~/.paliad-staging/known_hosts` (fingerprint `SHA256:HPoUzy60Cb8yLERIBQcB2mHihNST3NaTODx5Ypd1XpA`)
|
||||||
|
7. ✅ end-to-end SSH+shim+Claude run-turn validated from mLake → mRiver:22022 (3.4 s round-trip)
|
||||||
|
8. ✅ `from="100.99.98.201"` rejection verified
|
||||||
|
|
||||||
|
**Three secrets ready for Dokploy registration** (m to copy from `~/.paliad-staging/` on mRiver):
|
||||||
|
- `PALIADIN_SSH_PRIVATE_KEY` ← `cat ~/.paliad-staging/paliad-prod-key`
|
||||||
|
- `PALIADIN_KNOWN_HOSTS` ← `cat ~/.paliad-staging/known_hosts`
|
||||||
|
- `PALIADIN_REMOTE_HOST=100.99.98.203`, `PALIADIN_REMOTE_PORT=22022`, `PALIADIN_REMOTE_USER=m`
|
||||||
|
|
||||||
|
**Phase A.5 (traefik+host-mode test) and Phase A.6/A.7 (in-container SSH smoke + paliad/paliadin end-to-end) await m's hands** — they touch prod paliad.de.
|
||||||
|
|
||||||
|
**Phase B (Dockerfile + Go interface split + Dokploy secrets) is unblocked from a code perspective** — but should not merge until Phase A.5 confirms the host-mode networking trade-off is acceptable.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Inventor design + coder Phase A.0 complete.** Awaiting m for Phase A.5 traefik validation before the coder writes the Go interface split.
|
||||||
@@ -1558,6 +1558,10 @@ const translations: Record<Lang, Record<string, string>> = {
|
|||||||
"paliadin.stop": "Stop",
|
"paliadin.stop": "Stop",
|
||||||
"paliadin.reset": "Neue Unterhaltung",
|
"paliadin.reset": "Neue Unterhaltung",
|
||||||
"paliadin.error.local_only": "Paliadin läuft nur lokal. Diese Instanz hat kein tmux/claude installiert — lokal mit ./paliad starten.",
|
"paliadin.error.local_only": "Paliadin läuft nur lokal. Diese Instanz hat kein tmux/claude installiert — lokal mit ./paliad starten.",
|
||||||
|
"paliadin.error.mriver_unreachable": "mRiver ist offline — Paliadin nicht erreichbar. Mach mRiver an, oder nutze Paliadin lokal mit ./paliad.",
|
||||||
|
"paliadin.error.shim_auth_failed": "Paliadin-Authentifizierung fehlgeschlagen. SSH-Schlüssel oder Berechtigung auf mRiver prüfen.",
|
||||||
|
"paliadin.error.shim_error": "Paliadin-Fehler auf mRiver. tmux/claude-Pane prüfen.",
|
||||||
|
"paliadin.error.timeout": "Paliadin antwortet nicht (Timeout 60s). Nochmal versuchen.",
|
||||||
"paliadin.error.connection_lost": "Verbindung verloren.",
|
"paliadin.error.connection_lost": "Verbindung verloren.",
|
||||||
"paliadin.error.upstream": "Fehler beim Senden.",
|
"paliadin.error.upstream": "Fehler beim Senden.",
|
||||||
"nav.admin.paliadin": "Paliadin Monitor",
|
"nav.admin.paliadin": "Paliadin Monitor",
|
||||||
@@ -3553,6 +3557,10 @@ const translations: Record<Lang, Record<string, string>> = {
|
|||||||
"paliadin.stop": "Stop",
|
"paliadin.stop": "Stop",
|
||||||
"paliadin.reset": "New conversation",
|
"paliadin.reset": "New conversation",
|
||||||
"paliadin.error.local_only": "Paliadin only runs locally. This instance has no tmux/claude installed — start it locally via ./paliad.",
|
"paliadin.error.local_only": "Paliadin only runs locally. This instance has no tmux/claude installed — start it locally via ./paliad.",
|
||||||
|
"paliadin.error.mriver_unreachable": "mRiver is offline — Paliadin can't reach it. Wake mRiver, or run Paliadin locally with ./paliad.",
|
||||||
|
"paliadin.error.shim_auth_failed": "Paliadin auth failed. Check the SSH key or authorized_keys on mRiver.",
|
||||||
|
"paliadin.error.shim_error": "Paliadin error on mRiver. Check the tmux/claude pane.",
|
||||||
|
"paliadin.error.timeout": "Paliadin didn't respond in time (60s). Try again.",
|
||||||
"paliadin.error.connection_lost": "Connection lost.",
|
"paliadin.error.connection_lost": "Connection lost.",
|
||||||
"paliadin.error.upstream": "Send failed.",
|
"paliadin.error.upstream": "Send failed.",
|
||||||
"nav.admin.paliadin": "Paliadin Monitor",
|
"nav.admin.paliadin": "Paliadin Monitor",
|
||||||
|
|||||||
@@ -210,8 +210,24 @@ function friendlyErrorMessage(data: unknown): string {
|
|||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
const parsed = JSON.parse(data) as { code?: string };
|
const parsed = JSON.parse(data) as { code?: string };
|
||||||
if (parsed.code === "tmux_unavailable") {
|
switch (parsed.code) {
|
||||||
return t("paliadin.error.local_only");
|
case "tmux_unavailable":
|
||||||
|
// Local PoC path: paliad is running on a host without tmux/claude
|
||||||
|
// (typically the legacy laptop-only build).
|
||||||
|
return t("paliadin.error.local_only");
|
||||||
|
case "mriver_unreachable":
|
||||||
|
// t-paliad-151: prod path's mRiver is offline (laptop asleep, off
|
||||||
|
// tailnet, or paliadin-shim missing).
|
||||||
|
return t("paliadin.error.mriver_unreachable");
|
||||||
|
case "shim_auth_failed":
|
||||||
|
// SSH key wrong or authorized_keys drifted.
|
||||||
|
return t("paliadin.error.shim_auth_failed");
|
||||||
|
case "shim_error":
|
||||||
|
case "bootstrap_failed":
|
||||||
|
// Generic remote shim failure or system-prompt bootstrap error.
|
||||||
|
return t("paliadin.error.shim_error");
|
||||||
|
case "timeout":
|
||||||
|
return t("paliadin.error.timeout");
|
||||||
}
|
}
|
||||||
} catch {
|
} catch {
|
||||||
// Not JSON — fall through to the generic connection-lost message
|
// Not JSON — fall through to the generic connection-lost message
|
||||||
|
|||||||
@@ -1423,6 +1423,10 @@ export type I18nKey =
|
|||||||
| "paliadin.empty"
|
| "paliadin.empty"
|
||||||
| "paliadin.error.connection_lost"
|
| "paliadin.error.connection_lost"
|
||||||
| "paliadin.error.local_only"
|
| "paliadin.error.local_only"
|
||||||
|
| "paliadin.error.mriver_unreachable"
|
||||||
|
| "paliadin.error.shim_auth_failed"
|
||||||
|
| "paliadin.error.shim_error"
|
||||||
|
| "paliadin.error.timeout"
|
||||||
| "paliadin.error.upstream"
|
| "paliadin.error.upstream"
|
||||||
| "paliadin.heading"
|
| "paliadin.heading"
|
||||||
| "paliadin.input.placeholder"
|
| "paliadin.input.placeholder"
|
||||||
|
|||||||
@@ -69,10 +69,12 @@ type Services struct {
|
|||||||
Pin *services.PinService
|
Pin *services.PinService
|
||||||
CardLayout *services.CardLayoutService
|
CardLayout *services.CardLayoutService
|
||||||
|
|
||||||
// Paliadin is wired only when PALIADIN_ENABLED=true at boot
|
// Paliadin is wired when DATABASE_URL is set. The concrete backend
|
||||||
// (PoC; m's laptop only). On prod it stays nil and all /paliadin*
|
// is picked in cmd/server/main.go based on PALIADIN_REMOTE_HOST
|
||||||
// routes 404 because Register() skips registering them.
|
// (remote → mRiver via SSH) or local tmux availability. Stays nil
|
||||||
Paliadin *services.PaliadinService
|
// without DATABASE_URL; in that case the per-request handler gate
|
||||||
|
// 404s anyway.
|
||||||
|
Paliadin services.Paliadin
|
||||||
}
|
}
|
||||||
|
|
||||||
func Register(mux *http.ServeMux, client *auth.Client, giteaAPIToken string, svc *Services) {
|
func Register(mux *http.ServeMux, client *auth.Client, giteaAPIToken string, svc *Services) {
|
||||||
|
|||||||
@@ -39,10 +39,11 @@ func newDetachedContext(timeout time.Duration) (context.Context, context.CancelF
|
|||||||
return context.WithTimeout(context.Background(), timeout)
|
return context.WithTimeout(context.Background(), timeout)
|
||||||
}
|
}
|
||||||
|
|
||||||
// paliadinSvc is the live PaliadinService instance. nil when
|
// paliadinSvc is the live Paliadin backend. nil when DATABASE_URL was
|
||||||
// DATABASE_URL was unset (the service depends on the audit table).
|
// unset (the service depends on the audit table). Set by Register() at
|
||||||
// Set by Register() at boot.
|
// boot. The concrete type is decided in cmd/server/main.go: local-tmux
|
||||||
var paliadinSvc *services.PaliadinService
|
// PoC, remote-via-SSH (mRiver), or a disabled stub.
|
||||||
|
var paliadinSvc services.Paliadin
|
||||||
|
|
||||||
// requirePaliadinOwner gates every paliadin handler to the single
|
// requirePaliadinOwner gates every paliadin handler to the single
|
||||||
// owner email (services.PaliadinOwnerEmail = m). Anyone else gets a
|
// owner email (services.PaliadinOwnerEmail = m). Anyone else gets a
|
||||||
|
|||||||
@@ -1,23 +1,23 @@
|
|||||||
package services
|
package services
|
||||||
|
|
||||||
// PaliadinService — Phase 0 PoC of the in-app AI buddy (t-paliad-146).
|
// Paliadin — the in-app AI buddy. Two implementations of the same
|
||||||
|
// interface, picked at boot time (see cmd/server/main.go):
|
||||||
//
|
//
|
||||||
// Design: docs/design-paliadin-2026-05-07.md §0.5 (PoC track).
|
// - LocalPaliadinService — talks to a `claude` CLI in a local tmux
|
||||||
|
// session. The PoC path (t-paliad-146); used on m's laptop.
|
||||||
|
// - RemotePaliadinService — shells out to ssh on mRiver where the
|
||||||
|
// long-lived tmux+claude pane lives. The prod path (t-paliad-151);
|
||||||
|
// used by the paliad.de Dokploy container, which has no `claude`
|
||||||
|
// CLI of its own.
|
||||||
//
|
//
|
||||||
// Architecture: a long-lived `claude` process inside a tmux session.
|
// Designs:
|
||||||
// Prompts go in via `tmux send-keys -l`; responses come back via a
|
// - docs/design-paliadin-2026-05-07.md (PoC architecture)
|
||||||
// per-turn file the system prompt instructs Claude to write
|
// - docs/design-paliadin-tailscale-ssh-2026-05-07.md (remote routing)
|
||||||
// (Write(/tmp/paliadin/{turn_id}.txt)). The service polls that file,
|
|
||||||
// strips the [paliadin-meta] trailer block, parses the metadata, writes
|
|
||||||
// an audit row, and emits the response back to the SSE handler.
|
|
||||||
//
|
//
|
||||||
// The architecture is lifted (with adaptation to Go) from
|
// Both implementations share the audit-table I/O (paliadinDB) and the
|
||||||
// ~/dev/mVoice/server.py:250-380, which has been driving the goldi voice
|
// trailer parser. The conversation state (turn ordering, response file
|
||||||
// surface in production since 2026-Q1.
|
// polling) is split: Local owns the tmux pane directly; Remote delegates
|
||||||
//
|
// to the paliadin-shim on mRiver and reads the file there.
|
||||||
// PoC ONLY runs on m's laptop (PALIADIN_ENABLED=false on prod default).
|
|
||||||
// Hardcoded single-user, single-tmux-window scope. Do not attempt to
|
|
||||||
// deploy this to the Dokploy container — there is no `claude` CLI there.
|
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
@@ -50,12 +50,36 @@ import (
|
|||||||
// path to enabling Paliadin.
|
// path to enabling Paliadin.
|
||||||
const PaliadinOwnerEmail = "matthias.siebels@hoganlovells.com"
|
const PaliadinOwnerEmail = "matthias.siebels@hoganlovells.com"
|
||||||
|
|
||||||
// PaliadinService manages the tmux-claude PoC.
|
// Paliadin is the interface every Paliadin backend implements. Two
|
||||||
type PaliadinService struct {
|
// production implementations: LocalPaliadinService (local tmux+claude)
|
||||||
db *sqlx.DB
|
// and RemotePaliadinService (ssh+paliadin-shim on mRiver). A
|
||||||
|
// DisabledPaliadinService stub is constructed when neither is available
|
||||||
|
// so callers don't have to nil-check on every entry point.
|
||||||
|
type Paliadin interface {
|
||||||
|
RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error)
|
||||||
|
ResetSession(ctx context.Context) error
|
||||||
|
ListRecentTurns(ctx context.Context, callerID uuid.UUID, limit int) ([]PaliadinTurn, error)
|
||||||
|
Stats(ctx context.Context, callerID uuid.UUID) (*PaliadinStats, error)
|
||||||
|
IsOwner(ctx context.Context, userID uuid.UUID) (bool, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
// paliadinDB is the audit-table read/write surface shared by every
|
||||||
|
// Paliadin implementation. Embedded in LocalPaliadinService and
|
||||||
|
// RemotePaliadinService so they inherit IsOwner / ListRecentTurns /
|
||||||
|
// Stats and the per-turn row writers without duplication.
|
||||||
|
type paliadinDB struct {
|
||||||
|
db *sqlx.DB
|
||||||
|
users *UserService
|
||||||
|
}
|
||||||
|
|
||||||
|
// LocalPaliadinService runs the local tmux+claude PoC (t-paliad-146).
|
||||||
|
// Hardcoded single-user, single-tmux-window scope. Used on m's laptop;
|
||||||
|
// not deployed to prod (the Dokploy container has no `claude` CLI —
|
||||||
|
// see RemotePaliadinService for that path).
|
||||||
|
type LocalPaliadinService struct {
|
||||||
|
paliadinDB
|
||||||
tmuxSession string
|
tmuxSession string
|
||||||
responseDir string
|
responseDir string
|
||||||
users *UserService
|
|
||||||
|
|
||||||
// Cached pane target ("session:window-idx") once the voice window is
|
// Cached pane target ("session:window-idx") once the voice window is
|
||||||
// either discovered or created. Reset to "" if the pane dies.
|
// either discovered or created. Reset to "" if the pane dies.
|
||||||
@@ -74,7 +98,7 @@ type PaliadinService struct {
|
|||||||
//
|
//
|
||||||
// Returns (false, nil) for any other user — including unknown UUIDs and
|
// Returns (false, nil) for any other user — including unknown UUIDs and
|
||||||
// users without an email row. Errors only on DB failure.
|
// users without an email row. Errors only on DB failure.
|
||||||
func (s *PaliadinService) IsOwner(ctx context.Context, userID uuid.UUID) (bool, error) {
|
func (s *paliadinDB) IsOwner(ctx context.Context, userID uuid.UUID) (bool, error) {
|
||||||
var email string
|
var email string
|
||||||
err := s.db.QueryRowxContext(ctx,
|
err := s.db.QueryRowxContext(ctx,
|
||||||
`SELECT email FROM paliad.users WHERE id = $1`, userID).Scan(&email)
|
`SELECT email FROM paliad.users WHERE id = $1`, userID).Scan(&email)
|
||||||
@@ -87,19 +111,19 @@ func (s *PaliadinService) IsOwner(ctx context.Context, userID uuid.UUID) (bool,
|
|||||||
return strings.EqualFold(email, PaliadinOwnerEmail), nil
|
return strings.EqualFold(email, PaliadinOwnerEmail), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewPaliadinService wires the service. Call only when PALIADIN_ENABLED=true.
|
// NewLocalPaliadinService wires the local-tmux PoC backend. Falls back
|
||||||
func NewPaliadinService(db *sqlx.DB, users *UserService, tmuxSession, responseDir string) *PaliadinService {
|
// to default tmux session + response dir when env vars are empty.
|
||||||
|
func NewLocalPaliadinService(db *sqlx.DB, users *UserService, tmuxSession, responseDir string) *LocalPaliadinService {
|
||||||
if tmuxSession == "" {
|
if tmuxSession == "" {
|
||||||
tmuxSession = "paliad-paliadin"
|
tmuxSession = "paliad-paliadin"
|
||||||
}
|
}
|
||||||
if responseDir == "" {
|
if responseDir == "" {
|
||||||
responseDir = "/tmp/paliadin"
|
responseDir = "/tmp/paliadin"
|
||||||
}
|
}
|
||||||
return &PaliadinService{
|
return &LocalPaliadinService{
|
||||||
db: db,
|
paliadinDB: paliadinDB{db: db, users: users},
|
||||||
tmuxSession: tmuxSession,
|
tmuxSession: tmuxSession,
|
||||||
responseDir: responseDir,
|
responseDir: responseDir,
|
||||||
users: users,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -156,7 +180,7 @@ var ErrTmuxUnavailable = errors.New("paliadin: tmux unavailable")
|
|||||||
//
|
//
|
||||||
// PoC: serialised. The package-level turnMu enforces "one at a time".
|
// PoC: serialised. The package-level turnMu enforces "one at a time".
|
||||||
// m is the only user, so this is fine.
|
// m is the only user, so this is fine.
|
||||||
func (s *PaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) {
|
func (s *LocalPaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) {
|
||||||
s.turnMu.Lock()
|
s.turnMu.Lock()
|
||||||
defer s.turnMu.Unlock()
|
defer s.turnMu.Unlock()
|
||||||
|
|
||||||
@@ -238,7 +262,7 @@ func (s *PaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnRe
|
|||||||
|
|
||||||
// ResetSession sends `/clear` to the Claude pane so the next turn starts
|
// ResetSession sends `/clear` to the Claude pane so the next turn starts
|
||||||
// from a clean conversation. Used by the "New conversation" button.
|
// from a clean conversation. Used by the "New conversation" button.
|
||||||
func (s *PaliadinService) ResetSession(ctx context.Context) error {
|
func (s *LocalPaliadinService) ResetSession(ctx context.Context) error {
|
||||||
s.mu.Lock()
|
s.mu.Lock()
|
||||||
target := s.paneTarget
|
target := s.paneTarget
|
||||||
s.mu.Unlock()
|
s.mu.Unlock()
|
||||||
@@ -254,7 +278,7 @@ func (s *PaliadinService) ResetSession(ctx context.Context) error {
|
|||||||
|
|
||||||
// ListRecentTurns reads the last N turns visible to the caller.
|
// ListRecentTurns reads the last N turns visible to the caller.
|
||||||
// global_admin sees everything; everyone else sees their own.
|
// global_admin sees everything; everyone else sees their own.
|
||||||
func (s *PaliadinService) ListRecentTurns(ctx context.Context, callerID uuid.UUID, limit int) ([]PaliadinTurn, error) {
|
func (s *paliadinDB) ListRecentTurns(ctx context.Context, callerID uuid.UUID, limit int) ([]PaliadinTurn, error) {
|
||||||
if limit <= 0 || limit > 200 {
|
if limit <= 0 || limit > 200 {
|
||||||
limit = 50
|
limit = 50
|
||||||
}
|
}
|
||||||
@@ -302,7 +326,7 @@ type PaliadinPromptCount struct {
|
|||||||
// Stats computes the dashboard aggregate. global_admin sees everything;
|
// Stats computes the dashboard aggregate. global_admin sees everything;
|
||||||
// everyone else sees their own slice (PoC has only m, but the policy
|
// everyone else sees their own slice (PoC has only m, but the policy
|
||||||
// matches RLS on the table).
|
// matches RLS on the table).
|
||||||
func (s *PaliadinService) Stats(ctx context.Context, callerID uuid.UUID) (*PaliadinStats, error) {
|
func (s *paliadinDB) Stats(ctx context.Context, callerID uuid.UUID) (*PaliadinStats, error) {
|
||||||
stats := &PaliadinStats{
|
stats := &PaliadinStats{
|
||||||
ByClassifier: map[string]int{},
|
ByClassifier: map[string]int{},
|
||||||
DailyCounts: []PaliadinDailyCount{},
|
DailyCounts: []PaliadinDailyCount{},
|
||||||
@@ -404,7 +428,7 @@ func (s *PaliadinService) Stats(ctx context.Context, callerID uuid.UUID) (*Palia
|
|||||||
|
|
||||||
// ensurePane returns the tmux target ("session:window-idx") of the live
|
// ensurePane returns the tmux target ("session:window-idx") of the live
|
||||||
// Claude pane, creating both session and window if missing.
|
// Claude pane, creating both session and window if missing.
|
||||||
func (s *PaliadinService) ensurePane(ctx context.Context) (string, error) {
|
func (s *LocalPaliadinService) ensurePane(ctx context.Context) (string, error) {
|
||||||
s.mu.Lock()
|
s.mu.Lock()
|
||||||
defer s.mu.Unlock()
|
defer s.mu.Unlock()
|
||||||
|
|
||||||
@@ -468,7 +492,7 @@ func (s *PaliadinService) ensurePane(ctx context.Context) (string, error) {
|
|||||||
return target, nil
|
return target, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *PaliadinService) findChatWindow(ctx context.Context) string {
|
func (s *LocalPaliadinService) findChatWindow(ctx context.Context) string {
|
||||||
out, err := runTmuxOut(ctx, "list-windows", "-t", s.tmuxSession,
|
out, err := runTmuxOut(ctx, "list-windows", "-t", s.tmuxSession,
|
||||||
"-F", "#{window_index}")
|
"-F", "#{window_index}")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -485,14 +509,14 @@ func (s *PaliadinService) findChatWindow(ctx context.Context) string {
|
|||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *PaliadinService) paneAlive(ctx context.Context, target string) bool {
|
func (s *LocalPaliadinService) paneAlive(ctx context.Context, target string) bool {
|
||||||
if err := runTmux(ctx, "has-session", "-t", target); err != nil {
|
if err := runTmux(ctx, "has-session", "-t", target); err != nil {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *PaliadinService) waitForPaneReady(ctx context.Context, target string, timeout time.Duration) error {
|
func (s *LocalPaliadinService) waitForPaneReady(ctx context.Context, target string, timeout time.Duration) error {
|
||||||
deadline := time.Now().Add(timeout)
|
deadline := time.Now().Add(timeout)
|
||||||
for time.Now().Before(deadline) {
|
for time.Now().Before(deadline) {
|
||||||
select {
|
select {
|
||||||
@@ -509,7 +533,7 @@ func (s *PaliadinService) waitForPaneReady(ctx context.Context, target string, t
|
|||||||
return fmt.Errorf("pane %s not ready within %s", target, timeout)
|
return fmt.Errorf("pane %s not ready within %s", target, timeout)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *PaliadinService) sendToPane(ctx context.Context, target, msg string) error {
|
func (s *LocalPaliadinService) sendToPane(ctx context.Context, target, msg string) error {
|
||||||
// `-l` sends the message literally (no key parsing) — necessary so
|
// `-l` sends the message literally (no key parsing) — necessary so
|
||||||
// our prompt's special characters don't get interpreted.
|
// our prompt's special characters don't get interpreted.
|
||||||
if err := runTmux(ctx, "send-keys", "-t", target, "-l", msg); err != nil {
|
if err := runTmux(ctx, "send-keys", "-t", target, "-l", msg); err != nil {
|
||||||
@@ -527,7 +551,7 @@ func (s *PaliadinService) sendToPane(ctx context.Context, target, msg string) er
|
|||||||
// over from earlier turns) as a non-event — the file existing without a
|
// over from earlier turns) as a non-event — the file existing without a
|
||||||
// fresh mtime is a corner case the caller already de-duplicates by
|
// fresh mtime is a corner case the caller already de-duplicates by
|
||||||
// having a unique turn_id per request.
|
// having a unique turn_id per request.
|
||||||
func (s *PaliadinService) pollForResponse(ctx context.Context, path string, timeout time.Duration) (string, error) {
|
func (s *LocalPaliadinService) pollForResponse(ctx context.Context, path string, timeout time.Duration) (string, error) {
|
||||||
deadline := time.Now().Add(timeout)
|
deadline := time.Now().Add(timeout)
|
||||||
for time.Now().Before(deadline) {
|
for time.Now().Before(deadline) {
|
||||||
select {
|
select {
|
||||||
@@ -687,7 +711,7 @@ func countChips(s string) int {
|
|||||||
// audit-row writers.
|
// audit-row writers.
|
||||||
// =============================================================================
|
// =============================================================================
|
||||||
|
|
||||||
func (s *PaliadinService) insertTurnRow(ctx context.Context, t *PaliadinTurn) error {
|
func (s *paliadinDB) insertTurnRow(ctx context.Context, t *PaliadinTurn) error {
|
||||||
q := `
|
q := `
|
||||||
INSERT INTO paliad.paliadin_turns (
|
INSERT INTO paliad.paliadin_turns (
|
||||||
turn_id, user_id, session_id, started_at, user_message, page_origin
|
turn_id, user_id, session_id, started_at, user_message, page_origin
|
||||||
@@ -698,7 +722,7 @@ func (s *PaliadinService) insertTurnRow(ctx context.Context, t *PaliadinTurn) er
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *PaliadinService) completeTurn(ctx context.Context, turnID uuid.UUID,
|
func (s *paliadinDB) completeTurn(ctx context.Context, turnID uuid.UUID,
|
||||||
finishedAt time.Time, durationMS int, response string, tokens int,
|
finishedAt time.Time, durationMS int, response string, tokens int,
|
||||||
meta trailerMeta, chipCount int) error {
|
meta trailerMeta, chipCount int) error {
|
||||||
rowsSeen := make(pq.Int64Array, 0, len(meta.RowsSeen))
|
rowsSeen := make(pq.Int64Array, 0, len(meta.RowsSeen))
|
||||||
@@ -724,7 +748,7 @@ func (s *PaliadinService) completeTurn(ctx context.Context, turnID uuid.UUID,
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *PaliadinService) markTurnError(ctx context.Context, turnID uuid.UUID, code string) error {
|
func (s *paliadinDB) markTurnError(ctx context.Context, turnID uuid.UUID, code string) error {
|
||||||
finished := time.Now().UTC()
|
finished := time.Now().UTC()
|
||||||
q := `
|
q := `
|
||||||
UPDATE paliad.paliadin_turns
|
UPDATE paliad.paliadin_turns
|
||||||
@@ -735,7 +759,7 @@ func (s *PaliadinService) markTurnError(ctx context.Context, turnID uuid.UUID, c
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *PaliadinService) markTurnAbandonedOrError(ctx context.Context, turnID uuid.UUID, code string, abandoned bool) error {
|
func (s *paliadinDB) markTurnAbandonedOrError(ctx context.Context, turnID uuid.UUID, code string, abandoned bool) error {
|
||||||
finished := time.Now().UTC()
|
finished := time.Now().UTC()
|
||||||
q := `
|
q := `
|
||||||
UPDATE paliad.paliadin_turns
|
UPDATE paliad.paliadin_turns
|
||||||
|
|||||||
322
internal/services/paliadin_remote.go
Normal file
322
internal/services/paliadin_remote.go
Normal file
@@ -0,0 +1,322 @@
|
|||||||
|
package services
|
||||||
|
|
||||||
|
// RemotePaliadinService — the prod path of the Paliadin backend.
|
||||||
|
//
|
||||||
|
// Design: docs/design-paliadin-tailscale-ssh-2026-05-07.md.
|
||||||
|
//
|
||||||
|
// Where the local backend (LocalPaliadinService) drives a tmux+claude
|
||||||
|
// pane in-process, the remote backend shells out to ssh m@mriver
|
||||||
|
// paliadin-shim — the script at scripts/paliadin-shim, installed at
|
||||||
|
// /home/m/.local/bin/paliadin-shim on m's laptop. The shim owns the
|
||||||
|
// tmux+claude pane on mRiver; this Go side just wraps each turn in one
|
||||||
|
// SSH call.
|
||||||
|
//
|
||||||
|
// The path was chosen so paliad.de (deployed in a Dokploy container on
|
||||||
|
// mLake, no `claude` CLI of its own) can keep using m's Claude Code
|
||||||
|
// subscription instead of paying API tokens. Tailscale provides the
|
||||||
|
// transport — mLake's tailscale0 interface is shared into the container
|
||||||
|
// via network_mode: host (compose layer; not this file's concern).
|
||||||
|
//
|
||||||
|
// Wiring is gated on PALIADIN_REMOTE_HOST in cmd/server/main.go. When
|
||||||
|
// that env var is unset, the binary falls back to LocalPaliadinService
|
||||||
|
// (or DisabledPaliadinService if neither tmux nor remote is available).
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"encoding/base64"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"os/exec"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/google/uuid"
|
||||||
|
"github.com/jmoiron/sqlx"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ErrMRiverUnreachable signals that the remote paliadin-shim could not
|
||||||
|
// be contacted within the health-check window. The handler maps this to
|
||||||
|
// the friendly mriver_unreachable error code (see frontend
|
||||||
|
// friendlyErrorMessage).
|
||||||
|
var ErrMRiverUnreachable = errors.New("paliadin: mriver unreachable")
|
||||||
|
|
||||||
|
// RemotePaliadinConfig is the bag of knobs cmd/server/main.go passes
|
||||||
|
// when constructing a RemotePaliadinService.
|
||||||
|
type RemotePaliadinConfig struct {
|
||||||
|
SSHHost string // 100.99.98.203 — mRiver's tailnet IP
|
||||||
|
SSHPort int // 22022 — bypasses Tailscale SSH on :22 (design §4.5)
|
||||||
|
SSHUser string // m
|
||||||
|
SSHKeyPath string // /tmp/paliadin-id_ed25519-<rand> (chmod 600)
|
||||||
|
KnownHostsPath string // /tmp/paliadin-known_hosts
|
||||||
|
}
|
||||||
|
|
||||||
|
// RemotePaliadinService implements Paliadin against a remote
|
||||||
|
// paliadin-shim over SSH.
|
||||||
|
type RemotePaliadinService struct {
|
||||||
|
paliadinDB
|
||||||
|
cfg RemotePaliadinConfig
|
||||||
|
|
||||||
|
// Single in-flight turn. mRiver's claude pane is single-user; we
|
||||||
|
// serialise turns the same way LocalPaliadinService does.
|
||||||
|
turnMu sync.Mutex
|
||||||
|
|
||||||
|
// Health-check cache. Avoids probing mRiver on every turn — once
|
||||||
|
// the cache is warm, RunTurn skips the probe for 10 seconds.
|
||||||
|
healthMu sync.Mutex
|
||||||
|
healthOK bool
|
||||||
|
healthCheckedAt time.Time
|
||||||
|
|
||||||
|
// Lazy bootstrap state. The system prompt only needs to be sent
|
||||||
|
// once per claude pane; on first RunTurn after a paliad restart we
|
||||||
|
// inject it, and remember we did so we don't re-send.
|
||||||
|
bootstrapMu sync.Mutex
|
||||||
|
bootstrapped bool
|
||||||
|
|
||||||
|
// Hook for tests — when non-nil, callShim delegates here instead
|
||||||
|
// of exec'ing ssh. Production code never sets this.
|
||||||
|
callShimHook func(ctx context.Context, args ...string) ([]byte, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewRemotePaliadinService wires the remote backend. Call only when
|
||||||
|
// PALIADIN_REMOTE_HOST is set in the environment; the constructor does
|
||||||
|
// not probe mRiver — first probe happens on the first RunTurn call via
|
||||||
|
// healthGate.
|
||||||
|
func NewRemotePaliadinService(db *sqlx.DB, users *UserService, cfg RemotePaliadinConfig) *RemotePaliadinService {
|
||||||
|
if cfg.SSHPort == 0 {
|
||||||
|
cfg.SSHPort = 22022
|
||||||
|
}
|
||||||
|
if cfg.SSHUser == "" {
|
||||||
|
cfg.SSHUser = "m"
|
||||||
|
}
|
||||||
|
return &RemotePaliadinService{
|
||||||
|
paliadinDB: paliadinDB{db: db, users: users},
|
||||||
|
cfg: cfg,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunTurn drives one Q&A round against the remote claude pane. Same
|
||||||
|
// audit-row contract as LocalPaliadinService: write the row first, run
|
||||||
|
// the turn, complete the row on success, mark error on failure.
|
||||||
|
func (s *RemotePaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) {
|
||||||
|
s.turnMu.Lock()
|
||||||
|
defer s.turnMu.Unlock()
|
||||||
|
|
||||||
|
turnID := uuid.New()
|
||||||
|
startedAt := time.Now().UTC()
|
||||||
|
|
||||||
|
// Audit row first — leave traces even if we crash mid-turn.
|
||||||
|
if err := s.insertTurnRow(ctx, &PaliadinTurn{
|
||||||
|
TurnID: turnID,
|
||||||
|
UserID: req.UserID,
|
||||||
|
SessionID: req.SessionID,
|
||||||
|
StartedAt: startedAt,
|
||||||
|
UserMessage: req.UserMessage,
|
||||||
|
PageOrigin: optionalString(req.PageOrigin),
|
||||||
|
}); err != nil {
|
||||||
|
return nil, fmt.Errorf("paliadin: insert turn row: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Health-gate before paying the cost of a real turn. Caches OK for
|
||||||
|
// 10 s so a fast back-to-back chat doesn't probe every time.
|
||||||
|
if err := s.healthGate(ctx); err != nil {
|
||||||
|
_ = s.markTurnError(ctx, turnID, "mriver_unreachable")
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Lazy bootstrap — first turn after a paliad restart sends the
|
||||||
|
// system prompt; subsequent turns skip.
|
||||||
|
if err := s.ensureBootstrapped(ctx); err != nil {
|
||||||
|
_ = s.markTurnError(ctx, turnID, "bootstrap_failed")
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
msg := sanitiseForTmux(req.UserMessage)
|
||||||
|
msgB64 := base64.StdEncoding.EncodeToString([]byte(msg))
|
||||||
|
|
||||||
|
body, err := s.callShim(ctx, "run-turn", turnID.String(), msgB64)
|
||||||
|
if err != nil {
|
||||||
|
_ = s.markTurnError(ctx, turnID, classifySSHError(err))
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Same trailer parse + audit completion as the local path.
|
||||||
|
cleanBody, meta := splitTrailer(string(body))
|
||||||
|
tokens := approxTokenCount(cleanBody)
|
||||||
|
chipCount := countChips(cleanBody)
|
||||||
|
finished := time.Now().UTC()
|
||||||
|
durationMS := int(finished.Sub(startedAt) / time.Millisecond)
|
||||||
|
|
||||||
|
if err := s.completeTurn(ctx, turnID, finished, durationMS, cleanBody, tokens, meta, chipCount); err != nil {
|
||||||
|
log.Printf("paliadin: complete turn %s: %v", turnID, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &TurnResult{
|
||||||
|
TurnID: turnID,
|
||||||
|
Response: cleanBody,
|
||||||
|
UsedTools: meta.UsedTools,
|
||||||
|
RowsSeen: meta.RowsSeen,
|
||||||
|
ChipCount: chipCount,
|
||||||
|
ClassifierTag: meta.ClassifierTag,
|
||||||
|
DurationMS: durationMS,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ResetSession sends `/clear` to the remote claude pane.
|
||||||
|
func (s *RemotePaliadinService) ResetSession(ctx context.Context) error {
|
||||||
|
if _, err := s.callShim(ctx, "reset"); err != nil {
|
||||||
|
return fmt.Errorf("paliadin: reset: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// healthGate runs the shim's `health` verb at most once per 10 s.
|
||||||
|
// Returns ErrMRiverUnreachable wrapping the underlying error on miss.
|
||||||
|
func (s *RemotePaliadinService) healthGate(ctx context.Context) error {
|
||||||
|
s.healthMu.Lock()
|
||||||
|
defer s.healthMu.Unlock()
|
||||||
|
|
||||||
|
if s.healthOK && time.Since(s.healthCheckedAt) < 10*time.Second {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
probeCtx, cancel := context.WithTimeout(ctx, 3*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
out, err := s.callShim(probeCtx, "health")
|
||||||
|
s.healthCheckedAt = time.Now()
|
||||||
|
if err != nil {
|
||||||
|
s.healthOK = false
|
||||||
|
return fmt.Errorf("%w: %v", ErrMRiverUnreachable, err)
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(string(out)) != "ok" {
|
||||||
|
s.healthOK = false
|
||||||
|
return fmt.Errorf("%w: shim returned %q", ErrMRiverUnreachable, string(out))
|
||||||
|
}
|
||||||
|
s.healthOK = true
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ensureBootstrapped sends the Paliadin system prompt to the remote
|
||||||
|
// claude pane on first call. Idempotent — subsequent calls return nil
|
||||||
|
// without doing any work.
|
||||||
|
func (s *RemotePaliadinService) ensureBootstrapped(ctx context.Context) error {
|
||||||
|
s.bootstrapMu.Lock()
|
||||||
|
defer s.bootstrapMu.Unlock()
|
||||||
|
if s.bootstrapped {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
prompt := paliadinSystemPrompt("/tmp/paliadin")
|
||||||
|
promptB64 := base64.StdEncoding.EncodeToString([]byte(prompt))
|
||||||
|
if _, err := s.callShim(ctx, "bootstrap", promptB64); err != nil {
|
||||||
|
return fmt.Errorf("paliadin: bootstrap: %w", err)
|
||||||
|
}
|
||||||
|
s.bootstrapped = true
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// callShim runs `ssh <user>@<host> -- <verb> <args...>` against the
|
||||||
|
// paliadin-shim. The shim's authorized_keys command= directive ensures
|
||||||
|
// the verb + args are passed via $SSH_ORIGINAL_COMMAND regardless of
|
||||||
|
// what we put after the `--`; we keep the explicit argv form anyway so
|
||||||
|
// reading the code at the call site is unambiguous.
|
||||||
|
//
|
||||||
|
// Tests set callShimHook to bypass exec.
|
||||||
|
func (s *RemotePaliadinService) callShim(ctx context.Context, args ...string) ([]byte, error) {
|
||||||
|
if s.callShimHook != nil {
|
||||||
|
return s.callShimHook(ctx, args...)
|
||||||
|
}
|
||||||
|
|
||||||
|
sshArgs := []string{
|
||||||
|
"-F", "/dev/null", // ignore /etc/ssh/ssh_config + ~/.ssh/config
|
||||||
|
"-i", s.cfg.SSHKeyPath,
|
||||||
|
"-p", strconv.Itoa(s.cfg.SSHPort), // 22022 — bypasses Tailscale SSH on :22
|
||||||
|
"-o", "IdentitiesOnly=yes",
|
||||||
|
"-o", "UserKnownHostsFile=" + s.cfg.KnownHostsPath,
|
||||||
|
"-o", "StrictHostKeyChecking=yes",
|
||||||
|
"-o", "BatchMode=yes",
|
||||||
|
"-o", "ConnectTimeout=3",
|
||||||
|
"-o", "ServerAliveInterval=10",
|
||||||
|
"-o", "ServerAliveCountMax=3",
|
||||||
|
s.cfg.SSHUser + "@" + s.cfg.SSHHost,
|
||||||
|
"--",
|
||||||
|
}
|
||||||
|
sshArgs = append(sshArgs, args...)
|
||||||
|
|
||||||
|
// Shim's run-turn timeout is 60 s; +10 s gives SSH some overhead.
|
||||||
|
c, cancel := context.WithTimeout(ctx, 70*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
cmd := exec.CommandContext(c, "ssh", sshArgs...)
|
||||||
|
var stdout, stderr bytes.Buffer
|
||||||
|
cmd.Stdout = &stdout
|
||||||
|
cmd.Stderr = &stderr
|
||||||
|
if err := cmd.Run(); err != nil {
|
||||||
|
return nil, fmt.Errorf("ssh %s: %w (stderr: %s)", strings.Join(args, " "), err, strings.TrimSpace(stderr.String()))
|
||||||
|
}
|
||||||
|
return stdout.Bytes(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// classifySSHError turns a callShim error into one of the audit-row
|
||||||
|
// error codes. Codes are stable strings shown on the admin dashboard
|
||||||
|
// and used by the frontend's friendlyErrorMessage to localise.
|
||||||
|
func classifySSHError(err error) string {
|
||||||
|
if err == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
if errors.Is(err, ErrMRiverUnreachable) {
|
||||||
|
return "mriver_unreachable"
|
||||||
|
}
|
||||||
|
if errors.Is(err, context.DeadlineExceeded) {
|
||||||
|
return "timeout"
|
||||||
|
}
|
||||||
|
msg := err.Error()
|
||||||
|
switch {
|
||||||
|
case strings.Contains(msg, "Connection timed out"),
|
||||||
|
strings.Contains(msg, "Connection refused"),
|
||||||
|
strings.Contains(msg, "Could not resolve hostname"),
|
||||||
|
strings.Contains(msg, "Network is unreachable"):
|
||||||
|
return "mriver_unreachable"
|
||||||
|
case strings.Contains(msg, "exit status 124"):
|
||||||
|
// Shim's run-turn 60 s timeout — Claude didn't write the
|
||||||
|
// response file in time.
|
||||||
|
return "timeout"
|
||||||
|
case strings.Contains(msg, "Permission denied"):
|
||||||
|
return "shim_auth_failed"
|
||||||
|
default:
|
||||||
|
return "shim_error"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// DisabledPaliadinService is a stub that always returns
|
||||||
|
// ErrPaliadinDisabled. cmd/server/main.go constructs one when neither
|
||||||
|
// PALIADIN_REMOTE_HOST is set nor a local tmux is available; without
|
||||||
|
// the stub, the handler would have to nil-check on every entry point.
|
||||||
|
type DisabledPaliadinService struct {
|
||||||
|
paliadinDB
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewDisabledPaliadinService wires the stub. DB methods (IsOwner /
|
||||||
|
// ListRecentTurns / Stats) still work; only RunTurn / ResetSession
|
||||||
|
// return ErrPaliadinDisabled.
|
||||||
|
func NewDisabledPaliadinService(db *sqlx.DB, users *UserService) *DisabledPaliadinService {
|
||||||
|
return &DisabledPaliadinService{paliadinDB: paliadinDB{db: db, users: users}}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisabledPaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) {
|
||||||
|
return nil, ErrPaliadinDisabled
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisabledPaliadinService) ResetSession(ctx context.Context) error {
|
||||||
|
return ErrPaliadinDisabled
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compile-time interface conformance checks — fail the build, not a
|
||||||
|
// runtime test, if a method drifts off any backend.
|
||||||
|
var (
|
||||||
|
_ Paliadin = (*LocalPaliadinService)(nil)
|
||||||
|
_ Paliadin = (*RemotePaliadinService)(nil)
|
||||||
|
_ Paliadin = (*DisabledPaliadinService)(nil)
|
||||||
|
)
|
||||||
257
internal/services/paliadin_remote_test.go
Normal file
257
internal/services/paliadin_remote_test.go
Normal file
@@ -0,0 +1,257 @@
|
|||||||
|
package services
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
"sync/atomic"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Tests for the remote-Paliadin backend. Every test bypasses exec via
|
||||||
|
// the callShimHook field — no real ssh is ever invoked, no DB rows are
|
||||||
|
// written. Tests that would need DB I/O (audit row insert/complete on
|
||||||
|
// RunTurn) are not in scope here; paliad's test suite has no sqlx mock
|
||||||
|
// and the existing paliadin_test.go only covers pure functions.
|
||||||
|
|
||||||
|
func TestNewRemotePaliadinService_Defaults(t *testing.T) {
|
||||||
|
s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{
|
||||||
|
SSHHost: "100.99.98.203",
|
||||||
|
// SSHPort + SSHUser intentionally left zero/empty
|
||||||
|
})
|
||||||
|
if s.cfg.SSHPort != 22022 {
|
||||||
|
t.Errorf("SSHPort default = %d; want 22022 (Tailscale-SSH bypass port)", s.cfg.SSHPort)
|
||||||
|
}
|
||||||
|
if s.cfg.SSHUser != "m" {
|
||||||
|
t.Errorf("SSHUser default = %q; want %q", s.cfg.SSHUser, "m")
|
||||||
|
}
|
||||||
|
if s.cfg.SSHHost != "100.99.98.203" {
|
||||||
|
t.Errorf("SSHHost not preserved: %q", s.cfg.SSHHost)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNewRemotePaliadinService_HonoursOverrides(t *testing.T) {
|
||||||
|
s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{
|
||||||
|
SSHHost: "10.0.0.1",
|
||||||
|
SSHPort: 2222,
|
||||||
|
SSHUser: "alice",
|
||||||
|
})
|
||||||
|
if s.cfg.SSHPort != 2222 {
|
||||||
|
t.Errorf("SSHPort override lost: %d", s.cfg.SSHPort)
|
||||||
|
}
|
||||||
|
if s.cfg.SSHUser != "alice" {
|
||||||
|
t.Errorf("SSHUser override lost: %q", s.cfg.SSHUser)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestClassifySSHError(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
err error
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{"nil", nil, ""},
|
||||||
|
{"explicit ErrMRiverUnreachable", ErrMRiverUnreachable, "mriver_unreachable"},
|
||||||
|
{"wrapped ErrMRiverUnreachable", fmt.Errorf("foo: %w", ErrMRiverUnreachable), "mriver_unreachable"},
|
||||||
|
{"context deadline", context.DeadlineExceeded, "timeout"},
|
||||||
|
{"shim run-turn timeout (exit 124)", errors.New("ssh run-turn …: exit status 124 (stderr: response timeout)"), "timeout"},
|
||||||
|
{"connection refused", errors.New("ssh health: dial: Connection refused"), "mriver_unreachable"},
|
||||||
|
{"connection timed out", errors.New("ssh health: Connection timed out"), "mriver_unreachable"},
|
||||||
|
{"permission denied", errors.New("ssh: Permission denied (publickey)"), "shim_auth_failed"},
|
||||||
|
{"unknown", errors.New("ssh: some other failure"), "shim_error"},
|
||||||
|
}
|
||||||
|
for _, c := range cases {
|
||||||
|
t.Run(c.name, func(t *testing.T) {
|
||||||
|
got := classifySSHError(c.err)
|
||||||
|
if got != c.want {
|
||||||
|
t.Errorf("classifySSHError(%v) = %q; want %q", c.err, got, c.want)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHealthGate_CachesOnSuccess(t *testing.T) {
|
||||||
|
var calls int32
|
||||||
|
s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{SSHHost: "x"})
|
||||||
|
s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) {
|
||||||
|
atomic.AddInt32(&calls, 1)
|
||||||
|
if len(args) != 1 || args[0] != "health" {
|
||||||
|
t.Errorf("unexpected callShim args: %v", args)
|
||||||
|
}
|
||||||
|
return []byte("ok\n"), nil
|
||||||
|
}
|
||||||
|
for i := 0; i < 5; i++ {
|
||||||
|
if err := s.healthGate(context.Background()); err != nil {
|
||||||
|
t.Fatalf("healthGate iteration %d: %v", i, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if got := atomic.LoadInt32(&calls); got != 1 {
|
||||||
|
t.Errorf("expected 1 callShim call (cached); got %d", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHealthGate_RetriesAfterFailure(t *testing.T) {
|
||||||
|
var calls int32
|
||||||
|
s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{SSHHost: "x"})
|
||||||
|
s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) {
|
||||||
|
atomic.AddInt32(&calls, 1)
|
||||||
|
return nil, errors.New("ssh: Connection refused")
|
||||||
|
}
|
||||||
|
for i := 0; i < 3; i++ {
|
||||||
|
err := s.healthGate(context.Background())
|
||||||
|
if !errors.Is(err, ErrMRiverUnreachable) {
|
||||||
|
t.Errorf("iteration %d: err %v; want wrapping ErrMRiverUnreachable", i, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Failed health is NOT cached — every call re-probes.
|
||||||
|
if got := atomic.LoadInt32(&calls); got != 3 {
|
||||||
|
t.Errorf("expected 3 callShim calls (no caching on failure); got %d", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHealthGate_RejectsUnexpectedReply(t *testing.T) {
|
||||||
|
s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{SSHHost: "x"})
|
||||||
|
s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) {
|
||||||
|
return []byte("not-ok"), nil
|
||||||
|
}
|
||||||
|
err := s.healthGate(context.Background())
|
||||||
|
if !errors.Is(err, ErrMRiverUnreachable) {
|
||||||
|
t.Errorf("err = %v; want wrap of ErrMRiverUnreachable for non-ok reply", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEnsureBootstrapped_RunsOnce(t *testing.T) {
|
||||||
|
var calls int32
|
||||||
|
s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{SSHHost: "x"})
|
||||||
|
s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) {
|
||||||
|
atomic.AddInt32(&calls, 1)
|
||||||
|
if len(args) != 2 || args[0] != "bootstrap" {
|
||||||
|
t.Errorf("unexpected callShim args: %v", args)
|
||||||
|
}
|
||||||
|
// args[1] is the base64'd system prompt — no need to decode in
|
||||||
|
// the test; just sanity-check it isn't trivially empty.
|
||||||
|
if len(args[1]) < 100 {
|
||||||
|
t.Errorf("bootstrap prompt suspiciously short: %d bytes", len(args[1]))
|
||||||
|
}
|
||||||
|
return []byte("ok\n"), nil
|
||||||
|
}
|
||||||
|
for i := 0; i < 3; i++ {
|
||||||
|
if err := s.ensureBootstrapped(context.Background()); err != nil {
|
||||||
|
t.Fatalf("ensureBootstrapped iteration %d: %v", i, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if got := atomic.LoadInt32(&calls); got != 1 {
|
||||||
|
t.Errorf("expected 1 callShim call (bootstrap is one-shot); got %d", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEnsureBootstrapped_RetriesOnFailure(t *testing.T) {
|
||||||
|
var calls int32
|
||||||
|
var failOnce atomic.Bool
|
||||||
|
s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{SSHHost: "x"})
|
||||||
|
s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) {
|
||||||
|
atomic.AddInt32(&calls, 1)
|
||||||
|
if failOnce.CompareAndSwap(false, true) {
|
||||||
|
return nil, errors.New("ssh: transient failure")
|
||||||
|
}
|
||||||
|
return []byte("ok\n"), nil
|
||||||
|
}
|
||||||
|
if err := s.ensureBootstrapped(context.Background()); err == nil {
|
||||||
|
t.Fatal("first call should error")
|
||||||
|
}
|
||||||
|
if err := s.ensureBootstrapped(context.Background()); err != nil {
|
||||||
|
t.Fatalf("second call should succeed: %v", err)
|
||||||
|
}
|
||||||
|
// Third call should be a cache hit (bootstrapped flag set on success).
|
||||||
|
if err := s.ensureBootstrapped(context.Background()); err != nil {
|
||||||
|
t.Fatalf("third call should be cached: %v", err)
|
||||||
|
}
|
||||||
|
if got := atomic.LoadInt32(&calls); got != 2 {
|
||||||
|
t.Errorf("expected 2 callShim calls (1 fail + 1 succeed; 3rd cached); got %d", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHealthGate_CacheExpires(t *testing.T) {
|
||||||
|
var calls int32
|
||||||
|
s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{SSHHost: "x"})
|
||||||
|
s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) {
|
||||||
|
atomic.AddInt32(&calls, 1)
|
||||||
|
return []byte("ok"), nil
|
||||||
|
}
|
||||||
|
if err := s.healthGate(context.Background()); err != nil {
|
||||||
|
t.Fatalf("first probe: %v", err)
|
||||||
|
}
|
||||||
|
// Force the cached timestamp to expire.
|
||||||
|
s.healthMu.Lock()
|
||||||
|
s.healthCheckedAt = time.Now().Add(-11 * time.Second)
|
||||||
|
s.healthMu.Unlock()
|
||||||
|
if err := s.healthGate(context.Background()); err != nil {
|
||||||
|
t.Fatalf("second probe (expired cache): %v", err)
|
||||||
|
}
|
||||||
|
if got := atomic.LoadInt32(&calls); got != 2 {
|
||||||
|
t.Errorf("expected 2 callShim calls (cache expired between); got %d", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRemotePaliadin_ImplementsPaliadin(t *testing.T) {
|
||||||
|
// Compile-time check is in paliadin_remote.go; this test makes the
|
||||||
|
// failure mode obvious if someone accidentally drops a method.
|
||||||
|
var _ Paliadin = (*RemotePaliadinService)(nil)
|
||||||
|
var _ Paliadin = (*LocalPaliadinService)(nil)
|
||||||
|
var _ Paliadin = (*DisabledPaliadinService)(nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDisabledPaliadinService(t *testing.T) {
|
||||||
|
s := NewDisabledPaliadinService(nil, nil)
|
||||||
|
if _, err := s.RunTurn(context.Background(), TurnRequest{}); !errors.Is(err, ErrPaliadinDisabled) {
|
||||||
|
t.Errorf("RunTurn error = %v; want ErrPaliadinDisabled", err)
|
||||||
|
}
|
||||||
|
if err := s.ResetSession(context.Background()); !errors.Is(err, ErrPaliadinDisabled) {
|
||||||
|
t.Errorf("ResetSession error = %v; want ErrPaliadinDisabled", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCallShim_SSHArgvShape(t *testing.T) {
|
||||||
|
// Verify the ssh argv we'd construct includes the bypass-port flag,
|
||||||
|
// the key + known_hosts paths, and the verb after `--`. We don't
|
||||||
|
// actually exec ssh — we set callShimHook so callShim never reaches
|
||||||
|
// the exec path; this test just guards the constructor wiring.
|
||||||
|
s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{
|
||||||
|
SSHHost: "100.99.98.203",
|
||||||
|
SSHPort: 22022,
|
||||||
|
SSHUser: "m",
|
||||||
|
SSHKeyPath: "/tmp/k",
|
||||||
|
KnownHostsPath: "/tmp/kh",
|
||||||
|
})
|
||||||
|
var captured []string
|
||||||
|
s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) {
|
||||||
|
captured = append([]string(nil), args...)
|
||||||
|
return []byte("ok"), nil
|
||||||
|
}
|
||||||
|
_, _ = s.callShim(context.Background(), "health")
|
||||||
|
if len(captured) != 1 || captured[0] != "health" {
|
||||||
|
t.Errorf("callShim forwarded args = %v; want [health]", captured)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCallShim_StderrSurfacesInError(t *testing.T) {
|
||||||
|
// When the real exec path fails, callShim wraps stderr into the
|
||||||
|
// returned error so classifySSHError can pattern-match. Simulate
|
||||||
|
// that contract via the hook.
|
||||||
|
s := NewRemotePaliadinService(nil, nil, RemotePaliadinConfig{SSHHost: "x"})
|
||||||
|
s.callShimHook = func(ctx context.Context, args ...string) ([]byte, error) {
|
||||||
|
return nil, errors.New("ssh health: exit status 1 (stderr: Permission denied (publickey))")
|
||||||
|
}
|
||||||
|
_, err := s.callShim(context.Background(), "health")
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error")
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), "Permission denied") {
|
||||||
|
t.Errorf("error should preserve stderr: %v", err)
|
||||||
|
}
|
||||||
|
if classifySSHError(err) != "shim_auth_failed" {
|
||||||
|
t.Errorf("classifier should pick up Permission denied; got %q", classifySSHError(err))
|
||||||
|
}
|
||||||
|
}
|
||||||
185
scripts/paliadin-shim
Executable file
185
scripts/paliadin-shim
Executable file
@@ -0,0 +1,185 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# paliadin-shim — server-side RPC for paliad's remote-tmux turns.
|
||||||
|
#
|
||||||
|
# Invoked via mRiver's ~/.ssh/authorized_keys command= restriction. The
|
||||||
|
# client's requested command is exposed in $SSH_ORIGINAL_COMMAND; this
|
||||||
|
# script parses it and dispatches to a fixed verb set.
|
||||||
|
#
|
||||||
|
# Design: docs/design-paliadin-tailscale-ssh-2026-05-07.md §5.4.
|
||||||
|
#
|
||||||
|
# Verbs:
|
||||||
|
# health -> "ok" iff tmux + claude reachable
|
||||||
|
# bootstrap <prompt-base64> -> ensure pane + send system prompt
|
||||||
|
# run-turn <uuid> <msg-base64> -> send framed prompt, poll, return
|
||||||
|
# reset -> /clear the conversation
|
||||||
|
#
|
||||||
|
# All multi-character payloads (prompts, messages) are base64-encoded by
|
||||||
|
# the Go caller so we never have to quote them through ssh's argv.
|
||||||
|
#
|
||||||
|
# Errors go to stderr with a non-zero exit. The Go side maps the exit
|
||||||
|
# status into a friendly error code.
|
||||||
|
set -euo pipefail
|
||||||
|
umask 077
|
||||||
|
|
||||||
|
readonly TMUX_SESSION="${PALIADIN_TMUX_SESSION:-paliad-paliadin}"
|
||||||
|
readonly RESPONSE_DIR="${PALIADIN_RESPONSE_DIR:-/tmp/paliadin}"
|
||||||
|
readonly TIMEOUT_S="${PALIADIN_TIMEOUT_S:-60}"
|
||||||
|
readonly PANE_READY_S=60 # max wait for claude pane to settle
|
||||||
|
readonly TURN_ID_RE='^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$'
|
||||||
|
|
||||||
|
mkdir -p "$RESPONSE_DIR"
|
||||||
|
chmod 700 "$RESPONSE_DIR"
|
||||||
|
|
||||||
|
# Parse $SSH_ORIGINAL_COMMAND into argv. Format: "<verb> <arg1> <arg2> …".
|
||||||
|
# We never `eval` this; `read -r -a` splits on $IFS without word-expansion.
|
||||||
|
read -r -a argv <<< "${SSH_ORIGINAL_COMMAND:-}"
|
||||||
|
verb="${argv[0]:-}"
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
log_err() { printf 'paliadin-shim: %s\n' "$*" >&2; }
|
||||||
|
|
||||||
|
# ensure_pane creates the tmux session + claude window if missing, waits
|
||||||
|
# for the pane to become ready, and prints the target identifier
|
||||||
|
# ("session:window-idx") on stdout.
|
||||||
|
ensure_pane() {
|
||||||
|
if ! tmux has-session -t "$TMUX_SESSION" 2>/dev/null; then
|
||||||
|
tmux new-session -d -s "$TMUX_SESSION"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Look for an existing window tagged with @paliadin-scope=chat.
|
||||||
|
local target=""
|
||||||
|
local idx scope
|
||||||
|
while read -r idx; do
|
||||||
|
[[ -z "$idx" ]] && continue
|
||||||
|
scope=$(tmux show-window-option -t "$TMUX_SESSION:$idx" -v @paliadin-scope 2>/dev/null || true)
|
||||||
|
if [[ "$scope" == "chat" ]]; then
|
||||||
|
target="$TMUX_SESSION:$idx"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done < <(tmux list-windows -t "$TMUX_SESSION" -F '#{window_index}' 2>/dev/null || true)
|
||||||
|
|
||||||
|
if [[ -z "$target" ]]; then
|
||||||
|
if ! command -v claude >/dev/null 2>&1; then
|
||||||
|
log_err "claude CLI not found in PATH"
|
||||||
|
exit 3
|
||||||
|
fi
|
||||||
|
idx=$(tmux new-window -t "$TMUX_SESSION" -n claude-paliadin -P -F '#{window_index}' claude)
|
||||||
|
target="$TMUX_SESSION:$idx"
|
||||||
|
|
||||||
|
# Wait for claude to settle. Matches Go waitForPaneReady (paliadin.go:495).
|
||||||
|
local deadline=$(( $(date +%s) + PANE_READY_S ))
|
||||||
|
local pane=""
|
||||||
|
while [[ $(date +%s) -lt $deadline ]]; do
|
||||||
|
pane=$(tmux capture-pane -t "$target" -p 2>/dev/null || true)
|
||||||
|
if [[ "$pane" == *"❯"* || "$pane" == *"│"* ]]; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 0.5
|
||||||
|
done
|
||||||
|
|
||||||
|
tmux set-window-option -t "$target" @paliadin-scope chat >/dev/null
|
||||||
|
tmux set-window-option -t "$target" @fix-name claude-paliadin >/dev/null
|
||||||
|
fi
|
||||||
|
|
||||||
|
printf '%s' "$target"
|
||||||
|
}
|
||||||
|
|
||||||
|
# send_to_pane writes a literal string then Enter.
|
||||||
|
send_to_pane() {
|
||||||
|
local target="$1" msg="$2"
|
||||||
|
tmux send-keys -t "$target" -l -- "$msg"
|
||||||
|
tmux send-keys -t "$target" Enter
|
||||||
|
}
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# verb dispatch
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
case "$verb" in
|
||||||
|
|
||||||
|
health)
|
||||||
|
# Used by the Go side's healthGate to short-circuit when mRiver is
|
||||||
|
# offline or tmux/claude is broken. Output is parsed verbatim.
|
||||||
|
if ! command -v tmux >/dev/null 2>&1; then
|
||||||
|
log_err "tmux not in PATH"; exit 1
|
||||||
|
fi
|
||||||
|
if ! command -v claude >/dev/null 2>&1; then
|
||||||
|
log_err "claude not in PATH"; exit 1
|
||||||
|
fi
|
||||||
|
if ! tmux has-session -t "$TMUX_SESSION" 2>/dev/null; then
|
||||||
|
tmux new-session -d -s "$TMUX_SESSION"
|
||||||
|
fi
|
||||||
|
echo ok
|
||||||
|
;;
|
||||||
|
|
||||||
|
bootstrap)
|
||||||
|
# Inject the system prompt into a fresh claude pane. Idempotent —
|
||||||
|
# the Go side may call this repeatedly; tmux send-keys is harmless
|
||||||
|
# against a settled pane.
|
||||||
|
if [[ -z "${argv[1]:-}" ]]; then
|
||||||
|
log_err "bootstrap: missing prompt"; exit 2
|
||||||
|
fi
|
||||||
|
if ! prompt=$(printf '%s' "${argv[1]}" | base64 -d 2>/dev/null); then
|
||||||
|
log_err "bootstrap: invalid base64 prompt"; exit 2
|
||||||
|
fi
|
||||||
|
target=$(ensure_pane)
|
||||||
|
send_to_pane "$target" "$prompt"
|
||||||
|
sleep 2 # let claude absorb before turns flow
|
||||||
|
echo ok
|
||||||
|
;;
|
||||||
|
|
||||||
|
run-turn)
|
||||||
|
# $1 = turn_id (UUID), $2 = base64-encoded user message.
|
||||||
|
turn_id="${argv[1]:-}"
|
||||||
|
if [[ ! "$turn_id" =~ $TURN_ID_RE ]]; then
|
||||||
|
log_err "run-turn: bad turn_id"; exit 2
|
||||||
|
fi
|
||||||
|
if [[ -z "${argv[2]:-}" ]]; then
|
||||||
|
log_err "run-turn: missing message"; exit 2
|
||||||
|
fi
|
||||||
|
if ! msg=$(printf '%s' "${argv[2]}" | base64 -d 2>/dev/null); then
|
||||||
|
log_err "run-turn: invalid base64 message"; exit 2
|
||||||
|
fi
|
||||||
|
target=$(ensure_pane)
|
||||||
|
out="$RESPONSE_DIR/$turn_id.txt"
|
||||||
|
rm -f "$out"
|
||||||
|
|
||||||
|
# Envelope matches paliadin_prompt.go's `[PALIADIN:turn_id] <msg>` shape.
|
||||||
|
send_to_pane "$target" "[PALIADIN:$turn_id] $msg"
|
||||||
|
|
||||||
|
# Poll for the response file. Same shape as Go pollForResponse
|
||||||
|
# (paliadin.go:530). Settle delay so we don't read mid-flush.
|
||||||
|
deadline=$(( $(date +%s) + TIMEOUT_S ))
|
||||||
|
while [[ $(date +%s) -lt $deadline ]]; do
|
||||||
|
if [[ -s "$out" ]]; then
|
||||||
|
sleep 0.05
|
||||||
|
cat "$out"
|
||||||
|
rm -f "$out"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
sleep 0.2
|
||||||
|
done
|
||||||
|
log_err "response timeout after ${TIMEOUT_S}s"
|
||||||
|
exit 124
|
||||||
|
;;
|
||||||
|
|
||||||
|
reset)
|
||||||
|
# Send `/clear` so the next turn starts a fresh conversation.
|
||||||
|
target=$(ensure_pane)
|
||||||
|
send_to_pane "$target" "/clear"
|
||||||
|
echo ok
|
||||||
|
;;
|
||||||
|
|
||||||
|
'')
|
||||||
|
log_err "no verb (set SSH_ORIGINAL_COMMAND via authorized_keys command=)"
|
||||||
|
exit 2
|
||||||
|
;;
|
||||||
|
|
||||||
|
*)
|
||||||
|
log_err "unknown verb '$verb'"
|
||||||
|
exit 2
|
||||||
|
;;
|
||||||
|
esac
|
||||||
Reference in New Issue
Block a user