feat(t-paliad-151) RemotePaliadinService + main.go env-var routing

Phase B step 2: lands the Paliadin backend that talks to mRiver via
ssh + paliadin-shim. Local backend untouched — selection happens in
cmd/server/main.go based on PALIADIN_REMOTE_HOST.

Files:
- internal/services/paliadin_remote.go (new) — RemotePaliadinService
  + RemotePaliadinConfig, with five SSH knobs (Host/Port/User/KeyPath/
  KnownHostsPath). RunTurn does insertTurnRow → healthGate → bootstrap
  → callShim run-turn → splitTrailer → completeTurn, mirroring the
  local path's audit-row contract. ResetSession sends shim 'reset'.
  callShim runs `ssh -F /dev/null -i <key> -p <port> -o … host -- verb
  args`; ControlMaster intentionally not enabled (design §6.8).
- internal/services/paliadin_remote.go also adds DisabledPaliadinService
  (returns ErrPaliadinDisabled from RunTurn/ResetSession; DB methods
  inherited from paliadinDB still work) so cmd/server/main.go can wire
  a non-nil Paliadin even when neither local tmux nor remote SSH is
  available.
- ErrMRiverUnreachable sentinel for the friendly error code.
- classifySSHError translates ssh exit 124 / Permission denied /
  network errors into the audit-row error_code field.
- Compile-time conformance: var _ Paliadin = (*Local|*Remote|*Disabled)
  PaliadinService(nil).

cmd/server/main.go switch:
  PALIADIN_REMOTE_HOST set → NewRemotePaliadinService
  else: tmux on PATH → NewLocalPaliadinService
  else: NewDisabledPaliadinService

buildPaliadinRemoteConfig materialises PALIADIN_SSH_PRIVATE_KEY +
PALIADIN_KNOWN_HOSTS (multi-line Dokploy secrets) into chmod-600/644
tmpfiles at boot. Defaults: SSHUser=m, SSHPort=22022 (bypasses
Tailscale SSH on :22, see design §4.5). Fails fast on a configured
remote-host without the matching key/known_hosts secrets.

Local-tmux mode now requires `tmux` actually be on PATH at boot
(exec.LookPath gate); previously the constructor unconditionally
returned a service whose RunTurn would fail at runtime with
ErrTmuxUnavailable. The handler-level "friendly error" UX is
unchanged: DisabledPaliadinService surfaces ErrPaliadinDisabled which
the frontend renders the same way.

Build green; existing paliadin_test.go still passes (it tests
package-level helpers, untouched). Remote-specific tests land in B4.

Refs m/paliad#12
This commit is contained in:
m
2026-05-08 02:16:50 +02:00
parent 56a3dc961e
commit 0c8a2f1a95
2 changed files with 433 additions and 14 deletions

View File

@@ -2,10 +2,13 @@ package main
import (
"context"
"fmt"
"log"
"net/http"
"os"
"os/exec"
"os/signal"
"strconv"
"syscall"
// Embed Go's IANA tz database into the binary so time.LoadLocation works
@@ -165,20 +168,34 @@ func main() {
CardLayout: services.NewCardLayoutService(pool),
}
// t-paliad-146 — Paliadin PoC. Always wired when DATABASE_URL
// is set; the per-request handler gate (requirePaliadinOwner)
// restricts access to the single owner email
// (services.PaliadinOwnerEmail). All other authenticated users
// get a 404 — the route effectively does not exist for them.
// On hosts without tmux + the `claude` CLI (e.g. the Dokploy
// container), the owner gate still applies; if m ever hits the
// route from such a host, the service returns "tmux unavailable"
// without ever invoking shell-out.
tmuxSession := os.Getenv("PALIADIN_TMUX_SESSION")
responseDir := os.Getenv("PALIADIN_RESPONSE_DIR")
svcBundle.Paliadin = services.NewLocalPaliadinService(pool, users, tmuxSession, responseDir)
log.Printf("paliadin: wired (owner=%s; gate is per-request, not per-deploy)",
services.PaliadinOwnerEmail)
// Paliadin backend selection (t-paliad-146 + t-paliad-151):
// PALIADIN_REMOTE_HOST set → RemotePaliadinService (ssh to mRiver)
// else: local tmux available → LocalPaliadinService (PoC path)
// else: DisabledPaliadinService (handlers still 404 for non-owners
// via the gate; for m, RunTurn returns ErrPaliadinDisabled
// which surfaces as a friendly error).
//
// All three implement services.Paliadin; the per-request handler
// gate (requirePaliadinOwner) is unchanged and applies to every
// backend.
if remoteHost := os.Getenv("PALIADIN_REMOTE_HOST"); remoteHost != "" {
cfg, err := buildPaliadinRemoteConfig(remoteHost)
if err != nil {
log.Fatalf("paliadin: remote config: %v", err)
}
svcBundle.Paliadin = services.NewRemotePaliadinService(pool, users, cfg)
log.Printf("paliadin: remote mode → ssh %s@%s:%d (owner=%s)",
cfg.SSHUser, cfg.SSHHost, cfg.SSHPort, services.PaliadinOwnerEmail)
} else if _, err := exec.LookPath("tmux"); err == nil {
tmuxSession := os.Getenv("PALIADIN_TMUX_SESSION")
responseDir := os.Getenv("PALIADIN_RESPONSE_DIR")
svcBundle.Paliadin = services.NewLocalPaliadinService(pool, users, tmuxSession, responseDir)
log.Printf("paliadin: local tmux mode (owner=%s)", services.PaliadinOwnerEmail)
} else {
svcBundle.Paliadin = services.NewDisabledPaliadinService(pool, users)
log.Printf("paliadin: disabled (no PALIADIN_REMOTE_HOST, no local tmux; owner=%s)",
services.PaliadinOwnerEmail)
}
// Wire ApprovalService into the entity services so Create / Update /
// Complete / Delete consult paliad.approval_policies (t-paliad-138).
// Without this wiring, the policies and request tables exist but no
@@ -217,3 +234,83 @@ func main() {
log.Fatal(err)
}
}
// buildPaliadinRemoteConfig assembles a RemotePaliadinConfig from
// environment variables, materialising the SSH private key and
// known_hosts blobs into chmod-600/644 tmpfiles for OpenSSH to read.
//
// The blobs travel as Dokploy secrets (multi-line env vars). We never
// persist them to disk — tmpfiles live for the process lifetime in
// /tmp and disappear on container restart. Re-creating them every boot
// is fine; the keys themselves rotate independently via Dokploy
// secret updates.
//
// Required: PALIADIN_REMOTE_HOST, PALIADIN_SSH_PRIVATE_KEY, PALIADIN_KNOWN_HOSTS.
// Optional: PALIADIN_REMOTE_USER (default "m"), PALIADIN_REMOTE_PORT
// (default 22022 — bypasses Tailscale SSH on :22, see design §4.5).
func buildPaliadinRemoteConfig(host string) (services.RemotePaliadinConfig, error) {
cfg := services.RemotePaliadinConfig{
SSHHost: host,
SSHUser: cmpOr(os.Getenv("PALIADIN_REMOTE_USER"), "m"),
SSHPort: 22022,
}
if p := os.Getenv("PALIADIN_REMOTE_PORT"); p != "" {
n, err := strconv.Atoi(p)
if err != nil || n <= 0 || n > 65535 {
return cfg, fmt.Errorf("PALIADIN_REMOTE_PORT %q: not a valid port", p)
}
cfg.SSHPort = n
}
keyPath, err := writeSecretFile("paliadin-id_ed25519-", os.Getenv("PALIADIN_SSH_PRIVATE_KEY"), 0o600)
if err != nil {
return cfg, fmt.Errorf("PALIADIN_SSH_PRIVATE_KEY: %w", err)
}
if keyPath == "" {
return cfg, fmt.Errorf("PALIADIN_REMOTE_HOST set but PALIADIN_SSH_PRIVATE_KEY empty")
}
cfg.SSHKeyPath = keyPath
knownHostsPath, err := writeSecretFile("paliadin-known_hosts-", os.Getenv("PALIADIN_KNOWN_HOSTS"), 0o644)
if err != nil {
return cfg, fmt.Errorf("PALIADIN_KNOWN_HOSTS: %w", err)
}
if knownHostsPath == "" {
return cfg, fmt.Errorf("PALIADIN_REMOTE_HOST set but PALIADIN_KNOWN_HOSTS empty")
}
cfg.KnownHostsPath = knownHostsPath
return cfg, nil
}
// writeSecretFile writes blob to a tmpfile with the given mode and
// returns its path. Returns ("", nil) when blob is empty so callers
// can distinguish "not set" from real I/O errors.
func writeSecretFile(prefix, blob string, mode os.FileMode) (string, error) {
if blob == "" {
return "", nil
}
f, err := os.CreateTemp("", prefix+"*")
if err != nil {
return "", err
}
if _, err := f.WriteString(blob); err != nil {
_ = f.Close()
_ = os.Remove(f.Name())
return "", err
}
if err := f.Close(); err != nil {
return "", err
}
if err := os.Chmod(f.Name(), mode); err != nil {
return "", err
}
return f.Name(), nil
}
func cmpOr(s, fallback string) string {
if s != "" {
return s
}
return fallback
}

View File

@@ -0,0 +1,322 @@
package services
// RemotePaliadinService — the prod path of the Paliadin backend.
//
// Design: docs/design-paliadin-tailscale-ssh-2026-05-07.md.
//
// Where the local backend (LocalPaliadinService) drives a tmux+claude
// pane in-process, the remote backend shells out to ssh m@mriver
// paliadin-shim — the script at scripts/paliadin-shim, installed at
// /home/m/.local/bin/paliadin-shim on m's laptop. The shim owns the
// tmux+claude pane on mRiver; this Go side just wraps each turn in one
// SSH call.
//
// The path was chosen so paliad.de (deployed in a Dokploy container on
// mLake, no `claude` CLI of its own) can keep using m's Claude Code
// subscription instead of paying API tokens. Tailscale provides the
// transport — mLake's tailscale0 interface is shared into the container
// via network_mode: host (compose layer; not this file's concern).
//
// Wiring is gated on PALIADIN_REMOTE_HOST in cmd/server/main.go. When
// that env var is unset, the binary falls back to LocalPaliadinService
// (or DisabledPaliadinService if neither tmux nor remote is available).
import (
"bytes"
"context"
"encoding/base64"
"errors"
"fmt"
"log"
"os/exec"
"strconv"
"strings"
"sync"
"time"
"github.com/google/uuid"
"github.com/jmoiron/sqlx"
)
// ErrMRiverUnreachable signals that the remote paliadin-shim could not
// be contacted within the health-check window. The handler maps this to
// the friendly mriver_unreachable error code (see frontend
// friendlyErrorMessage).
var ErrMRiverUnreachable = errors.New("paliadin: mriver unreachable")
// RemotePaliadinConfig is the bag of knobs cmd/server/main.go passes
// when constructing a RemotePaliadinService.
type RemotePaliadinConfig struct {
SSHHost string // 100.99.98.203 — mRiver's tailnet IP
SSHPort int // 22022 — bypasses Tailscale SSH on :22 (design §4.5)
SSHUser string // m
SSHKeyPath string // /tmp/paliadin-id_ed25519-<rand> (chmod 600)
KnownHostsPath string // /tmp/paliadin-known_hosts
}
// RemotePaliadinService implements Paliadin against a remote
// paliadin-shim over SSH.
type RemotePaliadinService struct {
paliadinDB
cfg RemotePaliadinConfig
// Single in-flight turn. mRiver's claude pane is single-user; we
// serialise turns the same way LocalPaliadinService does.
turnMu sync.Mutex
// Health-check cache. Avoids probing mRiver on every turn — once
// the cache is warm, RunTurn skips the probe for 10 seconds.
healthMu sync.Mutex
healthOK bool
healthCheckedAt time.Time
// Lazy bootstrap state. The system prompt only needs to be sent
// once per claude pane; on first RunTurn after a paliad restart we
// inject it, and remember we did so we don't re-send.
bootstrapMu sync.Mutex
bootstrapped bool
// Hook for tests — when non-nil, callShim delegates here instead
// of exec'ing ssh. Production code never sets this.
callShimHook func(ctx context.Context, args ...string) ([]byte, error)
}
// NewRemotePaliadinService wires the remote backend. Call only when
// PALIADIN_REMOTE_HOST is set in the environment; the constructor does
// not probe mRiver — first probe happens on the first RunTurn call via
// healthGate.
func NewRemotePaliadinService(db *sqlx.DB, users *UserService, cfg RemotePaliadinConfig) *RemotePaliadinService {
if cfg.SSHPort == 0 {
cfg.SSHPort = 22022
}
if cfg.SSHUser == "" {
cfg.SSHUser = "m"
}
return &RemotePaliadinService{
paliadinDB: paliadinDB{db: db, users: users},
cfg: cfg,
}
}
// RunTurn drives one Q&A round against the remote claude pane. Same
// audit-row contract as LocalPaliadinService: write the row first, run
// the turn, complete the row on success, mark error on failure.
func (s *RemotePaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) {
s.turnMu.Lock()
defer s.turnMu.Unlock()
turnID := uuid.New()
startedAt := time.Now().UTC()
// Audit row first — leave traces even if we crash mid-turn.
if err := s.insertTurnRow(ctx, &PaliadinTurn{
TurnID: turnID,
UserID: req.UserID,
SessionID: req.SessionID,
StartedAt: startedAt,
UserMessage: req.UserMessage,
PageOrigin: optionalString(req.PageOrigin),
}); err != nil {
return nil, fmt.Errorf("paliadin: insert turn row: %w", err)
}
// Health-gate before paying the cost of a real turn. Caches OK for
// 10 s so a fast back-to-back chat doesn't probe every time.
if err := s.healthGate(ctx); err != nil {
_ = s.markTurnError(ctx, turnID, "mriver_unreachable")
return nil, err
}
// Lazy bootstrap — first turn after a paliad restart sends the
// system prompt; subsequent turns skip.
if err := s.ensureBootstrapped(ctx); err != nil {
_ = s.markTurnError(ctx, turnID, "bootstrap_failed")
return nil, err
}
msg := sanitiseForTmux(req.UserMessage)
msgB64 := base64.StdEncoding.EncodeToString([]byte(msg))
body, err := s.callShim(ctx, "run-turn", turnID.String(), msgB64)
if err != nil {
_ = s.markTurnError(ctx, turnID, classifySSHError(err))
return nil, err
}
// Same trailer parse + audit completion as the local path.
cleanBody, meta := splitTrailer(string(body))
tokens := approxTokenCount(cleanBody)
chipCount := countChips(cleanBody)
finished := time.Now().UTC()
durationMS := int(finished.Sub(startedAt) / time.Millisecond)
if err := s.completeTurn(ctx, turnID, finished, durationMS, cleanBody, tokens, meta, chipCount); err != nil {
log.Printf("paliadin: complete turn %s: %v", turnID, err)
}
return &TurnResult{
TurnID: turnID,
Response: cleanBody,
UsedTools: meta.UsedTools,
RowsSeen: meta.RowsSeen,
ChipCount: chipCount,
ClassifierTag: meta.ClassifierTag,
DurationMS: durationMS,
}, nil
}
// ResetSession sends `/clear` to the remote claude pane.
func (s *RemotePaliadinService) ResetSession(ctx context.Context) error {
if _, err := s.callShim(ctx, "reset"); err != nil {
return fmt.Errorf("paliadin: reset: %w", err)
}
return nil
}
// healthGate runs the shim's `health` verb at most once per 10 s.
// Returns ErrMRiverUnreachable wrapping the underlying error on miss.
func (s *RemotePaliadinService) healthGate(ctx context.Context) error {
s.healthMu.Lock()
defer s.healthMu.Unlock()
if s.healthOK && time.Since(s.healthCheckedAt) < 10*time.Second {
return nil
}
probeCtx, cancel := context.WithTimeout(ctx, 3*time.Second)
defer cancel()
out, err := s.callShim(probeCtx, "health")
s.healthCheckedAt = time.Now()
if err != nil {
s.healthOK = false
return fmt.Errorf("%w: %v", ErrMRiverUnreachable, err)
}
if strings.TrimSpace(string(out)) != "ok" {
s.healthOK = false
return fmt.Errorf("%w: shim returned %q", ErrMRiverUnreachable, string(out))
}
s.healthOK = true
return nil
}
// ensureBootstrapped sends the Paliadin system prompt to the remote
// claude pane on first call. Idempotent — subsequent calls return nil
// without doing any work.
func (s *RemotePaliadinService) ensureBootstrapped(ctx context.Context) error {
s.bootstrapMu.Lock()
defer s.bootstrapMu.Unlock()
if s.bootstrapped {
return nil
}
prompt := paliadinSystemPrompt("/tmp/paliadin")
promptB64 := base64.StdEncoding.EncodeToString([]byte(prompt))
if _, err := s.callShim(ctx, "bootstrap", promptB64); err != nil {
return fmt.Errorf("paliadin: bootstrap: %w", err)
}
s.bootstrapped = true
return nil
}
// callShim runs `ssh <user>@<host> -- <verb> <args...>` against the
// paliadin-shim. The shim's authorized_keys command= directive ensures
// the verb + args are passed via $SSH_ORIGINAL_COMMAND regardless of
// what we put after the `--`; we keep the explicit argv form anyway so
// reading the code at the call site is unambiguous.
//
// Tests set callShimHook to bypass exec.
func (s *RemotePaliadinService) callShim(ctx context.Context, args ...string) ([]byte, error) {
if s.callShimHook != nil {
return s.callShimHook(ctx, args...)
}
sshArgs := []string{
"-F", "/dev/null", // ignore /etc/ssh/ssh_config + ~/.ssh/config
"-i", s.cfg.SSHKeyPath,
"-p", strconv.Itoa(s.cfg.SSHPort), // 22022 — bypasses Tailscale SSH on :22
"-o", "IdentitiesOnly=yes",
"-o", "UserKnownHostsFile=" + s.cfg.KnownHostsPath,
"-o", "StrictHostKeyChecking=yes",
"-o", "BatchMode=yes",
"-o", "ConnectTimeout=3",
"-o", "ServerAliveInterval=10",
"-o", "ServerAliveCountMax=3",
s.cfg.SSHUser + "@" + s.cfg.SSHHost,
"--",
}
sshArgs = append(sshArgs, args...)
// Shim's run-turn timeout is 60 s; +10 s gives SSH some overhead.
c, cancel := context.WithTimeout(ctx, 70*time.Second)
defer cancel()
cmd := exec.CommandContext(c, "ssh", sshArgs...)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
return nil, fmt.Errorf("ssh %s: %w (stderr: %s)", strings.Join(args, " "), err, strings.TrimSpace(stderr.String()))
}
return stdout.Bytes(), nil
}
// classifySSHError turns a callShim error into one of the audit-row
// error codes. Codes are stable strings shown on the admin dashboard
// and used by the frontend's friendlyErrorMessage to localise.
func classifySSHError(err error) string {
if err == nil {
return ""
}
if errors.Is(err, ErrMRiverUnreachable) {
return "mriver_unreachable"
}
if errors.Is(err, context.DeadlineExceeded) {
return "timeout"
}
msg := err.Error()
switch {
case strings.Contains(msg, "Connection timed out"),
strings.Contains(msg, "Connection refused"),
strings.Contains(msg, "Could not resolve hostname"),
strings.Contains(msg, "Network is unreachable"):
return "mriver_unreachable"
case strings.Contains(msg, "exit status 124"):
// Shim's run-turn 60 s timeout — Claude didn't write the
// response file in time.
return "timeout"
case strings.Contains(msg, "Permission denied"):
return "shim_auth_failed"
default:
return "shim_error"
}
}
// DisabledPaliadinService is a stub that always returns
// ErrPaliadinDisabled. cmd/server/main.go constructs one when neither
// PALIADIN_REMOTE_HOST is set nor a local tmux is available; without
// the stub, the handler would have to nil-check on every entry point.
type DisabledPaliadinService struct {
paliadinDB
}
// NewDisabledPaliadinService wires the stub. DB methods (IsOwner /
// ListRecentTurns / Stats) still work; only RunTurn / ResetSession
// return ErrPaliadinDisabled.
func NewDisabledPaliadinService(db *sqlx.DB, users *UserService) *DisabledPaliadinService {
return &DisabledPaliadinService{paliadinDB: paliadinDB{db: db, users: users}}
}
func (s *DisabledPaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) {
return nil, ErrPaliadinDisabled
}
func (s *DisabledPaliadinService) ResetSession(ctx context.Context) error {
return ErrPaliadinDisabled
}
// Compile-time interface conformance checks — fail the build, not a
// runtime test, if a method drifts off any backend.
var (
_ Paliadin = (*LocalPaliadinService)(nil)
_ Paliadin = (*RemotePaliadinService)(nil)
_ Paliadin = (*DisabledPaliadinService)(nil)
)