Phase B step 2: lands the Paliadin backend that talks to mRiver via ssh + paliadin-shim. Local backend untouched — selection happens in cmd/server/main.go based on PALIADIN_REMOTE_HOST. Files: - internal/services/paliadin_remote.go (new) — RemotePaliadinService + RemotePaliadinConfig, with five SSH knobs (Host/Port/User/KeyPath/ KnownHostsPath). RunTurn does insertTurnRow → healthGate → bootstrap → callShim run-turn → splitTrailer → completeTurn, mirroring the local path's audit-row contract. ResetSession sends shim 'reset'. callShim runs `ssh -F /dev/null -i <key> -p <port> -o … host -- verb args`; ControlMaster intentionally not enabled (design §6.8). - internal/services/paliadin_remote.go also adds DisabledPaliadinService (returns ErrPaliadinDisabled from RunTurn/ResetSession; DB methods inherited from paliadinDB still work) so cmd/server/main.go can wire a non-nil Paliadin even when neither local tmux nor remote SSH is available. - ErrMRiverUnreachable sentinel for the friendly error code. - classifySSHError translates ssh exit 124 / Permission denied / network errors into the audit-row error_code field. - Compile-time conformance: var _ Paliadin = (*Local|*Remote|*Disabled) PaliadinService(nil). cmd/server/main.go switch: PALIADIN_REMOTE_HOST set → NewRemotePaliadinService else: tmux on PATH → NewLocalPaliadinService else: NewDisabledPaliadinService buildPaliadinRemoteConfig materialises PALIADIN_SSH_PRIVATE_KEY + PALIADIN_KNOWN_HOSTS (multi-line Dokploy secrets) into chmod-600/644 tmpfiles at boot. Defaults: SSHUser=m, SSHPort=22022 (bypasses Tailscale SSH on :22, see design §4.5). Fails fast on a configured remote-host without the matching key/known_hosts secrets. Local-tmux mode now requires `tmux` actually be on PATH at boot (exec.LookPath gate); previously the constructor unconditionally returned a service whose RunTurn would fail at runtime with ErrTmuxUnavailable. The handler-level "friendly error" UX is unchanged: DisabledPaliadinService surfaces ErrPaliadinDisabled which the frontend renders the same way. Build green; existing paliadin_test.go still passes (it tests package-level helpers, untouched). Remote-specific tests land in B4. Refs m/paliad#12
323 lines
11 KiB
Go
323 lines
11 KiB
Go
package services
|
|
|
|
// RemotePaliadinService — the prod path of the Paliadin backend.
|
|
//
|
|
// Design: docs/design-paliadin-tailscale-ssh-2026-05-07.md.
|
|
//
|
|
// Where the local backend (LocalPaliadinService) drives a tmux+claude
|
|
// pane in-process, the remote backend shells out to ssh m@mriver
|
|
// paliadin-shim — the script at scripts/paliadin-shim, installed at
|
|
// /home/m/.local/bin/paliadin-shim on m's laptop. The shim owns the
|
|
// tmux+claude pane on mRiver; this Go side just wraps each turn in one
|
|
// SSH call.
|
|
//
|
|
// The path was chosen so paliad.de (deployed in a Dokploy container on
|
|
// mLake, no `claude` CLI of its own) can keep using m's Claude Code
|
|
// subscription instead of paying API tokens. Tailscale provides the
|
|
// transport — mLake's tailscale0 interface is shared into the container
|
|
// via network_mode: host (compose layer; not this file's concern).
|
|
//
|
|
// Wiring is gated on PALIADIN_REMOTE_HOST in cmd/server/main.go. When
|
|
// that env var is unset, the binary falls back to LocalPaliadinService
|
|
// (or DisabledPaliadinService if neither tmux nor remote is available).
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/base64"
|
|
"errors"
|
|
"fmt"
|
|
"log"
|
|
"os/exec"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
"github.com/jmoiron/sqlx"
|
|
)
|
|
|
|
// ErrMRiverUnreachable signals that the remote paliadin-shim could not
|
|
// be contacted within the health-check window. The handler maps this to
|
|
// the friendly mriver_unreachable error code (see frontend
|
|
// friendlyErrorMessage).
|
|
var ErrMRiverUnreachable = errors.New("paliadin: mriver unreachable")
|
|
|
|
// RemotePaliadinConfig is the bag of knobs cmd/server/main.go passes
|
|
// when constructing a RemotePaliadinService.
|
|
type RemotePaliadinConfig struct {
|
|
SSHHost string // 100.99.98.203 — mRiver's tailnet IP
|
|
SSHPort int // 22022 — bypasses Tailscale SSH on :22 (design §4.5)
|
|
SSHUser string // m
|
|
SSHKeyPath string // /tmp/paliadin-id_ed25519-<rand> (chmod 600)
|
|
KnownHostsPath string // /tmp/paliadin-known_hosts
|
|
}
|
|
|
|
// RemotePaliadinService implements Paliadin against a remote
|
|
// paliadin-shim over SSH.
|
|
type RemotePaliadinService struct {
|
|
paliadinDB
|
|
cfg RemotePaliadinConfig
|
|
|
|
// Single in-flight turn. mRiver's claude pane is single-user; we
|
|
// serialise turns the same way LocalPaliadinService does.
|
|
turnMu sync.Mutex
|
|
|
|
// Health-check cache. Avoids probing mRiver on every turn — once
|
|
// the cache is warm, RunTurn skips the probe for 10 seconds.
|
|
healthMu sync.Mutex
|
|
healthOK bool
|
|
healthCheckedAt time.Time
|
|
|
|
// Lazy bootstrap state. The system prompt only needs to be sent
|
|
// once per claude pane; on first RunTurn after a paliad restart we
|
|
// inject it, and remember we did so we don't re-send.
|
|
bootstrapMu sync.Mutex
|
|
bootstrapped bool
|
|
|
|
// Hook for tests — when non-nil, callShim delegates here instead
|
|
// of exec'ing ssh. Production code never sets this.
|
|
callShimHook func(ctx context.Context, args ...string) ([]byte, error)
|
|
}
|
|
|
|
// NewRemotePaliadinService wires the remote backend. Call only when
|
|
// PALIADIN_REMOTE_HOST is set in the environment; the constructor does
|
|
// not probe mRiver — first probe happens on the first RunTurn call via
|
|
// healthGate.
|
|
func NewRemotePaliadinService(db *sqlx.DB, users *UserService, cfg RemotePaliadinConfig) *RemotePaliadinService {
|
|
if cfg.SSHPort == 0 {
|
|
cfg.SSHPort = 22022
|
|
}
|
|
if cfg.SSHUser == "" {
|
|
cfg.SSHUser = "m"
|
|
}
|
|
return &RemotePaliadinService{
|
|
paliadinDB: paliadinDB{db: db, users: users},
|
|
cfg: cfg,
|
|
}
|
|
}
|
|
|
|
// RunTurn drives one Q&A round against the remote claude pane. Same
|
|
// audit-row contract as LocalPaliadinService: write the row first, run
|
|
// the turn, complete the row on success, mark error on failure.
|
|
func (s *RemotePaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) {
|
|
s.turnMu.Lock()
|
|
defer s.turnMu.Unlock()
|
|
|
|
turnID := uuid.New()
|
|
startedAt := time.Now().UTC()
|
|
|
|
// Audit row first — leave traces even if we crash mid-turn.
|
|
if err := s.insertTurnRow(ctx, &PaliadinTurn{
|
|
TurnID: turnID,
|
|
UserID: req.UserID,
|
|
SessionID: req.SessionID,
|
|
StartedAt: startedAt,
|
|
UserMessage: req.UserMessage,
|
|
PageOrigin: optionalString(req.PageOrigin),
|
|
}); err != nil {
|
|
return nil, fmt.Errorf("paliadin: insert turn row: %w", err)
|
|
}
|
|
|
|
// Health-gate before paying the cost of a real turn. Caches OK for
|
|
// 10 s so a fast back-to-back chat doesn't probe every time.
|
|
if err := s.healthGate(ctx); err != nil {
|
|
_ = s.markTurnError(ctx, turnID, "mriver_unreachable")
|
|
return nil, err
|
|
}
|
|
|
|
// Lazy bootstrap — first turn after a paliad restart sends the
|
|
// system prompt; subsequent turns skip.
|
|
if err := s.ensureBootstrapped(ctx); err != nil {
|
|
_ = s.markTurnError(ctx, turnID, "bootstrap_failed")
|
|
return nil, err
|
|
}
|
|
|
|
msg := sanitiseForTmux(req.UserMessage)
|
|
msgB64 := base64.StdEncoding.EncodeToString([]byte(msg))
|
|
|
|
body, err := s.callShim(ctx, "run-turn", turnID.String(), msgB64)
|
|
if err != nil {
|
|
_ = s.markTurnError(ctx, turnID, classifySSHError(err))
|
|
return nil, err
|
|
}
|
|
|
|
// Same trailer parse + audit completion as the local path.
|
|
cleanBody, meta := splitTrailer(string(body))
|
|
tokens := approxTokenCount(cleanBody)
|
|
chipCount := countChips(cleanBody)
|
|
finished := time.Now().UTC()
|
|
durationMS := int(finished.Sub(startedAt) / time.Millisecond)
|
|
|
|
if err := s.completeTurn(ctx, turnID, finished, durationMS, cleanBody, tokens, meta, chipCount); err != nil {
|
|
log.Printf("paliadin: complete turn %s: %v", turnID, err)
|
|
}
|
|
|
|
return &TurnResult{
|
|
TurnID: turnID,
|
|
Response: cleanBody,
|
|
UsedTools: meta.UsedTools,
|
|
RowsSeen: meta.RowsSeen,
|
|
ChipCount: chipCount,
|
|
ClassifierTag: meta.ClassifierTag,
|
|
DurationMS: durationMS,
|
|
}, nil
|
|
}
|
|
|
|
// ResetSession sends `/clear` to the remote claude pane.
|
|
func (s *RemotePaliadinService) ResetSession(ctx context.Context) error {
|
|
if _, err := s.callShim(ctx, "reset"); err != nil {
|
|
return fmt.Errorf("paliadin: reset: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// healthGate runs the shim's `health` verb at most once per 10 s.
|
|
// Returns ErrMRiverUnreachable wrapping the underlying error on miss.
|
|
func (s *RemotePaliadinService) healthGate(ctx context.Context) error {
|
|
s.healthMu.Lock()
|
|
defer s.healthMu.Unlock()
|
|
|
|
if s.healthOK && time.Since(s.healthCheckedAt) < 10*time.Second {
|
|
return nil
|
|
}
|
|
|
|
probeCtx, cancel := context.WithTimeout(ctx, 3*time.Second)
|
|
defer cancel()
|
|
out, err := s.callShim(probeCtx, "health")
|
|
s.healthCheckedAt = time.Now()
|
|
if err != nil {
|
|
s.healthOK = false
|
|
return fmt.Errorf("%w: %v", ErrMRiverUnreachable, err)
|
|
}
|
|
if strings.TrimSpace(string(out)) != "ok" {
|
|
s.healthOK = false
|
|
return fmt.Errorf("%w: shim returned %q", ErrMRiverUnreachable, string(out))
|
|
}
|
|
s.healthOK = true
|
|
return nil
|
|
}
|
|
|
|
// ensureBootstrapped sends the Paliadin system prompt to the remote
|
|
// claude pane on first call. Idempotent — subsequent calls return nil
|
|
// without doing any work.
|
|
func (s *RemotePaliadinService) ensureBootstrapped(ctx context.Context) error {
|
|
s.bootstrapMu.Lock()
|
|
defer s.bootstrapMu.Unlock()
|
|
if s.bootstrapped {
|
|
return nil
|
|
}
|
|
prompt := paliadinSystemPrompt("/tmp/paliadin")
|
|
promptB64 := base64.StdEncoding.EncodeToString([]byte(prompt))
|
|
if _, err := s.callShim(ctx, "bootstrap", promptB64); err != nil {
|
|
return fmt.Errorf("paliadin: bootstrap: %w", err)
|
|
}
|
|
s.bootstrapped = true
|
|
return nil
|
|
}
|
|
|
|
// callShim runs `ssh <user>@<host> -- <verb> <args...>` against the
|
|
// paliadin-shim. The shim's authorized_keys command= directive ensures
|
|
// the verb + args are passed via $SSH_ORIGINAL_COMMAND regardless of
|
|
// what we put after the `--`; we keep the explicit argv form anyway so
|
|
// reading the code at the call site is unambiguous.
|
|
//
|
|
// Tests set callShimHook to bypass exec.
|
|
func (s *RemotePaliadinService) callShim(ctx context.Context, args ...string) ([]byte, error) {
|
|
if s.callShimHook != nil {
|
|
return s.callShimHook(ctx, args...)
|
|
}
|
|
|
|
sshArgs := []string{
|
|
"-F", "/dev/null", // ignore /etc/ssh/ssh_config + ~/.ssh/config
|
|
"-i", s.cfg.SSHKeyPath,
|
|
"-p", strconv.Itoa(s.cfg.SSHPort), // 22022 — bypasses Tailscale SSH on :22
|
|
"-o", "IdentitiesOnly=yes",
|
|
"-o", "UserKnownHostsFile=" + s.cfg.KnownHostsPath,
|
|
"-o", "StrictHostKeyChecking=yes",
|
|
"-o", "BatchMode=yes",
|
|
"-o", "ConnectTimeout=3",
|
|
"-o", "ServerAliveInterval=10",
|
|
"-o", "ServerAliveCountMax=3",
|
|
s.cfg.SSHUser + "@" + s.cfg.SSHHost,
|
|
"--",
|
|
}
|
|
sshArgs = append(sshArgs, args...)
|
|
|
|
// Shim's run-turn timeout is 60 s; +10 s gives SSH some overhead.
|
|
c, cancel := context.WithTimeout(ctx, 70*time.Second)
|
|
defer cancel()
|
|
|
|
cmd := exec.CommandContext(c, "ssh", sshArgs...)
|
|
var stdout, stderr bytes.Buffer
|
|
cmd.Stdout = &stdout
|
|
cmd.Stderr = &stderr
|
|
if err := cmd.Run(); err != nil {
|
|
return nil, fmt.Errorf("ssh %s: %w (stderr: %s)", strings.Join(args, " "), err, strings.TrimSpace(stderr.String()))
|
|
}
|
|
return stdout.Bytes(), nil
|
|
}
|
|
|
|
// classifySSHError turns a callShim error into one of the audit-row
|
|
// error codes. Codes are stable strings shown on the admin dashboard
|
|
// and used by the frontend's friendlyErrorMessage to localise.
|
|
func classifySSHError(err error) string {
|
|
if err == nil {
|
|
return ""
|
|
}
|
|
if errors.Is(err, ErrMRiverUnreachable) {
|
|
return "mriver_unreachable"
|
|
}
|
|
if errors.Is(err, context.DeadlineExceeded) {
|
|
return "timeout"
|
|
}
|
|
msg := err.Error()
|
|
switch {
|
|
case strings.Contains(msg, "Connection timed out"),
|
|
strings.Contains(msg, "Connection refused"),
|
|
strings.Contains(msg, "Could not resolve hostname"),
|
|
strings.Contains(msg, "Network is unreachable"):
|
|
return "mriver_unreachable"
|
|
case strings.Contains(msg, "exit status 124"):
|
|
// Shim's run-turn 60 s timeout — Claude didn't write the
|
|
// response file in time.
|
|
return "timeout"
|
|
case strings.Contains(msg, "Permission denied"):
|
|
return "shim_auth_failed"
|
|
default:
|
|
return "shim_error"
|
|
}
|
|
}
|
|
|
|
// DisabledPaliadinService is a stub that always returns
|
|
// ErrPaliadinDisabled. cmd/server/main.go constructs one when neither
|
|
// PALIADIN_REMOTE_HOST is set nor a local tmux is available; without
|
|
// the stub, the handler would have to nil-check on every entry point.
|
|
type DisabledPaliadinService struct {
|
|
paliadinDB
|
|
}
|
|
|
|
// NewDisabledPaliadinService wires the stub. DB methods (IsOwner /
|
|
// ListRecentTurns / Stats) still work; only RunTurn / ResetSession
|
|
// return ErrPaliadinDisabled.
|
|
func NewDisabledPaliadinService(db *sqlx.DB, users *UserService) *DisabledPaliadinService {
|
|
return &DisabledPaliadinService{paliadinDB: paliadinDB{db: db, users: users}}
|
|
}
|
|
|
|
func (s *DisabledPaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) {
|
|
return nil, ErrPaliadinDisabled
|
|
}
|
|
|
|
func (s *DisabledPaliadinService) ResetSession(ctx context.Context) error {
|
|
return ErrPaliadinDisabled
|
|
}
|
|
|
|
// Compile-time interface conformance checks — fail the build, not a
|
|
// runtime test, if a method drifts off any backend.
|
|
var (
|
|
_ Paliadin = (*LocalPaliadinService)(nil)
|
|
_ Paliadin = (*RemotePaliadinService)(nil)
|
|
_ Paliadin = (*DisabledPaliadinService)(nil)
|
|
)
|