package services // RemotePaliadinService — the prod path of the Paliadin backend. // // Design: docs/design-paliadin-tailscale-ssh-2026-05-07.md. // // Where the local backend (LocalPaliadinService) drives a tmux+claude // pane in-process, the remote backend shells out to ssh m@mriver // paliadin-shim — the script at scripts/paliadin-shim, installed at // /home/m/.local/bin/paliadin-shim on m's laptop. The shim owns the // tmux+claude pane on mRiver; this Go side just wraps each turn in one // SSH call. // // The path was chosen so paliad.de (deployed in a Dokploy container on // mLake, no `claude` CLI of its own) can keep using m's Claude Code // subscription instead of paying API tokens. Tailscale provides the // transport — mLake's tailscale0 interface is shared into the container // via network_mode: host (compose layer; not this file's concern). // // Wiring is gated on PALIADIN_REMOTE_HOST in cmd/server/main.go. When // that env var is unset, the binary falls back to LocalPaliadinService // (or DisabledPaliadinService if neither tmux nor remote is available). import ( "bytes" "context" "encoding/base64" "errors" "fmt" "log" "os/exec" "strconv" "strings" "sync" "time" "github.com/google/uuid" "github.com/jmoiron/sqlx" ) // ErrMRiverUnreachable signals that the remote paliadin-shim could not // be contacted within the health-check window. The handler maps this to // the friendly mriver_unreachable error code (see frontend // friendlyErrorMessage). var ErrMRiverUnreachable = errors.New("paliadin: mriver unreachable") // RemotePaliadinConfig is the bag of knobs cmd/server/main.go passes // when constructing a RemotePaliadinService. type RemotePaliadinConfig struct { SSHHost string // 100.99.98.203 — mRiver's tailnet IP SSHPort int // 22022 — bypasses Tailscale SSH on :22 (design §4.5) SSHUser string // m SSHKeyPath string // /tmp/paliadin-id_ed25519- (chmod 600) KnownHostsPath string // /tmp/paliadin-known_hosts } // RemotePaliadinService implements Paliadin against a remote // paliadin-shim over SSH. type RemotePaliadinService struct { paliadinDB cfg RemotePaliadinConfig // Single in-flight turn. mRiver's claude pane is single-user; we // serialise turns the same way LocalPaliadinService does. turnMu sync.Mutex // Health-check cache. Avoids probing mRiver on every turn — once // the cache is warm, RunTurn skips the probe for 10 seconds. healthMu sync.Mutex healthOK bool healthCheckedAt time.Time // Lazy bootstrap state. The system prompt only needs to be sent // once per claude pane; on first RunTurn after a paliad restart we // inject it, and remember we did so we don't re-send. bootstrapMu sync.Mutex bootstrapped bool // Hook for tests — when non-nil, callShim delegates here instead // of exec'ing ssh. Production code never sets this. callShimHook func(ctx context.Context, args ...string) ([]byte, error) } // NewRemotePaliadinService wires the remote backend. Call only when // PALIADIN_REMOTE_HOST is set in the environment; the constructor does // not probe mRiver — first probe happens on the first RunTurn call via // healthGate. func NewRemotePaliadinService(db *sqlx.DB, users *UserService, cfg RemotePaliadinConfig) *RemotePaliadinService { if cfg.SSHPort == 0 { cfg.SSHPort = 22022 } if cfg.SSHUser == "" { cfg.SSHUser = "m" } return &RemotePaliadinService{ paliadinDB: paliadinDB{db: db, users: users}, cfg: cfg, } } // RunTurn drives one Q&A round against the remote claude pane. Same // audit-row contract as LocalPaliadinService: write the row first, run // the turn, complete the row on success, mark error on failure. func (s *RemotePaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) { s.turnMu.Lock() defer s.turnMu.Unlock() turnID := uuid.New() startedAt := time.Now().UTC() // Audit row first — leave traces even if we crash mid-turn. if err := s.insertTurnRow(ctx, &PaliadinTurn{ TurnID: turnID, UserID: req.UserID, SessionID: req.SessionID, StartedAt: startedAt, UserMessage: req.UserMessage, PageOrigin: optionalString(req.PageOrigin), }); err != nil { return nil, fmt.Errorf("paliadin: insert turn row: %w", err) } // Health-gate before paying the cost of a real turn. Caches OK for // 10 s so a fast back-to-back chat doesn't probe every time. if err := s.healthGate(ctx); err != nil { _ = s.markTurnError(ctx, turnID, "mriver_unreachable") return nil, err } // Lazy bootstrap — first turn after a paliad restart sends the // system prompt; subsequent turns skip. if err := s.ensureBootstrapped(ctx); err != nil { _ = s.markTurnError(ctx, turnID, "bootstrap_failed") return nil, err } msg := sanitiseForTmux(req.UserMessage) msgB64 := base64.StdEncoding.EncodeToString([]byte(msg)) body, err := s.callShim(ctx, "run-turn", turnID.String(), msgB64) if err != nil { _ = s.markTurnError(ctx, turnID, classifySSHError(err)) return nil, err } // Same trailer parse + audit completion as the local path. cleanBody, meta := splitTrailer(string(body)) tokens := approxTokenCount(cleanBody) chipCount := countChips(cleanBody) finished := time.Now().UTC() durationMS := int(finished.Sub(startedAt) / time.Millisecond) if err := s.completeTurn(ctx, turnID, finished, durationMS, cleanBody, tokens, meta, chipCount); err != nil { log.Printf("paliadin: complete turn %s: %v", turnID, err) } return &TurnResult{ TurnID: turnID, Response: cleanBody, UsedTools: meta.UsedTools, RowsSeen: meta.RowsSeen, ChipCount: chipCount, ClassifierTag: meta.ClassifierTag, DurationMS: durationMS, }, nil } // ResetSession sends `/clear` to the remote claude pane. func (s *RemotePaliadinService) ResetSession(ctx context.Context) error { if _, err := s.callShim(ctx, "reset"); err != nil { return fmt.Errorf("paliadin: reset: %w", err) } return nil } // healthGate runs the shim's `health` verb at most once per 10 s. // Returns ErrMRiverUnreachable wrapping the underlying error on miss. func (s *RemotePaliadinService) healthGate(ctx context.Context) error { s.healthMu.Lock() defer s.healthMu.Unlock() if s.healthOK && time.Since(s.healthCheckedAt) < 10*time.Second { return nil } probeCtx, cancel := context.WithTimeout(ctx, 3*time.Second) defer cancel() out, err := s.callShim(probeCtx, "health") s.healthCheckedAt = time.Now() if err != nil { s.healthOK = false return fmt.Errorf("%w: %v", ErrMRiverUnreachable, err) } if strings.TrimSpace(string(out)) != "ok" { s.healthOK = false return fmt.Errorf("%w: shim returned %q", ErrMRiverUnreachable, string(out)) } s.healthOK = true return nil } // ensureBootstrapped sends the Paliadin system prompt to the remote // claude pane on first call. Idempotent — subsequent calls return nil // without doing any work. func (s *RemotePaliadinService) ensureBootstrapped(ctx context.Context) error { s.bootstrapMu.Lock() defer s.bootstrapMu.Unlock() if s.bootstrapped { return nil } prompt := paliadinSystemPrompt("/tmp/paliadin") promptB64 := base64.StdEncoding.EncodeToString([]byte(prompt)) if _, err := s.callShim(ctx, "bootstrap", promptB64); err != nil { return fmt.Errorf("paliadin: bootstrap: %w", err) } s.bootstrapped = true return nil } // callShim runs `ssh @ -- ` against the // paliadin-shim. The shim's authorized_keys command= directive ensures // the verb + args are passed via $SSH_ORIGINAL_COMMAND regardless of // what we put after the `--`; we keep the explicit argv form anyway so // reading the code at the call site is unambiguous. // // Tests set callShimHook to bypass exec. func (s *RemotePaliadinService) callShim(ctx context.Context, args ...string) ([]byte, error) { if s.callShimHook != nil { return s.callShimHook(ctx, args...) } sshArgs := []string{ "-F", "/dev/null", // ignore /etc/ssh/ssh_config + ~/.ssh/config "-i", s.cfg.SSHKeyPath, "-p", strconv.Itoa(s.cfg.SSHPort), // 22022 — bypasses Tailscale SSH on :22 "-o", "IdentitiesOnly=yes", "-o", "UserKnownHostsFile=" + s.cfg.KnownHostsPath, "-o", "StrictHostKeyChecking=yes", "-o", "BatchMode=yes", "-o", "ConnectTimeout=3", "-o", "ServerAliveInterval=10", "-o", "ServerAliveCountMax=3", s.cfg.SSHUser + "@" + s.cfg.SSHHost, "--", } sshArgs = append(sshArgs, args...) // Shim's run-turn timeout is 60 s; +10 s gives SSH some overhead. c, cancel := context.WithTimeout(ctx, 70*time.Second) defer cancel() cmd := exec.CommandContext(c, "ssh", sshArgs...) var stdout, stderr bytes.Buffer cmd.Stdout = &stdout cmd.Stderr = &stderr if err := cmd.Run(); err != nil { return nil, fmt.Errorf("ssh %s: %w (stderr: %s)", strings.Join(args, " "), err, strings.TrimSpace(stderr.String())) } return stdout.Bytes(), nil } // classifySSHError turns a callShim error into one of the audit-row // error codes. Codes are stable strings shown on the admin dashboard // and used by the frontend's friendlyErrorMessage to localise. func classifySSHError(err error) string { if err == nil { return "" } if errors.Is(err, ErrMRiverUnreachable) { return "mriver_unreachable" } if errors.Is(err, context.DeadlineExceeded) { return "timeout" } msg := err.Error() switch { case strings.Contains(msg, "Connection timed out"), strings.Contains(msg, "Connection refused"), strings.Contains(msg, "Could not resolve hostname"), strings.Contains(msg, "Network is unreachable"): return "mriver_unreachable" case strings.Contains(msg, "exit status 124"): // Shim's run-turn 60 s timeout — Claude didn't write the // response file in time. return "timeout" case strings.Contains(msg, "Permission denied"): return "shim_auth_failed" default: return "shim_error" } } // DisabledPaliadinService is a stub that always returns // ErrPaliadinDisabled. cmd/server/main.go constructs one when neither // PALIADIN_REMOTE_HOST is set nor a local tmux is available; without // the stub, the handler would have to nil-check on every entry point. type DisabledPaliadinService struct { paliadinDB } // NewDisabledPaliadinService wires the stub. DB methods (IsOwner / // ListRecentTurns / Stats) still work; only RunTurn / ResetSession // return ErrPaliadinDisabled. func NewDisabledPaliadinService(db *sqlx.DB, users *UserService) *DisabledPaliadinService { return &DisabledPaliadinService{paliadinDB: paliadinDB{db: db, users: users}} } func (s *DisabledPaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) { return nil, ErrPaliadinDisabled } func (s *DisabledPaliadinService) ResetSession(ctx context.Context) error { return ErrPaliadinDisabled } // Compile-time interface conformance checks — fail the build, not a // runtime test, if a method drifts off any backend. var ( _ Paliadin = (*LocalPaliadinService)(nil) _ Paliadin = (*RemotePaliadinService)(nil) _ Paliadin = (*DisabledPaliadinService)(nil) )