package services // RemotePaliadinService — the prod path of the Paliadin backend. // // Design: docs/design-paliadin-tailscale-ssh-2026-05-07.md. // // Where the local backend (LocalPaliadinService) drives a tmux+claude // pane in-process, the remote backend shells out to ssh m@mriver // paliadin-shim — the script at scripts/paliadin-shim, installed at // /home/m/.local/bin/paliadin-shim on m's laptop. The shim owns the // tmux+claude pane on mRiver; this Go side just wraps each turn in one // SSH call. // // The path was chosen so paliad.de (deployed in a Dokploy container on // mLake, no `claude` CLI of its own) can keep using m's Claude Code // subscription instead of paying API tokens. Tailscale provides the // transport — mLake's tailscale0 interface is shared into the container // via network_mode: host (compose layer; not this file's concern). // // Wiring is gated on PALIADIN_REMOTE_HOST in cmd/server/main.go. When // that env var is unset, the binary falls back to LocalPaliadinService // (or DisabledPaliadinService if neither tmux nor remote is available). import ( "bytes" "context" "encoding/base64" "errors" "fmt" "log" "os/exec" "strconv" "strings" "sync" "time" "github.com/google/uuid" "github.com/jmoiron/sqlx" ) // ErrMRiverUnreachable signals that the remote paliadin-shim could not // be contacted within the health-check window. The handler maps this to // the friendly mriver_unreachable error code (see frontend // friendlyErrorMessage). var ErrMRiverUnreachable = errors.New("paliadin: mriver unreachable") // RemotePaliadinConfig is the bag of knobs cmd/server/main.go passes // when constructing a RemotePaliadinService. type RemotePaliadinConfig struct { SSHHost string // 100.99.98.203 — mRiver's tailnet IP SSHPort int // 22022 — bypasses Tailscale SSH on :22 (design §4.5) SSHUser string // m SSHKeyPath string // /tmp/paliadin-id_ed25519- (chmod 600) KnownHostsPath string // /tmp/paliadin-known_hosts SessionPrefix string // tmux session prefix; per-user session is "-" } // RemotePaliadinService implements Paliadin against a remote // paliadin-shim over SSH. type RemotePaliadinService struct { paliadinDB cfg RemotePaliadinConfig // Serialise turns across all users. mRiver's host has finite tmux // concurrency anyway, and Paliadin turns are short. Per-user // fan-out can ship in v2 if it ever bottlenecks. turnMu sync.Mutex // Health-check cache, keyed by per-user session name. Avoids // probing mRiver on every turn — once a session's cache is warm, // RunTurn skips the probe for 10 seconds. healthMu sync.Mutex health map[string]healthCacheEntry // Hook for tests — when non-nil, callShim delegates here instead // of exec'ing ssh. Production code never sets this. callShimHook func(ctx context.Context, args ...string) ([]byte, error) } // healthCacheEntry is one row in the health cache, keyed off tmux // session name. We cache success only — failures re-probe so a flap // surfaces immediately when paliad reboots into a healthy mRiver. type healthCacheEntry struct { ok bool checkedAt time.Time } // NewRemotePaliadinService wires the remote backend. Call only when // PALIADIN_REMOTE_HOST is set in the environment; the constructor does // not probe mRiver — first probe happens on the first RunTurn call via // healthGate. func NewRemotePaliadinService(db *sqlx.DB, users *UserService, cfg RemotePaliadinConfig) *RemotePaliadinService { if cfg.SSHPort == 0 { cfg.SSHPort = 22022 } if cfg.SSHUser == "" { cfg.SSHUser = "m" } if cfg.SessionPrefix == "" { cfg.SessionPrefix = "paliad-paliadin" } return &RemotePaliadinService{ paliadinDB: paliadinDB{db: db, users: users}, cfg: cfg, health: make(map[string]healthCacheEntry), } } // sessionNameFor returns the per-user tmux session name. Per-user // keying (t-paliad-155): one persistent session per Paliad user keyed // on the first 8 hex chars of their UUID. Conversation history piles // up across visits; ResetSession is the user-driven escape hatch. func (s *RemotePaliadinService) sessionNameFor(userID uuid.UUID) string { short := userID.String() if len(short) >= 8 { short = short[:8] } return s.cfg.SessionPrefix + "-" + short } // RunTurn drives one Q&A round against the remote claude pane. Same // audit-row contract as LocalPaliadinService: write the row first, run // the turn, complete the row on success, mark error on failure. func (s *RemotePaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) { s.turnMu.Lock() defer s.turnMu.Unlock() turnID := uuid.New() startedAt := time.Now().UTC() // Audit row first — leave traces even if we crash mid-turn. if err := s.insertTurnRow(ctx, &PaliadinTurn{ TurnID: turnID, UserID: req.UserID, SessionID: req.SessionID, StartedAt: startedAt, UserMessage: req.UserMessage, PageOrigin: optionalString(req.PageOrigin), }); err != nil { return nil, fmt.Errorf("paliadin: insert turn row: %w", err) } session := s.sessionNameFor(req.UserID) // Health-gate before paying the cost of a real turn. Caches OK for // 10 s per session so a fast back-to-back chat doesn't probe every // time. if err := s.healthGate(ctx, session); err != nil { _ = s.markTurnError(ctx, turnID, "mriver_unreachable") return nil, err } // Persona + response protocol live in the Paliadin skill at // ~/.claude/skills/paliadin/SKILL.md on mRiver. Claude's skill // router auto-matches the [PALIADIN: envelope so no in-process // bootstrap (system-prompt-via-tmux-keystroke) is needed any more. msg := sanitiseForTmux(req.UserMessage) msgB64 := base64.StdEncoding.EncodeToString([]byte(msg)) body, err := s.callShim(ctx, "run-turn", session, turnID.String(), msgB64) if err != nil { _ = s.markTurnError(ctx, turnID, classifySSHError(err)) return nil, err } // Same trailer parse + audit completion as the local path. cleanBody, meta := splitTrailer(string(body)) tokens := approxTokenCount(cleanBody) chipCount := countChips(cleanBody) finished := time.Now().UTC() durationMS := int(finished.Sub(startedAt) / time.Millisecond) if err := s.completeTurn(ctx, turnID, finished, durationMS, cleanBody, tokens, meta, chipCount); err != nil { log.Printf("paliadin: complete turn %s: %v", turnID, err) } return &TurnResult{ TurnID: turnID, Response: cleanBody, UsedTools: meta.UsedTools, RowsSeen: meta.RowsSeen, ChipCount: chipCount, ClassifierTag: meta.ClassifierTag, DurationMS: durationMS, }, nil } // ResetSession kills the user's tmux session on mRiver entirely so the // next RunTurn boots a fresh claude pane. Skill-based persona load // means the new pane re-acquires the Paliadin protocol contract on // first turn — no system-prompt re-send needed. func (s *RemotePaliadinService) ResetSession(ctx context.Context, userID uuid.UUID) error { session := s.sessionNameFor(userID) // Drop the cached health entry so the next turn re-probes against // the fresh session. s.healthMu.Lock() delete(s.health, session) s.healthMu.Unlock() if _, err := s.callShim(ctx, "reset", session); err != nil { return fmt.Errorf("paliadin: reset %s: %w", session, err) } return nil } // healthGate runs the shim's `health ` verb at most once per // 10 s per session. Returns ErrMRiverUnreachable wrapping the // underlying error on miss. func (s *RemotePaliadinService) healthGate(ctx context.Context, session string) error { s.healthMu.Lock() defer s.healthMu.Unlock() if entry, ok := s.health[session]; ok && entry.ok && time.Since(entry.checkedAt) < 10*time.Second { return nil } probeCtx, cancel := context.WithTimeout(ctx, 3*time.Second) defer cancel() out, err := s.callShim(probeCtx, "health", session) if err != nil { // Don't cache failures — re-probe on every miss so a recovery // surfaces immediately. delete(s.health, session) return fmt.Errorf("%w: %v", ErrMRiverUnreachable, err) } if strings.TrimSpace(string(out)) != "ok" { delete(s.health, session) return fmt.Errorf("%w: shim returned %q", ErrMRiverUnreachable, string(out)) } s.health[session] = healthCacheEntry{ok: true, checkedAt: time.Now()} return nil } // callShim runs `ssh @ -- ` against the // paliadin-shim. The shim's authorized_keys command= directive ensures // the verb + args are passed via $SSH_ORIGINAL_COMMAND regardless of // what we put after the `--`; we keep the explicit argv form anyway so // reading the code at the call site is unambiguous. // // Tests set callShimHook to bypass exec. func (s *RemotePaliadinService) callShim(ctx context.Context, args ...string) ([]byte, error) { if s.callShimHook != nil { return s.callShimHook(ctx, args...) } sshArgs := []string{ "-F", "/dev/null", // ignore /etc/ssh/ssh_config + ~/.ssh/config "-i", s.cfg.SSHKeyPath, "-p", strconv.Itoa(s.cfg.SSHPort), // 22022 — bypasses Tailscale SSH on :22 "-o", "IdentitiesOnly=yes", "-o", "UserKnownHostsFile=" + s.cfg.KnownHostsPath, "-o", "StrictHostKeyChecking=yes", "-o", "BatchMode=yes", "-o", "ConnectTimeout=3", "-o", "ServerAliveInterval=10", "-o", "ServerAliveCountMax=3", s.cfg.SSHUser + "@" + s.cfg.SSHHost, "--", } sshArgs = append(sshArgs, args...) // Shim's run-turn timeout is 60 s; +10 s gives SSH some overhead. c, cancel := context.WithTimeout(ctx, 70*time.Second) defer cancel() cmd := exec.CommandContext(c, "ssh", sshArgs...) var stdout, stderr bytes.Buffer cmd.Stdout = &stdout cmd.Stderr = &stderr if err := cmd.Run(); err != nil { return nil, fmt.Errorf("ssh %s: %w (stderr: %s)", strings.Join(args, " "), err, strings.TrimSpace(stderr.String())) } return stdout.Bytes(), nil } // classifySSHError turns a callShim error into one of the audit-row // error codes. Codes are stable strings shown on the admin dashboard // and used by the frontend's friendlyErrorMessage to localise. func classifySSHError(err error) string { if err == nil { return "" } if errors.Is(err, ErrMRiverUnreachable) { return "mriver_unreachable" } if errors.Is(err, context.DeadlineExceeded) { return "timeout" } msg := err.Error() switch { case strings.Contains(msg, "Connection timed out"), strings.Contains(msg, "Connection refused"), strings.Contains(msg, "Could not resolve hostname"), strings.Contains(msg, "Network is unreachable"): return "mriver_unreachable" case strings.Contains(msg, "exit status 124"): // Shim's run-turn 60 s timeout — Claude didn't write the // response file in time. return "timeout" case strings.Contains(msg, "Permission denied"): return "shim_auth_failed" default: return "shim_error" } } // DisabledPaliadinService is a stub that always returns // ErrPaliadinDisabled. cmd/server/main.go constructs one when neither // PALIADIN_REMOTE_HOST is set nor a local tmux is available; without // the stub, the handler would have to nil-check on every entry point. type DisabledPaliadinService struct { paliadinDB } // NewDisabledPaliadinService wires the stub. DB methods (IsOwner / // ListRecentTurns / Stats) still work; only RunTurn / ResetSession // return ErrPaliadinDisabled. func NewDisabledPaliadinService(db *sqlx.DB, users *UserService) *DisabledPaliadinService { return &DisabledPaliadinService{paliadinDB: paliadinDB{db: db, users: users}} } func (s *DisabledPaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) { return nil, ErrPaliadinDisabled } func (s *DisabledPaliadinService) ResetSession(ctx context.Context, userID uuid.UUID) error { return ErrPaliadinDisabled } // Compile-time interface conformance checks — fail the build, not a // runtime test, if a method drifts off any backend. var ( _ Paliadin = (*LocalPaliadinService)(nil) _ Paliadin = (*RemotePaliadinService)(nil) _ Paliadin = (*DisabledPaliadinService)(nil) )