diff --git a/cmd/server/main.go b/cmd/server/main.go index 3b94ec6..32c9ca0 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -2,10 +2,13 @@ package main import ( "context" + "fmt" "log" "net/http" "os" + "os/exec" "os/signal" + "strconv" "syscall" // Embed Go's IANA tz database into the binary so time.LoadLocation works @@ -165,20 +168,34 @@ func main() { CardLayout: services.NewCardLayoutService(pool), } - // t-paliad-146 — Paliadin PoC. Always wired when DATABASE_URL - // is set; the per-request handler gate (requirePaliadinOwner) - // restricts access to the single owner email - // (services.PaliadinOwnerEmail). All other authenticated users - // get a 404 — the route effectively does not exist for them. - // On hosts without tmux + the `claude` CLI (e.g. the Dokploy - // container), the owner gate still applies; if m ever hits the - // route from such a host, the service returns "tmux unavailable" - // without ever invoking shell-out. - tmuxSession := os.Getenv("PALIADIN_TMUX_SESSION") - responseDir := os.Getenv("PALIADIN_RESPONSE_DIR") - svcBundle.Paliadin = services.NewLocalPaliadinService(pool, users, tmuxSession, responseDir) - log.Printf("paliadin: wired (owner=%s; gate is per-request, not per-deploy)", - services.PaliadinOwnerEmail) + // Paliadin backend selection (t-paliad-146 + t-paliad-151): + // PALIADIN_REMOTE_HOST set → RemotePaliadinService (ssh to mRiver) + // else: local tmux available → LocalPaliadinService (PoC path) + // else: DisabledPaliadinService (handlers still 404 for non-owners + // via the gate; for m, RunTurn returns ErrPaliadinDisabled + // which surfaces as a friendly error). + // + // All three implement services.Paliadin; the per-request handler + // gate (requirePaliadinOwner) is unchanged and applies to every + // backend. + if remoteHost := os.Getenv("PALIADIN_REMOTE_HOST"); remoteHost != "" { + cfg, err := buildPaliadinRemoteConfig(remoteHost) + if err != nil { + log.Fatalf("paliadin: remote config: %v", err) + } + svcBundle.Paliadin = services.NewRemotePaliadinService(pool, users, cfg) + log.Printf("paliadin: remote mode → ssh %s@%s:%d (owner=%s)", + cfg.SSHUser, cfg.SSHHost, cfg.SSHPort, services.PaliadinOwnerEmail) + } else if _, err := exec.LookPath("tmux"); err == nil { + tmuxSession := os.Getenv("PALIADIN_TMUX_SESSION") + responseDir := os.Getenv("PALIADIN_RESPONSE_DIR") + svcBundle.Paliadin = services.NewLocalPaliadinService(pool, users, tmuxSession, responseDir) + log.Printf("paliadin: local tmux mode (owner=%s)", services.PaliadinOwnerEmail) + } else { + svcBundle.Paliadin = services.NewDisabledPaliadinService(pool, users) + log.Printf("paliadin: disabled (no PALIADIN_REMOTE_HOST, no local tmux; owner=%s)", + services.PaliadinOwnerEmail) + } // Wire ApprovalService into the entity services so Create / Update / // Complete / Delete consult paliad.approval_policies (t-paliad-138). // Without this wiring, the policies and request tables exist but no @@ -217,3 +234,83 @@ func main() { log.Fatal(err) } } + +// buildPaliadinRemoteConfig assembles a RemotePaliadinConfig from +// environment variables, materialising the SSH private key and +// known_hosts blobs into chmod-600/644 tmpfiles for OpenSSH to read. +// +// The blobs travel as Dokploy secrets (multi-line env vars). We never +// persist them to disk — tmpfiles live for the process lifetime in +// /tmp and disappear on container restart. Re-creating them every boot +// is fine; the keys themselves rotate independently via Dokploy +// secret updates. +// +// Required: PALIADIN_REMOTE_HOST, PALIADIN_SSH_PRIVATE_KEY, PALIADIN_KNOWN_HOSTS. +// Optional: PALIADIN_REMOTE_USER (default "m"), PALIADIN_REMOTE_PORT +// (default 22022 — bypasses Tailscale SSH on :22, see design §4.5). +func buildPaliadinRemoteConfig(host string) (services.RemotePaliadinConfig, error) { + cfg := services.RemotePaliadinConfig{ + SSHHost: host, + SSHUser: cmpOr(os.Getenv("PALIADIN_REMOTE_USER"), "m"), + SSHPort: 22022, + } + if p := os.Getenv("PALIADIN_REMOTE_PORT"); p != "" { + n, err := strconv.Atoi(p) + if err != nil || n <= 0 || n > 65535 { + return cfg, fmt.Errorf("PALIADIN_REMOTE_PORT %q: not a valid port", p) + } + cfg.SSHPort = n + } + + keyPath, err := writeSecretFile("paliadin-id_ed25519-", os.Getenv("PALIADIN_SSH_PRIVATE_KEY"), 0o600) + if err != nil { + return cfg, fmt.Errorf("PALIADIN_SSH_PRIVATE_KEY: %w", err) + } + if keyPath == "" { + return cfg, fmt.Errorf("PALIADIN_REMOTE_HOST set but PALIADIN_SSH_PRIVATE_KEY empty") + } + cfg.SSHKeyPath = keyPath + + knownHostsPath, err := writeSecretFile("paliadin-known_hosts-", os.Getenv("PALIADIN_KNOWN_HOSTS"), 0o644) + if err != nil { + return cfg, fmt.Errorf("PALIADIN_KNOWN_HOSTS: %w", err) + } + if knownHostsPath == "" { + return cfg, fmt.Errorf("PALIADIN_REMOTE_HOST set but PALIADIN_KNOWN_HOSTS empty") + } + cfg.KnownHostsPath = knownHostsPath + + return cfg, nil +} + +// writeSecretFile writes blob to a tmpfile with the given mode and +// returns its path. Returns ("", nil) when blob is empty so callers +// can distinguish "not set" from real I/O errors. +func writeSecretFile(prefix, blob string, mode os.FileMode) (string, error) { + if blob == "" { + return "", nil + } + f, err := os.CreateTemp("", prefix+"*") + if err != nil { + return "", err + } + if _, err := f.WriteString(blob); err != nil { + _ = f.Close() + _ = os.Remove(f.Name()) + return "", err + } + if err := f.Close(); err != nil { + return "", err + } + if err := os.Chmod(f.Name(), mode); err != nil { + return "", err + } + return f.Name(), nil +} + +func cmpOr(s, fallback string) string { + if s != "" { + return s + } + return fallback +} diff --git a/internal/services/paliadin_remote.go b/internal/services/paliadin_remote.go new file mode 100644 index 0000000..36f34e0 --- /dev/null +++ b/internal/services/paliadin_remote.go @@ -0,0 +1,322 @@ +package services + +// RemotePaliadinService — the prod path of the Paliadin backend. +// +// Design: docs/design-paliadin-tailscale-ssh-2026-05-07.md. +// +// Where the local backend (LocalPaliadinService) drives a tmux+claude +// pane in-process, the remote backend shells out to ssh m@mriver +// paliadin-shim — the script at scripts/paliadin-shim, installed at +// /home/m/.local/bin/paliadin-shim on m's laptop. The shim owns the +// tmux+claude pane on mRiver; this Go side just wraps each turn in one +// SSH call. +// +// The path was chosen so paliad.de (deployed in a Dokploy container on +// mLake, no `claude` CLI of its own) can keep using m's Claude Code +// subscription instead of paying API tokens. Tailscale provides the +// transport — mLake's tailscale0 interface is shared into the container +// via network_mode: host (compose layer; not this file's concern). +// +// Wiring is gated on PALIADIN_REMOTE_HOST in cmd/server/main.go. When +// that env var is unset, the binary falls back to LocalPaliadinService +// (or DisabledPaliadinService if neither tmux nor remote is available). + +import ( + "bytes" + "context" + "encoding/base64" + "errors" + "fmt" + "log" + "os/exec" + "strconv" + "strings" + "sync" + "time" + + "github.com/google/uuid" + "github.com/jmoiron/sqlx" +) + +// ErrMRiverUnreachable signals that the remote paliadin-shim could not +// be contacted within the health-check window. The handler maps this to +// the friendly mriver_unreachable error code (see frontend +// friendlyErrorMessage). +var ErrMRiverUnreachable = errors.New("paliadin: mriver unreachable") + +// RemotePaliadinConfig is the bag of knobs cmd/server/main.go passes +// when constructing a RemotePaliadinService. +type RemotePaliadinConfig struct { + SSHHost string // 100.99.98.203 — mRiver's tailnet IP + SSHPort int // 22022 — bypasses Tailscale SSH on :22 (design §4.5) + SSHUser string // m + SSHKeyPath string // /tmp/paliadin-id_ed25519- (chmod 600) + KnownHostsPath string // /tmp/paliadin-known_hosts +} + +// RemotePaliadinService implements Paliadin against a remote +// paliadin-shim over SSH. +type RemotePaliadinService struct { + paliadinDB + cfg RemotePaliadinConfig + + // Single in-flight turn. mRiver's claude pane is single-user; we + // serialise turns the same way LocalPaliadinService does. + turnMu sync.Mutex + + // Health-check cache. Avoids probing mRiver on every turn — once + // the cache is warm, RunTurn skips the probe for 10 seconds. + healthMu sync.Mutex + healthOK bool + healthCheckedAt time.Time + + // Lazy bootstrap state. The system prompt only needs to be sent + // once per claude pane; on first RunTurn after a paliad restart we + // inject it, and remember we did so we don't re-send. + bootstrapMu sync.Mutex + bootstrapped bool + + // Hook for tests — when non-nil, callShim delegates here instead + // of exec'ing ssh. Production code never sets this. + callShimHook func(ctx context.Context, args ...string) ([]byte, error) +} + +// NewRemotePaliadinService wires the remote backend. Call only when +// PALIADIN_REMOTE_HOST is set in the environment; the constructor does +// not probe mRiver — first probe happens on the first RunTurn call via +// healthGate. +func NewRemotePaliadinService(db *sqlx.DB, users *UserService, cfg RemotePaliadinConfig) *RemotePaliadinService { + if cfg.SSHPort == 0 { + cfg.SSHPort = 22022 + } + if cfg.SSHUser == "" { + cfg.SSHUser = "m" + } + return &RemotePaliadinService{ + paliadinDB: paliadinDB{db: db, users: users}, + cfg: cfg, + } +} + +// RunTurn drives one Q&A round against the remote claude pane. Same +// audit-row contract as LocalPaliadinService: write the row first, run +// the turn, complete the row on success, mark error on failure. +func (s *RemotePaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) { + s.turnMu.Lock() + defer s.turnMu.Unlock() + + turnID := uuid.New() + startedAt := time.Now().UTC() + + // Audit row first — leave traces even if we crash mid-turn. + if err := s.insertTurnRow(ctx, &PaliadinTurn{ + TurnID: turnID, + UserID: req.UserID, + SessionID: req.SessionID, + StartedAt: startedAt, + UserMessage: req.UserMessage, + PageOrigin: optionalString(req.PageOrigin), + }); err != nil { + return nil, fmt.Errorf("paliadin: insert turn row: %w", err) + } + + // Health-gate before paying the cost of a real turn. Caches OK for + // 10 s so a fast back-to-back chat doesn't probe every time. + if err := s.healthGate(ctx); err != nil { + _ = s.markTurnError(ctx, turnID, "mriver_unreachable") + return nil, err + } + + // Lazy bootstrap — first turn after a paliad restart sends the + // system prompt; subsequent turns skip. + if err := s.ensureBootstrapped(ctx); err != nil { + _ = s.markTurnError(ctx, turnID, "bootstrap_failed") + return nil, err + } + + msg := sanitiseForTmux(req.UserMessage) + msgB64 := base64.StdEncoding.EncodeToString([]byte(msg)) + + body, err := s.callShim(ctx, "run-turn", turnID.String(), msgB64) + if err != nil { + _ = s.markTurnError(ctx, turnID, classifySSHError(err)) + return nil, err + } + + // Same trailer parse + audit completion as the local path. + cleanBody, meta := splitTrailer(string(body)) + tokens := approxTokenCount(cleanBody) + chipCount := countChips(cleanBody) + finished := time.Now().UTC() + durationMS := int(finished.Sub(startedAt) / time.Millisecond) + + if err := s.completeTurn(ctx, turnID, finished, durationMS, cleanBody, tokens, meta, chipCount); err != nil { + log.Printf("paliadin: complete turn %s: %v", turnID, err) + } + + return &TurnResult{ + TurnID: turnID, + Response: cleanBody, + UsedTools: meta.UsedTools, + RowsSeen: meta.RowsSeen, + ChipCount: chipCount, + ClassifierTag: meta.ClassifierTag, + DurationMS: durationMS, + }, nil +} + +// ResetSession sends `/clear` to the remote claude pane. +func (s *RemotePaliadinService) ResetSession(ctx context.Context) error { + if _, err := s.callShim(ctx, "reset"); err != nil { + return fmt.Errorf("paliadin: reset: %w", err) + } + return nil +} + +// healthGate runs the shim's `health` verb at most once per 10 s. +// Returns ErrMRiverUnreachable wrapping the underlying error on miss. +func (s *RemotePaliadinService) healthGate(ctx context.Context) error { + s.healthMu.Lock() + defer s.healthMu.Unlock() + + if s.healthOK && time.Since(s.healthCheckedAt) < 10*time.Second { + return nil + } + + probeCtx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + out, err := s.callShim(probeCtx, "health") + s.healthCheckedAt = time.Now() + if err != nil { + s.healthOK = false + return fmt.Errorf("%w: %v", ErrMRiverUnreachable, err) + } + if strings.TrimSpace(string(out)) != "ok" { + s.healthOK = false + return fmt.Errorf("%w: shim returned %q", ErrMRiverUnreachable, string(out)) + } + s.healthOK = true + return nil +} + +// ensureBootstrapped sends the Paliadin system prompt to the remote +// claude pane on first call. Idempotent — subsequent calls return nil +// without doing any work. +func (s *RemotePaliadinService) ensureBootstrapped(ctx context.Context) error { + s.bootstrapMu.Lock() + defer s.bootstrapMu.Unlock() + if s.bootstrapped { + return nil + } + prompt := paliadinSystemPrompt("/tmp/paliadin") + promptB64 := base64.StdEncoding.EncodeToString([]byte(prompt)) + if _, err := s.callShim(ctx, "bootstrap", promptB64); err != nil { + return fmt.Errorf("paliadin: bootstrap: %w", err) + } + s.bootstrapped = true + return nil +} + +// callShim runs `ssh @ -- ` against the +// paliadin-shim. The shim's authorized_keys command= directive ensures +// the verb + args are passed via $SSH_ORIGINAL_COMMAND regardless of +// what we put after the `--`; we keep the explicit argv form anyway so +// reading the code at the call site is unambiguous. +// +// Tests set callShimHook to bypass exec. +func (s *RemotePaliadinService) callShim(ctx context.Context, args ...string) ([]byte, error) { + if s.callShimHook != nil { + return s.callShimHook(ctx, args...) + } + + sshArgs := []string{ + "-F", "/dev/null", // ignore /etc/ssh/ssh_config + ~/.ssh/config + "-i", s.cfg.SSHKeyPath, + "-p", strconv.Itoa(s.cfg.SSHPort), // 22022 — bypasses Tailscale SSH on :22 + "-o", "IdentitiesOnly=yes", + "-o", "UserKnownHostsFile=" + s.cfg.KnownHostsPath, + "-o", "StrictHostKeyChecking=yes", + "-o", "BatchMode=yes", + "-o", "ConnectTimeout=3", + "-o", "ServerAliveInterval=10", + "-o", "ServerAliveCountMax=3", + s.cfg.SSHUser + "@" + s.cfg.SSHHost, + "--", + } + sshArgs = append(sshArgs, args...) + + // Shim's run-turn timeout is 60 s; +10 s gives SSH some overhead. + c, cancel := context.WithTimeout(ctx, 70*time.Second) + defer cancel() + + cmd := exec.CommandContext(c, "ssh", sshArgs...) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + if err := cmd.Run(); err != nil { + return nil, fmt.Errorf("ssh %s: %w (stderr: %s)", strings.Join(args, " "), err, strings.TrimSpace(stderr.String())) + } + return stdout.Bytes(), nil +} + +// classifySSHError turns a callShim error into one of the audit-row +// error codes. Codes are stable strings shown on the admin dashboard +// and used by the frontend's friendlyErrorMessage to localise. +func classifySSHError(err error) string { + if err == nil { + return "" + } + if errors.Is(err, ErrMRiverUnreachable) { + return "mriver_unreachable" + } + if errors.Is(err, context.DeadlineExceeded) { + return "timeout" + } + msg := err.Error() + switch { + case strings.Contains(msg, "Connection timed out"), + strings.Contains(msg, "Connection refused"), + strings.Contains(msg, "Could not resolve hostname"), + strings.Contains(msg, "Network is unreachable"): + return "mriver_unreachable" + case strings.Contains(msg, "exit status 124"): + // Shim's run-turn 60 s timeout — Claude didn't write the + // response file in time. + return "timeout" + case strings.Contains(msg, "Permission denied"): + return "shim_auth_failed" + default: + return "shim_error" + } +} + +// DisabledPaliadinService is a stub that always returns +// ErrPaliadinDisabled. cmd/server/main.go constructs one when neither +// PALIADIN_REMOTE_HOST is set nor a local tmux is available; without +// the stub, the handler would have to nil-check on every entry point. +type DisabledPaliadinService struct { + paliadinDB +} + +// NewDisabledPaliadinService wires the stub. DB methods (IsOwner / +// ListRecentTurns / Stats) still work; only RunTurn / ResetSession +// return ErrPaliadinDisabled. +func NewDisabledPaliadinService(db *sqlx.DB, users *UserService) *DisabledPaliadinService { + return &DisabledPaliadinService{paliadinDB: paliadinDB{db: db, users: users}} +} + +func (s *DisabledPaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*TurnResult, error) { + return nil, ErrPaliadinDisabled +} + +func (s *DisabledPaliadinService) ResetSession(ctx context.Context) error { + return ErrPaliadinDisabled +} + +// Compile-time interface conformance checks — fail the build, not a +// runtime test, if a method drifts off any backend. +var ( + _ Paliadin = (*LocalPaliadinService)(nil) + _ Paliadin = (*RemotePaliadinService)(nil) + _ Paliadin = (*DisabledPaliadinService)(nil) +)