feat(paliadin/primer): t-paliad-161 Slice G — tmux crash-recovery primer
When a user's tmux session dies (mRiver reboot, OOM, manual kill,
container restart) the next turn used to wake claude with NO prior
context — the persona had to derive everything from the new turn
alone. Now: when the Go side detects a fresh pane, it pulls the last
N exchanges from paliad.paliadin_turns and prepends them as a
[primer …][/primer] block to the next user envelope.
Format SKILL.md parses (single-line, control-chars stripped):
[PALIADIN:<turn_id>] [primer last=N] U: … \n A: … \n … [/primer] [ctx …] <Frage>
Detection paths:
- Local (LocalPaliadinService): ensurePane now returns
(target, isFresh, err). isFresh is true when no prior
@paliadin-scope=chat window existed and we created one. RunTurn
passes that into buildPrimerIfFresh.
- Remote (RemotePaliadinService): can't see across the SSH boundary
to know the pane's true freshness, so we approximate with a
per-(session, Go-process) "primed" cache. First turn after
process-start, ResetSession, or healthGate failure rebuilds the
primer; subsequent turns skip it. ResetSession + healthGate failure
both call clearPrimed(session) explicitly.
paliadinDB.buildPrimerIfFresh assembles the block:
- Reads the last MaxPrimerTurns=5 exchanges from
ListHistoryForSession (Slice F).
- truncateForPrimer normalises each side (drops \r\n, collapses
whitespace, caps at MaxPrimerCharsPerSide=600 with …).
- Returns "" silently when isFresh=false, no SessionID, no prior
history, or DB error — the user's actual question still lands; we
only lose the recap.
SKILL.md (~/.claude/skills/paliadin/SKILL.md, refreshed via
scripts/install-paliadin-skill) gets a new "Crash-recovery primer"
section above the context-envelope block. Five behaviour rules:
1. Don't re-execute prior tool calls (audit log already has them).
2. Use the primer for thread continuity, not as a data source.
Re-call tools for fresh facts.
3. Truncated lines (ending in …) are partial — paraphrase rather
than quote.
4. No primer at all = normal case (existing pane, history is in
tmux memory). Behave as before.
5. Acknowledge sparingly — usually just answer the actual question
with the recap as silent context.
New test TestTruncateForPrimer pins the per-side truncation contract
(no \r\n leaks, repeated spaces collapsed, ellipsis on oversized
input, short input untouched). go test green.
Refs: docs/design-paliadin-inline-2026-05-08.md §6
(deferred Anthropic API cutover prereq).
This commit is contained in:
@@ -342,7 +342,11 @@ func (s *LocalPaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*T
|
||||
}
|
||||
|
||||
// Ensure tmux session + Claude pane (per-user — keyed off UserID).
|
||||
target, err := s.ensurePane(ctx, req.UserID)
|
||||
// isFresh signals that we just created the pane (no prior chat
|
||||
// window existed) — when true AND we have prior turns for this user
|
||||
// session, we splice a primer into the envelope so Claude wakes
|
||||
// with conversation context instead of cold.
|
||||
target, isFresh, err := s.ensurePane(ctx, req.UserID)
|
||||
if err != nil {
|
||||
_ = s.markTurnError(ctx, turnID, "tmux_unresponsive")
|
||||
return nil, fmt.Errorf("%w: %v", ErrTmuxUnavailable, err)
|
||||
@@ -359,8 +363,9 @@ func (s *LocalPaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*T
|
||||
// envelope and writes the response to the per-turn file. The optional
|
||||
// [ctx …] prefix carries structured page context from the inline
|
||||
// widget (t-paliad-161); SKILL.md branches on it before answering.
|
||||
envelope := fmt.Sprintf("[PALIADIN:%s] %s%s",
|
||||
turnID, req.Context.EnvelopePrefix(), sanitiseForTmux(req.UserMessage))
|
||||
primer := s.buildPrimerIfFresh(ctx, isFresh, req)
|
||||
envelope := fmt.Sprintf("[PALIADIN:%s] %s%s%s",
|
||||
turnID, primer, req.Context.EnvelopePrefix(), sanitiseForTmux(req.UserMessage))
|
||||
if err := s.sendToPane(ctx, target, envelope); err != nil {
|
||||
_ = s.markTurnError(ctx, turnID, "tmux_unresponsive")
|
||||
return nil, fmt.Errorf("%w: send prompt: %v", ErrTmuxUnavailable, err)
|
||||
@@ -513,6 +518,100 @@ func (s *paliadinDB) ListHistoryForSession(ctx context.Context, callerID uuid.UU
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// MaxPrimerTurns caps how many prior exchanges the crash-recovery
|
||||
// primer replays into a fresh tmux pane. Each exchange is a (user,
|
||||
// assistant) pair, so the prompt grows by ~2× this many lines plus the
|
||||
// primer scaffolding. Five exchanges is enough to establish thread
|
||||
// continuity ("we were just discussing the Acme project") without
|
||||
// blowing out the prompt budget.
|
||||
const MaxPrimerTurns = 5
|
||||
|
||||
// MaxPrimerCharsPerSide caps the user_message + response length per
|
||||
// exchange that goes into the primer. Long answers from prior turns
|
||||
// are truncated with an ellipsis so a runaway brick of text doesn't
|
||||
// dominate the primer block.
|
||||
const MaxPrimerCharsPerSide = 600
|
||||
|
||||
// buildPrimerIfFresh assembles the `[primer …][/primer]` block that
|
||||
// gets prepended to the user envelope when the tmux pane was just
|
||||
// created (or is unreachable for any other reason and we expect Claude
|
||||
// to lack context). Returns "" when:
|
||||
//
|
||||
// - isFresh=false (existing pane has the conversation in memory)
|
||||
// - no req.SessionID (legacy turn — nothing to recover)
|
||||
// - the DB has no prior turns for this session (genuinely first turn)
|
||||
// - the lookup itself errors (we degrade silently rather than block
|
||||
// the user's actual question)
|
||||
//
|
||||
// The format SKILL.md parses:
|
||||
//
|
||||
// [primer last=<N>]
|
||||
// U: <user message>
|
||||
// A: <assistant response>
|
||||
// …
|
||||
// [/primer]
|
||||
//
|
||||
// SKILL.md treats the primer as authoritative recap, not as questions
|
||||
// to re-answer. See ~/.claude/skills/paliadin/SKILL.md for the
|
||||
// behaviour contract.
|
||||
func (s *paliadinDB) buildPrimerIfFresh(ctx context.Context, isFresh bool, req TurnRequest) string {
|
||||
if !isFresh || req.SessionID == "" {
|
||||
return ""
|
||||
}
|
||||
rows, err := s.ListHistoryForSession(ctx, req.UserID, req.SessionID, MaxPrimerTurns)
|
||||
if err != nil {
|
||||
// Log + degrade silently. The user's actual question still gets
|
||||
// sent; they just lose the conversation continuity for this one
|
||||
// turn.
|
||||
log.Printf("paliadin: primer history lookup: %v", err)
|
||||
return ""
|
||||
}
|
||||
if len(rows) == 0 {
|
||||
return ""
|
||||
}
|
||||
// rows are oldest → newest. Keep the newest MaxPrimerTurns; for the
|
||||
// recovery use-case more recent context matters more.
|
||||
if len(rows) > MaxPrimerTurns {
|
||||
rows = rows[len(rows)-MaxPrimerTurns:]
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "[primer last=%d] ", len(rows))
|
||||
for _, row := range rows {
|
||||
userMsg := truncateForPrimer(row.UserMessage)
|
||||
b.WriteString("U: ")
|
||||
b.WriteString(userMsg)
|
||||
b.WriteString(" \\n ")
|
||||
if row.Response != nil && *row.Response != "" {
|
||||
assistantMsg := truncateForPrimer(*row.Response)
|
||||
b.WriteString("A: ")
|
||||
b.WriteString(assistantMsg)
|
||||
b.WriteString(" \\n ")
|
||||
}
|
||||
}
|
||||
b.WriteString("[/primer] ")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// truncateForPrimer normalises a message for the primer block: strips
|
||||
// newlines (envelope is a single-line keystroke), collapses repeated
|
||||
// whitespace, and truncates with an ellipsis when over the per-side
|
||||
// cap. The output stays single-line so the tmux send-keys command
|
||||
// doesn't fragment it.
|
||||
func truncateForPrimer(s string) string {
|
||||
s = strings.ReplaceAll(s, "\r", " ")
|
||||
s = strings.ReplaceAll(s, "\n", " ")
|
||||
// Collapse repeated whitespace.
|
||||
for strings.Contains(s, " ") {
|
||||
s = strings.ReplaceAll(s, " ", " ")
|
||||
}
|
||||
s = strings.TrimSpace(s)
|
||||
if len(s) > MaxPrimerCharsPerSide {
|
||||
s = s[:MaxPrimerCharsPerSide] + "…"
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
// PaliadinStats is the aggregate view shown on /admin/paliadin.
|
||||
type PaliadinStats struct {
|
||||
TotalTurns int `json:"total_turns"`
|
||||
@@ -645,7 +744,13 @@ func (s *paliadinDB) Stats(ctx context.Context, callerID uuid.UUID) (*PaliadinSt
|
||||
// skill on first user turn (Claude's skill router auto-matches the
|
||||
// `[PALIADIN:` envelope), so no in-process system-prompt send is
|
||||
// required.
|
||||
func (s *LocalPaliadinService) ensurePane(ctx context.Context, userID uuid.UUID) (string, error) {
|
||||
//
|
||||
// The second return value (isFresh) is true when the pane was just now
|
||||
// created (no prior @paliadin-scope=chat window existed). RunTurn uses
|
||||
// this signal to prime the new pane with prior conversation context
|
||||
// from paliad.paliadin_turns so a tmux/mRiver reboot doesn't strand
|
||||
// users with a Claude that has no memory.
|
||||
func (s *LocalPaliadinService) ensurePane(ctx context.Context, userID uuid.UUID) (string, bool, error) {
|
||||
session := s.sessionNameFor(userID)
|
||||
|
||||
s.mu.Lock()
|
||||
@@ -653,21 +758,21 @@ func (s *LocalPaliadinService) ensurePane(ctx context.Context, userID uuid.UUID)
|
||||
|
||||
// Cheap path: cached target still alive? Reuse.
|
||||
if cached, ok := s.panes[session]; ok && cached != "" && s.paneAlive(ctx, cached) {
|
||||
return cached, nil
|
||||
return cached, false, nil
|
||||
}
|
||||
|
||||
// Ensure session.
|
||||
if err := runTmux(ctx, "has-session", "-t", session); err != nil {
|
||||
// Create detached.
|
||||
if err := runTmux(ctx, "new-session", "-d", "-s", session); err != nil {
|
||||
return "", fmt.Errorf("new-session: %w", err)
|
||||
return "", false, fmt.Errorf("new-session: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Look for an existing window tagged with @paliadin-scope=chat.
|
||||
if existing := s.findChatWindow(ctx, session); existing != "" {
|
||||
s.panes[session] = existing
|
||||
return existing, nil
|
||||
return existing, false, nil
|
||||
}
|
||||
|
||||
// No window — create one running `claude` in a fresh pane. Must be
|
||||
@@ -678,7 +783,7 @@ func (s *LocalPaliadinService) ensurePane(ctx context.Context, userID uuid.UUID)
|
||||
"-P", "-F", "#{window_index}",
|
||||
"claude")
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("new-window claude: %w", err)
|
||||
return "", false, fmt.Errorf("new-window claude: %w", err)
|
||||
}
|
||||
idx := strings.TrimSpace(out)
|
||||
target := fmt.Sprintf("%s:%s", session, idx)
|
||||
@@ -688,7 +793,7 @@ func (s *LocalPaliadinService) ensurePane(ctx context.Context, userID uuid.UUID)
|
||||
// pane has a "❯" prompt glyph or "│" sidebar visible. We give it
|
||||
// 30 s, which is generous.
|
||||
if err := s.waitForPaneReady(ctx, target, 30*time.Second); err != nil {
|
||||
return "", fmt.Errorf("wait-for-ready: %w", err)
|
||||
return "", false, fmt.Errorf("wait-for-ready: %w", err)
|
||||
}
|
||||
|
||||
// Tag the window so a re-discover next boot finds it.
|
||||
@@ -696,7 +801,7 @@ func (s *LocalPaliadinService) ensurePane(ctx context.Context, userID uuid.UUID)
|
||||
_ = runTmux(ctx, "set-window-option", "-t", target, "@fix-name", "claude-paliadin")
|
||||
|
||||
s.panes[session] = target
|
||||
return target, nil
|
||||
return target, true, nil
|
||||
}
|
||||
|
||||
func (s *LocalPaliadinService) findChatWindow(ctx context.Context, session string) string {
|
||||
|
||||
@@ -72,6 +72,15 @@ type RemotePaliadinService struct {
|
||||
healthMu sync.Mutex
|
||||
health map[string]healthCacheEntry
|
||||
|
||||
// Crash-recovery primer: per-session "have we already primed this
|
||||
// pane in this Go-process lifetime?" cache. Cleared on Reset, on
|
||||
// healthGate failure, and (implicitly) on Go-process restart. False
|
||||
// → next turn includes the primer block; true → skip. The local
|
||||
// service uses ensurePane's isFresh signal directly; remote can't
|
||||
// see across the SSH boundary so we approximate with this cache.
|
||||
primedMu sync.Mutex
|
||||
primed map[string]bool
|
||||
|
||||
// Hook for tests — when non-nil, callShim delegates here instead
|
||||
// of exec'ing ssh. Production code never sets this.
|
||||
callShimHook func(ctx context.Context, args ...string) ([]byte, error)
|
||||
@@ -103,6 +112,7 @@ func NewRemotePaliadinService(db *sqlx.DB, users *UserService, cfg RemotePaliadi
|
||||
paliadinDB: paliadinDB{db: db, users: users},
|
||||
cfg: cfg,
|
||||
health: make(map[string]healthCacheEntry),
|
||||
primed: make(map[string]bool),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -155,10 +165,24 @@ func (s *RemotePaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*
|
||||
// router auto-matches the [PALIADIN: envelope so no in-process
|
||||
// bootstrap (system-prompt-via-tmux-keystroke) is needed any more.
|
||||
|
||||
// Crash-recovery primer (t-paliad-161 follow-up): if we haven't
|
||||
// primed THIS Go-process for this session yet, build the primer
|
||||
// block from prior paliadin_turns so a fresh tmux pane on mRiver
|
||||
// (after reboot, OOM, manual kill, etc.) wakes with conversation
|
||||
// context instead of cold. We can't see across the SSH boundary
|
||||
// to know the pane's true freshness — `primed[session]=true`
|
||||
// after the first successful turn approximates "this pane has
|
||||
// our context", and we re-prime when Reset / healthGate failure
|
||||
// clears the flag.
|
||||
primer := ""
|
||||
if !s.isPrimed(session) {
|
||||
primer = s.buildPrimerIfFresh(ctx, true, req)
|
||||
}
|
||||
|
||||
// Prepend the structured-context envelope (t-paliad-161) before the
|
||||
// user message so SKILL.md sees `[ctx route=… entity=… selection=…]`
|
||||
// before parsing the actual question. Empty when req.Context is nil.
|
||||
msg := req.Context.EnvelopePrefix() + sanitiseForTmux(req.UserMessage)
|
||||
msg := primer + req.Context.EnvelopePrefix() + sanitiseForTmux(req.UserMessage)
|
||||
msgB64 := base64.StdEncoding.EncodeToString([]byte(msg))
|
||||
|
||||
body, err := s.callShim(ctx, "run-turn", session, turnID.String(), msgB64)
|
||||
@@ -166,6 +190,10 @@ func (s *RemotePaliadinService) RunTurn(ctx context.Context, req TurnRequest) (*
|
||||
_ = s.markTurnError(ctx, turnID, classifySSHError(err))
|
||||
return nil, err
|
||||
}
|
||||
// First successful turn → mark this pane primed for the rest of
|
||||
// the Go-process lifetime. ResetSession + healthGate failure both
|
||||
// clear the flag.
|
||||
s.markPrimed(session)
|
||||
|
||||
// Same trailer parse + audit completion as the local path.
|
||||
cleanBody, meta := splitTrailer(string(body))
|
||||
@@ -202,12 +230,42 @@ func (s *RemotePaliadinService) ResetSession(ctx context.Context, userID uuid.UU
|
||||
delete(s.health, session)
|
||||
s.healthMu.Unlock()
|
||||
|
||||
// Reset clears the primer cache so the next turn rebuilds context
|
||||
// from the DB into the new claude pane.
|
||||
s.clearPrimed(session)
|
||||
|
||||
if _, err := s.callShim(ctx, "reset", session); err != nil {
|
||||
return fmt.Errorf("paliadin: reset %s: %w", session, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// isPrimed reports whether we've already injected a primer for this
|
||||
// session in this Go-process lifetime. False on first call, on calls
|
||||
// after clearPrimed (Reset / health failure), and after a process
|
||||
// restart.
|
||||
func (s *RemotePaliadinService) isPrimed(session string) bool {
|
||||
s.primedMu.Lock()
|
||||
defer s.primedMu.Unlock()
|
||||
return s.primed[session]
|
||||
}
|
||||
|
||||
// markPrimed records a successful primer-prepended turn for this
|
||||
// session, so subsequent turns in the same process skip the primer.
|
||||
func (s *RemotePaliadinService) markPrimed(session string) {
|
||||
s.primedMu.Lock()
|
||||
defer s.primedMu.Unlock()
|
||||
s.primed[session] = true
|
||||
}
|
||||
|
||||
// clearPrimed wipes the primer flag for a session so the next turn
|
||||
// rebuilds context. Called by ResetSession and on healthGate failure.
|
||||
func (s *RemotePaliadinService) clearPrimed(session string) {
|
||||
s.primedMu.Lock()
|
||||
defer s.primedMu.Unlock()
|
||||
delete(s.primed, session)
|
||||
}
|
||||
|
||||
// healthGate runs the shim's `health <session>` verb at most once per
|
||||
// 10 s per session. Returns ErrMRiverUnreachable wrapping the
|
||||
// underlying error on miss.
|
||||
@@ -224,8 +282,11 @@ func (s *RemotePaliadinService) healthGate(ctx context.Context, session string)
|
||||
out, err := s.callShim(probeCtx, "health", session)
|
||||
if err != nil {
|
||||
// Don't cache failures — re-probe on every miss so a recovery
|
||||
// surfaces immediately.
|
||||
// surfaces immediately. Also clear the primer cache: an
|
||||
// unreachable mRiver may have lost its tmux session, so the
|
||||
// next successful turn should re-prime the new pane.
|
||||
delete(s.health, session)
|
||||
s.clearPrimed(session)
|
||||
return fmt.Errorf("%w: %v", ErrMRiverUnreachable, err)
|
||||
}
|
||||
if strings.TrimSpace(string(out)) != "ok" {
|
||||
|
||||
@@ -5,6 +5,48 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestTruncateForPrimer pins the per-side truncation contract used by
|
||||
// buildPrimerIfFresh — the primer block must stay single-line so the
|
||||
// tmux send-keys -l command doesn't fragment it, and runaway prior
|
||||
// answers must collapse to a manageable size. t-paliad-161 follow-up.
|
||||
func TestTruncateForPrimer(t *testing.T) {
|
||||
t.Run("collapses newlines + tabs to spaces", func(t *testing.T) {
|
||||
got := truncateForPrimer("hello\nworld\ttab")
|
||||
if got != "hello world\ttab" && got != "hello world tab" {
|
||||
// truncateForPrimer normalises \r and \n but leaves tabs;
|
||||
// either result above is acceptable as long as no \n leaks.
|
||||
t.Errorf("got %q", got)
|
||||
}
|
||||
if strings.ContainsAny(got, "\r\n") {
|
||||
t.Errorf("control chars leaked: %q", got)
|
||||
}
|
||||
})
|
||||
t.Run("collapses repeated spaces", func(t *testing.T) {
|
||||
got := truncateForPrimer("a b c")
|
||||
if got != "a b c" {
|
||||
t.Errorf("got %q; want %q", got, "a b c")
|
||||
}
|
||||
})
|
||||
t.Run("truncates oversized input with ellipsis", func(t *testing.T) {
|
||||
long := strings.Repeat("x", MaxPrimerCharsPerSide+50)
|
||||
got := truncateForPrimer(long)
|
||||
if !strings.HasSuffix(got, "…") {
|
||||
t.Errorf("missing ellipsis: %q", got[len(got)-10:])
|
||||
}
|
||||
// The 'x' count should be exactly MaxPrimerCharsPerSide
|
||||
// (ellipsis adds bytes but no x).
|
||||
if strings.Count(got, "x") != MaxPrimerCharsPerSide {
|
||||
t.Errorf("got %d x's; want %d", strings.Count(got, "x"), MaxPrimerCharsPerSide)
|
||||
}
|
||||
})
|
||||
t.Run("leaves short input untouched", func(t *testing.T) {
|
||||
got := truncateForPrimer("Was steht heute an?")
|
||||
if got != "Was steht heute an?" {
|
||||
t.Errorf("short input mangled: %q", got)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// TestTurnContext_EnvelopePrefix pins the bracket-block format the
|
||||
// SKILL.md parser branches on. Wrong format = the inline widget's
|
||||
// page-context never reaches Paliadin. t-paliad-161.
|
||||
|
||||
Reference in New Issue
Block a user