mAi: #8 - imagen.jobs queue + worker subcommand (flexsiebels write path)

Async write path for the flexsiebels owner-mode UI: flexsiebels INSERTs into imagen.jobs, the worker on mRiver claims pending rows via LISTEN/NOTIFY + 5s safety poll, runs the same generate pipeline imagen generate uses, and writes the result through internal/cloud into imagen.images. - Schema migration imagen_jobs_init: table + status CHECK + two indexes + owner-scoped RLS + grants + AFTER INSERT trigger publishing on the imagen_jobs channel via pg_notify. - internal/worker: DB-agnostic loop over a Queue interface. Drains the whole pending backlog on each wake. Job-scoped contexts are derived from Background so SIGTERM lets the in-flight generation finish (no half-state). ResetStaleRunning at startup unsticks rows left over from a previous crash. Eight unit tests cover the done / failed / missing-id / drain / NOTIFY-wake / shutdown / transient-error paths against a fake queue (no real Postgres in CI). - cmd/imagen/worker.go: pgx-backed Queue (one dedicated conn for LISTEN + UPDATE), plus the workerPipeline that reuses buildBackend + attachUsageSink + prompt.Apply + buildWriter + maybeCloudSync. The per-job owner_user_id overrides the env-level fallback so each row in imagen.images is attributed correctly. - maybeCloudSync now returns (*cloud.SyncResult, error) so the worker can link imagen.jobs.image_id to the inserted imagen.images row. The CLI generate path keeps printing its stderr summary unchanged. - scripts/imagen-worker.service + .env.example for the systemd --user unit on mRiver. EnvironmentFile lives in ~/.dotfiles and is never committed. - docs/setup-worker-mriver.md walks through installation + the spec's SQL-INSERT smoke; docs/architecture.md grows an "async write path" section. - worker_integration_test.go (env-guarded by IMAGEN_WORKER_INTEGRATION=1) drives one real job through the full pipeline against msupabase using the mock backend, then verifies imagen.images + Storage object landed and the row flipped to done with image_id linked. Verified end-to-end: pickup latency ~7ms, total 74ms, failure path captures error text.
2026-05-11 10:23:33 +02:00
parent cb6656c436
commit 2758c5a500
13 changed files with 1205 additions and 27 deletions
--- a/internal/worker/worker.go
+++ b/internal/worker/worker.go
@@ -0,0 +1,213 @@
+// Package worker consumes the imagen.jobs queue. It claims pending rows via
+// an UPDATE-returning lock (single source of truth, no double-claim window),
+// runs the supplied generation pipeline, then writes status + image_id back.
+//
+// The package is DB-agnostic: it talks to two small interfaces (Queue +
+// Pipeline) so unit tests can drive the claim/transition logic with no real
+// Postgres connection. cmd/imagen wires the pgx implementation.
+package worker
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"sync"
+	"time"
+)
+
+// Job is the slice of an imagen.jobs row the worker needs to drive a
+// generation. Null columns from the DB are represented as zero values; the
+// pipeline treats zero values as "use backend default" (same convention as
+// backend.Request).
+type Job struct {
+	ID          string
+	OwnerUserID string
+	Prompt      string
+	Backend     string
+	Model       string
+	Width       int
+	Height      int
+	Steps       int
+	Seed        int64
+	Style       string
+}
+
+// Outcome is what the pipeline reports back per job. ImageID is the
+// imagen.images.id the cloud-sync produced. Empty ImageID with nil Err means
+// the cloud-sync was skipped (config off) — we treat that as a failure for
+// the worker since flexsiebels needs the image_id to render the result.
+type Outcome struct {
+	ImageID string
+	Err     error
+}
+
+// Queue is the persistence layer for the imagen.jobs table. Implementations
+// must be safe for serialised single-worker use (concurrent claim across
+// multiple worker processes is out of scope for v1 — the FOR UPDATE SKIP
+// LOCKED clause in the pgx claim query covers it cheaply anyway).
+type Queue interface {
+	// ClaimNextPending atomically marks the oldest pending row 'running' and
+	// returns it. Returns (nil, nil) when the queue is empty.
+	ClaimNextPending(ctx context.Context) (*Job, error)
+	// MarkDone records success: status='done', image_id, completed_at=now().
+	MarkDone(ctx context.Context, jobID, imageID string) error
+	// MarkFailed records failure: status='failed', error=msg, completed_at=now().
+	MarkFailed(ctx context.Context, jobID, errMsg string) error
+	// WaitForJob blocks until either a NOTIFY arrives on imagen_jobs, the
+	// timeout expires, or ctx is cancelled. Returns nil on notification or
+	// timeout; returns ctx.Err() on cancellation. Transient connection errors
+	// are returned so the caller can decide to reconnect.
+	WaitForJob(ctx context.Context, timeout time.Duration) error
+	// ResetStaleRunning marks any rows stuck in 'running' (e.g. left over
+	// from a crash before this process started) back to 'pending'. Called
+	// once at worker startup so the cold-start safety poll can pick them up.
+	ResetStaleRunning(ctx context.Context) error
+}
+
+// Pipeline runs one generation and reports back the imagen.images.id (or an
+// error). The implementation owns backend dispatch, prompt enrichment, disk
+// write, and cloud-sync; the worker only orchestrates queue state.
+type Pipeline interface {
+	Run(ctx context.Context, job Job) Outcome
+}
+
+// Config is the runtime knob set for the worker loop.
+type Config struct {
+	// PollInterval is the safety-poll cadence between LISTEN wakeups. Picking
+	// this too low wastes DB roundtrips; too high lets a dropped NOTIFY
+	// stall the queue. 5s is the spec'd default.
+	PollInterval time.Duration
+	// JobTimeout caps any single Pipeline.Run. A backend hang shouldn't
+	// freeze the queue forever.
+	JobTimeout time.Duration
+	// Logger receives one-line status events. nil means silent.
+	Logger func(format string, args ...any)
+}
+
+// Worker is the orchestration loop. It is not reusable across Run calls.
+type Worker struct {
+	q      Queue
+	p      Pipeline
+	cfg    Config
+
+	// processingMu guards the in-flight job so SIGTERM-triggered shutdown
+	// waits for it to complete before returning.
+	processingMu sync.Mutex
+}
+
+// New constructs a Worker.
+func New(q Queue, p Pipeline, cfg Config) *Worker {
+	if cfg.PollInterval <= 0 {
+		cfg.PollInterval = 5 * time.Second
+	}
+	if cfg.JobTimeout <= 0 {
+		cfg.JobTimeout = 5 * time.Minute
+	}
+	return &Worker{q: q, p: p, cfg: cfg}
+}
+
+// Run drives the consume loop until ctx is cancelled or a fatal queue error
+// (e.g. unrecoverable DB drop) is returned. A LISTEN wait can fail with a
+// transient transport error; the worker logs and continues so a temporary
+// network blip doesn't take it down.
+func (w *Worker) Run(ctx context.Context) error {
+	if err := w.q.ResetStaleRunning(ctx); err != nil {
+		w.log("worker: reset stale running rows: %v", err)
+		// Don't return — a stale row will eventually be visible to the poll
+		// path once flexsiebels gives up and resubmits, and we'd rather keep
+		// serving fresh jobs than crash here.
+	}
+	for {
+		if err := ctx.Err(); err != nil {
+			return nil
+		}
+		// Drain the queue: claim and process until empty.
+		if err := w.drain(ctx); err != nil && !errors.Is(err, context.Canceled) {
+			w.log("worker: drain: %v", err)
+		}
+		if err := ctx.Err(); err != nil {
+			return nil
+		}
+		// Wait for the next wake. WaitForJob covers both LISTEN and the
+		// timeout-based poll fallback; either returns nil and we loop.
+		if err := w.q.WaitForJob(ctx, w.cfg.PollInterval); err != nil {
+			if errors.Is(err, context.Canceled) {
+				return nil
+			}
+			w.log("worker: wait: %v (continuing)", err)
+			// Pace the retries so a totally-broken DB doesn't busy-spin.
+			select {
+			case <-ctx.Done():
+				return nil
+			case <-time.After(w.cfg.PollInterval):
+			}
+		}
+	}
+}
+
+// drain claims and processes every currently-pending job. The job-scoped
+// context is derived from context.Background() so that a SIGTERM mid-job
+// still lets the pipeline finish — that's the "no half-state on shutdown"
+// guarantee the issue calls for.
+func (w *Worker) drain(ctx context.Context) error {
+	for {
+		if err := ctx.Err(); err != nil {
+			return err
+		}
+		job, err := w.q.ClaimNextPending(ctx)
+		if err != nil {
+			return fmt.Errorf("claim: %w", err)
+		}
+		if job == nil {
+			return nil
+		}
+		w.processOne(*job)
+	}
+}
+
+// processOne runs the pipeline for one already-claimed job and writes the
+// outcome back to the queue. The job context is independent of the outer
+// ctx so an in-flight job can finish even after SIGTERM.
+func (w *Worker) processOne(job Job) {
+	w.processingMu.Lock()
+	defer w.processingMu.Unlock()
+
+	w.log("worker: processing job %s backend=%s", job.ID, job.Backend)
+	jobCtx, cancel := context.WithTimeout(context.Background(), w.cfg.JobTimeout)
+	defer cancel()
+	out := w.p.Run(jobCtx, job)
+
+	// Status-update uses Background ctx with a short timeout — we must
+	// always be able to record the outcome, otherwise the row sits in
+	// 'running' forever.
+	updCtx, updCancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer updCancel()
+	if out.Err != nil {
+		w.log("worker: job %s failed: %v", job.ID, out.Err)
+		if err := w.q.MarkFailed(updCtx, job.ID, out.Err.Error()); err != nil {
+			w.log("worker: mark failed for %s: %v", job.ID, err)
+		}
+		return
+	}
+	if out.ImageID == "" {
+		// Pipeline reported success but no imagen.images row — treat as
+		// failure because flexsiebels has nothing to link.
+		const msg = "pipeline did not return an imagen.images id (cloud sync misconfigured?)"
+		w.log("worker: job %s: %s", job.ID, msg)
+		if err := w.q.MarkFailed(updCtx, job.ID, msg); err != nil {
+			w.log("worker: mark failed for %s: %v", job.ID, err)
+		}
+		return
+	}
+	if err := w.q.MarkDone(updCtx, job.ID, out.ImageID); err != nil {
+		w.log("worker: mark done for %s: %v", job.ID, err)
+		return
+	}
+	w.log("worker: job %s done image_id=%s", job.ID, out.ImageID)
+}
+
+func (w *Worker) log(format string, args ...any) {
+	if w.cfg.Logger != nil {
+		w.cfg.Logger(format, args...)
+	}
+}
--- a/internal/worker/worker_test.go
+++ b/internal/worker/worker_test.go
@@ -0,0 +1,332 @@
+package worker
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"sync"
+	"testing"
+	"time"
+)
+
+// fakeQueue is a hand-rolled in-memory queue that mirrors the contract of a
+// real Postgres-backed implementation: ClaimNextPending atomically takes one
+// pending row and flips its status to "running", MarkDone/MarkFailed are
+// idempotent terminal transitions, WaitForJob blocks until notified or until
+// the timeout elapses.
+type fakeQueue struct {
+	mu      sync.Mutex
+	pending []Job
+	state   map[string]string // jobID -> status
+	last    map[string]string // jobID -> error msg or image_id
+	notify  chan struct{}
+
+	claimErr error
+	doneErr  error
+	failErr  error
+	resetErr error
+
+	claimed int
+	done    int
+	failed  int
+	resets  int
+}
+
+func newFakeQueue(jobs ...Job) *fakeQueue {
+	q := &fakeQueue{
+		state:   make(map[string]string),
+		last:    make(map[string]string),
+		notify:  make(chan struct{}, 16),
+	}
+	for _, j := range jobs {
+		q.pending = append(q.pending, j)
+		q.state[j.ID] = "pending"
+	}
+	return q
+}
+
+func (q *fakeQueue) ClaimNextPending(ctx context.Context) (*Job, error) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	if q.claimErr != nil {
+		return nil, q.claimErr
+	}
+	if len(q.pending) == 0 {
+		return nil, nil
+	}
+	j := q.pending[0]
+	q.pending = q.pending[1:]
+	q.state[j.ID] = "running"
+	q.claimed++
+	return &j, nil
+}
+
+func (q *fakeQueue) MarkDone(ctx context.Context, jobID, imageID string) error {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	if q.doneErr != nil {
+		return q.doneErr
+	}
+	q.state[jobID] = "done"
+	q.last[jobID] = imageID
+	q.done++
+	return nil
+}
+
+func (q *fakeQueue) MarkFailed(ctx context.Context, jobID, msg string) error {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	if q.failErr != nil {
+		return q.failErr
+	}
+	q.state[jobID] = "failed"
+	q.last[jobID] = msg
+	q.failed++
+	return nil
+}
+
+func (q *fakeQueue) WaitForJob(ctx context.Context, timeout time.Duration) error {
+	select {
+	case <-ctx.Done():
+		return ctx.Err()
+	case <-q.notify:
+		return nil
+	case <-time.After(timeout):
+		return nil
+	}
+}
+
+func (q *fakeQueue) ResetStaleRunning(ctx context.Context) error {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	q.resets++
+	return q.resetErr
+}
+
+// pingNotify simulates an INSERT-trigger NOTIFY by waking WaitForJob.
+func (q *fakeQueue) pingNotify() {
+	select {
+	case q.notify <- struct{}{}:
+	default:
+	}
+}
+
+// stub pipeline.
+type fakePipeline struct {
+	mu      sync.Mutex
+	results map[string]Outcome // by job.ID; "" key = default outcome
+	calls   int
+	delay   time.Duration
+	lastJob Job
+}
+
+func (p *fakePipeline) Run(ctx context.Context, job Job) Outcome {
+	p.mu.Lock()
+	p.calls++
+	p.lastJob = job
+	delay := p.delay
+	out, ok := p.results[job.ID]
+	if !ok {
+		out = p.results[""]
+	}
+	p.mu.Unlock()
+	if delay > 0 {
+		select {
+		case <-ctx.Done():
+			return Outcome{Err: ctx.Err()}
+		case <-time.After(delay):
+		}
+	}
+	return out
+}
+
+func TestWorker_DonePath(t *testing.T) {
+	q := newFakeQueue(
+		Job{ID: "j1", Prompt: "a", Backend: "mock"},
+	)
+	p := &fakePipeline{results: map[string]Outcome{"j1": {ImageID: "img-1"}}}
+	w := New(q, p, Config{PollInterval: 10 * time.Millisecond, JobTimeout: time.Second})
+	ctx, cancel := context.WithCancel(context.Background())
+	go func() {
+		time.Sleep(80 * time.Millisecond)
+		cancel()
+	}()
+	if err := w.Run(ctx); err != nil {
+		t.Fatalf("Run: %v", err)
+	}
+	if got := q.state["j1"]; got != "done" {
+		t.Fatalf("state=%q want done", got)
+	}
+	if got := q.last["j1"]; got != "img-1" {
+		t.Fatalf("image_id=%q want img-1", got)
+	}
+	if q.done != 1 || q.failed != 0 {
+		t.Fatalf("counts: done=%d failed=%d", q.done, q.failed)
+	}
+	if p.calls != 1 {
+		t.Fatalf("pipeline calls=%d want 1", p.calls)
+	}
+	if q.resets != 1 {
+		t.Fatalf("ResetStaleRunning calls=%d want 1", q.resets)
+	}
+}
+
+func TestWorker_FailedPath_RecordsErrorText(t *testing.T) {
+	q := newFakeQueue(Job{ID: "j1", Prompt: "a", Backend: "mock"})
+	p := &fakePipeline{results: map[string]Outcome{"j1": {Err: errors.New("backend unreachable")}}}
+	w := New(q, p, Config{PollInterval: 10 * time.Millisecond, JobTimeout: time.Second})
+	ctx, cancel := context.WithCancel(context.Background())
+	go func() { time.Sleep(80 * time.Millisecond); cancel() }()
+	_ = w.Run(ctx)
+
+	if got := q.state["j1"]; got != "failed" {
+		t.Fatalf("state=%q want failed", got)
+	}
+	if got := q.last["j1"]; got != "backend unreachable" {
+		t.Fatalf("error=%q want %q", got, "backend unreachable")
+	}
+	if q.done != 0 || q.failed != 1 {
+		t.Fatalf("counts: done=%d failed=%d", q.done, q.failed)
+	}
+}
+
+func TestWorker_MissingImageID_TreatedAsFailure(t *testing.T) {
+	q := newFakeQueue(Job{ID: "j1", Prompt: "a", Backend: "mock"})
+	// Outcome has neither Err nor ImageID — pipeline silently swallowed
+	// cloud-sync. flexsiebels needs the image_id; without it, fail the job.
+	p := &fakePipeline{results: map[string]Outcome{"j1": {}}}
+	w := New(q, p, Config{PollInterval: 10 * time.Millisecond, JobTimeout: time.Second})
+	ctx, cancel := context.WithCancel(context.Background())
+	go func() { time.Sleep(80 * time.Millisecond); cancel() }()
+	_ = w.Run(ctx)
+
+	if got := q.state["j1"]; got != "failed" {
+		t.Fatalf("state=%q want failed", got)
+	}
+	if q.last["j1"] == "" {
+		t.Fatalf("expected non-empty error explanation for missing image_id")
+	}
+}
+
+func TestWorker_DrainsMultipleBeforeWaiting(t *testing.T) {
+	q := newFakeQueue(
+		Job{ID: "j1", Backend: "mock"},
+		Job{ID: "j2", Backend: "mock"},
+		Job{ID: "j3", Backend: "mock"},
+	)
+	p := &fakePipeline{results: map[string]Outcome{"": {ImageID: "img"}}}
+	w := New(q, p, Config{PollInterval: 200 * time.Millisecond, JobTimeout: time.Second})
+	ctx, cancel := context.WithCancel(context.Background())
+	go func() { time.Sleep(60 * time.Millisecond); cancel() }()
+	_ = w.Run(ctx)
+
+	for _, id := range []string{"j1", "j2", "j3"} {
+		if got := q.state[id]; got != "done" {
+			t.Fatalf("%s state=%q want done", id, got)
+		}
+	}
+	if q.done != 3 {
+		t.Fatalf("done=%d want 3", q.done)
+	}
+}
+
+func TestWorker_NotifyWakesEarlierThanPoll(t *testing.T) {
+	q := newFakeQueue()
+	p := &fakePipeline{results: map[string]Outcome{"": {ImageID: "img"}}}
+	// Set poll interval high so a working LISTEN is required to see the job
+	// promptly. Without NOTIFY plumbing this test would time out the worker
+	// before drain ever runs.
+	w := New(q, p, Config{PollInterval: 5 * time.Second, JobTimeout: time.Second})
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	done := make(chan struct{})
+	go func() {
+		_ = w.Run(ctx)
+		close(done)
+	}()
+	// Append a job and ping the wake channel.
+	q.mu.Lock()
+	q.pending = append(q.pending, Job{ID: "late", Backend: "mock"})
+	q.state["late"] = "pending"
+	q.mu.Unlock()
+	q.pingNotify()
+
+	// Give the worker a beat to claim + process.
+	deadline := time.Now().Add(500 * time.Millisecond)
+	for time.Now().Before(deadline) {
+		q.mu.Lock()
+		s := q.state["late"]
+		q.mu.Unlock()
+		if s == "done" {
+			cancel()
+			<-done
+			return
+		}
+		time.Sleep(5 * time.Millisecond)
+	}
+	t.Fatalf("worker did not pick up the late job within the 500ms window — NOTIFY wake-up path is broken")
+}
+
+func TestWorker_HonoursContextCancellation(t *testing.T) {
+	q := newFakeQueue()
+	p := &fakePipeline{results: map[string]Outcome{"": {ImageID: "img"}}}
+	w := New(q, p, Config{PollInterval: 10 * time.Millisecond, JobTimeout: time.Second})
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Millisecond)
+	defer cancel()
+	start := time.Now()
+	if err := w.Run(ctx); err != nil {
+		t.Fatalf("Run: %v", err)
+	}
+	if dur := time.Since(start); dur > 200*time.Millisecond {
+		t.Fatalf("worker did not exit promptly on ctx cancel: %v", dur)
+	}
+}
+
+func TestWorker_InflightJobFinishesAfterShutdown(t *testing.T) {
+	q := newFakeQueue(Job{ID: "long", Backend: "mock"})
+	p := &fakePipeline{
+		results: map[string]Outcome{"long": {ImageID: "img-long"}},
+		delay:   120 * time.Millisecond,
+	}
+	// Short JobTimeout would also kill the in-flight job; give it enough
+	// budget so the test exercises the shutdown-during-job path.
+	w := New(q, p, Config{PollInterval: 10 * time.Millisecond, JobTimeout: 5 * time.Second})
+	ctx, cancel := context.WithCancel(context.Background())
+	go func() {
+		// Let the job start, then cancel mid-flight.
+		time.Sleep(30 * time.Millisecond)
+		cancel()
+	}()
+	_ = w.Run(ctx)
+	if got := q.state["long"]; got != "done" {
+		t.Fatalf("state=%q want done (in-flight job should finish even on shutdown)", got)
+	}
+}
+
+func TestWorker_TransientClaimErrorDoesNotKillLoop(t *testing.T) {
+	// First claim returns an error; the loop should log and try again on the
+	// next wake — it must not propagate the error and exit.
+	q := newFakeQueue(Job{ID: "j1", Backend: "mock"})
+	q.claimErr = fmt.Errorf("transient: connection reset")
+	p := &fakePipeline{results: map[string]Outcome{"j1": {ImageID: "img"}}}
+	w := New(q, p, Config{PollInterval: 20 * time.Millisecond, JobTimeout: time.Second})
+	ctx, cancel := context.WithCancel(context.Background())
+
+	// Heal the claim error after a beat so the second drain succeeds.
+	go func() {
+		time.Sleep(40 * time.Millisecond)
+		q.mu.Lock()
+		q.claimErr = nil
+		q.mu.Unlock()
+	}()
+	go func() {
+		time.Sleep(200 * time.Millisecond)
+		cancel()
+	}()
+	if err := w.Run(ctx); err != nil {
+		t.Fatalf("Run returned: %v (transient claim errors should not kill the loop)", err)
+	}
+	if got := q.state["j1"]; got != "done" {
+		t.Fatalf("state=%q want done", got)
+	}
+}