mAi: #8 - imagen.jobs queue + worker subcommand (flexsiebels write path)

Async write path for the flexsiebels owner-mode UI: flexsiebels INSERTs into
imagen.jobs, the worker on mRiver claims pending rows via LISTEN/NOTIFY +
5s safety poll, runs the same generate pipeline imagen generate uses, and
writes the result through internal/cloud into imagen.images.

- Schema migration imagen_jobs_init: table + status CHECK + two indexes +
  owner-scoped RLS + grants + AFTER INSERT trigger publishing on the
  imagen_jobs channel via pg_notify.
- internal/worker: DB-agnostic loop over a Queue interface. Drains the
  whole pending backlog on each wake. Job-scoped contexts are derived
  from Background so SIGTERM lets the in-flight generation finish (no
  half-state). ResetStaleRunning at startup unsticks rows left over from
  a previous crash. Eight unit tests cover the done / failed / missing-id /
  drain / NOTIFY-wake / shutdown / transient-error paths against a fake
  queue (no real Postgres in CI).
- cmd/imagen/worker.go: pgx-backed Queue (one dedicated conn for LISTEN +
  UPDATE), plus the workerPipeline that reuses buildBackend +
  attachUsageSink + prompt.Apply + buildWriter + maybeCloudSync. The
  per-job owner_user_id overrides the env-level fallback so each row in
  imagen.images is attributed correctly.
- maybeCloudSync now returns (*cloud.SyncResult, error) so the worker can
  link imagen.jobs.image_id to the inserted imagen.images row. The CLI
  generate path keeps printing its stderr summary unchanged.
- scripts/imagen-worker.service + .env.example for the systemd --user unit
  on mRiver. EnvironmentFile lives in ~/.dotfiles and is never committed.
- docs/setup-worker-mriver.md walks through installation + the spec's
  SQL-INSERT smoke; docs/architecture.md grows an "async write path"
  section.
- worker_integration_test.go (env-guarded by IMAGEN_WORKER_INTEGRATION=1)
  drives one real job through the full pipeline against msupabase using
  the mock backend, then verifies imagen.images + Storage object landed
  and the row flipped to done with image_id linked. Verified end-to-end:
  pickup latency ~7ms, total 74ms, failure path captures error text.
This commit is contained in:
mAi
2026-05-11 10:23:33 +02:00
parent cb6656c436
commit 2758c5a500
13 changed files with 1205 additions and 27 deletions

213
internal/worker/worker.go Normal file
View File

@@ -0,0 +1,213 @@
// Package worker consumes the imagen.jobs queue. It claims pending rows via
// an UPDATE-returning lock (single source of truth, no double-claim window),
// runs the supplied generation pipeline, then writes status + image_id back.
//
// The package is DB-agnostic: it talks to two small interfaces (Queue +
// Pipeline) so unit tests can drive the claim/transition logic with no real
// Postgres connection. cmd/imagen wires the pgx implementation.
package worker
import (
"context"
"errors"
"fmt"
"sync"
"time"
)
// Job is the slice of an imagen.jobs row the worker needs to drive a
// generation. Null columns from the DB are represented as zero values; the
// pipeline treats zero values as "use backend default" (same convention as
// backend.Request).
type Job struct {
ID string
OwnerUserID string
Prompt string
Backend string
Model string
Width int
Height int
Steps int
Seed int64
Style string
}
// Outcome is what the pipeline reports back per job. ImageID is the
// imagen.images.id the cloud-sync produced. Empty ImageID with nil Err means
// the cloud-sync was skipped (config off) — we treat that as a failure for
// the worker since flexsiebels needs the image_id to render the result.
type Outcome struct {
ImageID string
Err error
}
// Queue is the persistence layer for the imagen.jobs table. Implementations
// must be safe for serialised single-worker use (concurrent claim across
// multiple worker processes is out of scope for v1 — the FOR UPDATE SKIP
// LOCKED clause in the pgx claim query covers it cheaply anyway).
type Queue interface {
// ClaimNextPending atomically marks the oldest pending row 'running' and
// returns it. Returns (nil, nil) when the queue is empty.
ClaimNextPending(ctx context.Context) (*Job, error)
// MarkDone records success: status='done', image_id, completed_at=now().
MarkDone(ctx context.Context, jobID, imageID string) error
// MarkFailed records failure: status='failed', error=msg, completed_at=now().
MarkFailed(ctx context.Context, jobID, errMsg string) error
// WaitForJob blocks until either a NOTIFY arrives on imagen_jobs, the
// timeout expires, or ctx is cancelled. Returns nil on notification or
// timeout; returns ctx.Err() on cancellation. Transient connection errors
// are returned so the caller can decide to reconnect.
WaitForJob(ctx context.Context, timeout time.Duration) error
// ResetStaleRunning marks any rows stuck in 'running' (e.g. left over
// from a crash before this process started) back to 'pending'. Called
// once at worker startup so the cold-start safety poll can pick them up.
ResetStaleRunning(ctx context.Context) error
}
// Pipeline runs one generation and reports back the imagen.images.id (or an
// error). The implementation owns backend dispatch, prompt enrichment, disk
// write, and cloud-sync; the worker only orchestrates queue state.
type Pipeline interface {
Run(ctx context.Context, job Job) Outcome
}
// Config is the runtime knob set for the worker loop.
type Config struct {
// PollInterval is the safety-poll cadence between LISTEN wakeups. Picking
// this too low wastes DB roundtrips; too high lets a dropped NOTIFY
// stall the queue. 5s is the spec'd default.
PollInterval time.Duration
// JobTimeout caps any single Pipeline.Run. A backend hang shouldn't
// freeze the queue forever.
JobTimeout time.Duration
// Logger receives one-line status events. nil means silent.
Logger func(format string, args ...any)
}
// Worker is the orchestration loop. It is not reusable across Run calls.
type Worker struct {
q Queue
p Pipeline
cfg Config
// processingMu guards the in-flight job so SIGTERM-triggered shutdown
// waits for it to complete before returning.
processingMu sync.Mutex
}
// New constructs a Worker.
func New(q Queue, p Pipeline, cfg Config) *Worker {
if cfg.PollInterval <= 0 {
cfg.PollInterval = 5 * time.Second
}
if cfg.JobTimeout <= 0 {
cfg.JobTimeout = 5 * time.Minute
}
return &Worker{q: q, p: p, cfg: cfg}
}
// Run drives the consume loop until ctx is cancelled or a fatal queue error
// (e.g. unrecoverable DB drop) is returned. A LISTEN wait can fail with a
// transient transport error; the worker logs and continues so a temporary
// network blip doesn't take it down.
func (w *Worker) Run(ctx context.Context) error {
if err := w.q.ResetStaleRunning(ctx); err != nil {
w.log("worker: reset stale running rows: %v", err)
// Don't return — a stale row will eventually be visible to the poll
// path once flexsiebels gives up and resubmits, and we'd rather keep
// serving fresh jobs than crash here.
}
for {
if err := ctx.Err(); err != nil {
return nil
}
// Drain the queue: claim and process until empty.
if err := w.drain(ctx); err != nil && !errors.Is(err, context.Canceled) {
w.log("worker: drain: %v", err)
}
if err := ctx.Err(); err != nil {
return nil
}
// Wait for the next wake. WaitForJob covers both LISTEN and the
// timeout-based poll fallback; either returns nil and we loop.
if err := w.q.WaitForJob(ctx, w.cfg.PollInterval); err != nil {
if errors.Is(err, context.Canceled) {
return nil
}
w.log("worker: wait: %v (continuing)", err)
// Pace the retries so a totally-broken DB doesn't busy-spin.
select {
case <-ctx.Done():
return nil
case <-time.After(w.cfg.PollInterval):
}
}
}
}
// drain claims and processes every currently-pending job. The job-scoped
// context is derived from context.Background() so that a SIGTERM mid-job
// still lets the pipeline finish — that's the "no half-state on shutdown"
// guarantee the issue calls for.
func (w *Worker) drain(ctx context.Context) error {
for {
if err := ctx.Err(); err != nil {
return err
}
job, err := w.q.ClaimNextPending(ctx)
if err != nil {
return fmt.Errorf("claim: %w", err)
}
if job == nil {
return nil
}
w.processOne(*job)
}
}
// processOne runs the pipeline for one already-claimed job and writes the
// outcome back to the queue. The job context is independent of the outer
// ctx so an in-flight job can finish even after SIGTERM.
func (w *Worker) processOne(job Job) {
w.processingMu.Lock()
defer w.processingMu.Unlock()
w.log("worker: processing job %s backend=%s", job.ID, job.Backend)
jobCtx, cancel := context.WithTimeout(context.Background(), w.cfg.JobTimeout)
defer cancel()
out := w.p.Run(jobCtx, job)
// Status-update uses Background ctx with a short timeout — we must
// always be able to record the outcome, otherwise the row sits in
// 'running' forever.
updCtx, updCancel := context.WithTimeout(context.Background(), 30*time.Second)
defer updCancel()
if out.Err != nil {
w.log("worker: job %s failed: %v", job.ID, out.Err)
if err := w.q.MarkFailed(updCtx, job.ID, out.Err.Error()); err != nil {
w.log("worker: mark failed for %s: %v", job.ID, err)
}
return
}
if out.ImageID == "" {
// Pipeline reported success but no imagen.images row — treat as
// failure because flexsiebels has nothing to link.
const msg = "pipeline did not return an imagen.images id (cloud sync misconfigured?)"
w.log("worker: job %s: %s", job.ID, msg)
if err := w.q.MarkFailed(updCtx, job.ID, msg); err != nil {
w.log("worker: mark failed for %s: %v", job.ID, err)
}
return
}
if err := w.q.MarkDone(updCtx, job.ID, out.ImageID); err != nil {
w.log("worker: mark done for %s: %v", job.ID, err)
return
}
w.log("worker: job %s done image_id=%s", job.ID, out.ImageID)
}
func (w *Worker) log(format string, args ...any) {
if w.cfg.Logger != nil {
w.cfg.Logger(format, args...)
}
}

View File

@@ -0,0 +1,332 @@
package worker
import (
"context"
"errors"
"fmt"
"sync"
"testing"
"time"
)
// fakeQueue is a hand-rolled in-memory queue that mirrors the contract of a
// real Postgres-backed implementation: ClaimNextPending atomically takes one
// pending row and flips its status to "running", MarkDone/MarkFailed are
// idempotent terminal transitions, WaitForJob blocks until notified or until
// the timeout elapses.
type fakeQueue struct {
mu sync.Mutex
pending []Job
state map[string]string // jobID -> status
last map[string]string // jobID -> error msg or image_id
notify chan struct{}
claimErr error
doneErr error
failErr error
resetErr error
claimed int
done int
failed int
resets int
}
func newFakeQueue(jobs ...Job) *fakeQueue {
q := &fakeQueue{
state: make(map[string]string),
last: make(map[string]string),
notify: make(chan struct{}, 16),
}
for _, j := range jobs {
q.pending = append(q.pending, j)
q.state[j.ID] = "pending"
}
return q
}
func (q *fakeQueue) ClaimNextPending(ctx context.Context) (*Job, error) {
q.mu.Lock()
defer q.mu.Unlock()
if q.claimErr != nil {
return nil, q.claimErr
}
if len(q.pending) == 0 {
return nil, nil
}
j := q.pending[0]
q.pending = q.pending[1:]
q.state[j.ID] = "running"
q.claimed++
return &j, nil
}
func (q *fakeQueue) MarkDone(ctx context.Context, jobID, imageID string) error {
q.mu.Lock()
defer q.mu.Unlock()
if q.doneErr != nil {
return q.doneErr
}
q.state[jobID] = "done"
q.last[jobID] = imageID
q.done++
return nil
}
func (q *fakeQueue) MarkFailed(ctx context.Context, jobID, msg string) error {
q.mu.Lock()
defer q.mu.Unlock()
if q.failErr != nil {
return q.failErr
}
q.state[jobID] = "failed"
q.last[jobID] = msg
q.failed++
return nil
}
func (q *fakeQueue) WaitForJob(ctx context.Context, timeout time.Duration) error {
select {
case <-ctx.Done():
return ctx.Err()
case <-q.notify:
return nil
case <-time.After(timeout):
return nil
}
}
func (q *fakeQueue) ResetStaleRunning(ctx context.Context) error {
q.mu.Lock()
defer q.mu.Unlock()
q.resets++
return q.resetErr
}
// pingNotify simulates an INSERT-trigger NOTIFY by waking WaitForJob.
func (q *fakeQueue) pingNotify() {
select {
case q.notify <- struct{}{}:
default:
}
}
// stub pipeline.
type fakePipeline struct {
mu sync.Mutex
results map[string]Outcome // by job.ID; "" key = default outcome
calls int
delay time.Duration
lastJob Job
}
func (p *fakePipeline) Run(ctx context.Context, job Job) Outcome {
p.mu.Lock()
p.calls++
p.lastJob = job
delay := p.delay
out, ok := p.results[job.ID]
if !ok {
out = p.results[""]
}
p.mu.Unlock()
if delay > 0 {
select {
case <-ctx.Done():
return Outcome{Err: ctx.Err()}
case <-time.After(delay):
}
}
return out
}
func TestWorker_DonePath(t *testing.T) {
q := newFakeQueue(
Job{ID: "j1", Prompt: "a", Backend: "mock"},
)
p := &fakePipeline{results: map[string]Outcome{"j1": {ImageID: "img-1"}}}
w := New(q, p, Config{PollInterval: 10 * time.Millisecond, JobTimeout: time.Second})
ctx, cancel := context.WithCancel(context.Background())
go func() {
time.Sleep(80 * time.Millisecond)
cancel()
}()
if err := w.Run(ctx); err != nil {
t.Fatalf("Run: %v", err)
}
if got := q.state["j1"]; got != "done" {
t.Fatalf("state=%q want done", got)
}
if got := q.last["j1"]; got != "img-1" {
t.Fatalf("image_id=%q want img-1", got)
}
if q.done != 1 || q.failed != 0 {
t.Fatalf("counts: done=%d failed=%d", q.done, q.failed)
}
if p.calls != 1 {
t.Fatalf("pipeline calls=%d want 1", p.calls)
}
if q.resets != 1 {
t.Fatalf("ResetStaleRunning calls=%d want 1", q.resets)
}
}
func TestWorker_FailedPath_RecordsErrorText(t *testing.T) {
q := newFakeQueue(Job{ID: "j1", Prompt: "a", Backend: "mock"})
p := &fakePipeline{results: map[string]Outcome{"j1": {Err: errors.New("backend unreachable")}}}
w := New(q, p, Config{PollInterval: 10 * time.Millisecond, JobTimeout: time.Second})
ctx, cancel := context.WithCancel(context.Background())
go func() { time.Sleep(80 * time.Millisecond); cancel() }()
_ = w.Run(ctx)
if got := q.state["j1"]; got != "failed" {
t.Fatalf("state=%q want failed", got)
}
if got := q.last["j1"]; got != "backend unreachable" {
t.Fatalf("error=%q want %q", got, "backend unreachable")
}
if q.done != 0 || q.failed != 1 {
t.Fatalf("counts: done=%d failed=%d", q.done, q.failed)
}
}
func TestWorker_MissingImageID_TreatedAsFailure(t *testing.T) {
q := newFakeQueue(Job{ID: "j1", Prompt: "a", Backend: "mock"})
// Outcome has neither Err nor ImageID — pipeline silently swallowed
// cloud-sync. flexsiebels needs the image_id; without it, fail the job.
p := &fakePipeline{results: map[string]Outcome{"j1": {}}}
w := New(q, p, Config{PollInterval: 10 * time.Millisecond, JobTimeout: time.Second})
ctx, cancel := context.WithCancel(context.Background())
go func() { time.Sleep(80 * time.Millisecond); cancel() }()
_ = w.Run(ctx)
if got := q.state["j1"]; got != "failed" {
t.Fatalf("state=%q want failed", got)
}
if q.last["j1"] == "" {
t.Fatalf("expected non-empty error explanation for missing image_id")
}
}
func TestWorker_DrainsMultipleBeforeWaiting(t *testing.T) {
q := newFakeQueue(
Job{ID: "j1", Backend: "mock"},
Job{ID: "j2", Backend: "mock"},
Job{ID: "j3", Backend: "mock"},
)
p := &fakePipeline{results: map[string]Outcome{"": {ImageID: "img"}}}
w := New(q, p, Config{PollInterval: 200 * time.Millisecond, JobTimeout: time.Second})
ctx, cancel := context.WithCancel(context.Background())
go func() { time.Sleep(60 * time.Millisecond); cancel() }()
_ = w.Run(ctx)
for _, id := range []string{"j1", "j2", "j3"} {
if got := q.state[id]; got != "done" {
t.Fatalf("%s state=%q want done", id, got)
}
}
if q.done != 3 {
t.Fatalf("done=%d want 3", q.done)
}
}
func TestWorker_NotifyWakesEarlierThanPoll(t *testing.T) {
q := newFakeQueue()
p := &fakePipeline{results: map[string]Outcome{"": {ImageID: "img"}}}
// Set poll interval high so a working LISTEN is required to see the job
// promptly. Without NOTIFY plumbing this test would time out the worker
// before drain ever runs.
w := New(q, p, Config{PollInterval: 5 * time.Second, JobTimeout: time.Second})
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
done := make(chan struct{})
go func() {
_ = w.Run(ctx)
close(done)
}()
// Append a job and ping the wake channel.
q.mu.Lock()
q.pending = append(q.pending, Job{ID: "late", Backend: "mock"})
q.state["late"] = "pending"
q.mu.Unlock()
q.pingNotify()
// Give the worker a beat to claim + process.
deadline := time.Now().Add(500 * time.Millisecond)
for time.Now().Before(deadline) {
q.mu.Lock()
s := q.state["late"]
q.mu.Unlock()
if s == "done" {
cancel()
<-done
return
}
time.Sleep(5 * time.Millisecond)
}
t.Fatalf("worker did not pick up the late job within the 500ms window — NOTIFY wake-up path is broken")
}
func TestWorker_HonoursContextCancellation(t *testing.T) {
q := newFakeQueue()
p := &fakePipeline{results: map[string]Outcome{"": {ImageID: "img"}}}
w := New(q, p, Config{PollInterval: 10 * time.Millisecond, JobTimeout: time.Second})
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Millisecond)
defer cancel()
start := time.Now()
if err := w.Run(ctx); err != nil {
t.Fatalf("Run: %v", err)
}
if dur := time.Since(start); dur > 200*time.Millisecond {
t.Fatalf("worker did not exit promptly on ctx cancel: %v", dur)
}
}
func TestWorker_InflightJobFinishesAfterShutdown(t *testing.T) {
q := newFakeQueue(Job{ID: "long", Backend: "mock"})
p := &fakePipeline{
results: map[string]Outcome{"long": {ImageID: "img-long"}},
delay: 120 * time.Millisecond,
}
// Short JobTimeout would also kill the in-flight job; give it enough
// budget so the test exercises the shutdown-during-job path.
w := New(q, p, Config{PollInterval: 10 * time.Millisecond, JobTimeout: 5 * time.Second})
ctx, cancel := context.WithCancel(context.Background())
go func() {
// Let the job start, then cancel mid-flight.
time.Sleep(30 * time.Millisecond)
cancel()
}()
_ = w.Run(ctx)
if got := q.state["long"]; got != "done" {
t.Fatalf("state=%q want done (in-flight job should finish even on shutdown)", got)
}
}
func TestWorker_TransientClaimErrorDoesNotKillLoop(t *testing.T) {
// First claim returns an error; the loop should log and try again on the
// next wake — it must not propagate the error and exit.
q := newFakeQueue(Job{ID: "j1", Backend: "mock"})
q.claimErr = fmt.Errorf("transient: connection reset")
p := &fakePipeline{results: map[string]Outcome{"j1": {ImageID: "img"}}}
w := New(q, p, Config{PollInterval: 20 * time.Millisecond, JobTimeout: time.Second})
ctx, cancel := context.WithCancel(context.Background())
// Heal the claim error after a beat so the second drain succeeds.
go func() {
time.Sleep(40 * time.Millisecond)
q.mu.Lock()
q.claimErr = nil
q.mu.Unlock()
}()
go func() {
time.Sleep(200 * time.Millisecond)
cancel()
}()
if err := w.Run(ctx); err != nil {
t.Fatalf("Run returned: %v (transient claim errors should not kill the loop)", err)
}
if got := q.state["j1"]; got != "done" {
t.Fatalf("state=%q want done", got)
}
}