// Package worker consumes the imagen.jobs queue. It claims pending rows via // an UPDATE-returning lock (single source of truth, no double-claim window), // runs the supplied generation pipeline, then writes status + image_id back. // // The package is DB-agnostic: it talks to two small interfaces (Queue + // Pipeline) so unit tests can drive the claim/transition logic with no real // Postgres connection. cmd/imagen wires the pgx implementation. package worker import ( "context" "errors" "fmt" "sync" "time" ) // Job is the slice of an imagen.jobs row the worker needs to drive a // generation. Null columns from the DB are represented as zero values; the // pipeline treats zero values as "use backend default" (same convention as // backend.Request). type Job struct { ID string OwnerUserID string Prompt string Backend string Model string Width int Height int Steps int Seed int64 Style string } // Outcome is what the pipeline reports back per job. ImageID is the // imagen.images.id the cloud-sync produced. Empty ImageID with nil Err means // the cloud-sync was skipped (config off) — we treat that as a failure for // the worker since flexsiebels needs the image_id to render the result. type Outcome struct { ImageID string Err error } // Queue is the persistence layer for the imagen.jobs table. Implementations // must be safe for serialised single-worker use (concurrent claim across // multiple worker processes is out of scope for v1 — the FOR UPDATE SKIP // LOCKED clause in the pgx claim query covers it cheaply anyway). type Queue interface { // ClaimNextPending atomically marks the oldest pending row 'running' and // returns it. Returns (nil, nil) when the queue is empty. ClaimNextPending(ctx context.Context) (*Job, error) // MarkDone records success: status='done', image_id, completed_at=now(). MarkDone(ctx context.Context, jobID, imageID string) error // MarkFailed records failure: status='failed', error=msg, completed_at=now(). MarkFailed(ctx context.Context, jobID, errMsg string) error // WaitForJob blocks until either a NOTIFY arrives on imagen_jobs, the // timeout expires, or ctx is cancelled. Returns nil on notification or // timeout; returns ctx.Err() on cancellation. Transient connection errors // are returned so the caller can decide to reconnect. WaitForJob(ctx context.Context, timeout time.Duration) error // ResetStaleRunning marks any rows stuck in 'running' (e.g. left over // from a crash before this process started) back to 'pending'. Called // once at worker startup so the cold-start safety poll can pick them up. ResetStaleRunning(ctx context.Context) error } // Pipeline runs one generation and reports back the imagen.images.id (or an // error). The implementation owns backend dispatch, prompt enrichment, disk // write, and cloud-sync; the worker only orchestrates queue state. type Pipeline interface { Run(ctx context.Context, job Job) Outcome } // Config is the runtime knob set for the worker loop. type Config struct { // PollInterval is the safety-poll cadence between LISTEN wakeups. Picking // this too low wastes DB roundtrips; too high lets a dropped NOTIFY // stall the queue. 5s is the spec'd default. PollInterval time.Duration // JobTimeout caps any single Pipeline.Run. A backend hang shouldn't // freeze the queue forever. JobTimeout time.Duration // Logger receives one-line status events. nil means silent. Logger func(format string, args ...any) } // Worker is the orchestration loop. It is not reusable across Run calls. type Worker struct { q Queue p Pipeline cfg Config // processingMu guards the in-flight job so SIGTERM-triggered shutdown // waits for it to complete before returning. processingMu sync.Mutex } // New constructs a Worker. func New(q Queue, p Pipeline, cfg Config) *Worker { if cfg.PollInterval <= 0 { cfg.PollInterval = 5 * time.Second } if cfg.JobTimeout <= 0 { cfg.JobTimeout = 5 * time.Minute } return &Worker{q: q, p: p, cfg: cfg} } // Run drives the consume loop until ctx is cancelled or a fatal queue error // (e.g. unrecoverable DB drop) is returned. A LISTEN wait can fail with a // transient transport error; the worker logs and continues so a temporary // network blip doesn't take it down. func (w *Worker) Run(ctx context.Context) error { if err := w.q.ResetStaleRunning(ctx); err != nil { w.log("worker: reset stale running rows: %v", err) // Don't return — a stale row will eventually be visible to the poll // path once flexsiebels gives up and resubmits, and we'd rather keep // serving fresh jobs than crash here. } for { if err := ctx.Err(); err != nil { return nil } // Drain the queue: claim and process until empty. if err := w.drain(ctx); err != nil && !errors.Is(err, context.Canceled) { w.log("worker: drain: %v", err) } if err := ctx.Err(); err != nil { return nil } // Wait for the next wake. WaitForJob covers both LISTEN and the // timeout-based poll fallback; either returns nil and we loop. if err := w.q.WaitForJob(ctx, w.cfg.PollInterval); err != nil { if errors.Is(err, context.Canceled) { return nil } w.log("worker: wait: %v (continuing)", err) // Pace the retries so a totally-broken DB doesn't busy-spin. select { case <-ctx.Done(): return nil case <-time.After(w.cfg.PollInterval): } } } } // drain claims and processes every currently-pending job. The job-scoped // context is derived from context.Background() so that a SIGTERM mid-job // still lets the pipeline finish — that's the "no half-state on shutdown" // guarantee the issue calls for. func (w *Worker) drain(ctx context.Context) error { for { if err := ctx.Err(); err != nil { return err } job, err := w.q.ClaimNextPending(ctx) if err != nil { return fmt.Errorf("claim: %w", err) } if job == nil { return nil } w.processOne(*job) } } // processOne runs the pipeline for one already-claimed job and writes the // outcome back to the queue. The job context is independent of the outer // ctx so an in-flight job can finish even after SIGTERM. func (w *Worker) processOne(job Job) { w.processingMu.Lock() defer w.processingMu.Unlock() w.log("worker: processing job %s backend=%s", job.ID, job.Backend) jobCtx, cancel := context.WithTimeout(context.Background(), w.cfg.JobTimeout) defer cancel() out := w.p.Run(jobCtx, job) // Status-update uses Background ctx with a short timeout — we must // always be able to record the outcome, otherwise the row sits in // 'running' forever. updCtx, updCancel := context.WithTimeout(context.Background(), 30*time.Second) defer updCancel() if out.Err != nil { w.log("worker: job %s failed: %v", job.ID, out.Err) if err := w.q.MarkFailed(updCtx, job.ID, out.Err.Error()); err != nil { w.log("worker: mark failed for %s: %v", job.ID, err) } return } if out.ImageID == "" { // Pipeline reported success but no imagen.images row — treat as // failure because flexsiebels has nothing to link. const msg = "pipeline did not return an imagen.images id (cloud sync misconfigured?)" w.log("worker: job %s: %s", job.ID, msg) if err := w.q.MarkFailed(updCtx, job.ID, msg); err != nil { w.log("worker: mark failed for %s: %v", job.ID, err) } return } if err := w.q.MarkDone(updCtx, job.ID, out.ImageID); err != nil { w.log("worker: mark done for %s: %v", job.ID, err) return } w.log("worker: job %s done image_id=%s", job.ID, out.ImageID) } func (w *Worker) log(format string, args ...any) { if w.cfg.Logger != nil { w.cfg.Logger(format, args...) } }