Schema (applied via migration imagen_series_init): - imagen.series parent table (prompt + params + count CHECK 1..10 + selected_image_id) - imagen.jobs += series_id (FK) + series_idx - imagen.images += series_id (FK) - Owner-scoped RLS on series (SELECT/INSERT/UPDATE) + grants - Partial indexes WHERE series_id IS NOT NULL on both child tables Worker pipeline: - worker.Job += SeriesID, populated from imagen.jobs.series_id via the claim query. - cloud.SyncRequest += SeriesID; insertRow writes series_id when non-empty, omits the key when empty so solo runs leave the column NULL. - maybeCloudSync threads seriesID from job.SeriesID through to the cloud sink. generate.go (CLI) always passes "" — solo path unchanged. Tests: - worker: SeriesID propagates from Job to fakePipeline.lastJob unchanged, solo job keeps it empty. - cloud: SyncRequest.SeriesID lands as row.series_id in the POST body; empty SeriesID omits the key entirely. Refs ImaGen#9.
218 lines
7.7 KiB
Go
218 lines
7.7 KiB
Go
// Package worker consumes the imagen.jobs queue. It claims pending rows via
|
|
// an UPDATE-returning lock (single source of truth, no double-claim window),
|
|
// runs the supplied generation pipeline, then writes status + image_id back.
|
|
//
|
|
// The package is DB-agnostic: it talks to two small interfaces (Queue +
|
|
// Pipeline) so unit tests can drive the claim/transition logic with no real
|
|
// Postgres connection. cmd/imagen wires the pgx implementation.
|
|
package worker
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// Job is the slice of an imagen.jobs row the worker needs to drive a
|
|
// generation. Null columns from the DB are represented as zero values; the
|
|
// pipeline treats zero values as "use backend default" (same convention as
|
|
// backend.Request).
|
|
type Job struct {
|
|
ID string
|
|
OwnerUserID string
|
|
Prompt string
|
|
Backend string
|
|
Model string
|
|
Width int
|
|
Height int
|
|
Steps int
|
|
Seed int64
|
|
Style string
|
|
// SeriesID is the parent imagen.series row when this job is one of N
|
|
// tries in a batch. Empty means a solo run — the pipeline must not
|
|
// propagate a series_id onto the resulting imagen.images row.
|
|
SeriesID string
|
|
}
|
|
|
|
// Outcome is what the pipeline reports back per job. ImageID is the
|
|
// imagen.images.id the cloud-sync produced. Empty ImageID with nil Err means
|
|
// the cloud-sync was skipped (config off) — we treat that as a failure for
|
|
// the worker since flexsiebels needs the image_id to render the result.
|
|
type Outcome struct {
|
|
ImageID string
|
|
Err error
|
|
}
|
|
|
|
// Queue is the persistence layer for the imagen.jobs table. Implementations
|
|
// must be safe for serialised single-worker use (concurrent claim across
|
|
// multiple worker processes is out of scope for v1 — the FOR UPDATE SKIP
|
|
// LOCKED clause in the pgx claim query covers it cheaply anyway).
|
|
type Queue interface {
|
|
// ClaimNextPending atomically marks the oldest pending row 'running' and
|
|
// returns it. Returns (nil, nil) when the queue is empty.
|
|
ClaimNextPending(ctx context.Context) (*Job, error)
|
|
// MarkDone records success: status='done', image_id, completed_at=now().
|
|
MarkDone(ctx context.Context, jobID, imageID string) error
|
|
// MarkFailed records failure: status='failed', error=msg, completed_at=now().
|
|
MarkFailed(ctx context.Context, jobID, errMsg string) error
|
|
// WaitForJob blocks until either a NOTIFY arrives on imagen_jobs, the
|
|
// timeout expires, or ctx is cancelled. Returns nil on notification or
|
|
// timeout; returns ctx.Err() on cancellation. Transient connection errors
|
|
// are returned so the caller can decide to reconnect.
|
|
WaitForJob(ctx context.Context, timeout time.Duration) error
|
|
// ResetStaleRunning marks any rows stuck in 'running' (e.g. left over
|
|
// from a crash before this process started) back to 'pending'. Called
|
|
// once at worker startup so the cold-start safety poll can pick them up.
|
|
ResetStaleRunning(ctx context.Context) error
|
|
}
|
|
|
|
// Pipeline runs one generation and reports back the imagen.images.id (or an
|
|
// error). The implementation owns backend dispatch, prompt enrichment, disk
|
|
// write, and cloud-sync; the worker only orchestrates queue state.
|
|
type Pipeline interface {
|
|
Run(ctx context.Context, job Job) Outcome
|
|
}
|
|
|
|
// Config is the runtime knob set for the worker loop.
|
|
type Config struct {
|
|
// PollInterval is the safety-poll cadence between LISTEN wakeups. Picking
|
|
// this too low wastes DB roundtrips; too high lets a dropped NOTIFY
|
|
// stall the queue. 5s is the spec'd default.
|
|
PollInterval time.Duration
|
|
// JobTimeout caps any single Pipeline.Run. A backend hang shouldn't
|
|
// freeze the queue forever.
|
|
JobTimeout time.Duration
|
|
// Logger receives one-line status events. nil means silent.
|
|
Logger func(format string, args ...any)
|
|
}
|
|
|
|
// Worker is the orchestration loop. It is not reusable across Run calls.
|
|
type Worker struct {
|
|
q Queue
|
|
p Pipeline
|
|
cfg Config
|
|
|
|
// processingMu guards the in-flight job so SIGTERM-triggered shutdown
|
|
// waits for it to complete before returning.
|
|
processingMu sync.Mutex
|
|
}
|
|
|
|
// New constructs a Worker.
|
|
func New(q Queue, p Pipeline, cfg Config) *Worker {
|
|
if cfg.PollInterval <= 0 {
|
|
cfg.PollInterval = 5 * time.Second
|
|
}
|
|
if cfg.JobTimeout <= 0 {
|
|
cfg.JobTimeout = 5 * time.Minute
|
|
}
|
|
return &Worker{q: q, p: p, cfg: cfg}
|
|
}
|
|
|
|
// Run drives the consume loop until ctx is cancelled or a fatal queue error
|
|
// (e.g. unrecoverable DB drop) is returned. A LISTEN wait can fail with a
|
|
// transient transport error; the worker logs and continues so a temporary
|
|
// network blip doesn't take it down.
|
|
func (w *Worker) Run(ctx context.Context) error {
|
|
if err := w.q.ResetStaleRunning(ctx); err != nil {
|
|
w.log("worker: reset stale running rows: %v", err)
|
|
// Don't return — a stale row will eventually be visible to the poll
|
|
// path once flexsiebels gives up and resubmits, and we'd rather keep
|
|
// serving fresh jobs than crash here.
|
|
}
|
|
for {
|
|
if err := ctx.Err(); err != nil {
|
|
return nil
|
|
}
|
|
// Drain the queue: claim and process until empty.
|
|
if err := w.drain(ctx); err != nil && !errors.Is(err, context.Canceled) {
|
|
w.log("worker: drain: %v", err)
|
|
}
|
|
if err := ctx.Err(); err != nil {
|
|
return nil
|
|
}
|
|
// Wait for the next wake. WaitForJob covers both LISTEN and the
|
|
// timeout-based poll fallback; either returns nil and we loop.
|
|
if err := w.q.WaitForJob(ctx, w.cfg.PollInterval); err != nil {
|
|
if errors.Is(err, context.Canceled) {
|
|
return nil
|
|
}
|
|
w.log("worker: wait: %v (continuing)", err)
|
|
// Pace the retries so a totally-broken DB doesn't busy-spin.
|
|
select {
|
|
case <-ctx.Done():
|
|
return nil
|
|
case <-time.After(w.cfg.PollInterval):
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// drain claims and processes every currently-pending job. The job-scoped
|
|
// context is derived from context.Background() so that a SIGTERM mid-job
|
|
// still lets the pipeline finish — that's the "no half-state on shutdown"
|
|
// guarantee the issue calls for.
|
|
func (w *Worker) drain(ctx context.Context) error {
|
|
for {
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
job, err := w.q.ClaimNextPending(ctx)
|
|
if err != nil {
|
|
return fmt.Errorf("claim: %w", err)
|
|
}
|
|
if job == nil {
|
|
return nil
|
|
}
|
|
w.processOne(*job)
|
|
}
|
|
}
|
|
|
|
// processOne runs the pipeline for one already-claimed job and writes the
|
|
// outcome back to the queue. The job context is independent of the outer
|
|
// ctx so an in-flight job can finish even after SIGTERM.
|
|
func (w *Worker) processOne(job Job) {
|
|
w.processingMu.Lock()
|
|
defer w.processingMu.Unlock()
|
|
|
|
w.log("worker: processing job %s backend=%s", job.ID, job.Backend)
|
|
jobCtx, cancel := context.WithTimeout(context.Background(), w.cfg.JobTimeout)
|
|
defer cancel()
|
|
out := w.p.Run(jobCtx, job)
|
|
|
|
// Status-update uses Background ctx with a short timeout — we must
|
|
// always be able to record the outcome, otherwise the row sits in
|
|
// 'running' forever.
|
|
updCtx, updCancel := context.WithTimeout(context.Background(), 30*time.Second)
|
|
defer updCancel()
|
|
if out.Err != nil {
|
|
w.log("worker: job %s failed: %v", job.ID, out.Err)
|
|
if err := w.q.MarkFailed(updCtx, job.ID, out.Err.Error()); err != nil {
|
|
w.log("worker: mark failed for %s: %v", job.ID, err)
|
|
}
|
|
return
|
|
}
|
|
if out.ImageID == "" {
|
|
// Pipeline reported success but no imagen.images row — treat as
|
|
// failure because flexsiebels has nothing to link.
|
|
const msg = "pipeline did not return an imagen.images id (cloud sync misconfigured?)"
|
|
w.log("worker: job %s: %s", job.ID, msg)
|
|
if err := w.q.MarkFailed(updCtx, job.ID, msg); err != nil {
|
|
w.log("worker: mark failed for %s: %v", job.ID, err)
|
|
}
|
|
return
|
|
}
|
|
if err := w.q.MarkDone(updCtx, job.ID, out.ImageID); err != nil {
|
|
w.log("worker: mark done for %s: %v", job.ID, err)
|
|
return
|
|
}
|
|
w.log("worker: job %s done image_id=%s", job.ID, out.ImageID)
|
|
}
|
|
|
|
func (w *Worker) log(format string, args ...any) {
|
|
if w.cfg.Logger != nil {
|
|
w.cfg.Logger(format, args...)
|
|
}
|
|
}
|