ImaGen/internal/worker/worker.go

// Package worker consumes the imagen.jobs queue. It claims pending rows via
// an UPDATE-returning lock (single source of truth, no double-claim window),
// runs the supplied generation pipeline, then writes status + image_id back.
//
// The package is DB-agnostic: it talks to two small interfaces (Queue +
// Pipeline) so unit tests can drive the claim/transition logic with no real
// Postgres connection. cmd/imagen wires the pgx implementation.
package worker

import (
	"context"
	"errors"
	"fmt"
	"sync"
	"time"
)

// Job is the slice of an imagen.jobs row the worker needs to drive a
// generation. Null columns from the DB are represented as zero values; the
// pipeline treats zero values as "use backend default" (same convention as
// backend.Request).
type Job struct {
	ID          string
	OwnerUserID string
	Prompt      string
	Backend     string
	Model       string
	Width       int
	Height      int
	Steps       int
	Seed        int64
	Style       string
	// SeriesID is the parent imagen.series row when this job is one of N
	// tries in a batch. Empty means a solo run — the pipeline must not
	// propagate a series_id onto the resulting imagen.images row.
	SeriesID string
}

// Outcome is what the pipeline reports back per job. ImageID is the
// imagen.images.id the cloud-sync produced. Empty ImageID with nil Err means
// the cloud-sync was skipped (config off) — we treat that as a failure for
// the worker since flexsiebels needs the image_id to render the result.
type Outcome struct {
	ImageID string
	Err     error
}

// Queue is the persistence layer for the imagen.jobs table. Implementations
// must be safe for serialised single-worker use (concurrent claim across
// multiple worker processes is out of scope for v1 — the FOR UPDATE SKIP
// LOCKED clause in the pgx claim query covers it cheaply anyway).
type Queue interface {
	// ClaimNextPending atomically marks the oldest pending row 'running' and
	// returns it. Returns (nil, nil) when the queue is empty.
	ClaimNextPending(ctx context.Context) (*Job, error)
	// MarkDone records success: status='done', image_id, completed_at=now().
	MarkDone(ctx context.Context, jobID, imageID string) error
	// MarkFailed records failure: status='failed', error=msg, completed_at=now().
	MarkFailed(ctx context.Context, jobID, errMsg string) error
	// WaitForJob blocks until either a NOTIFY arrives on imagen_jobs, the
	// timeout expires, or ctx is cancelled. Returns nil on notification or
	// timeout; returns ctx.Err() on cancellation. Transient connection errors
	// are returned so the caller can decide to reconnect.
	WaitForJob(ctx context.Context, timeout time.Duration) error
	// ResetStaleRunning marks any rows stuck in 'running' (e.g. left over
	// from a crash before this process started) back to 'pending'. Called
	// once at worker startup so the cold-start safety poll can pick them up.
	ResetStaleRunning(ctx context.Context) error
}

// Pipeline runs one generation and reports back the imagen.images.id (or an
// error). The implementation owns backend dispatch, prompt enrichment, disk
// write, and cloud-sync; the worker only orchestrates queue state.
type Pipeline interface {
	Run(ctx context.Context, job Job) Outcome
}

// Config is the runtime knob set for the worker loop.
type Config struct {
	// PollInterval is the safety-poll cadence between LISTEN wakeups. Picking
	// this too low wastes DB roundtrips; too high lets a dropped NOTIFY
	// stall the queue. 5s is the spec'd default.
	PollInterval time.Duration
	// JobTimeout caps any single Pipeline.Run. A backend hang shouldn't
	// freeze the queue forever.
	JobTimeout time.Duration
	// Logger receives one-line status events. nil means silent.
	Logger func(format string, args ...any)
}

// Worker is the orchestration loop. It is not reusable across Run calls.
type Worker struct {
	q      Queue
	p      Pipeline
	cfg    Config

	// processingMu guards the in-flight job so SIGTERM-triggered shutdown
	// waits for it to complete before returning.
	processingMu sync.Mutex
}

// New constructs a Worker.
func New(q Queue, p Pipeline, cfg Config) *Worker {
	if cfg.PollInterval <= 0 {
		cfg.PollInterval = 5 * time.Second
	}
	if cfg.JobTimeout <= 0 {
		cfg.JobTimeout = 5 * time.Minute
	}
	return &Worker{q: q, p: p, cfg: cfg}
}

// Run drives the consume loop until ctx is cancelled or a fatal queue error
// (e.g. unrecoverable DB drop) is returned. A LISTEN wait can fail with a
// transient transport error; the worker logs and continues so a temporary
// network blip doesn't take it down.
func (w *Worker) Run(ctx context.Context) error {
	if err := w.q.ResetStaleRunning(ctx); err != nil {
		w.log("worker: reset stale running rows: %v", err)
		// Don't return — a stale row will eventually be visible to the poll
		// path once flexsiebels gives up and resubmits, and we'd rather keep
		// serving fresh jobs than crash here.
	}
	for {
		if err := ctx.Err(); err != nil {
			return nil
		}
		// Drain the queue: claim and process until empty.
		if err := w.drain(ctx); err != nil && !errors.Is(err, context.Canceled) {
			w.log("worker: drain: %v", err)
		}
		if err := ctx.Err(); err != nil {
			return nil
		}
		// Wait for the next wake. WaitForJob covers both LISTEN and the
		// timeout-based poll fallback; either returns nil and we loop.
		if err := w.q.WaitForJob(ctx, w.cfg.PollInterval); err != nil {
			if errors.Is(err, context.Canceled) {
				return nil
			}
			w.log("worker: wait: %v (continuing)", err)
			// Pace the retries so a totally-broken DB doesn't busy-spin.
			select {
			case <-ctx.Done():
				return nil
			case <-time.After(w.cfg.PollInterval):
			}
		}
	}
}

// drain claims and processes every currently-pending job. The job-scoped
// context is derived from context.Background() so that a SIGTERM mid-job
// still lets the pipeline finish — that's the "no half-state on shutdown"
// guarantee the issue calls for.
func (w *Worker) drain(ctx context.Context) error {
	for {
		if err := ctx.Err(); err != nil {
			return err
		}
		job, err := w.q.ClaimNextPending(ctx)
		if err != nil {
			return fmt.Errorf("claim: %w", err)
		}
		if job == nil {
			return nil
		}
		w.processOne(*job)
	}
}

// processOne runs the pipeline for one already-claimed job and writes the
// outcome back to the queue. The job context is independent of the outer
// ctx so an in-flight job can finish even after SIGTERM.
func (w *Worker) processOne(job Job) {
	w.processingMu.Lock()
	defer w.processingMu.Unlock()

	w.log("worker: processing job %s backend=%s", job.ID, job.Backend)
	jobCtx, cancel := context.WithTimeout(context.Background(), w.cfg.JobTimeout)
	defer cancel()
	out := w.p.Run(jobCtx, job)

	// Status-update uses Background ctx with a short timeout — we must
	// always be able to record the outcome, otherwise the row sits in
	// 'running' forever.
	updCtx, updCancel := context.WithTimeout(context.Background(), 30*time.Second)
	defer updCancel()
	if out.Err != nil {
		w.log("worker: job %s failed: %v", job.ID, out.Err)
		if err := w.q.MarkFailed(updCtx, job.ID, out.Err.Error()); err != nil {
			w.log("worker: mark failed for %s: %v", job.ID, err)
		}
		return
	}
	if out.ImageID == "" {
		// Pipeline reported success but no imagen.images row — treat as
		// failure because flexsiebels has nothing to link.
		const msg = "pipeline did not return an imagen.images id (cloud sync misconfigured?)"
		w.log("worker: job %s: %s", job.ID, msg)
		if err := w.q.MarkFailed(updCtx, job.ID, msg); err != nil {
			w.log("worker: mark failed for %s: %v", job.ID, err)
		}
		return
	}
	if err := w.q.MarkDone(updCtx, job.ID, out.ImageID); err != nil {
		w.log("worker: mark done for %s: %v", job.ID, err)
		return
	}
	w.log("worker: job %s done image_id=%s", job.ID, out.ImageID)
}

func (w *Worker) log(format string, args ...any) {
	if w.cfg.Logger != nil {
		w.cfg.Logger(format, args...)
	}
}