fix(scheduler): mark lazy consumers (Unload but no Load) as not-loaded at startup
Live deploy on mRock surfaced a Schritt 5 bug: comfyui was always
treated as preloaded at scheduler startup, which made ensureFits()
short-circuit on the very first /v1/image request — exactly the
scenario eviction is supposed to handle. mvoice was never picked as
a victim, ComfyUI then OOM'd loading FLUX on top of the still-resident
mvoice.
Fix: replace the blanket 'every consumer starts loaded' init with a
heuristic — initialLoaded(cons):
- VRAMManaged (ollama): true. We never track/evict it; the consumer
runs its own LRU.
- Load+Unload both present (mvoice): true. Designed to be controllable;
typically preloads in its own lifespan.
- Unload only, no Load (comfyui): false. Lazy — FLUX isn't resident
until the first /prompt, so we shouldn't bill its 13 GiB against the
GPU budget until then.
- SystemdUnit only (whisper-server): true. Always-on, model loaded at
process start.
- Empty: true. Safe fallback.
Verified live on mRock (2026-05-15):
Before /v1/image: nvidia-smi 8963 MiB used; mvoice gpu_resident_mib 2345
POST /v1/image: HTTP 400 from upstream (empty workflow), broker did
trigger eviction before forwarding
After: nvidia-smi 6547 MiB used; mvoice gpu_resident_mib 9
(~CUDA context only); scheduler.evictions = 2
POST /v1/tts: audio_url returned, tts_ms 670, audio 3.5 s
After reload: nvidia-smi 8943 MiB used; mvoice gpu_resident_mib 2917
Test: TestInitialLoadedHeuristic pins the four cases down so this
doesn't regress when someone adds a fifth consumer type.
Refs: m/mGPUmanager#1 (live deploy).
This commit is contained in:
@@ -70,13 +70,38 @@ func NewEvicting(cfg *config.Config, reg *registry.Registry, gpuPoller *gpu.Poll
|
||||
lastUsed: make(map[string]time.Time, len(cfg.Consumers)),
|
||||
}
|
||||
for name, cons := range cfg.Consumers {
|
||||
// Self-managed VRAM consumers (ollama) are always 'loaded' from
|
||||
// the scheduler's perspective — we never evict them via HTTP.
|
||||
e.loaded[name] = !cons.VRAMManaged || true
|
||||
e.loaded[name] = initialLoaded(cons)
|
||||
}
|
||||
return e
|
||||
}
|
||||
|
||||
// initialLoaded picks the believed-loaded state for a consumer at scheduler
|
||||
// startup. The rule:
|
||||
//
|
||||
// - VRAM-managed (ollama): true — we never track or evict it.
|
||||
// - Has a load route AND an unload route (mvoice): true — the consumer
|
||||
// is set up to be controllable in both directions, and typically
|
||||
// preloads on its own systemd-managed startup.
|
||||
// - Has only an unload route, no load route (comfyui): false — lazy.
|
||||
// FLUX isn't resident until the first /prompt; until that happens we
|
||||
// don't account for its VRAM cost.
|
||||
// - Has a systemd_unit but no HTTP routes (whisper-server): true — these
|
||||
// are always-on services that load their model at process start.
|
||||
// - Neither: true — fallback, assume it's there if the consumer is up.
|
||||
//
|
||||
// Getting this right matters for the eviction smoke test: if comfyui were
|
||||
// believed loaded at startup, ensureFits would short-circuit on the first
|
||||
// /v1/image request and never trigger eviction. (m/mGPUmanager#1 live deploy.)
|
||||
func initialLoaded(cons *config.Consumer) bool {
|
||||
if cons.VRAMManaged {
|
||||
return true
|
||||
}
|
||||
if cons.Load == nil && cons.Unload != nil {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Run is the public Scheduler interface: ensure room + load + serialise.
|
||||
func (e *Evicting) Run(ctx context.Context, consumer string, fn Job) error {
|
||||
if err := e.ensureFits(ctx, consumer); err != nil {
|
||||
|
||||
@@ -118,6 +118,37 @@ func buildCfg(mvoiceURL, comfyURL string) *config.Config {
|
||||
}
|
||||
}
|
||||
|
||||
// TestInitialLoadedHeuristic pins the comfyui-isn't-preloaded rule down:
|
||||
// a consumer with Unload but no Load is lazy; everything else is assumed
|
||||
// resident at startup.
|
||||
func TestInitialLoadedHeuristic(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
cons *config.Consumer
|
||||
want bool
|
||||
}{
|
||||
{"vram_managed (ollama)", &config.Consumer{VRAMManaged: true}, true},
|
||||
{"load+unload (mvoice)", &config.Consumer{
|
||||
Load: &config.Route{Path: "/load"},
|
||||
Unload: &config.Route{Path: "/unload"},
|
||||
}, true},
|
||||
{"unload only — lazy (comfyui)", &config.Consumer{
|
||||
Unload: &config.Route{Path: "/api/free"},
|
||||
}, false},
|
||||
{"systemd unit only (whisper-server)", &config.Consumer{
|
||||
SystemdUnit: "whisper-server.service",
|
||||
}, true},
|
||||
{"empty consumer", &config.Consumer{}, true},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
if got := initialLoaded(c.cons); got != c.want {
|
||||
t.Errorf("initialLoaded = %v, want %v", got, c.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestEvictingSkipsWhenAlreadyResident verifies the no-op fast path: a job
|
||||
// for an already-loaded consumer with plenty of free VRAM runs without any
|
||||
// unload call.
|
||||
|
||||
Reference in New Issue
Block a user