Live deploy on mRock surfaced a Schritt 5 bug: comfyui was always
treated as preloaded at scheduler startup, which made ensureFits()
short-circuit on the very first /v1/image request — exactly the
scenario eviction is supposed to handle. mvoice was never picked as
a victim, ComfyUI then OOM'd loading FLUX on top of the still-resident
mvoice.
Fix: replace the blanket 'every consumer starts loaded' init with a
heuristic — initialLoaded(cons):
- VRAMManaged (ollama): true. We never track/evict it; the consumer
runs its own LRU.
- Load+Unload both present (mvoice): true. Designed to be controllable;
typically preloads in its own lifespan.
- Unload only, no Load (comfyui): false. Lazy — FLUX isn't resident
until the first /prompt, so we shouldn't bill its 13 GiB against the
GPU budget until then.
- SystemdUnit only (whisper-server): true. Always-on, model loaded at
process start.
- Empty: true. Safe fallback.
Verified live on mRock (2026-05-15):
Before /v1/image: nvidia-smi 8963 MiB used; mvoice gpu_resident_mib 2345
POST /v1/image: HTTP 400 from upstream (empty workflow), broker did
trigger eviction before forwarding
After: nvidia-smi 6547 MiB used; mvoice gpu_resident_mib 9
(~CUDA context only); scheduler.evictions = 2
POST /v1/tts: audio_url returned, tts_ms 670, audio 3.5 s
After reload: nvidia-smi 8943 MiB used; mvoice gpu_resident_mib 2917
Test: TestInitialLoadedHeuristic pins the four cases down so this
doesn't regress when someone adds a fifth consumer type.
Refs: m/mGPUmanager#1 (live deploy).
279 lines
9.4 KiB
Go
279 lines
9.4 KiB
Go
package scheduler
|
|
|
|
import (
|
|
"context"
|
|
"io"
|
|
"log/slog"
|
|
"net/http"
|
|
"net/http/httptest"
|
|
"sync/atomic"
|
|
"testing"
|
|
"time"
|
|
|
|
"mgit.msbls.de/m/mGPUmanager/internal/config"
|
|
"mgit.msbls.de/m/mGPUmanager/internal/gpu"
|
|
"mgit.msbls.de/m/mGPUmanager/internal/registry"
|
|
)
|
|
|
|
func silentLogger() *slog.Logger {
|
|
return slog.New(slog.NewTextHandler(io.Discard, &slog.HandlerOptions{Level: slog.LevelError}))
|
|
}
|
|
|
|
// gpuStub implements just enough of gpu.Poller's surface for the evicting
|
|
// scheduler. We use the real Poller type (no interface yet) by hand-loading
|
|
// a Sample via a tiny wrapper.
|
|
//
|
|
// In practice we set gpu.Poller's internal sample via NewPoller + a goroutine.
|
|
// For tests we sidestep that by using a real Poller with a fake nvidia-smi —
|
|
// but the simpler path is to construct a Poller, store a Sample, and skip
|
|
// Run. We do that by exposing a tiny helper here.
|
|
|
|
// makeGPU returns a Poller pre-loaded with the given free/total values.
|
|
// It never calls nvidia-smi.
|
|
func makeGPU(t *testing.T, freeMiB, totalMiB int) *gpu.Poller {
|
|
t.Helper()
|
|
p := gpu.NewPoller(time.Hour, silentLogger())
|
|
// gpu.Poller.Last() reads from an internal Sample. We can't poke it
|
|
// directly without exporting state, so we use a sub-test trick: run
|
|
// sampleOnce against a fake nvidia-smi command. But that needs a PATH
|
|
// override and is brittle. Instead, expose a SetForTest helper.
|
|
gpu.SetSampleForTest(p, freeMiB, totalMiB)
|
|
return p
|
|
}
|
|
|
|
// fakeConsumer hosts /api/admin/{load,unload} so the evicting scheduler can
|
|
// exercise the HTTP eviction path.
|
|
type fakeConsumer struct {
|
|
srv *httptest.Server
|
|
unloadHit atomic.Int32
|
|
loadHit atomic.Int32
|
|
}
|
|
|
|
func newFakeConsumer(t *testing.T) *fakeConsumer {
|
|
t.Helper()
|
|
fc := &fakeConsumer{}
|
|
mux := http.NewServeMux()
|
|
mux.HandleFunc("GET /api/health", func(w http.ResponseWriter, _ *http.Request) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
_, _ = w.Write([]byte(`{"loaded":true,"gpu_resident_mib":2800}`))
|
|
})
|
|
mux.HandleFunc("POST /api/admin/unload", func(w http.ResponseWriter, _ *http.Request) {
|
|
fc.unloadHit.Add(1)
|
|
w.WriteHeader(200)
|
|
})
|
|
mux.HandleFunc("POST /api/admin/load", func(w http.ResponseWriter, _ *http.Request) {
|
|
fc.loadHit.Add(1)
|
|
w.WriteHeader(200)
|
|
})
|
|
mux.HandleFunc("POST /prompt", func(w http.ResponseWriter, _ *http.Request) {
|
|
w.WriteHeader(200)
|
|
})
|
|
mux.HandleFunc("POST /api/free", func(w http.ResponseWriter, _ *http.Request) {
|
|
fc.unloadHit.Add(1)
|
|
w.WriteHeader(200)
|
|
})
|
|
fc.srv = httptest.NewServer(mux)
|
|
return fc
|
|
}
|
|
|
|
func buildCfg(mvoiceURL, comfyURL string) *config.Config {
|
|
return &config.Config{
|
|
Listen: "127.0.0.1:0",
|
|
GPU: config.GPU{TotalMiB: 16376, ReservedMiB: 1024, PollIntervalSeconds: 2},
|
|
Routing: map[config.EndpointKind]string{
|
|
config.KindTTS: "mvoice",
|
|
config.KindImage: "comfyui",
|
|
},
|
|
Consumers: map[string]*config.Consumer{
|
|
"mvoice": {
|
|
URL: mvoiceURL,
|
|
Health: config.Route{Method: "GET", Path: "/api/health"},
|
|
Paths: map[config.EndpointKind]config.Route{
|
|
config.KindTTS: {Method: "POST", Path: "/api/synthesize"},
|
|
},
|
|
VRAMResidentMiB: 2800,
|
|
Load: &config.Route{Method: "POST", Path: "/api/admin/load"},
|
|
Unload: &config.Route{Method: "POST", Path: "/api/admin/unload"},
|
|
CanCoexistWith: []string{"whisper-server", "ollama"},
|
|
Priority: 3,
|
|
MaxConcurrency: 1,
|
|
},
|
|
"comfyui": {
|
|
URL: comfyURL,
|
|
Health: config.Route{Method: "GET", Path: "/system_stats"},
|
|
Paths: map[config.EndpointKind]config.Route{
|
|
config.KindImage: {Method: "POST", Path: "/prompt"},
|
|
},
|
|
VRAMResidentMiB: 13000,
|
|
Unload: &config.Route{
|
|
Method: "POST",
|
|
Path: "/api/free",
|
|
Body: `{"unload_models":true,"free_memory":true}`,
|
|
},
|
|
CanCoexistWith: []string{},
|
|
Priority: 1,
|
|
MaxConcurrency: 1,
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
// TestInitialLoadedHeuristic pins the comfyui-isn't-preloaded rule down:
|
|
// a consumer with Unload but no Load is lazy; everything else is assumed
|
|
// resident at startup.
|
|
func TestInitialLoadedHeuristic(t *testing.T) {
|
|
cases := []struct {
|
|
name string
|
|
cons *config.Consumer
|
|
want bool
|
|
}{
|
|
{"vram_managed (ollama)", &config.Consumer{VRAMManaged: true}, true},
|
|
{"load+unload (mvoice)", &config.Consumer{
|
|
Load: &config.Route{Path: "/load"},
|
|
Unload: &config.Route{Path: "/unload"},
|
|
}, true},
|
|
{"unload only — lazy (comfyui)", &config.Consumer{
|
|
Unload: &config.Route{Path: "/api/free"},
|
|
}, false},
|
|
{"systemd unit only (whisper-server)", &config.Consumer{
|
|
SystemdUnit: "whisper-server.service",
|
|
}, true},
|
|
{"empty consumer", &config.Consumer{}, true},
|
|
}
|
|
for _, c := range cases {
|
|
t.Run(c.name, func(t *testing.T) {
|
|
if got := initialLoaded(c.cons); got != c.want {
|
|
t.Errorf("initialLoaded = %v, want %v", got, c.want)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
// TestEvictingSkipsWhenAlreadyResident verifies the no-op fast path: a job
|
|
// for an already-loaded consumer with plenty of free VRAM runs without any
|
|
// unload call.
|
|
func TestEvictingSkipsWhenAlreadyResident(t *testing.T) {
|
|
mvoice := newFakeConsumer(t)
|
|
defer mvoice.srv.Close()
|
|
comfy := newFakeConsumer(t)
|
|
defer comfy.srv.Close()
|
|
|
|
cfg := buildCfg(mvoice.srv.URL, comfy.srv.URL)
|
|
reg := registry.New(cfg, silentLogger())
|
|
g := makeGPU(t, 8192, 16376) // plenty of headroom
|
|
e := NewEvicting(cfg, reg, g, silentLogger())
|
|
|
|
if err := e.Run(context.Background(), "mvoice", func(ctx context.Context) error { return nil }); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if mvoice.unloadHit.Load() != 0 {
|
|
t.Errorf("unexpected unload hits on mvoice: %d", mvoice.unloadHit.Load())
|
|
}
|
|
if comfy.unloadHit.Load() != 0 {
|
|
t.Errorf("unexpected unload hits on comfyui: %d", comfy.unloadHit.Load())
|
|
}
|
|
}
|
|
|
|
// TestEvictingFreesNonCoexistentVictim simulates the canonical scenario from
|
|
// the design: a TTS request comes in while comfyui is hogging 13 GiB. mvoice
|
|
// is not coexistent with comfyui (per cfg), so the scheduler must call
|
|
// comfyui's /api/free before letting the TTS job run.
|
|
func TestEvictingFreesNonCoexistentVictim(t *testing.T) {
|
|
mvoice := newFakeConsumer(t)
|
|
defer mvoice.srv.Close()
|
|
comfy := newFakeConsumer(t)
|
|
defer comfy.srv.Close()
|
|
|
|
cfg := buildCfg(mvoice.srv.URL, comfy.srv.URL)
|
|
reg := registry.New(cfg, silentLogger())
|
|
|
|
// Only 1 GiB free — mvoice (2.8 GiB) won't fit until comfyui (13 GiB)
|
|
// is evicted.
|
|
g := makeGPU(t, 1024, 16376)
|
|
e := NewEvicting(cfg, reg, g, silentLogger())
|
|
|
|
// Force the believed-loaded state so eviction kicks in (Run treats
|
|
// 'already loaded' as a no-op fast path).
|
|
e.SetLoadedForTest("mvoice", false)
|
|
e.SetLoadedForTest("comfyui", true)
|
|
|
|
// After the eviction unload call lands, we want fits() to return true
|
|
// for the next iteration — patch the GPU sample to reflect the freed
|
|
// memory by swapping the poller before the second fits() check is hit.
|
|
// We accomplish that by stubbing the unload handler to also bump the
|
|
// sample.
|
|
comfy.srv.Config.Handler = withHook(comfy.srv.Config.Handler, func() {
|
|
gpu.SetSampleForTest(g, 14000, 16376)
|
|
})
|
|
|
|
if err := e.Run(context.Background(), "mvoice", func(ctx context.Context) error { return nil }); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if got := comfy.unloadHit.Load(); got != 1 {
|
|
t.Errorf("comfyui unload hit count = %d, want 1", got)
|
|
}
|
|
if got := mvoice.loadHit.Load(); got != 1 {
|
|
t.Errorf("mvoice load hit count = %d, want 1", got)
|
|
}
|
|
if got := e.Stats().Evictions; got != 1 {
|
|
t.Errorf("stats.Evictions = %d, want 1", got)
|
|
}
|
|
}
|
|
|
|
// TestEvictingHonoursCoexistence ensures we never evict a consumer that the
|
|
// target declared compatible. mvoice can coexist with ollama, so ollama must
|
|
// not be picked even if it's the LRU candidate.
|
|
func TestEvictingHonoursCoexistence(t *testing.T) {
|
|
mvoice := newFakeConsumer(t)
|
|
defer mvoice.srv.Close()
|
|
comfy := newFakeConsumer(t)
|
|
defer comfy.srv.Close()
|
|
|
|
cfg := buildCfg(mvoice.srv.URL, comfy.srv.URL)
|
|
// Add a stub ollama with an unload endpoint, mark coexistent.
|
|
ollama := newFakeConsumer(t)
|
|
defer ollama.srv.Close()
|
|
cfg.Consumers["ollama"] = &config.Consumer{
|
|
URL: ollama.srv.URL,
|
|
Health: config.Route{Method: "GET", Path: "/api/health"},
|
|
Paths: map[config.EndpointKind]config.Route{},
|
|
VRAMResidentMiB: 2000,
|
|
Unload: &config.Route{Method: "POST", Path: "/api/admin/unload"},
|
|
CanCoexistWith: []string{"mvoice"},
|
|
MaxConcurrency: 1,
|
|
}
|
|
|
|
reg := registry.New(cfg, silentLogger())
|
|
g := makeGPU(t, 1000, 16376)
|
|
e := NewEvicting(cfg, reg, g, silentLogger())
|
|
e.SetLoadedForTest("mvoice", false)
|
|
e.SetLoadedForTest("comfyui", true)
|
|
e.SetLoadedForTest("ollama", true)
|
|
|
|
comfy.srv.Config.Handler = withHook(comfy.srv.Config.Handler, func() {
|
|
gpu.SetSampleForTest(g, 14000, 16376)
|
|
})
|
|
|
|
if err := e.Run(context.Background(), "mvoice", func(ctx context.Context) error { return nil }); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if got := ollama.unloadHit.Load(); got != 0 {
|
|
t.Errorf("ollama (coexistent) unloaded %d times; should be 0", got)
|
|
}
|
|
if got := comfy.unloadHit.Load(); got != 1 {
|
|
t.Errorf("comfyui unload hit count = %d, want 1", got)
|
|
}
|
|
}
|
|
|
|
// ───── helpers ────────────────────────────────────────────────────────────
|
|
|
|
// withHook wraps an http.Handler so each call invokes hook() before
|
|
// delegating to the original handler. Used to simulate VRAM being freed
|
|
// the instant comfyui's /api/free returns.
|
|
func withHook(h http.Handler, hook func()) http.Handler {
|
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
hook()
|
|
h.ServeHTTP(w, r)
|
|
})
|
|
}
|