Files
mGPUmanager/internal/scheduler/evicting_test.go
mAi ca9bb1773f feat: Schritt 5 — VRAM-pressure eviction + coexistence groups
scheduler.Evicting wraps the Locked scheduler with the design's
LRU-with-coexistence eviction loop. main.go switches to it.

Per-job flow:
1. ensureFits — compare cons.vram_resident_mib + 256 MiB cushion against
   the live nvidia-smi free reading. If insufficient, pick the LRU
   loaded consumer NOT in cons.can_coexist_with, NOT VRAM-managed
   (ollama is excluded from eviction by design — it runs its own LRU),
   and NOT the target itself, then call its unload route. Wait 1s for
   VRAM to actually free. Repeat up to 5 times.
2. ensureLoaded — if the target was previously unloaded, call its
   /api/admin/load (mvoice). Consumers without a load route are
   assumed to cold-start implicitly on first request.
3. inner.Run — global GPU lock + job execution.

State:
- scheduler-local 'loaded' map + scheduler-local 'lastUsed' map. The
  registry's health-derived Loaded field is the source of truth for
  consumers that report it, but we need our own state for the seconds
  between an unload call and the next probe.
- Stats.Evictions counts successful unload calls and surfaces through
  /v1/status.

LRU pick order:
- Scheduler-local lastUsed (set on successful Run completion) takes
  precedence over registry.LastUsed (set on health probes) because the
  former reflects real GPU work, not health chatter. Zero-time
  consumers (never used) lose first.

Tests:
- Already-resident target: no eviction calls.
- 13 GiB comfyui evicted to fit 2.8 GiB mvoice → 1 unload + 1 load,
  Stats.Evictions = 1.
- Coexistent consumer (ollama, in mvoice.can_coexist_with) is never
  picked even if it's the LRU candidate; the non-coexistent comfyui
  is unloaded instead.

Race detector clean.

Refs: m/mGPUmanager#1 (Schritt 5).
2026-05-11 13:37:03 +02:00

248 lines
8.5 KiB
Go

package scheduler
import (
"context"
"io"
"log/slog"
"net/http"
"net/http/httptest"
"sync/atomic"
"testing"
"time"
"mgit.msbls.de/m/mGPUmanager/internal/config"
"mgit.msbls.de/m/mGPUmanager/internal/gpu"
"mgit.msbls.de/m/mGPUmanager/internal/registry"
)
func silentLogger() *slog.Logger {
return slog.New(slog.NewTextHandler(io.Discard, &slog.HandlerOptions{Level: slog.LevelError}))
}
// gpuStub implements just enough of gpu.Poller's surface for the evicting
// scheduler. We use the real Poller type (no interface yet) by hand-loading
// a Sample via a tiny wrapper.
//
// In practice we set gpu.Poller's internal sample via NewPoller + a goroutine.
// For tests we sidestep that by using a real Poller with a fake nvidia-smi —
// but the simpler path is to construct a Poller, store a Sample, and skip
// Run. We do that by exposing a tiny helper here.
// makeGPU returns a Poller pre-loaded with the given free/total values.
// It never calls nvidia-smi.
func makeGPU(t *testing.T, freeMiB, totalMiB int) *gpu.Poller {
t.Helper()
p := gpu.NewPoller(time.Hour, silentLogger())
// gpu.Poller.Last() reads from an internal Sample. We can't poke it
// directly without exporting state, so we use a sub-test trick: run
// sampleOnce against a fake nvidia-smi command. But that needs a PATH
// override and is brittle. Instead, expose a SetForTest helper.
gpu.SetSampleForTest(p, freeMiB, totalMiB)
return p
}
// fakeConsumer hosts /api/admin/{load,unload} so the evicting scheduler can
// exercise the HTTP eviction path.
type fakeConsumer struct {
srv *httptest.Server
unloadHit atomic.Int32
loadHit atomic.Int32
}
func newFakeConsumer(t *testing.T) *fakeConsumer {
t.Helper()
fc := &fakeConsumer{}
mux := http.NewServeMux()
mux.HandleFunc("GET /api/health", func(w http.ResponseWriter, _ *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"loaded":true,"gpu_resident_mib":2800}`))
})
mux.HandleFunc("POST /api/admin/unload", func(w http.ResponseWriter, _ *http.Request) {
fc.unloadHit.Add(1)
w.WriteHeader(200)
})
mux.HandleFunc("POST /api/admin/load", func(w http.ResponseWriter, _ *http.Request) {
fc.loadHit.Add(1)
w.WriteHeader(200)
})
mux.HandleFunc("POST /prompt", func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(200)
})
mux.HandleFunc("POST /api/free", func(w http.ResponseWriter, _ *http.Request) {
fc.unloadHit.Add(1)
w.WriteHeader(200)
})
fc.srv = httptest.NewServer(mux)
return fc
}
func buildCfg(mvoiceURL, comfyURL string) *config.Config {
return &config.Config{
Listen: "127.0.0.1:0",
GPU: config.GPU{TotalMiB: 16376, ReservedMiB: 1024, PollIntervalSeconds: 2},
Routing: map[config.EndpointKind]string{
config.KindTTS: "mvoice",
config.KindImage: "comfyui",
},
Consumers: map[string]*config.Consumer{
"mvoice": {
URL: mvoiceURL,
Health: config.Route{Method: "GET", Path: "/api/health"},
Paths: map[config.EndpointKind]config.Route{
config.KindTTS: {Method: "POST", Path: "/api/synthesize"},
},
VRAMResidentMiB: 2800,
Load: &config.Route{Method: "POST", Path: "/api/admin/load"},
Unload: &config.Route{Method: "POST", Path: "/api/admin/unload"},
CanCoexistWith: []string{"whisper-server", "ollama"},
Priority: 3,
MaxConcurrency: 1,
},
"comfyui": {
URL: comfyURL,
Health: config.Route{Method: "GET", Path: "/system_stats"},
Paths: map[config.EndpointKind]config.Route{
config.KindImage: {Method: "POST", Path: "/prompt"},
},
VRAMResidentMiB: 13000,
Unload: &config.Route{
Method: "POST",
Path: "/api/free",
Body: `{"unload_models":true,"free_memory":true}`,
},
CanCoexistWith: []string{},
Priority: 1,
MaxConcurrency: 1,
},
},
}
}
// TestEvictingSkipsWhenAlreadyResident verifies the no-op fast path: a job
// for an already-loaded consumer with plenty of free VRAM runs without any
// unload call.
func TestEvictingSkipsWhenAlreadyResident(t *testing.T) {
mvoice := newFakeConsumer(t)
defer mvoice.srv.Close()
comfy := newFakeConsumer(t)
defer comfy.srv.Close()
cfg := buildCfg(mvoice.srv.URL, comfy.srv.URL)
reg := registry.New(cfg, silentLogger())
g := makeGPU(t, 8192, 16376) // plenty of headroom
e := NewEvicting(cfg, reg, g, silentLogger())
if err := e.Run(context.Background(), "mvoice", func(ctx context.Context) error { return nil }); err != nil {
t.Fatal(err)
}
if mvoice.unloadHit.Load() != 0 {
t.Errorf("unexpected unload hits on mvoice: %d", mvoice.unloadHit.Load())
}
if comfy.unloadHit.Load() != 0 {
t.Errorf("unexpected unload hits on comfyui: %d", comfy.unloadHit.Load())
}
}
// TestEvictingFreesNonCoexistentVictim simulates the canonical scenario from
// the design: a TTS request comes in while comfyui is hogging 13 GiB. mvoice
// is not coexistent with comfyui (per cfg), so the scheduler must call
// comfyui's /api/free before letting the TTS job run.
func TestEvictingFreesNonCoexistentVictim(t *testing.T) {
mvoice := newFakeConsumer(t)
defer mvoice.srv.Close()
comfy := newFakeConsumer(t)
defer comfy.srv.Close()
cfg := buildCfg(mvoice.srv.URL, comfy.srv.URL)
reg := registry.New(cfg, silentLogger())
// Only 1 GiB free — mvoice (2.8 GiB) won't fit until comfyui (13 GiB)
// is evicted.
g := makeGPU(t, 1024, 16376)
e := NewEvicting(cfg, reg, g, silentLogger())
// Force the believed-loaded state so eviction kicks in (Run treats
// 'already loaded' as a no-op fast path).
e.SetLoadedForTest("mvoice", false)
e.SetLoadedForTest("comfyui", true)
// After the eviction unload call lands, we want fits() to return true
// for the next iteration — patch the GPU sample to reflect the freed
// memory by swapping the poller before the second fits() check is hit.
// We accomplish that by stubbing the unload handler to also bump the
// sample.
comfy.srv.Config.Handler = withHook(comfy.srv.Config.Handler, func() {
gpu.SetSampleForTest(g, 14000, 16376)
})
if err := e.Run(context.Background(), "mvoice", func(ctx context.Context) error { return nil }); err != nil {
t.Fatal(err)
}
if got := comfy.unloadHit.Load(); got != 1 {
t.Errorf("comfyui unload hit count = %d, want 1", got)
}
if got := mvoice.loadHit.Load(); got != 1 {
t.Errorf("mvoice load hit count = %d, want 1", got)
}
if got := e.Stats().Evictions; got != 1 {
t.Errorf("stats.Evictions = %d, want 1", got)
}
}
// TestEvictingHonoursCoexistence ensures we never evict a consumer that the
// target declared compatible. mvoice can coexist with ollama, so ollama must
// not be picked even if it's the LRU candidate.
func TestEvictingHonoursCoexistence(t *testing.T) {
mvoice := newFakeConsumer(t)
defer mvoice.srv.Close()
comfy := newFakeConsumer(t)
defer comfy.srv.Close()
cfg := buildCfg(mvoice.srv.URL, comfy.srv.URL)
// Add a stub ollama with an unload endpoint, mark coexistent.
ollama := newFakeConsumer(t)
defer ollama.srv.Close()
cfg.Consumers["ollama"] = &config.Consumer{
URL: ollama.srv.URL,
Health: config.Route{Method: "GET", Path: "/api/health"},
Paths: map[config.EndpointKind]config.Route{},
VRAMResidentMiB: 2000,
Unload: &config.Route{Method: "POST", Path: "/api/admin/unload"},
CanCoexistWith: []string{"mvoice"},
MaxConcurrency: 1,
}
reg := registry.New(cfg, silentLogger())
g := makeGPU(t, 1000, 16376)
e := NewEvicting(cfg, reg, g, silentLogger())
e.SetLoadedForTest("mvoice", false)
e.SetLoadedForTest("comfyui", true)
e.SetLoadedForTest("ollama", true)
comfy.srv.Config.Handler = withHook(comfy.srv.Config.Handler, func() {
gpu.SetSampleForTest(g, 14000, 16376)
})
if err := e.Run(context.Background(), "mvoice", func(ctx context.Context) error { return nil }); err != nil {
t.Fatal(err)
}
if got := ollama.unloadHit.Load(); got != 0 {
t.Errorf("ollama (coexistent) unloaded %d times; should be 0", got)
}
if got := comfy.unloadHit.Load(); got != 1 {
t.Errorf("comfyui unload hit count = %d, want 1", got)
}
}
// ───── helpers ────────────────────────────────────────────────────────────
// withHook wraps an http.Handler so each call invokes hook() before
// delegating to the original handler. Used to simulate VRAM being freed
// the instant comfyui's /api/free returns.
func withHook(h http.Handler, hook func()) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
hook()
h.ServeHTTP(w, r)
})
}