mGPUmanager/internal/scheduler/evicting_test.go

package scheduler

import (
	"context"
	"io"
	"log/slog"
	"net/http"
	"net/http/httptest"
	"sync/atomic"
	"testing"
	"time"

	"mgit.msbls.de/m/mGPUmanager/internal/config"
	"mgit.msbls.de/m/mGPUmanager/internal/gpu"
	"mgit.msbls.de/m/mGPUmanager/internal/registry"
)

func silentLogger() *slog.Logger {
	return slog.New(slog.NewTextHandler(io.Discard, &slog.HandlerOptions{Level: slog.LevelError}))
}

// gpuStub implements just enough of gpu.Poller's surface for the evicting
// scheduler. We use the real Poller type (no interface yet) by hand-loading
// a Sample via a tiny wrapper.
//
// In practice we set gpu.Poller's internal sample via NewPoller + a goroutine.
// For tests we sidestep that by using a real Poller with a fake nvidia-smi —
// but the simpler path is to construct a Poller, store a Sample, and skip
// Run. We do that by exposing a tiny helper here.

// makeGPU returns a Poller pre-loaded with the given free/total values.
// It never calls nvidia-smi.
func makeGPU(t *testing.T, freeMiB, totalMiB int) *gpu.Poller {
	t.Helper()
	p := gpu.NewPoller(time.Hour, silentLogger())
	// gpu.Poller.Last() reads from an internal Sample. We can't poke it
	// directly without exporting state, so we use a sub-test trick: run
	// sampleOnce against a fake nvidia-smi command. But that needs a PATH
	// override and is brittle. Instead, expose a SetForTest helper.
	gpu.SetSampleForTest(p, freeMiB, totalMiB)
	return p
}

// fakeConsumer hosts /api/admin/{load,unload} so the evicting scheduler can
// exercise the HTTP eviction path.
type fakeConsumer struct {
	srv       *httptest.Server
	unloadHit atomic.Int32
	loadHit   atomic.Int32
}

func newFakeConsumer(t *testing.T) *fakeConsumer {
	t.Helper()
	fc := &fakeConsumer{}
	mux := http.NewServeMux()
	mux.HandleFunc("GET /api/health", func(w http.ResponseWriter, _ *http.Request) {
		w.Header().Set("Content-Type", "application/json")
		_, _ = w.Write([]byte(`{"loaded":true,"gpu_resident_mib":2800}`))
	})
	mux.HandleFunc("POST /api/admin/unload", func(w http.ResponseWriter, _ *http.Request) {
		fc.unloadHit.Add(1)
		w.WriteHeader(200)
	})
	mux.HandleFunc("POST /api/admin/load", func(w http.ResponseWriter, _ *http.Request) {
		fc.loadHit.Add(1)
		w.WriteHeader(200)
	})
	mux.HandleFunc("POST /prompt", func(w http.ResponseWriter, _ *http.Request) {
		w.WriteHeader(200)
	})
	mux.HandleFunc("POST /api/free", func(w http.ResponseWriter, _ *http.Request) {
		fc.unloadHit.Add(1)
		w.WriteHeader(200)
	})
	fc.srv = httptest.NewServer(mux)
	return fc
}

func buildCfg(mvoiceURL, comfyURL string) *config.Config {
	return &config.Config{
		Listen: "127.0.0.1:0",
		GPU:    config.GPU{TotalMiB: 16376, ReservedMiB: 1024, PollIntervalSeconds: 2},
		Routing: map[config.EndpointKind]string{
			config.KindTTS:   "mvoice",
			config.KindImage: "comfyui",
		},
		Consumers: map[string]*config.Consumer{
			"mvoice": {
				URL: mvoiceURL,
				Health: config.Route{Method: "GET", Path: "/api/health"},
				Paths: map[config.EndpointKind]config.Route{
					config.KindTTS: {Method: "POST", Path: "/api/synthesize"},
				},
				VRAMResidentMiB: 2800,
				Load:            &config.Route{Method: "POST", Path: "/api/admin/load"},
				Unload:          &config.Route{Method: "POST", Path: "/api/admin/unload"},
				CanCoexistWith:  []string{"whisper-server", "ollama"},
				Priority:        3,
				MaxConcurrency:  1,
			},
			"comfyui": {
				URL: comfyURL,
				Health: config.Route{Method: "GET", Path: "/system_stats"},
				Paths: map[config.EndpointKind]config.Route{
					config.KindImage: {Method: "POST", Path: "/prompt"},
				},
				VRAMResidentMiB: 13000,
				Unload: &config.Route{
					Method: "POST",
					Path:   "/api/free",
					Body:   `{"unload_models":true,"free_memory":true}`,
				},
				CanCoexistWith: []string{},
				Priority:       1,
				MaxConcurrency: 1,
			},
		},
	}
}

// TestInitialLoadedHeuristic pins the comfyui-isn't-preloaded rule down:
// a consumer with Unload but no Load is lazy; everything else is assumed
// resident at startup.
func TestInitialLoadedHeuristic(t *testing.T) {
	cases := []struct {
		name string
		cons *config.Consumer
		want bool
	}{
		{"vram_managed (ollama)", &config.Consumer{VRAMManaged: true}, true},
		{"load+unload (mvoice)", &config.Consumer{
			Load:   &config.Route{Path: "/load"},
			Unload: &config.Route{Path: "/unload"},
		}, true},
		{"unload only — lazy (comfyui)", &config.Consumer{
			Unload: &config.Route{Path: "/api/free"},
		}, false},
		{"systemd unit only (whisper-server)", &config.Consumer{
			SystemdUnit: "whisper-server.service",
		}, true},
		{"empty consumer", &config.Consumer{}, true},
	}
	for _, c := range cases {
		t.Run(c.name, func(t *testing.T) {
			if got := initialLoaded(c.cons); got != c.want {
				t.Errorf("initialLoaded = %v, want %v", got, c.want)
			}
		})
	}
}

// TestEvictingSkipsWhenAlreadyResident verifies the no-op fast path: a job
// for an already-loaded consumer with plenty of free VRAM runs without any
// unload call.
func TestEvictingSkipsWhenAlreadyResident(t *testing.T) {
	mvoice := newFakeConsumer(t)
	defer mvoice.srv.Close()
	comfy := newFakeConsumer(t)
	defer comfy.srv.Close()

	cfg := buildCfg(mvoice.srv.URL, comfy.srv.URL)
	reg := registry.New(cfg, silentLogger())
	g := makeGPU(t, 8192, 16376) // plenty of headroom
	e := NewEvicting(cfg, reg, g, silentLogger())

	if err := e.Run(context.Background(), "mvoice", func(ctx context.Context) error { return nil }); err != nil {
		t.Fatal(err)
	}
	if mvoice.unloadHit.Load() != 0 {
		t.Errorf("unexpected unload hits on mvoice: %d", mvoice.unloadHit.Load())
	}
	if comfy.unloadHit.Load() != 0 {
		t.Errorf("unexpected unload hits on comfyui: %d", comfy.unloadHit.Load())
	}
}

// TestEvictingFreesNonCoexistentVictim simulates the canonical scenario from
// the design: a TTS request comes in while comfyui is hogging 13 GiB. mvoice
// is not coexistent with comfyui (per cfg), so the scheduler must call
// comfyui's /api/free before letting the TTS job run.
func TestEvictingFreesNonCoexistentVictim(t *testing.T) {
	mvoice := newFakeConsumer(t)
	defer mvoice.srv.Close()
	comfy := newFakeConsumer(t)
	defer comfy.srv.Close()

	cfg := buildCfg(mvoice.srv.URL, comfy.srv.URL)
	reg := registry.New(cfg, silentLogger())

	// Only 1 GiB free — mvoice (2.8 GiB) won't fit until comfyui (13 GiB)
	// is evicted.
	g := makeGPU(t, 1024, 16376)
	e := NewEvicting(cfg, reg, g, silentLogger())

	// Force the believed-loaded state so eviction kicks in (Run treats
	// 'already loaded' as a no-op fast path).
	e.SetLoadedForTest("mvoice", false)
	e.SetLoadedForTest("comfyui", true)

	// After the eviction unload call lands, we want fits() to return true
	// for the next iteration — patch the GPU sample to reflect the freed
	// memory by swapping the poller before the second fits() check is hit.
	// We accomplish that by stubbing the unload handler to also bump the
	// sample.
	comfy.srv.Config.Handler = withHook(comfy.srv.Config.Handler, func() {
		gpu.SetSampleForTest(g, 14000, 16376)
	})

	if err := e.Run(context.Background(), "mvoice", func(ctx context.Context) error { return nil }); err != nil {
		t.Fatal(err)
	}
	if got := comfy.unloadHit.Load(); got != 1 {
		t.Errorf("comfyui unload hit count = %d, want 1", got)
	}
	if got := mvoice.loadHit.Load(); got != 1 {
		t.Errorf("mvoice load hit count = %d, want 1", got)
	}
	if got := e.Stats().Evictions; got != 1 {
		t.Errorf("stats.Evictions = %d, want 1", got)
	}
}

// TestEvictingHonoursCoexistence ensures we never evict a consumer that the
// target declared compatible. mvoice can coexist with ollama, so ollama must
// not be picked even if it's the LRU candidate.
func TestEvictingHonoursCoexistence(t *testing.T) {
	mvoice := newFakeConsumer(t)
	defer mvoice.srv.Close()
	comfy := newFakeConsumer(t)
	defer comfy.srv.Close()

	cfg := buildCfg(mvoice.srv.URL, comfy.srv.URL)
	// Add a stub ollama with an unload endpoint, mark coexistent.
	ollama := newFakeConsumer(t)
	defer ollama.srv.Close()
	cfg.Consumers["ollama"] = &config.Consumer{
		URL:             ollama.srv.URL,
		Health:          config.Route{Method: "GET", Path: "/api/health"},
		Paths:           map[config.EndpointKind]config.Route{},
		VRAMResidentMiB: 2000,
		Unload:          &config.Route{Method: "POST", Path: "/api/admin/unload"},
		CanCoexistWith:  []string{"mvoice"},
		MaxConcurrency:  1,
	}

	reg := registry.New(cfg, silentLogger())
	g := makeGPU(t, 1000, 16376)
	e := NewEvicting(cfg, reg, g, silentLogger())
	e.SetLoadedForTest("mvoice", false)
	e.SetLoadedForTest("comfyui", true)
	e.SetLoadedForTest("ollama", true)

	comfy.srv.Config.Handler = withHook(comfy.srv.Config.Handler, func() {
		gpu.SetSampleForTest(g, 14000, 16376)
	})

	if err := e.Run(context.Background(), "mvoice", func(ctx context.Context) error { return nil }); err != nil {
		t.Fatal(err)
	}
	if got := ollama.unloadHit.Load(); got != 0 {
		t.Errorf("ollama (coexistent) unloaded %d times; should be 0", got)
	}
	if got := comfy.unloadHit.Load(); got != 1 {
		t.Errorf("comfyui unload hit count = %d, want 1", got)
	}
}

// ───── helpers ────────────────────────────────────────────────────────────

// withHook wraps an http.Handler so each call invokes hook() before
// delegating to the original handler. Used to simulate VRAM being freed
// the instant comfyui's /api/free returns.
func withHook(h http.Handler, hook func()) http.Handler {
	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		hook()
		h.ServeHTTP(w, r)
	})
}