Replaces the MVP Passthrough with scheduler.Locked: a capacity-1 channel serialises every consumer's GPU work end-to-end. main.go switches to it. Behavioural contract: - Jobs that arrive while another job holds the GPU block on the channel until the holder finishes. Context cancellation aborts the wait cleanly (no leaked tokens, queue depth decremented). - Stats track queue_depth, in_flight, total_jobs, last_wait_ms, last_run_ms, oldest_queued — surfaced through /v1/status. - One lock for ALL consumers (not per-consumer): the design (§4.3) is explicit that grobgranular > GPU-stream-granular on single-GPU single-user hardware. mvoice + ollama + comfyui never run truly concurrently any more, which is the whole point — that's what produced the CUDA-OOM under load. Tests: - 5 goroutines hammer the scheduler concurrently → max in-flight = 1. - Cancellation while parked on the lock returns ctx.Err() and frees the queue slot. - Stats reflect in-flight + queue-depth transitions correctly. - Race detector clean. Schritt 5 will compose this with VRAM-pressure eviction: before acquiring the lock, check if the target consumer's resident cost fits under the current GPU headroom; if not, unload the LRU non-coexistent consumer first. Refs: m/mGPUmanager#1 (Schritt 4).
165 lines
3.8 KiB
Go
165 lines
3.8 KiB
Go
package scheduler
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"log/slog"
|
|
"io"
|
|
"sync"
|
|
"sync/atomic"
|
|
"testing"
|
|
"time"
|
|
|
|
"mgit.msbls.de/m/mGPUmanager/internal/config"
|
|
"mgit.msbls.de/m/mGPUmanager/internal/registry"
|
|
)
|
|
|
|
func newReg() *registry.Registry {
|
|
cfg := &config.Config{
|
|
Consumers: map[string]*config.Consumer{
|
|
"mvoice": {
|
|
URL: "http://localhost:8766",
|
|
Health: config.Route{Method: "GET", Path: "/api/health"},
|
|
},
|
|
},
|
|
}
|
|
return registry.New(cfg, slog.New(slog.NewTextHandler(io.Discard, nil)))
|
|
}
|
|
|
|
// TestLockedSerialisesConcurrentJobs is the regression test for the
|
|
// CUDA-OOM-from-parallel-loaders class: two TTS calls that arrive at the
|
|
// same time must run sequentially, not concurrently.
|
|
func TestLockedSerialisesConcurrentJobs(t *testing.T) {
|
|
sched := NewLocked(newReg(), 1)
|
|
|
|
var maxConcurrent atomic.Int32
|
|
var inFlight atomic.Int32
|
|
|
|
job := func(ctx context.Context) error {
|
|
now := inFlight.Add(1)
|
|
// Update max in a CAS loop (small N, never contested in practice).
|
|
for {
|
|
cur := maxConcurrent.Load()
|
|
if now <= cur || maxConcurrent.CompareAndSwap(cur, now) {
|
|
break
|
|
}
|
|
}
|
|
time.Sleep(20 * time.Millisecond)
|
|
inFlight.Add(-1)
|
|
return nil
|
|
}
|
|
|
|
var wg sync.WaitGroup
|
|
const n = 5
|
|
for range n {
|
|
wg.Go(func() {
|
|
if err := sched.Run(context.Background(), "mvoice", job); err != nil {
|
|
t.Errorf("Run: %v", err)
|
|
}
|
|
})
|
|
}
|
|
wg.Wait()
|
|
|
|
if got := maxConcurrent.Load(); got != 1 {
|
|
t.Fatalf("max concurrent jobs = %d, want 1", got)
|
|
}
|
|
if got := sched.Stats().TotalJobs; got != n {
|
|
t.Errorf("Stats.TotalJobs = %d, want %d", got, n)
|
|
}
|
|
}
|
|
|
|
func TestLockedRespectsContextCancel(t *testing.T) {
|
|
sched := NewLocked(newReg(), 1)
|
|
|
|
// Hold the lock with a long-running job.
|
|
started := make(chan struct{})
|
|
done := make(chan struct{})
|
|
go func() {
|
|
_ = sched.Run(context.Background(), "mvoice", func(ctx context.Context) error {
|
|
close(started)
|
|
<-done
|
|
return nil
|
|
})
|
|
}()
|
|
<-started
|
|
|
|
// Now try to run with a context that we'll cancel.
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
errCh := make(chan error, 1)
|
|
go func() {
|
|
errCh <- sched.Run(ctx, "mvoice", func(ctx context.Context) error {
|
|
t.Error("second job should not run after cancellation")
|
|
return nil
|
|
})
|
|
}()
|
|
|
|
// Give the second job time to queue.
|
|
time.Sleep(10 * time.Millisecond)
|
|
cancel()
|
|
|
|
select {
|
|
case err := <-errCh:
|
|
if !errors.Is(err, context.Canceled) {
|
|
t.Fatalf("got err=%v, want context.Canceled", err)
|
|
}
|
|
case <-time.After(time.Second):
|
|
t.Fatal("cancelled Run did not return within 1s")
|
|
}
|
|
|
|
// Release the holder.
|
|
close(done)
|
|
}
|
|
|
|
func TestLockedStatsTrackInFlightAndQueue(t *testing.T) {
|
|
sched := NewLocked(newReg(), 1)
|
|
|
|
jobStart := make(chan struct{})
|
|
jobBlock := make(chan struct{})
|
|
go func() {
|
|
_ = sched.Run(context.Background(), "mvoice", func(ctx context.Context) error {
|
|
close(jobStart)
|
|
<-jobBlock
|
|
return nil
|
|
})
|
|
}()
|
|
<-jobStart
|
|
|
|
// Inside the holding job: InFlight==1, QueueDepth==0.
|
|
s := sched.Stats()
|
|
if s.InFlight != 1 {
|
|
t.Errorf("InFlight while holding = %d, want 1", s.InFlight)
|
|
}
|
|
if s.QueueDepth != 0 {
|
|
t.Errorf("QueueDepth = %d, want 0", s.QueueDepth)
|
|
}
|
|
|
|
// Queue a waiter and verify QueueDepth grows.
|
|
waitStarted := make(chan struct{})
|
|
waitDone := make(chan struct{})
|
|
go func() {
|
|
close(waitStarted)
|
|
_ = sched.Run(context.Background(), "mvoice", func(ctx context.Context) error {
|
|
return nil
|
|
})
|
|
close(waitDone)
|
|
}()
|
|
<-waitStarted
|
|
// Wait for the waiter to actually be parked on the channel.
|
|
deadline := time.Now().Add(time.Second)
|
|
for time.Now().Before(deadline) {
|
|
if sched.Stats().QueueDepth == 1 {
|
|
break
|
|
}
|
|
time.Sleep(2 * time.Millisecond)
|
|
}
|
|
if got := sched.Stats().QueueDepth; got != 1 {
|
|
t.Errorf("QueueDepth with one waiter = %d, want 1", got)
|
|
}
|
|
|
|
close(jobBlock)
|
|
<-waitDone
|
|
if got := sched.Stats().InFlight; got != 0 {
|
|
t.Errorf("InFlight after both done = %d, want 0", got)
|
|
}
|
|
}
|