Replaces the MVP Passthrough with scheduler.Locked: a capacity-1 channel serialises every consumer's GPU work end-to-end. main.go switches to it. Behavioural contract: - Jobs that arrive while another job holds the GPU block on the channel until the holder finishes. Context cancellation aborts the wait cleanly (no leaked tokens, queue depth decremented). - Stats track queue_depth, in_flight, total_jobs, last_wait_ms, last_run_ms, oldest_queued — surfaced through /v1/status. - One lock for ALL consumers (not per-consumer): the design (§4.3) is explicit that grobgranular > GPU-stream-granular on single-GPU single-user hardware. mvoice + ollama + comfyui never run truly concurrently any more, which is the whole point — that's what produced the CUDA-OOM under load. Tests: - 5 goroutines hammer the scheduler concurrently → max in-flight = 1. - Cancellation while parked on the lock returns ctx.Err() and frees the queue slot. - Stats reflect in-flight + queue-depth transitions correctly. - Race detector clean. Schritt 5 will compose this with VRAM-pressure eviction: before acquiring the lock, check if the target consumer's resident cost fits under the current GPU headroom; if not, unload the LRU non-coexistent consumer first. Refs: m/mGPUmanager#1 (Schritt 4).
125 lines
3.1 KiB
Go
125 lines
3.1 KiB
Go
package scheduler
|
|
|
|
import (
|
|
"context"
|
|
"sync"
|
|
"time"
|
|
|
|
"mgit.msbls.de/m/mGPUmanager/internal/config"
|
|
"mgit.msbls.de/m/mGPUmanager/internal/registry"
|
|
)
|
|
|
|
// Locked is the Schritt 4 scheduler: a single capacity-1 semaphore serialises
|
|
// every consumer's GPU work. Jobs wait FIFO-ish at the channel until the lock
|
|
// is available, then run to completion.
|
|
//
|
|
// Why one global lock instead of per-stream or per-consumer:
|
|
// - mRock is single-GPU + single-user. Theoretical parallelism (e.g. mvoice
|
|
// + ollama small model coexisting) is given up to gain predictability:
|
|
// no more CUDA-OOM races between concurrent loaders.
|
|
// - The design (§4.3) is explicit: "Der Lock ist grobgranular (ein Mutex)
|
|
// […]. Wir verschenken theoretische Parallelität, gewinnen dafür
|
|
// Vorhersagbarkeit."
|
|
//
|
|
// Schritt 5 wraps this with eviction logic that runs before sem acquire when
|
|
// the requested consumer's resident cost would exceed available headroom.
|
|
type Locked struct {
|
|
reg *registry.Registry
|
|
gpuLock chan struct{} // capacity-1 = global mutex with cancellable acquire
|
|
|
|
mu sync.Mutex
|
|
inFlight int
|
|
queueDepth int
|
|
total int64
|
|
lastWaitMS int64
|
|
lastRunMS int64
|
|
oldestQueued time.Time
|
|
}
|
|
|
|
// NewLocked returns the serialising scheduler. capacity is the number of
|
|
// concurrent jobs allowed on the GPU (Phase 1 wires this as 1).
|
|
func NewLocked(reg *registry.Registry, capacity int) *Locked {
|
|
if capacity < 1 {
|
|
capacity = 1
|
|
}
|
|
return &Locked{
|
|
reg: reg,
|
|
gpuLock: make(chan struct{}, capacity),
|
|
}
|
|
}
|
|
|
|
// Run acquires the global GPU lock, executes fn while holding it, and
|
|
// releases. Cancellation via ctx aborts the wait without leaking a token.
|
|
func (s *Locked) Run(ctx context.Context, consumer string, fn Job) error {
|
|
release := s.reg.MarkActive(consumer)
|
|
defer release()
|
|
|
|
queuedAt := time.Now()
|
|
s.mu.Lock()
|
|
s.queueDepth++
|
|
if s.queueDepth == 1 || s.oldestQueued.IsZero() {
|
|
s.oldestQueued = queuedAt
|
|
}
|
|
s.mu.Unlock()
|
|
|
|
// Acquire global lock or bail on cancellation.
|
|
select {
|
|
case s.gpuLock <- struct{}{}:
|
|
case <-ctx.Done():
|
|
s.mu.Lock()
|
|
s.queueDepth--
|
|
if s.queueDepth == 0 {
|
|
s.oldestQueued = time.Time{}
|
|
}
|
|
s.mu.Unlock()
|
|
return ctx.Err()
|
|
}
|
|
waitMS := time.Since(queuedAt).Milliseconds()
|
|
|
|
s.mu.Lock()
|
|
s.queueDepth--
|
|
if s.queueDepth == 0 {
|
|
s.oldestQueued = time.Time{}
|
|
}
|
|
s.inFlight++
|
|
s.total++
|
|
s.lastWaitMS = waitMS
|
|
s.mu.Unlock()
|
|
|
|
defer func() {
|
|
<-s.gpuLock
|
|
s.mu.Lock()
|
|
s.inFlight--
|
|
s.mu.Unlock()
|
|
}()
|
|
|
|
start := time.Now()
|
|
err := fn(ctx)
|
|
runMS := time.Since(start).Milliseconds()
|
|
s.mu.Lock()
|
|
s.lastRunMS = runMS
|
|
s.mu.Unlock()
|
|
return err
|
|
}
|
|
|
|
// Stats reports current depth + last timings for /v1/status.
|
|
func (s *Locked) Stats() Stats {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
return Stats{
|
|
QueueDepth: s.queueDepth,
|
|
InFlight: s.inFlight,
|
|
TotalJobs: s.total,
|
|
LastWaitMS: s.lastWaitMS,
|
|
LastRunMS: s.lastRunMS,
|
|
OldestQueued: s.oldestQueued,
|
|
}
|
|
}
|
|
|
|
// Compile-time interface guard.
|
|
var _ Scheduler = (*Locked)(nil)
|
|
|
|
// Unused import guard — keeps the config package edge live for Schritt 5's
|
|
// VRAM-pressure evaluation, which reads cfg in this same package.
|
|
var _ = config.KindTTS
|