package scheduler import ( "context" "sync" "time" "mgit.msbls.de/m/mGPUmanager/internal/config" "mgit.msbls.de/m/mGPUmanager/internal/registry" ) // Locked is the Schritt 4 scheduler: a single capacity-1 semaphore serialises // every consumer's GPU work. Jobs wait FIFO-ish at the channel until the lock // is available, then run to completion. // // Why one global lock instead of per-stream or per-consumer: // - mRock is single-GPU + single-user. Theoretical parallelism (e.g. mvoice // + ollama small model coexisting) is given up to gain predictability: // no more CUDA-OOM races between concurrent loaders. // - The design (§4.3) is explicit: "Der Lock ist grobgranular (ein Mutex) // […]. Wir verschenken theoretische Parallelität, gewinnen dafür // Vorhersagbarkeit." // // Schritt 5 wraps this with eviction logic that runs before sem acquire when // the requested consumer's resident cost would exceed available headroom. type Locked struct { reg *registry.Registry gpuLock chan struct{} // capacity-1 = global mutex with cancellable acquire mu sync.Mutex inFlight int queueDepth int total int64 lastWaitMS int64 lastRunMS int64 oldestQueued time.Time } // NewLocked returns the serialising scheduler. capacity is the number of // concurrent jobs allowed on the GPU (Phase 1 wires this as 1). func NewLocked(reg *registry.Registry, capacity int) *Locked { if capacity < 1 { capacity = 1 } return &Locked{ reg: reg, gpuLock: make(chan struct{}, capacity), } } // Run acquires the global GPU lock, executes fn while holding it, and // releases. Cancellation via ctx aborts the wait without leaking a token. func (s *Locked) Run(ctx context.Context, consumer string, fn Job) error { release := s.reg.MarkActive(consumer) defer release() queuedAt := time.Now() s.mu.Lock() s.queueDepth++ if s.queueDepth == 1 || s.oldestQueued.IsZero() { s.oldestQueued = queuedAt } s.mu.Unlock() // Acquire global lock or bail on cancellation. select { case s.gpuLock <- struct{}{}: case <-ctx.Done(): s.mu.Lock() s.queueDepth-- if s.queueDepth == 0 { s.oldestQueued = time.Time{} } s.mu.Unlock() return ctx.Err() } waitMS := time.Since(queuedAt).Milliseconds() s.mu.Lock() s.queueDepth-- if s.queueDepth == 0 { s.oldestQueued = time.Time{} } s.inFlight++ s.total++ s.lastWaitMS = waitMS s.mu.Unlock() defer func() { <-s.gpuLock s.mu.Lock() s.inFlight-- s.mu.Unlock() }() start := time.Now() err := fn(ctx) runMS := time.Since(start).Milliseconds() s.mu.Lock() s.lastRunMS = runMS s.mu.Unlock() return err } // Stats reports current depth + last timings for /v1/status. func (s *Locked) Stats() Stats { s.mu.Lock() defer s.mu.Unlock() return Stats{ QueueDepth: s.queueDepth, InFlight: s.inFlight, TotalJobs: s.total, LastWaitMS: s.lastWaitMS, LastRunMS: s.lastRunMS, OldestQueued: s.oldestQueued, } } // Compile-time interface guard. var _ Scheduler = (*Locked)(nil) // Unused import guard — keeps the config package edge live for Schritt 5's // VRAM-pressure evaluation, which reads cfg in this same package. var _ = config.KindTTS