mGPUmanager/internal/scheduler/locked.go

package scheduler

import (
	"context"
	"sync"
	"time"

	"mgit.msbls.de/m/mGPUmanager/internal/config"
	"mgit.msbls.de/m/mGPUmanager/internal/registry"
)

// Locked is the Schritt 4 scheduler: a single capacity-1 semaphore serialises
// every consumer's GPU work. Jobs wait FIFO-ish at the channel until the lock
// is available, then run to completion.
//
// Why one global lock instead of per-stream or per-consumer:
//   - mRock is single-GPU + single-user. Theoretical parallelism (e.g. mvoice
//     + ollama small model coexisting) is given up to gain predictability:
//     no more CUDA-OOM races between concurrent loaders.
//   - The design (§4.3) is explicit: "Der Lock ist grobgranular (ein Mutex)
//     […]. Wir verschenken theoretische Parallelität, gewinnen dafür
//     Vorhersagbarkeit."
//
// Schritt 5 wraps this with eviction logic that runs before sem acquire when
// the requested consumer's resident cost would exceed available headroom.
type Locked struct {
	reg     *registry.Registry
	gpuLock chan struct{} // capacity-1 = global mutex with cancellable acquire

	mu           sync.Mutex
	inFlight     int
	queueDepth   int
	total        int64
	lastWaitMS   int64
	lastRunMS    int64
	oldestQueued time.Time
}

// NewLocked returns the serialising scheduler. capacity is the number of
// concurrent jobs allowed on the GPU (Phase 1 wires this as 1).
func NewLocked(reg *registry.Registry, capacity int) *Locked {
	if capacity < 1 {
		capacity = 1
	}
	return &Locked{
		reg:     reg,
		gpuLock: make(chan struct{}, capacity),
	}
}

// Run acquires the global GPU lock, executes fn while holding it, and
// releases. Cancellation via ctx aborts the wait without leaking a token.
func (s *Locked) Run(ctx context.Context, consumer string, fn Job) error {
	release := s.reg.MarkActive(consumer)
	defer release()

	queuedAt := time.Now()
	s.mu.Lock()
	s.queueDepth++
	if s.queueDepth == 1 || s.oldestQueued.IsZero() {
		s.oldestQueued = queuedAt
	}
	s.mu.Unlock()

	// Acquire global lock or bail on cancellation.
	select {
	case s.gpuLock <- struct{}{}:
	case <-ctx.Done():
		s.mu.Lock()
		s.queueDepth--
		if s.queueDepth == 0 {
			s.oldestQueued = time.Time{}
		}
		s.mu.Unlock()
		return ctx.Err()
	}
	waitMS := time.Since(queuedAt).Milliseconds()

	s.mu.Lock()
	s.queueDepth--
	if s.queueDepth == 0 {
		s.oldestQueued = time.Time{}
	}
	s.inFlight++
	s.total++
	s.lastWaitMS = waitMS
	s.mu.Unlock()

	defer func() {
		<-s.gpuLock
		s.mu.Lock()
		s.inFlight--
		s.mu.Unlock()
	}()

	start := time.Now()
	err := fn(ctx)
	runMS := time.Since(start).Milliseconds()
	s.mu.Lock()
	s.lastRunMS = runMS
	s.mu.Unlock()
	return err
}

// Stats reports current depth + last timings for /v1/status.
func (s *Locked) Stats() Stats {
	s.mu.Lock()
	defer s.mu.Unlock()
	return Stats{
		QueueDepth:   s.queueDepth,
		InFlight:     s.inFlight,
		TotalJobs:    s.total,
		LastWaitMS:   s.lastWaitMS,
		LastRunMS:    s.lastRunMS,
		OldestQueued: s.oldestQueued,
	}
}

// Compile-time interface guard.
var _ Scheduler = (*Locked)(nil)

// Unused import guard — keeps the config package edge live for Schritt 5's
// VRAM-pressure evaluation, which reads cfg in this same package.
var _ = config.KindTTS