mGPUmanager/internal/scheduler/locked_test.go

package scheduler

import (
	"context"
	"errors"
	"log/slog"
	"io"
	"sync"
	"sync/atomic"
	"testing"
	"time"

	"mgit.msbls.de/m/mGPUmanager/internal/config"
	"mgit.msbls.de/m/mGPUmanager/internal/registry"
)

func newReg() *registry.Registry {
	cfg := &config.Config{
		Consumers: map[string]*config.Consumer{
			"mvoice": {
				URL:    "http://localhost:8766",
				Health: config.Route{Method: "GET", Path: "/api/health"},
			},
		},
	}
	return registry.New(cfg, slog.New(slog.NewTextHandler(io.Discard, nil)))
}

// TestLockedSerialisesConcurrentJobs is the regression test for the
// CUDA-OOM-from-parallel-loaders class: two TTS calls that arrive at the
// same time must run sequentially, not concurrently.
func TestLockedSerialisesConcurrentJobs(t *testing.T) {
	sched := NewLocked(newReg(), 1)

	var maxConcurrent atomic.Int32
	var inFlight atomic.Int32

	job := func(ctx context.Context) error {
		now := inFlight.Add(1)
		// Update max in a CAS loop (small N, never contested in practice).
		for {
			cur := maxConcurrent.Load()
			if now <= cur || maxConcurrent.CompareAndSwap(cur, now) {
				break
			}
		}
		time.Sleep(20 * time.Millisecond)
		inFlight.Add(-1)
		return nil
	}

	var wg sync.WaitGroup
	const n = 5
	for range n {
		wg.Go(func() {
			if err := sched.Run(context.Background(), "mvoice", job); err != nil {
				t.Errorf("Run: %v", err)
			}
		})
	}
	wg.Wait()

	if got := maxConcurrent.Load(); got != 1 {
		t.Fatalf("max concurrent jobs = %d, want 1", got)
	}
	if got := sched.Stats().TotalJobs; got != n {
		t.Errorf("Stats.TotalJobs = %d, want %d", got, n)
	}
}

func TestLockedRespectsContextCancel(t *testing.T) {
	sched := NewLocked(newReg(), 1)

	// Hold the lock with a long-running job.
	started := make(chan struct{})
	done := make(chan struct{})
	go func() {
		_ = sched.Run(context.Background(), "mvoice", func(ctx context.Context) error {
			close(started)
			<-done
			return nil
		})
	}()
	<-started

	// Now try to run with a context that we'll cancel.
	ctx, cancel := context.WithCancel(context.Background())
	errCh := make(chan error, 1)
	go func() {
		errCh <- sched.Run(ctx, "mvoice", func(ctx context.Context) error {
			t.Error("second job should not run after cancellation")
			return nil
		})
	}()

	// Give the second job time to queue.
	time.Sleep(10 * time.Millisecond)
	cancel()

	select {
	case err := <-errCh:
		if !errors.Is(err, context.Canceled) {
			t.Fatalf("got err=%v, want context.Canceled", err)
		}
	case <-time.After(time.Second):
		t.Fatal("cancelled Run did not return within 1s")
	}

	// Release the holder.
	close(done)
}

func TestLockedStatsTrackInFlightAndQueue(t *testing.T) {
	sched := NewLocked(newReg(), 1)

	jobStart := make(chan struct{})
	jobBlock := make(chan struct{})
	go func() {
		_ = sched.Run(context.Background(), "mvoice", func(ctx context.Context) error {
			close(jobStart)
			<-jobBlock
			return nil
		})
	}()
	<-jobStart

	// Inside the holding job: InFlight==1, QueueDepth==0.
	s := sched.Stats()
	if s.InFlight != 1 {
		t.Errorf("InFlight while holding = %d, want 1", s.InFlight)
	}
	if s.QueueDepth != 0 {
		t.Errorf("QueueDepth = %d, want 0", s.QueueDepth)
	}

	// Queue a waiter and verify QueueDepth grows.
	waitStarted := make(chan struct{})
	waitDone := make(chan struct{})
	go func() {
		close(waitStarted)
		_ = sched.Run(context.Background(), "mvoice", func(ctx context.Context) error {
			return nil
		})
		close(waitDone)
	}()
	<-waitStarted
	// Wait for the waiter to actually be parked on the channel.
	deadline := time.Now().Add(time.Second)
	for time.Now().Before(deadline) {
		if sched.Stats().QueueDepth == 1 {
			break
		}
		time.Sleep(2 * time.Millisecond)
	}
	if got := sched.Stats().QueueDepth; got != 1 {
		t.Errorf("QueueDepth with one waiter = %d, want 1", got)
	}

	close(jobBlock)
	<-waitDone
	if got := sched.Stats().InFlight; got != 0 {
		t.Errorf("InFlight after both done = %d, want 0", got)
	}
}