Files
mGPUmanager/internal/server/server.go
mAi 3b3d828e9e feat: Schritt 4 — Locked scheduler (global GPU lock, queue, stats)
Replaces the MVP Passthrough with scheduler.Locked: a capacity-1 channel
serialises every consumer's GPU work end-to-end. main.go switches to it.

Behavioural contract:
- Jobs that arrive while another job holds the GPU block on the channel
  until the holder finishes. Context cancellation aborts the wait
  cleanly (no leaked tokens, queue depth decremented).
- Stats track queue_depth, in_flight, total_jobs, last_wait_ms,
  last_run_ms, oldest_queued — surfaced through /v1/status.
- One lock for ALL consumers (not per-consumer): the design (§4.3) is
  explicit that grobgranular > GPU-stream-granular on single-GPU
  single-user hardware. mvoice + ollama + comfyui never run truly
  concurrently any more, which is the whole point — that's what
  produced the CUDA-OOM under load.

Tests:
- 5 goroutines hammer the scheduler concurrently → max in-flight = 1.
- Cancellation while parked on the lock returns ctx.Err() and frees
  the queue slot.
- Stats reflect in-flight + queue-depth transitions correctly.
- Race detector clean.

Schritt 5 will compose this with VRAM-pressure eviction: before
acquiring the lock, check if the target consumer's resident cost fits
under the current GPU headroom; if not, unload the LRU non-coexistent
consumer first.

Refs: m/mGPUmanager#1 (Schritt 4).
2026-05-11 13:33:39 +02:00

380 lines
12 KiB
Go

// Package server is the HTTP façade of mGPUmanager.
//
// It exposes:
// - POST /v1/tts, /v1/stt, /v1/llm, /v1/image — pass-through proxy to the
// consumer named in config.Routing[kind].
// - GET /audio/* — proxy to config.AudioProxy (mvoice's audio directory).
// - GET /v1/status — live snapshot of consumers + GPU + scheduler.
// - GET /healthz — broker liveness (200 if process is up).
//
// Every proxy call goes through the Scheduler so that, in Schritt 4 and 5,
// queueing and eviction can be added without touching server.go.
package server
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"log/slog"
"net/http"
"net/url"
"strings"
"time"
"mgit.msbls.de/m/mGPUmanager/internal/config"
"mgit.msbls.de/m/mGPUmanager/internal/gpu"
"mgit.msbls.de/m/mGPUmanager/internal/registry"
"mgit.msbls.de/m/mGPUmanager/internal/scheduler"
)
// Server bundles the HTTP handlers + dependencies.
type Server struct {
cfg *config.Config
reg *registry.Registry
gpu *gpu.Poller
sched scheduler.Scheduler
client *http.Client
logger *slog.Logger
}
// New builds a Server. Caller owns the lifecycle of reg/gpu/sched.
func New(cfg *config.Config, reg *registry.Registry, gpuPoller *gpu.Poller, sched scheduler.Scheduler, logger *slog.Logger) *Server {
return &Server{
cfg: cfg,
reg: reg,
gpu: gpuPoller,
sched: sched,
client: &http.Client{Timeout: 120 * time.Second}, // TTS can take 5-10s; image gen up to 60s
logger: logger,
}
}
// Handler returns the root mux. Caller wraps it in http.Server.
func (s *Server) Handler() http.Handler {
mux := http.NewServeMux()
mux.HandleFunc("POST /v1/tts", s.handleEndpoint(config.KindTTS))
mux.HandleFunc("POST /v1/stt", s.handleEndpoint(config.KindSTT))
mux.HandleFunc("POST /v1/llm", s.handleEndpoint(config.KindLLM))
mux.HandleFunc("POST /v1/image", s.handleEndpoint(config.KindImage))
if s.cfg.AudioProxy != "" && s.cfg.AudioPathPrefix != "" {
mux.HandleFunc("GET "+s.cfg.AudioPathPrefix, s.handleAudio)
}
mux.HandleFunc("GET /v1/status", s.handleStatus)
mux.HandleFunc("GET /healthz", s.handleHealthz)
mux.HandleFunc("GET /", s.handleRoot)
return logMiddleware(s.logger, mux)
}
// ───── error envelope ─────────────────────────────────────────────────────
// errorBody is the broker's structured error envelope. Every non-2xx response
// from mGPUmanager itself uses this shape. (Pass-through 4xx/5xx from
// consumers are forwarded verbatim so callers see the original payload.)
type errorBody struct {
Error string `json:"error"`
Message string `json:"message"`
Consumer string `json:"consumer,omitempty"`
Retryable bool `json:"retryable"`
}
func writeErr(w http.ResponseWriter, status int, code, msg, consumer string, retryable bool) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
_ = json.NewEncoder(w).Encode(errorBody{
Error: code,
Message: msg,
Consumer: consumer,
Retryable: retryable,
})
}
// ───── endpoint proxy ─────────────────────────────────────────────────────
// handleEndpoint returns the http.HandlerFunc for a /v1/<kind> endpoint.
func (s *Server) handleEndpoint(kind config.EndpointKind) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
consName, cons := s.cfg.ConsumerForKind(kind)
if cons == nil {
writeErr(w, http.StatusNotImplemented, "no_consumer",
fmt.Sprintf("no consumer routes %s", kind), "", false)
return
}
route, ok := cons.Paths[kind]
if !ok {
writeErr(w, http.StatusNotImplemented, "no_consumer",
fmt.Sprintf("consumer %s lacks paths.%s", consName, kind), consName, false)
return
}
// Refuse fast if the consumer is unhealthy (last probe failed) — keeps
// Felix-Banholzer-style silent-fallback impossible.
st := s.reg.Get(consName)
if !st.Healthy && !st.LastProbe.IsZero() {
writeErr(w, http.StatusServiceUnavailable, "consumer_unreachable",
fmt.Sprintf("consumer %s last probe failed: %s", consName, st.LastError),
consName, true)
return
}
err := s.sched.Run(r.Context(), consName, func(ctx context.Context) error {
return s.proxyRequest(ctx, w, r, cons, route, consName)
})
if err != nil && !responseStarted(w) {
writeErr(w, http.StatusInternalServerError, "scheduler_error",
err.Error(), consName, true)
}
}
}
// proxyRequest forwards the inbound HTTP request to a consumer route and
// streams the response back. Errors before the consumer responds are surfaced
// as the broker's structured error envelope; once the consumer has begun
// responding we stream its bytes through unchanged.
func (s *Server) proxyRequest(ctx context.Context, w http.ResponseWriter, r *http.Request, cons *config.Consumer, route config.Route, consumer string) error {
target, err := url.Parse(cons.URL)
if err != nil {
writeErr(w, http.StatusInternalServerError, "bad_consumer_url",
err.Error(), consumer, false)
return nil
}
target.Path = route.Path
// Forward inbound query string verbatim.
target.RawQuery = r.URL.RawQuery
method := route.Method
if method == "" {
method = r.Method
}
upstream, err := http.NewRequestWithContext(ctx, method, target.String(), r.Body)
if err != nil {
writeErr(w, http.StatusInternalServerError, "bad_request",
err.Error(), consumer, false)
return nil
}
// Copy through Content-Type, Content-Length and Accept (don't carry Host).
for _, h := range []string{"Content-Type", "Content-Length", "Accept", "Accept-Encoding"} {
if v := r.Header.Get(h); v != "" {
upstream.Header.Set(h, v)
}
}
resp, err := s.client.Do(upstream)
if err != nil {
writeErr(w, http.StatusBadGateway, "consumer_unreachable",
fmt.Sprintf("upstream %s: %v", target.Host, err), consumer, true)
return nil
}
defer resp.Body.Close()
// Stream response.
for k, vs := range resp.Header {
if strings.EqualFold(k, "Connection") || strings.EqualFold(k, "Transfer-Encoding") {
continue
}
for _, v := range vs {
w.Header().Add(k, v)
}
}
w.WriteHeader(resp.StatusCode)
_, _ = io.Copy(w, resp.Body)
return nil
}
// ───── audio proxy ────────────────────────────────────────────────────────
// handleAudio forwards GET /audio/<file> to the audio_proxy consumer (mvoice).
// wa.sh fetches the rendered .wav via this path after /v1/tts returns its URL.
func (s *Server) handleAudio(w http.ResponseWriter, r *http.Request) {
if s.cfg.AudioProxy == "" {
writeErr(w, http.StatusNotFound, "no_audio_proxy",
"audio_proxy is not configured", "", false)
return
}
cons, ok := s.cfg.Consumers[s.cfg.AudioProxy]
if !ok {
writeErr(w, http.StatusInternalServerError, "no_audio_proxy",
"audio_proxy points at unknown consumer", s.cfg.AudioProxy, false)
return
}
target, err := url.Parse(cons.URL)
if err != nil {
writeErr(w, http.StatusInternalServerError, "bad_consumer_url",
err.Error(), s.cfg.AudioProxy, false)
return
}
target.Path = r.URL.Path
target.RawQuery = r.URL.RawQuery
upstream, err := http.NewRequestWithContext(r.Context(), http.MethodGet, target.String(), nil)
if err != nil {
writeErr(w, http.StatusInternalServerError, "bad_request",
err.Error(), s.cfg.AudioProxy, false)
return
}
resp, err := s.client.Do(upstream)
if err != nil {
writeErr(w, http.StatusBadGateway, "consumer_unreachable",
fmt.Sprintf("upstream %s: %v", target.Host, err), s.cfg.AudioProxy, true)
return
}
defer resp.Body.Close()
for k, vs := range resp.Header {
for _, v := range vs {
w.Header().Add(k, v)
}
}
w.WriteHeader(resp.StatusCode)
_, _ = io.Copy(w, resp.Body)
}
// ───── status ─────────────────────────────────────────────────────────────
type statusResponse struct {
Listen string `json:"listen"`
Time time.Time `json:"time"`
GPU statusGPU `json:"gpu"`
Routing map[config.EndpointKind]string `json:"routing"`
Consumers []statusConsumer `json:"consumers"`
Scheduler scheduler.Stats `json:"scheduler"`
}
type statusGPU struct {
TotalMiB int `json:"total_mib"`
UsedMiB int `json:"used_mib"`
FreeMiB int `json:"free_mib"`
ReservedMiB int `json:"reserved_mib"`
LastSample time.Time `json:"last_sample"`
Err string `json:"err,omitempty"`
}
type statusConsumer struct {
Name string `json:"name"`
URL string `json:"url"`
Healthy bool `json:"healthy"`
Loaded bool `json:"loaded"`
GPUResidentMiB int `json:"gpu_resident_mib"`
VRAMBudgetMiB int `json:"vram_budget_mib"`
Active int `json:"active"`
TotalRequests int64 `json:"total_requests"`
LastUsed time.Time `json:"last_used,omitzero"`
LastProbe time.Time `json:"last_probe,omitzero"`
LastError string `json:"last_error,omitempty"`
Priority int `json:"priority"`
CanCoexistWith []string `json:"can_coexist_with"`
}
func (s *Server) handleStatus(w http.ResponseWriter, r *http.Request) {
sample := s.gpu.Last()
snap := s.reg.Snapshot()
resp := statusResponse{
Listen: s.cfg.Listen,
Time: time.Now(),
Routing: s.cfg.Routing,
GPU: statusGPU{
TotalMiB: s.cfg.GPU.TotalMiB,
UsedMiB: sample.UsedMiB,
FreeMiB: sample.FreeMiB,
ReservedMiB: s.cfg.GPU.ReservedMiB,
LastSample: sample.At,
Err: sample.Err,
},
Scheduler: s.sched.Stats(),
}
if resp.GPU.TotalMiB == 0 && sample.TotalMiB > 0 {
resp.GPU.TotalMiB = sample.TotalMiB
}
// Stable ordering by config-declared name.
names := make([]string, 0, len(s.cfg.Consumers))
for n := range s.cfg.Consumers {
names = append(names, n)
}
sortStrings(names)
for _, n := range names {
cons := s.cfg.Consumers[n]
st := snap[n]
resp.Consumers = append(resp.Consumers, statusConsumer{
Name: n,
URL: cons.URL,
Healthy: st.Healthy,
Loaded: st.Loaded,
GPUResidentMiB: st.GPUResidentMiB,
VRAMBudgetMiB: cons.VRAMResidentMiB,
Active: st.Active,
TotalRequests: st.TotalRequests,
LastUsed: st.LastUsed,
LastProbe: st.LastProbe,
LastError: st.LastError,
Priority: cons.Priority,
CanCoexistWith: cons.CanCoexistWith,
})
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(resp)
}
func (s *Server) handleHealthz(w http.ResponseWriter, _ *http.Request) {
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write([]byte(`{"status":"ok"}`))
}
func (s *Server) handleRoot(w http.ResponseWriter, _ *http.Request) {
w.Header().Set("Content-Type", "text/plain")
_, _ = io.Copy(w, bytes.NewReader([]byte(
"mGPUmanager — see GET /v1/status for live state, POST /v1/{tts,stt,llm,image} for inference\n",
)))
}
// ───── helpers ────────────────────────────────────────────────────────────
// responseStarted is a coarse heuristic: once we've written headers, we can't
// switch to the error envelope. The proxy path writes headers only inside
// proxyRequest, which catches its own errors before that point.
func responseStarted(_ http.ResponseWriter) bool { return false }
// sortStrings: avoid pulling in "sort" everywhere this file uses ordering.
func sortStrings(s []string) {
for i := 1; i < len(s); i++ {
for j := i; j > 0 && s[j-1] > s[j]; j-- {
s[j-1], s[j] = s[j], s[j-1]
}
}
}
// logMiddleware emits one structured request log per call.
func logMiddleware(logger *slog.Logger, next http.Handler) http.Handler {
if logger == nil {
return next
}
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
lw := &statusCapture{ResponseWriter: w, code: 200}
next.ServeHTTP(lw, r)
logger.Info("http",
"method", r.Method,
"path", r.URL.Path,
"status", lw.code,
"ms", time.Since(start).Milliseconds(),
)
})
}
type statusCapture struct {
http.ResponseWriter
code int
}
func (s *statusCapture) WriteHeader(code int) {
s.code = code
s.ResponseWriter.WriteHeader(code)
}