// Package server is the HTTP façade of mGPUmanager. // // It exposes: // - POST /v1/tts, /v1/stt, /v1/llm, /v1/image — pass-through proxy to the // consumer named in config.Routing[kind]. // - GET /audio/* — proxy to config.AudioProxy (mvoice's audio directory). // - GET /v1/status — live snapshot of consumers + GPU + scheduler. // - GET /healthz — broker liveness (200 if process is up). // // Every proxy call goes through the Scheduler so that, in Schritt 4 and 5, // queueing and eviction can be added without touching server.go. package server import ( "bytes" "context" "encoding/json" "fmt" "io" "log/slog" "net/http" "net/url" "strings" "time" "mgit.msbls.de/m/mGPUmanager/internal/config" "mgit.msbls.de/m/mGPUmanager/internal/gpu" "mgit.msbls.de/m/mGPUmanager/internal/registry" "mgit.msbls.de/m/mGPUmanager/internal/scheduler" ) // Server bundles the HTTP handlers + dependencies. type Server struct { cfg *config.Config reg *registry.Registry gpu *gpu.Poller sched scheduler.Scheduler client *http.Client logger *slog.Logger } // New builds a Server. Caller owns the lifecycle of reg/gpu/sched. func New(cfg *config.Config, reg *registry.Registry, gpuPoller *gpu.Poller, sched scheduler.Scheduler, logger *slog.Logger) *Server { return &Server{ cfg: cfg, reg: reg, gpu: gpuPoller, sched: sched, client: &http.Client{Timeout: 120 * time.Second}, // TTS can take 5-10s; image gen up to 60s logger: logger, } } // Handler returns the root mux. Caller wraps it in http.Server. func (s *Server) Handler() http.Handler { mux := http.NewServeMux() mux.HandleFunc("POST /v1/tts", s.handleEndpoint(config.KindTTS)) mux.HandleFunc("POST /v1/stt", s.handleEndpoint(config.KindSTT)) mux.HandleFunc("POST /v1/llm", s.handleEndpoint(config.KindLLM)) mux.HandleFunc("POST /v1/image", s.handleEndpoint(config.KindImage)) mux.HandleFunc("GET /audio/", s.handleAudio) mux.HandleFunc("GET /v1/status", s.handleStatus) mux.HandleFunc("GET /healthz", s.handleHealthz) mux.HandleFunc("GET /", s.handleRoot) return logMiddleware(s.logger, mux) } // ───── error envelope ───────────────────────────────────────────────────── // errorBody is the broker's structured error envelope. Every non-2xx response // from mGPUmanager itself uses this shape. (Pass-through 4xx/5xx from // consumers are forwarded verbatim so callers see the original payload.) type errorBody struct { Error string `json:"error"` Message string `json:"message"` Consumer string `json:"consumer,omitempty"` Retryable bool `json:"retryable"` } func writeErr(w http.ResponseWriter, status int, code, msg, consumer string, retryable bool) { w.Header().Set("Content-Type", "application/json") w.WriteHeader(status) _ = json.NewEncoder(w).Encode(errorBody{ Error: code, Message: msg, Consumer: consumer, Retryable: retryable, }) } // ───── endpoint proxy ───────────────────────────────────────────────────── // handleEndpoint returns the http.HandlerFunc for a /v1/ endpoint. func (s *Server) handleEndpoint(kind config.EndpointKind) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { consName, cons := s.cfg.ConsumerForKind(kind) if cons == nil { writeErr(w, http.StatusNotImplemented, "no_consumer", fmt.Sprintf("no consumer routes %s", kind), "", false) return } route, ok := cons.Paths[kind] if !ok { writeErr(w, http.StatusNotImplemented, "no_consumer", fmt.Sprintf("consumer %s lacks paths.%s", consName, kind), consName, false) return } // Refuse fast if the consumer is unhealthy (last probe failed) — keeps // Felix-Banholzer-style silent-fallback impossible. st := s.reg.Get(consName) if !st.Healthy && !st.LastProbe.IsZero() { writeErr(w, http.StatusServiceUnavailable, "consumer_unreachable", fmt.Sprintf("consumer %s last probe failed: %s", consName, st.LastError), consName, true) return } err := s.sched.Run(r.Context(), consName, func(ctx context.Context) error { return s.proxyRequest(ctx, w, r, cons, route, consName) }) if err != nil && !responseStarted(w) { writeErr(w, http.StatusInternalServerError, "scheduler_error", err.Error(), consName, true) } } } // proxyRequest forwards the inbound HTTP request to a consumer route and // streams the response back. Errors before the consumer responds are surfaced // as the broker's structured error envelope; once the consumer has begun // responding we stream its bytes through unchanged. func (s *Server) proxyRequest(ctx context.Context, w http.ResponseWriter, r *http.Request, cons *config.Consumer, route config.Route, consumer string) error { target, err := url.Parse(cons.URL) if err != nil { writeErr(w, http.StatusInternalServerError, "bad_consumer_url", err.Error(), consumer, false) return nil } target.Path = route.Path // Forward inbound query string verbatim. target.RawQuery = r.URL.RawQuery method := route.Method if method == "" { method = r.Method } upstream, err := http.NewRequestWithContext(ctx, method, target.String(), r.Body) if err != nil { writeErr(w, http.StatusInternalServerError, "bad_request", err.Error(), consumer, false) return nil } // Copy through Content-Type, Content-Length and Accept (don't carry Host). for _, h := range []string{"Content-Type", "Content-Length", "Accept", "Accept-Encoding"} { if v := r.Header.Get(h); v != "" { upstream.Header.Set(h, v) } } resp, err := s.client.Do(upstream) if err != nil { writeErr(w, http.StatusBadGateway, "consumer_unreachable", fmt.Sprintf("upstream %s: %v", target.Host, err), consumer, true) return nil } defer resp.Body.Close() // Stream response. for k, vs := range resp.Header { if strings.EqualFold(k, "Connection") || strings.EqualFold(k, "Transfer-Encoding") { continue } for _, v := range vs { w.Header().Add(k, v) } } w.WriteHeader(resp.StatusCode) _, _ = io.Copy(w, resp.Body) return nil } // ───── audio proxy ──────────────────────────────────────────────────────── // handleAudio forwards GET /audio/ to the audio_proxy consumer (mvoice). // wa.sh fetches the rendered .wav via this path after /v1/tts returns its URL. func (s *Server) handleAudio(w http.ResponseWriter, r *http.Request) { if s.cfg.AudioProxy == "" { writeErr(w, http.StatusNotFound, "no_audio_proxy", "audio_proxy is not configured", "", false) return } cons, ok := s.cfg.Consumers[s.cfg.AudioProxy] if !ok { writeErr(w, http.StatusInternalServerError, "no_audio_proxy", "audio_proxy points at unknown consumer", s.cfg.AudioProxy, false) return } target, err := url.Parse(cons.URL) if err != nil { writeErr(w, http.StatusInternalServerError, "bad_consumer_url", err.Error(), s.cfg.AudioProxy, false) return } target.Path = r.URL.Path target.RawQuery = r.URL.RawQuery upstream, err := http.NewRequestWithContext(r.Context(), http.MethodGet, target.String(), nil) if err != nil { writeErr(w, http.StatusInternalServerError, "bad_request", err.Error(), s.cfg.AudioProxy, false) return } resp, err := s.client.Do(upstream) if err != nil { writeErr(w, http.StatusBadGateway, "consumer_unreachable", fmt.Sprintf("upstream %s: %v", target.Host, err), s.cfg.AudioProxy, true) return } defer resp.Body.Close() for k, vs := range resp.Header { for _, v := range vs { w.Header().Add(k, v) } } w.WriteHeader(resp.StatusCode) _, _ = io.Copy(w, resp.Body) } // ───── status ───────────────────────────────────────────────────────────── type statusResponse struct { Listen string `json:"listen"` Time time.Time `json:"time"` GPU statusGPU `json:"gpu"` Routing map[config.EndpointKind]string `json:"routing"` Consumers []statusConsumer `json:"consumers"` Scheduler scheduler.Stats `json:"scheduler"` } type statusGPU struct { TotalMiB int `json:"total_mib"` UsedMiB int `json:"used_mib"` FreeMiB int `json:"free_mib"` ReservedMiB int `json:"reserved_mib"` LastSample time.Time `json:"last_sample"` Err string `json:"err,omitempty"` } type statusConsumer struct { Name string `json:"name"` URL string `json:"url"` Healthy bool `json:"healthy"` Loaded bool `json:"loaded"` GPUResidentMiB int `json:"gpu_resident_mib"` VRAMBudgetMiB int `json:"vram_budget_mib"` Active int `json:"active"` TotalRequests int64 `json:"total_requests"` LastUsed time.Time `json:"last_used,omitzero"` LastProbe time.Time `json:"last_probe,omitzero"` LastError string `json:"last_error,omitempty"` Priority int `json:"priority"` CanCoexistWith []string `json:"can_coexist_with"` } func (s *Server) handleStatus(w http.ResponseWriter, r *http.Request) { sample := s.gpu.Last() snap := s.reg.Snapshot() resp := statusResponse{ Listen: s.cfg.Listen, Time: time.Now(), Routing: s.cfg.Routing, GPU: statusGPU{ TotalMiB: s.cfg.GPU.TotalMiB, UsedMiB: sample.UsedMiB, FreeMiB: sample.FreeMiB, ReservedMiB: s.cfg.GPU.ReservedMiB, LastSample: sample.At, Err: sample.Err, }, Scheduler: s.sched.Stats(), } if resp.GPU.TotalMiB == 0 && sample.TotalMiB > 0 { resp.GPU.TotalMiB = sample.TotalMiB } // Stable ordering by config-declared name. names := make([]string, 0, len(s.cfg.Consumers)) for n := range s.cfg.Consumers { names = append(names, n) } sortStrings(names) for _, n := range names { cons := s.cfg.Consumers[n] st := snap[n] resp.Consumers = append(resp.Consumers, statusConsumer{ Name: n, URL: cons.URL, Healthy: st.Healthy, Loaded: st.Loaded, GPUResidentMiB: st.GPUResidentMiB, VRAMBudgetMiB: cons.VRAMResidentMiB, Active: st.Active, TotalRequests: st.TotalRequests, LastUsed: st.LastUsed, LastProbe: st.LastProbe, LastError: st.LastError, Priority: cons.Priority, CanCoexistWith: cons.CanCoexistWith, }) } w.Header().Set("Content-Type", "application/json") _ = json.NewEncoder(w).Encode(resp) } func (s *Server) handleHealthz(w http.ResponseWriter, _ *http.Request) { w.Header().Set("Content-Type", "application/json") _, _ = w.Write([]byte(`{"status":"ok"}`)) } func (s *Server) handleRoot(w http.ResponseWriter, _ *http.Request) { w.Header().Set("Content-Type", "text/plain") _, _ = io.Copy(w, bytes.NewReader([]byte( "mGPUmanager — see GET /v1/status for live state, POST /v1/{tts,stt,llm,image} for inference\n", ))) } // ───── helpers ──────────────────────────────────────────────────────────── // responseStarted is a coarse heuristic: once we've written headers, we can't // switch to the error envelope. The proxy path writes headers only inside // proxyRequest, which catches its own errors before that point. func responseStarted(_ http.ResponseWriter) bool { return false } // sortStrings: avoid pulling in "sort" everywhere this file uses ordering. func sortStrings(s []string) { for i := 1; i < len(s); i++ { for j := i; j > 0 && s[j-1] > s[j]; j-- { s[j-1], s[j] = s[j], s[j-1] } } } // logMiddleware emits one structured request log per call. func logMiddleware(logger *slog.Logger, next http.Handler) http.Handler { if logger == nil { return next } return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { start := time.Now() lw := &statusCapture{ResponseWriter: w, code: 200} next.ServeHTTP(lw, r) logger.Info("http", "method", r.Method, "path", r.URL.Path, "status", lw.code, "ms", time.Since(start).Milliseconds(), ) }) } type statusCapture struct { http.ResponseWriter code int } func (s *statusCapture) WriteHeader(code int) { s.code = code s.ResponseWriter.WriteHeader(code) }