Replaces the MVP Passthrough with scheduler.Locked: a capacity-1 channel serialises every consumer's GPU work end-to-end. main.go switches to it. Behavioural contract: - Jobs that arrive while another job holds the GPU block on the channel until the holder finishes. Context cancellation aborts the wait cleanly (no leaked tokens, queue depth decremented). - Stats track queue_depth, in_flight, total_jobs, last_wait_ms, last_run_ms, oldest_queued — surfaced through /v1/status. - One lock for ALL consumers (not per-consumer): the design (§4.3) is explicit that grobgranular > GPU-stream-granular on single-GPU single-user hardware. mvoice + ollama + comfyui never run truly concurrently any more, which is the whole point — that's what produced the CUDA-OOM under load. Tests: - 5 goroutines hammer the scheduler concurrently → max in-flight = 1. - Cancellation while parked on the lock returns ctx.Err() and frees the queue slot. - Stats reflect in-flight + queue-depth transitions correctly. - Race detector clean. Schritt 5 will compose this with VRAM-pressure eviction: before acquiring the lock, check if the target consumer's resident cost fits under the current GPU headroom; if not, unload the LRU non-coexistent consumer first. Refs: m/mGPUmanager#1 (Schritt 4).
91 lines
2.0 KiB
YAML
91 lines
2.0 KiB
YAML
listen: 127.0.0.1:8770
|
|
|
|
gpu:
|
|
total_mib: 16376 # RTX 4070 Ti SUPER
|
|
reserved_mib: 1024 # headroom for system/desktop
|
|
poll_interval_seconds: 2
|
|
|
|
routing:
|
|
tts: mvoice
|
|
stt: mvoice # whisper-server is alternative if explicitly requested
|
|
llm: ollama
|
|
image: comfyui
|
|
|
|
# Audio download proxy: any GET under audio_path_prefix is forwarded to this
|
|
# consumer at the same path. wa.sh fetches mvoice's generated WAV this way.
|
|
audio_proxy: mvoice
|
|
audio_path_prefix: /api/audio/
|
|
|
|
consumers:
|
|
mvoice:
|
|
url: http://localhost:8766
|
|
health:
|
|
method: GET
|
|
path: /api/health
|
|
paths:
|
|
tts:
|
|
method: POST
|
|
path: /api/synthesize
|
|
stt:
|
|
method: POST
|
|
path: /api/transcribe
|
|
vram_resident_mib: 2800
|
|
load:
|
|
method: POST
|
|
path: /api/admin/load
|
|
unload:
|
|
method: POST
|
|
path: /api/admin/unload
|
|
can_coexist_with: [whisper-server, ollama]
|
|
priority: 3
|
|
max_concurrency: 1
|
|
|
|
whisper-server:
|
|
url: http://localhost:8178
|
|
health:
|
|
method: GET
|
|
path: /
|
|
paths:
|
|
stt:
|
|
method: POST
|
|
path: /inference
|
|
vram_resident_mib: 2050
|
|
# No HTTP unload; mGPUmanager evicts via systemd restart (Schritt 5).
|
|
systemd_unit: whisper-server.service
|
|
can_coexist_with: [mvoice, ollama]
|
|
priority: 2
|
|
max_concurrency: 1
|
|
|
|
ollama:
|
|
url: http://localhost:11434
|
|
health:
|
|
method: GET
|
|
path: /api/tags
|
|
paths:
|
|
llm:
|
|
method: POST
|
|
path: /api/generate
|
|
# Ollama runs its own LRU keep_alive; we don't track resident VRAM.
|
|
vram_managed: true
|
|
can_coexist_with: [mvoice, whisper-server]
|
|
priority: 2
|
|
max_concurrency: 1
|
|
|
|
comfyui:
|
|
url: http://localhost:8188
|
|
health:
|
|
method: GET
|
|
path: /system_stats
|
|
paths:
|
|
image:
|
|
method: POST
|
|
path: /prompt
|
|
vram_resident_mib: 13000
|
|
unload:
|
|
method: POST
|
|
path: /api/free
|
|
body: '{"unload_models":true,"free_memory":true}'
|
|
can_coexist_with: []
|
|
priority: 1
|
|
max_concurrency: 1
|