mGPUmanager/config/consumers.yaml

listen: 127.0.0.1:8770

gpu:
  total_mib: 16376        # RTX 4070 Ti SUPER
  reserved_mib: 1024      # headroom for system/desktop
  poll_interval_seconds: 2

routing:
  tts:   mvoice
  stt:   mvoice           # whisper-server is alternative if explicitly requested
  llm:   ollama
  image: comfyui

# Audio download proxy: GET /audio/* forwards to this consumer.
audio_proxy: mvoice

consumers:
  mvoice:
    url: http://localhost:8766
    health:
      method: GET
      path: /api/health
    paths:
      tts:
        method: POST
        path: /api/synthesize
      stt:
        method: POST
        path: /api/transcribe
    vram_resident_mib: 2800
    load:
      method: POST
      path: /api/admin/load
    unload:
      method: POST
      path: /api/admin/unload
    can_coexist_with: [whisper-server, ollama]
    priority: 3
    max_concurrency: 1

  whisper-server:
    url: http://localhost:8178
    health:
      method: GET
      path: /
    paths:
      stt:
        method: POST
        path: /inference
    vram_resident_mib: 2050
    # No HTTP unload; mGPUmanager evicts via systemd restart (Schritt 5).
    systemd_unit: whisper-server.service
    can_coexist_with: [mvoice, ollama]
    priority: 2
    max_concurrency: 1

  ollama:
    url: http://localhost:11434
    health:
      method: GET
      path: /api/tags
    paths:
      llm:
        method: POST
        path: /api/generate
    # Ollama runs its own LRU keep_alive; we don't track resident VRAM.
    vram_managed: true
    can_coexist_with: [mvoice, whisper-server]
    priority: 2
    max_concurrency: 1

  comfyui:
    url: http://localhost:8188
    health:
      method: GET
      path: /system_stats
    paths:
      image:
        method: POST
        path: /prompt
    vram_resident_mib: 13000
    unload:
      method: POST
      path: /api/free
      body: '{"unload_models":true,"free_memory":true}'
    can_coexist_with: []
    priority: 1
    max_concurrency: 1