Fixed eGPU deploy?

2025-12-27 14:26:56 -05:00
parent 2215d8c2e8
commit b104dc4640
1 changed files with 15 additions and 12 deletions
--- a/stacks/ai/ai-backend.nomad
+++ b/stacks/ai/ai-backend.nomad
@@ -23,27 +23,30 @@ job "ai-backend" {
        OLLAMA_HOST    = "0.0.0.0"
        OLLAMA_ORIGINS = "*"
-        # CRITICAL FOR 6900XT:
+        # Unlock the 6900XT (Navi 21) for ROCm
        # This tells ROCm to treat the card like a supported Pro workstation card
        HSA_OVERRIDE_GFX_VERSION = "10.3.0"
        # Debugging enabled so we can confirm it worked
        OLLAMA_DEBUG = "1"
      }
      config {
        # Standard image (contains ROCm libraries)
        image = "docker.io/ollama/ollama:latest"
        ports = ["api"]
        # Required for hardware access
        privileged = true
-        # Pass the graphics hardware to the container
+        # --- THE FIX: STRICT MAPPING ---
        # Only map the eGPU (renderD128) and the Compute interface (kfd)
        devices = [
            "/dev/kfd",
            "/dev/dri/renderD128"
        ]
        # Do NOT map the whole /dev/dri folder, or it might peek at the others
        volumes = [
-          "/mnt/local-ssd/nomad/stacks/ai/ai-backend/ollama:/root/.ollama",
+          "/mnt/local-ssd/nomad/stacks/ai/ai-backend/ollama:/root/.ollama"
          "/dev/kfd:/dev/kfd",
          "/dev/dri:/dev/dri"
        ]
      }
      service {
        name = "ollama"
        port = "api"