Fixed eGPU deploy?

2025-12-27 14:26:56 -05:00
parent 2215d8c2e8
commit b104dc4640
1 changed files with 15 additions and 12 deletions
--- a/stacks/ai/ai-backend.nomad
+++ b/stacks/ai/ai-backend.nomad
@@ -16,34 +16,37 @@ job "ai-backend" {
      port "api" { static = 11434 }
    }

-    task "ollama" {
+task "ollama" {
      driver = "podman"
-
+      
      env {
        OLLAMA_HOST    = "0.0.0.0"
        OLLAMA_ORIGINS = "*"
        
-        # CRITICAL FOR 6900XT:
-        # This tells ROCm to treat the card like a supported Pro workstation card
+        # Unlock the 6900XT (Navi 21) for ROCm
        HSA_OVERRIDE_GFX_VERSION = "10.3.0"
+        
+        # Debugging enabled so we can confirm it worked
+        OLLAMA_DEBUG = "1"
      }

      config {
-        # Standard image (contains ROCm libraries)
        image = "docker.io/ollama/ollama:latest"
        ports = ["api"]
-        
-        # Required for hardware access
        privileged = true

-        # Pass the graphics hardware to the container
+        # --- THE FIX: STRICT MAPPING ---
+        # Only map the eGPU (renderD128) and the Compute interface (kfd)
+        devices = [
+            "/dev/kfd",
+            "/dev/dri/renderD128"
+        ]
+
+        # Do NOT map the whole /dev/dri folder, or it might peek at the others
        volumes = [
-          "/mnt/local-ssd/nomad/stacks/ai/ai-backend/ollama:/root/.ollama",
-          "/dev/kfd:/dev/kfd",
-          "/dev/dri:/dev/dri"
+          "/mnt/local-ssd/nomad/stacks/ai/ai-backend/ollama:/root/.ollama"
        ]
      }
-
      service {
        name = "ollama"
        port = "api"