fgsfg

2025-12-27 14:45:58 -05:00
parent 99b50a5902
commit 619124a30c
1 changed files with 19 additions and 7 deletions
--- a/stacks/ai/ai-backend.nomad
+++ b/stacks/ai/ai-backend.nomad
@@ -13,6 +13,7 @@ job "ai-backend" {
    }

    network {
+      # Static port ensures it's always on 11434 on the P52
      port "api" { static = 11434 }
    }

@@ -26,7 +27,7 @@ job "ai-backend" {
        # 1. Unlock the 6900XT (Navi 21) for ROCm
        HSA_OVERRIDE_GFX_VERSION = "10.3.0"
        
-        # 2. Enable Debugging
+        # 2. Enable Debugging (Check logs for "ROCm compute capability detected")
        OLLAMA_DEBUG = "1"
      }

@@ -37,13 +38,24 @@ job "ai-backend" {
        # Required to talk to hardware
        privileged = true

-        # --- THE FIX: CLEAN VOLUMES ONLY ---
-        # We mount the compute interface and the entire graphics directory.
-        # This avoids the 'devices' syntax error entirely.
+        # --- CRITICAL FIXES FOR AMD ROCM ---
+        # 1. Allow shared memory access (Required for eGPU communication)
+        ipc_mode = "host"
+        
+        # 2. Disable security labels that might block device access
+        security_opt = ["label=disable"]
+
+        # 3. Explicit Device Mapping (Hides Nvidia/Intel cards from Ollama)
+        # We map the Compute interface, the Physical Card (card1), and the Render Node (renderD128)
+        devices = [
+            "/dev/kfd",
+            "/dev/dri/card1",
+            "/dev/dri/renderD128"
+        ]
+
+        # 4. Storage Volume (Updated to your request)
        volumes = [
-          "/mnt/local-ssd/nomad/stacks/ai/ai-backend/ollama:/root/.ollama",
-          "/dev/kfd:/dev/kfd",
-          "/dev/dri:/dev/dri"
+          "/mnt/local-ssd/nomad/stacks/ai/ai-backend/ollama:/root/.ollama"
        ]
      }