diff --git a/stacks/ai/ai-backend.nomad b/stacks/ai/ai-backend.nomad index 37a8375..636eecf 100644 --- a/stacks/ai/ai-backend.nomad +++ b/stacks/ai/ai-backend.nomad @@ -13,6 +13,7 @@ job "ai-backend" { } network { + # Static port ensures it's always on 11434 on the P52 port "api" { static = 11434 } } @@ -26,7 +27,7 @@ job "ai-backend" { # 1. Unlock the 6900XT (Navi 21) for ROCm HSA_OVERRIDE_GFX_VERSION = "10.3.0" - # 2. Enable Debugging + # 2. Enable Debugging (Check logs for "ROCm compute capability detected") OLLAMA_DEBUG = "1" } @@ -37,13 +38,24 @@ job "ai-backend" { # Required to talk to hardware privileged = true - # --- THE FIX: CLEAN VOLUMES ONLY --- - # We mount the compute interface and the entire graphics directory. - # This avoids the 'devices' syntax error entirely. + # --- CRITICAL FIXES FOR AMD ROCM --- + # 1. Allow shared memory access (Required for eGPU communication) + ipc_mode = "host" + + # 2. Disable security labels that might block device access + security_opt = ["label=disable"] + + # 3. Explicit Device Mapping (Hides Nvidia/Intel cards from Ollama) + # We map the Compute interface, the Physical Card (card1), and the Render Node (renderD128) + devices = [ + "/dev/kfd", + "/dev/dri/card1", + "/dev/dri/renderD128" + ] + + # 4. Storage Volume (Updated to your request) volumes = [ - "/mnt/local-ssd/nomad/stacks/ai/ai-backend/ollama:/root/.ollama", - "/dev/kfd:/dev/kfd", - "/dev/dri:/dev/dri" + "/mnt/local-ssd/nomad/stacks/ai/ai-backend/ollama:/root/.ollama" ] }