fix: llama3-70-b-agg recipe model download failure (#4290)

bf21cc03 · Thomas Montfort · GitHub · f817c595 · bf21cc03 · bf21cc03
Unverified Commit bf21cc03 authored Nov 13, 2025 by Thomas Montfort Committed by GitHub Nov 13, 2025
6 changed files
--- a/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
+++ b/recipes/gpt-oss-120b/trtllm/agg/deploy.yaml
@@ -54,7 +54,7 @@ spec:
      envFromSecret: hf-token-secret
      volumeMounts:
        - name: model-cache
-          mountPoint: /root/.cache/huggingface
+          mountPoint: /opt/models
      sharedMemory:
        size: 80Gi
      extraPodSpec:
@@ -92,7 +92,9 @@ spec:
          - name: ENGINE_ARGS
            value: "/opt/dynamo/configs/config.yaml"
          - name: MODEL_PATH
-            value: "/root/.cache/huggingface/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a"
+            value: "/opt/models/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a"
+          - name: HF_HOME
+            value: /opt/models
          volumeMounts:
          - mountPath: /opt/dynamo/configs
            name: llm-config

--- a/recipes/llama-3-70b/vllm/agg/deploy.yaml
+++ b/recipes/llama-3-70b/vllm/agg/deploy.yaml
@@ -15,11 +15,14 @@ spec:
      dynamoNamespace: llama3-70b-agg
      volumeMounts:
        - name: model-cache
-          mountPoint: /root/.cache/huggingface
+          mountPoint: /opt/models
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
          workingDir: /workspace/examples/backends/vllm
+      envs:
+        - name: HF_HOME
+          value: /opt/models
      replicas: 1
    VllmPrefillWorker:
      componentType: worker
@@ -27,7 +30,7 @@ spec:
      envFromSecret: hf-token-secret
      volumeMounts:
        - name: model-cache
-          mountPoint: /root/.cache/huggingface
+          mountPoint: /opt/models
      sharedMemory:
        size: 20Gi
      extraPodSpec:
@@ -36,7 +39,9 @@ spec:
            - name: SERVED_MODEL_NAME
              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
            - name: MODEL_PATH
-              value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
+              value: "/opt/models/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
+            - name: HF_HOME
+              value: /opt/models
          args:
          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
          command:

--- a/recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
+++ b/recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
@@ -15,11 +15,14 @@ spec:
      dynamoNamespace: llama3-70b-disagg-mn
      volumeMounts:
        - name: model-cache
-          mountPoint: /root/.cache/huggingface
+          mountPoint: /opt/models
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
          workingDir: /workspace/examples/backends/vllm
+      envs:
+        - name: HF_HOME
+          value: /opt/models
      replicas: 1
    VllmPrefillWorker:
      componentType: worker
@@ -27,7 +30,7 @@ spec:
      envFromSecret: hf-token-secret
      volumeMounts:
        - name: model-cache
-          mountPoint: /root/.cache/huggingface
+          mountPoint: /opt/models
      sharedMemory:
        size: 80Gi
      extraPodSpec:
@@ -36,7 +39,9 @@ spec:
            - name: SERVED_MODEL_NAME
              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
            - name: MODEL_PATH
-              value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
+              value: "/opt/models/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
+            - name: HF_HOME
+              value: /opt/models
          args:
          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
          command:
@@ -56,7 +61,7 @@ spec:
      envFromSecret: hf-token-secret
      volumeMounts:
        - name: model-cache
-          mountPoint: /root/.cache/huggingface
+          mountPoint: /opt/models
      sharedMemory:
        size: 80Gi
      extraPodSpec:
@@ -65,7 +70,9 @@ spec:
            - name: SERVED_MODEL_NAME
              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
            - name: MODEL_PATH
-              value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
+              value: "/opt/models/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
+            - name: HF_HOME
+              value: /opt/models
          args:
          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
          command:

--- a/recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
+++ b/recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
@@ -15,11 +15,14 @@ spec:
      dynamoNamespace: llama3-70b-disagg-sn
      volumeMounts:
        - name: model-cache
-          mountPoint: /root/.cache/huggingface
+          mountPoint: /opt/models
      extraPodSpec:
        mainContainer:
          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
          workingDir: /workspace/examples/backends/vllm
+      envs:
+        - name: HF_HOME
+          value: /opt/models
      replicas: 1
    VllmPrefillWorker:
      componentType: worker
@@ -27,7 +30,7 @@ spec:
      envFromSecret: hf-token-secret
      volumeMounts:
        - name: model-cache
-          mountPoint: /root/.cache/huggingface
+          mountPoint: /opt/models
      sharedMemory:
        size: 80Gi
      extraPodSpec:
@@ -48,7 +51,9 @@ spec:
            - name: SERVED_MODEL_NAME
              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
            - name: MODEL_PATH
-              value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
+              value: "/opt/models/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
+            - name: HF_HOME
+              value: /opt/models
          args:
          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
          command:
@@ -68,7 +73,7 @@ spec:
      envFromSecret: hf-token-secret
      volumeMounts:
        - name: model-cache
-          mountPoint: /root/.cache/huggingface
+          mountPoint: /opt/models
      sharedMemory:
        size: 80Gi
      extraPodSpec:
@@ -89,7 +94,9 @@ spec:
            - name: SERVED_MODEL_NAME
              value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
            - name: MODEL_PATH
-              value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
+              value: "/opt/models/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
+            - name: HF_HOME
+              value: /opt/models
          args:
          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
          command:

--- a/recipes/qwen3-32b-fp8/trtllm/agg/deploy.yaml
+++ b/recipes/qwen3-32b-fp8/trtllm/agg/deploy.yaml
@@ -70,7 +70,7 @@ spec:
      envFromSecret: hf-token-secret
      volumeMounts:
        - name: model-cache
-          mountPoint: /root/.cache/huggingface
+          mountPoint: /opt/models
      sharedMemory:
        size: 80Gi
      extraPodSpec:
@@ -106,6 +106,8 @@ spec:
            value: "/opt/dynamo/configs/config.yaml"
          - name: MODEL_PATH
            value: "Qwen/Qwen3-32B-FP8"
+          - name: HF_HOME
+            value: "/opt/models"
          volumeMounts:
          - mountPath: /opt/dynamo/configs
            name: llm-config

--- a/recipes/qwen3-32b-fp8/trtllm/disagg/deploy.yaml
+++ b/recipes/qwen3-32b-fp8/trtllm/disagg/deploy.yaml
@@ -228,7 +228,7 @@ spec:
      envFromSecret: hf-token-secret
      volumeMounts:
        - name: model-cache
-          mountPoint: /root/.cache/huggingface
+          mountPoint: /opt/models
      sharedMemory:
        size: 80Gi
      extraPodSpec:
@@ -265,6 +265,8 @@ spec:
            value: "/opt/dynamo/configs/config-prefill.yaml"
          - name: MODEL_PATH
            value: "Qwen/Qwen3-32B-FP8"
+          - name: HF_HOME
+            value: "/opt/models"
          volumeMounts:
          - mountPath: /opt/dynamo/configs
            name: llm-config-prefill
@@ -287,7 +289,7 @@ spec:
      envFromSecret: hf-token-secret
      volumeMounts:
        - name: model-cache
-          mountPoint: /root/.cache/huggingface
+          mountPoint: /opt/models
      sharedMemory:
        size: 80Gi
      extraPodSpec:
@@ -324,6 +326,8 @@ spec:
            value: "/opt/dynamo/configs/config-decode.yaml"
          - name: MODEL_PATH
            value: "Qwen/Qwen3-32B-FP8"
+          - name: HF_HOME
+            value: "/opt/models"
          volumeMounts:
          - mountPath: /opt/dynamo/configs
            name: llm-config-decode