Unverified Commit bf21cc03 authored by Thomas Montfort's avatar Thomas Montfort Committed by GitHub
Browse files

fix: llama3-70-b-agg recipe model download failure (#4290)

parent f817c595
...@@ -54,7 +54,7 @@ spec: ...@@ -54,7 +54,7 @@ spec:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
volumeMounts: volumeMounts:
- name: model-cache - name: model-cache
mountPoint: /root/.cache/huggingface mountPoint: /opt/models
sharedMemory: sharedMemory:
size: 80Gi size: 80Gi
extraPodSpec: extraPodSpec:
...@@ -92,7 +92,9 @@ spec: ...@@ -92,7 +92,9 @@ spec:
- name: ENGINE_ARGS - name: ENGINE_ARGS
value: "/opt/dynamo/configs/config.yaml" value: "/opt/dynamo/configs/config.yaml"
- name: MODEL_PATH - name: MODEL_PATH
value: "/root/.cache/huggingface/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a" value: "/opt/models/hub/models--openai--gpt-oss-120b/snapshots/b5c939de8f754692c1647ca79fbf85e8c1e70f8a"
- name: HF_HOME
value: /opt/models
volumeMounts: volumeMounts:
- mountPath: /opt/dynamo/configs - mountPath: /opt/dynamo/configs
name: llm-config name: llm-config
......
...@@ -15,11 +15,14 @@ spec: ...@@ -15,11 +15,14 @@ spec:
dynamoNamespace: llama3-70b-agg dynamoNamespace: llama3-70b-agg
volumeMounts: volumeMounts:
- name: model-cache - name: model-cache
mountPoint: /root/.cache/huggingface mountPoint: /opt/models
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/examples/backends/vllm workingDir: /workspace/examples/backends/vllm
envs:
- name: HF_HOME
value: /opt/models
replicas: 1 replicas: 1
VllmPrefillWorker: VllmPrefillWorker:
componentType: worker componentType: worker
...@@ -27,7 +30,7 @@ spec: ...@@ -27,7 +30,7 @@ spec:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
volumeMounts: volumeMounts:
- name: model-cache - name: model-cache
mountPoint: /root/.cache/huggingface mountPoint: /opt/models
sharedMemory: sharedMemory:
size: 20Gi size: 20Gi
extraPodSpec: extraPodSpec:
...@@ -36,7 +39,9 @@ spec: ...@@ -36,7 +39,9 @@ spec:
- name: SERVED_MODEL_NAME - name: SERVED_MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic" value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: MODEL_PATH - name: MODEL_PATH
value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd" value: "/opt/models/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
- name: HF_HOME
value: /opt/models
args: args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command: command:
......
...@@ -15,11 +15,14 @@ spec: ...@@ -15,11 +15,14 @@ spec:
dynamoNamespace: llama3-70b-disagg-mn dynamoNamespace: llama3-70b-disagg-mn
volumeMounts: volumeMounts:
- name: model-cache - name: model-cache
mountPoint: /root/.cache/huggingface mountPoint: /opt/models
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/examples/backends/vllm workingDir: /workspace/examples/backends/vllm
envs:
- name: HF_HOME
value: /opt/models
replicas: 1 replicas: 1
VllmPrefillWorker: VllmPrefillWorker:
componentType: worker componentType: worker
...@@ -27,7 +30,7 @@ spec: ...@@ -27,7 +30,7 @@ spec:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
volumeMounts: volumeMounts:
- name: model-cache - name: model-cache
mountPoint: /root/.cache/huggingface mountPoint: /opt/models
sharedMemory: sharedMemory:
size: 80Gi size: 80Gi
extraPodSpec: extraPodSpec:
...@@ -36,7 +39,9 @@ spec: ...@@ -36,7 +39,9 @@ spec:
- name: SERVED_MODEL_NAME - name: SERVED_MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic" value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: MODEL_PATH - name: MODEL_PATH
value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd" value: "/opt/models/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
- name: HF_HOME
value: /opt/models
args: args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128" - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
command: command:
...@@ -56,7 +61,7 @@ spec: ...@@ -56,7 +61,7 @@ spec:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
volumeMounts: volumeMounts:
- name: model-cache - name: model-cache
mountPoint: /root/.cache/huggingface mountPoint: /opt/models
sharedMemory: sharedMemory:
size: 80Gi size: 80Gi
extraPodSpec: extraPodSpec:
...@@ -65,7 +70,9 @@ spec: ...@@ -65,7 +70,9 @@ spec:
- name: SERVED_MODEL_NAME - name: SERVED_MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic" value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: MODEL_PATH - name: MODEL_PATH
value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd" value: "/opt/models/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
- name: HF_HOME
value: /opt/models
args: args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command: command:
......
...@@ -15,11 +15,14 @@ spec: ...@@ -15,11 +15,14 @@ spec:
dynamoNamespace: llama3-70b-disagg-sn dynamoNamespace: llama3-70b-disagg-sn
volumeMounts: volumeMounts:
- name: model-cache - name: model-cache
mountPoint: /root/.cache/huggingface mountPoint: /opt/models
extraPodSpec: extraPodSpec:
mainContainer: mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/examples/backends/vllm workingDir: /workspace/examples/backends/vllm
envs:
- name: HF_HOME
value: /opt/models
replicas: 1 replicas: 1
VllmPrefillWorker: VllmPrefillWorker:
componentType: worker componentType: worker
...@@ -27,7 +30,7 @@ spec: ...@@ -27,7 +30,7 @@ spec:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
volumeMounts: volumeMounts:
- name: model-cache - name: model-cache
mountPoint: /root/.cache/huggingface mountPoint: /opt/models
sharedMemory: sharedMemory:
size: 80Gi size: 80Gi
extraPodSpec: extraPodSpec:
...@@ -48,7 +51,9 @@ spec: ...@@ -48,7 +51,9 @@ spec:
- name: SERVED_MODEL_NAME - name: SERVED_MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic" value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: MODEL_PATH - name: MODEL_PATH
value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd" value: "/opt/models/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
- name: HF_HOME
value: /opt/models
args: args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128" - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
command: command:
...@@ -68,7 +73,7 @@ spec: ...@@ -68,7 +73,7 @@ spec:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
volumeMounts: volumeMounts:
- name: model-cache - name: model-cache
mountPoint: /root/.cache/huggingface mountPoint: /opt/models
sharedMemory: sharedMemory:
size: 80Gi size: 80Gi
extraPodSpec: extraPodSpec:
...@@ -89,7 +94,9 @@ spec: ...@@ -89,7 +94,9 @@ spec:
- name: SERVED_MODEL_NAME - name: SERVED_MODEL_NAME
value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic" value: "RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic"
- name: MODEL_PATH - name: MODEL_PATH
value: "/root/.cache/huggingface/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd" value: "/opt/models/hub/models--RedHatAI--Llama-3.3-70B-Instruct-FP8-dynamic/snapshots/ddb4128556dfcff99e0c41aee159ea6c3e655dcd"
- name: HF_HOME
value: /opt/models
args: args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128" - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command: command:
......
...@@ -70,7 +70,7 @@ spec: ...@@ -70,7 +70,7 @@ spec:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
volumeMounts: volumeMounts:
- name: model-cache - name: model-cache
mountPoint: /root/.cache/huggingface mountPoint: /opt/models
sharedMemory: sharedMemory:
size: 80Gi size: 80Gi
extraPodSpec: extraPodSpec:
...@@ -106,6 +106,8 @@ spec: ...@@ -106,6 +106,8 @@ spec:
value: "/opt/dynamo/configs/config.yaml" value: "/opt/dynamo/configs/config.yaml"
- name: MODEL_PATH - name: MODEL_PATH
value: "Qwen/Qwen3-32B-FP8" value: "Qwen/Qwen3-32B-FP8"
- name: HF_HOME
value: "/opt/models"
volumeMounts: volumeMounts:
- mountPath: /opt/dynamo/configs - mountPath: /opt/dynamo/configs
name: llm-config name: llm-config
......
...@@ -228,7 +228,7 @@ spec: ...@@ -228,7 +228,7 @@ spec:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
volumeMounts: volumeMounts:
- name: model-cache - name: model-cache
mountPoint: /root/.cache/huggingface mountPoint: /opt/models
sharedMemory: sharedMemory:
size: 80Gi size: 80Gi
extraPodSpec: extraPodSpec:
...@@ -265,6 +265,8 @@ spec: ...@@ -265,6 +265,8 @@ spec:
value: "/opt/dynamo/configs/config-prefill.yaml" value: "/opt/dynamo/configs/config-prefill.yaml"
- name: MODEL_PATH - name: MODEL_PATH
value: "Qwen/Qwen3-32B-FP8" value: "Qwen/Qwen3-32B-FP8"
- name: HF_HOME
value: "/opt/models"
volumeMounts: volumeMounts:
- mountPath: /opt/dynamo/configs - mountPath: /opt/dynamo/configs
name: llm-config-prefill name: llm-config-prefill
...@@ -287,7 +289,7 @@ spec: ...@@ -287,7 +289,7 @@ spec:
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
volumeMounts: volumeMounts:
- name: model-cache - name: model-cache
mountPoint: /root/.cache/huggingface mountPoint: /opt/models
sharedMemory: sharedMemory:
size: 80Gi size: 80Gi
extraPodSpec: extraPodSpec:
...@@ -324,6 +326,8 @@ spec: ...@@ -324,6 +326,8 @@ spec:
value: "/opt/dynamo/configs/config-decode.yaml" value: "/opt/dynamo/configs/config-decode.yaml"
- name: MODEL_PATH - name: MODEL_PATH
value: "Qwen/Qwen3-32B-FP8" value: "Qwen/Qwen3-32B-FP8"
- name: HF_HOME
value: "/opt/models"
volumeMounts: volumeMounts:
- mountPath: /opt/dynamo/configs - mountPath: /opt/dynamo/configs
name: llm-config-decode name: llm-config-decode
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment