Unverified Commit 31f31e8e authored by Ziqi Fan's avatar Ziqi Fan Committed by GitHub
Browse files

fix: remove small --gpu-memory-utilization to avoid OOM due to vllm upgrade (#4899)


Signed-off-by: default avatarZiqi Fan <ziqif@nvidia.com>
parent 5e96c9a8
...@@ -40,8 +40,6 @@ spec: ...@@ -40,8 +40,6 @@ spec:
args: args:
- --model - --model
- Qwen/Qwen3-8B - Qwen/Qwen3-8B
- --gpu-memory-utilization
- "0.45"
- --max-model-len - --max-model-len
- "32000" - "32000"
- --enforce-eager - --enforce-eager
......
...@@ -33,8 +33,6 @@ spec: ...@@ -33,8 +33,6 @@ spec:
args: args:
- --model - --model
- Qwen/Qwen3-8B - Qwen/Qwen3-8B
- --gpu-memory-utilization
- "0.3"
- --max-model-len - --max-model-len
- "32000" - "32000"
- --enforce-eager - --enforce-eager
...@@ -65,8 +63,6 @@ spec: ...@@ -65,8 +63,6 @@ spec:
- --model - --model
- Qwen/Qwen3-8B - Qwen/Qwen3-8B
- --is-prefill-worker - --is-prefill-worker
- --gpu-memory-utilization
- "0.3"
- --max-model-len - --max-model-len
- "32000" - "32000"
- --enforce-eager - --enforce-eager
......
...@@ -33,8 +33,6 @@ spec: ...@@ -33,8 +33,6 @@ spec:
args: args:
- --model - --model
- Qwen/Qwen3-8B - Qwen/Qwen3-8B
- --gpu-memory-utilization
- "0.3"
- --max-model-len - --max-model-len
- "32000" - "32000"
- --enforce-eager - --enforce-eager
...@@ -65,8 +63,6 @@ spec: ...@@ -65,8 +63,6 @@ spec:
- --model - --model
- Qwen/Qwen3-8B - Qwen/Qwen3-8B
- --is-prefill-worker - --is-prefill-worker
- --gpu-memory-utilization
- "0.3"
- --max-model-len - --max-model-len
- "32000" - "32000"
- --enforce-eager - --enforce-eager
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment