Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7e63ef82
Commit
7e63ef82
authored
Jan 21, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.14.0' into v0.14.0-dev
parents
8cbcac5d
b17039bc
Changes
681
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
105 additions
and
9 deletions
+105
-9
tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
+1
-2
tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
...vals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
+1
-1
tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
+1
-1
tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
+1
-1
tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
...als/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+1
-1
tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
+1
-1
tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
+1
-2
tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
+12
-0
tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml
tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml
+11
-0
tests/evals/gsm8k/configs/models-blackwell.txt
tests/evals/gsm8k/configs/models-blackwell.txt
+2
-0
tests/evals/gsm8k/configs/models-h200.txt
tests/evals/gsm8k/configs/models-h200.txt
+2
-0
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Llama-4-Scout-Fp8-ModelOpt-triton.yaml
...moe-refactor-dp-ep/Llama-4-Scout-Fp8-ModelOpt-triton.yaml
+5
-0
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ht.yaml
...r-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ht.yaml
+8
-0
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ll.yaml
...r-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ll.yaml
+9
-0
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml
...oe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml
+8
-0
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ht.yaml
...-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ht.yaml
+8
-0
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ll.yaml
...-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ll.yaml
+9
-0
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml
...e-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml
+8
-0
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-CT-fi-cutedsl-deepep-ll.yaml
...or-dp-ep/Qwen3-30B-A3B-NvFp4-CT-fi-cutedsl-deepep-ll.yaml
+8
-0
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml
...moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml
+8
-0
No files found.
Too many changes to show.
To preserve performance only
681 of 681+
files are displayed.
Plain diff
Email patch
tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
View file @
7e63ef82
...
@@ -2,5 +2,4 @@ model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8"
...
@@ -2,5 +2,4 @@ model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8"
accuracy_threshold
:
0.72
accuracy_threshold
:
0.72
num_questions
:
1319
num_questions
:
1319
num_fewshot
:
5
num_fewshot
:
5
max_model_len
:
4096
server_args
:
"
--enforce-eager
--max-model-len
4096"
tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
View file @
7e63ef82
...
@@ -2,4 +2,4 @@ model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
...
@@ -2,4 +2,4 @@ model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
accuracy_threshold
:
0.74
accuracy_threshold
:
0.74
num_questions
:
1319
num_questions
:
1319
num_fewshot
:
5
num_fewshot
:
5
max_model_len
:
4096
server_args
:
"
--enforce-eager
--max-model-len
4096"
\ No newline at end of file
tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
View file @
7e63ef82
...
@@ -2,4 +2,4 @@ model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
...
@@ -2,4 +2,4 @@ model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
accuracy_threshold
:
0.31
accuracy_threshold
:
0.31
num_questions
:
1319
num_questions
:
1319
num_fewshot
:
5
num_fewshot
:
5
max_model_len
:
4096
server_args
:
"
--enforce-eager
--max-model-len
4096"
\ No newline at end of file
tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
View file @
7e63ef82
...
@@ -2,4 +2,4 @@ model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
...
@@ -2,4 +2,4 @@ model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
accuracy_threshold
:
0.45
accuracy_threshold
:
0.45
num_questions
:
1319
num_questions
:
1319
num_fewshot
:
5
num_fewshot
:
5
max
_
model
_
len
:
4096
server_args
:
"
--enforce-eager
--
max
-
model
-
len
4096
"
tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
View file @
7e63ef82
...
@@ -2,4 +2,4 @@ model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
...
@@ -2,4 +2,4 @@ model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
accuracy_threshold
:
0.60
accuracy_threshold
:
0.60
num_questions
:
1319
num_questions
:
1319
num_fewshot
:
5
num_fewshot
:
5
max_model_len
:
4096
server_args
:
"
--enforce-eager
--max-model-len
4096"
\ No newline at end of file
tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
View file @
7e63ef82
...
@@ -2,4 +2,4 @@ model_name: "Qwen/Qwen3-0.6B-FP8"
...
@@ -2,4 +2,4 @@ model_name: "Qwen/Qwen3-0.6B-FP8"
accuracy_threshold
:
0.375
accuracy_threshold
:
0.375
num_questions
:
1319
num_questions
:
1319
num_fewshot
:
5
num_fewshot
:
5
max_model_len
:
4096
server_args
:
"
--enforce-eager
--max-model-len
4096"
\ No newline at end of file
tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
View file @
7e63ef82
...
@@ -2,5 +2,4 @@ model_name: "nvidia/Qwen3-30B-A3B-FP4"
...
@@ -2,5 +2,4 @@ model_name: "nvidia/Qwen3-30B-A3B-FP4"
accuracy_threshold
:
0.89
accuracy_threshold
:
0.89
num_questions
:
1319
num_questions
:
1319
num_fewshot
:
5
num_fewshot
:
5
max_model_len
:
4096
server_args
:
"
--enforce-eager
--max-model-len
4096"
tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
0 → 100644
View file @
7e63ef82
model_name
:
"
nm-testing/Qwen3-Next-80B-A3B-Instruct-NVFP4"
accuracy_threshold
:
0.75
num_questions
:
1319
num_fewshot
:
5
server_args
:
>-
--enforce-eager
--max-model-len 4096
--tensor-parallel-size 2
--enable-expert-parallel
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}'
env
:
VLLM_USE_FLASHINFER_MOE_FP4
:
"
1"
tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml
0 → 100644
View file @
7e63ef82
model_name
:
"
Qwen/Qwen3-Next-80B-A3B-Instruct-FP8"
accuracy_threshold
:
0.85
num_questions
:
1319
num_fewshot
:
5
server_args
:
>-
--max-model-len 4096
--tensor-parallel-size 2
--enable-expert-parallel
--async-scheduling
env
:
VLLM_USE_FLASHINFER_MOE_FP8
:
"
1"
tests/evals/gsm8k/configs/models-blackwell.txt
View file @
7e63ef82
...
@@ -3,3 +3,5 @@ Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
...
@@ -3,3 +3,5 @@ Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
Qwen1.5-MoE-W4A16-CT.yaml
Qwen1.5-MoE-W4A16-CT.yaml
DeepSeek-V2-Lite-Instruct-FP8.yaml
DeepSeek-V2-Lite-Instruct-FP8.yaml
Qwen3-30B-A3B-NVFP4.yaml
Qwen3-30B-A3B-NVFP4.yaml
Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
Qwen3-Next-FP8-EP2.yaml
tests/evals/gsm8k/configs/models-h200.txt
0 → 100644
View file @
7e63ef82
DeepSeek-R1-TP.yaml
DeepSeek-R1-DP.yaml
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Llama-4-Scout-Fp8-ModelOpt-triton.yaml
0 → 100644
View file @
7e63ef82
model_name
:
"
nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
accuracy_threshold
:
0.92
num_questions
:
1319
num_fewshot
:
5
server_args
:
"
--enforce-eager
--max-model-len
8192
--data-parallel-size
2
--enable-expert-parallel"
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ht.yaml
0 → 100644
View file @
7e63ef82
model_name
:
"
Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
accuracy_threshold
:
0.88
num_questions
:
1319
num_fewshot
:
5
server_args
:
"
--enforce-eager
--max-model-len
8192
--data-parallel-size
2
--enable-expert-parallel
--all2all-backend
deepep_high_throughput"
env
:
VLLM_USE_DEEP_GEMM
:
"
1"
VLLM_USE_DEEP_GEMM_MOE
:
"
1"
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ll.yaml
0 → 100644
View file @
7e63ef82
model_name
:
"
Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
accuracy_threshold
:
0.88
num_questions
:
1319
num_fewshot
:
5
server_args
:
"
--enforce-eager
--max-model-len
8192
--data-parallel-size
2
--enable-expert-parallel
--all2all-backend
deepep_low_latency
--disable-uvicorn-access-log"
env
:
VLLM_USE_DEEP_GEMM
:
"
1"
VLLM_USE_DEEP_GEMM_MOE
:
"
1"
VLLM_USE_DEEP_GEMM_E8M0
:
"
0"
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml
0 → 100644
View file @
7e63ef82
model_name
:
"
Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
accuracy_threshold
:
0.88
num_questions
:
1319
num_fewshot
:
5
server_args
:
"
--enforce-eager
--max-model-len
8192
--data-parallel-size
2
--enable-expert-parallel"
env
:
VLLM_USE_DEEP_GEMM
:
"
1"
VLLM_USE_DEEP_GEMM_MOE
:
"
1"
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ht.yaml
0 → 100644
View file @
7e63ef82
model_name
:
"
RedHatAI/Qwen3-30B-A3B-FP8-block"
accuracy_threshold
:
0.85
num_questions
:
1319
num_fewshot
:
5
server_args
:
"
--enforce-eager
--max-model-len
8192
--data-parallel-size
2
--enable-expert-parallel
--all2all-backend
deepep_high_throughput"
env
:
VLLM_USE_DEEP_GEMM
:
"
1"
VLLM_USE_DEEP_GEMM_MOE
:
"
1"
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ll.yaml
0 → 100644
View file @
7e63ef82
model_name
:
"
RedHatAI/Qwen3-30B-A3B-FP8-block"
accuracy_threshold
:
0.85
num_questions
:
1319
num_fewshot
:
5
server_args
:
"
--enforce-eager
--max-model-len
8192
--data-parallel-size
2
--enable-expert-parallel
--all2all-backend
deepep_low_latency
--disable-uvicorn-access-log"
env
:
VLLM_USE_DEEP_GEMM
:
"
1"
VLLM_USE_DEEP_GEMM_MOE
:
"
1"
VLLM_USE_DEEP_GEMM_E8M0
:
"
0"
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml
0 → 100644
View file @
7e63ef82
model_name
:
"
RedHatAI/Qwen3-30B-A3B-FP8-block"
accuracy_threshold
:
0.85
num_questions
:
1319
num_fewshot
:
5
server_args
:
"
--enforce-eager
--max-model-len
8192
--data-parallel-size
2
--enable-expert-parallel"
env
:
VLLM_USE_DEEP_GEMM
:
"
1"
VLLM_USE_DEEP_GEMM_MOE
:
"
1"
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-CT-fi-cutedsl-deepep-ll.yaml
0 → 100644
View file @
7e63ef82
model_name
:
"
RedHatAI/Qwen3-30B-A3B-NVFP4"
accuracy_threshold
:
0.88
num_questions
:
1319
num_fewshot
:
5
server_args
:
"
--enforce-eager
--max-model-len
8192
--data-parallel-size
2
--enable-expert-parallel
--all2all-backend
deepep_low_latency"
env
:
VLLM_USE_FLASHINFER_MOE_FP4
:
"
1"
VLLM_FLASHINFER_MOE_BACKEND
:
"
masked_gemm"
tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml
0 → 100644
View file @
7e63ef82
model_name
:
"
RedHatAI/Qwen3-30B-A3B-NVFP4"
accuracy_threshold
:
0.88
num_questions
:
1319
num_fewshot
:
5
server_args
:
"
--enforce-eager
--max-model-len
8192
--data-parallel-size
2
--enable-expert-parallel"
env
:
VLLM_USE_FLASHINFER_MOE_FP4
:
"
1"
VLLM_FLASHINFER_MOE_BACKEND
:
"
throughput"
Prev
1
…
15
16
17
18
19
20
21
22
23
…
35
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment