Merge tag 'v0.14.0' into v0.14.0-dev

7e63ef82 · zhuwenwen · 8cbcac5d · b17039bc · 7e63ef82 · 7e63ef82
Commit 7e63ef82 authored Jan 21, 2026 by zhuwenwen
20 changed files
--- a/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
+++ b/tests/evals/gsm8k/configs/DeepSeek-V2-Lite-Instruct-FP8.yaml
@@ -2,5 +2,4 @@ model_name: "RedHatAI/DeepSeek-Coder-V2-Lite-Instruct-FP8"
 accuracy_threshold: 0.72
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
+server_args: "--enforce-eager --max-model-len 4096"
--- a/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
+++ b/tests/evals/gsm8k/configs/Llama-3-8B-Instruct-nonuniform-CT.yaml
@@ -2,4 +2,4 @@ model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 accuracy_threshold: 0.74
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
+server_args: "--enforce-eager --max-model-len 4096"
\ No newline at end of file
--- a/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
+++ b/tests/evals/gsm8k/configs/Llama-3.2-1B-Instruct-INT8-CT.yaml
@@ -2,4 +2,4 @@ model_name: "RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8"
 accuracy_threshold: 0.31
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
+server_args: "--enforce-eager --max-model-len 4096"
\ No newline at end of file
--- a/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
+++ b/tests/evals/gsm8k/configs/Qwen1.5-MoE-W4A16-CT.yaml
@@ -2,4 +2,4 @@ model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
 accuracy_threshold: 0.45
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
+server_args: "--enforce-eager --max-model-len 4096"
--- a/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
+++ b/tests/evals/gsm8k/configs/Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
@@ -2,4 +2,4 @@ model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 accuracy_threshold: 0.60
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
+server_args: "--enforce-eager --max-model-len 4096"
\ No newline at end of file
--- a/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
+++ b/tests/evals/gsm8k/configs/Qwen3-0.6B-FP8.yaml
@@ -2,4 +2,4 @@ model_name: "Qwen/Qwen3-0.6B-FP8"
 accuracy_threshold: 0.375
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
+server_args: "--enforce-eager --max-model-len 4096"
\ No newline at end of file
--- a/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
+++ b/tests/evals/gsm8k/configs/Qwen3-30B-A3B-NVFP4.yaml
@@ -2,5 +2,4 @@ model_name: "nvidia/Qwen3-30B-A3B-FP4"
 accuracy_threshold: 0.89
 num_questions: 1319
 num_fewshot: 5
-max_model_len: 4096
+server_args: "--enforce-eager --max-model-len 4096"
--- a/tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
+++ b/tests/evals/gsm8k/configs/Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
+model_name: "nm-testing/Qwen3-Next-80B-A3B-Instruct-NVFP4"
+accuracy_threshold: 0.75
+num_questions: 1319
+num_fewshot: 5
+server_args: >-
+  --enforce-eager
+  --max-model-len 4096
+  --tensor-parallel-size 2
+  --enable-expert-parallel
+  --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}'
+env:
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
--- a/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml
+++ b/tests/evals/gsm8k/configs/Qwen3-Next-FP8-EP2.yaml
+model_name: "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8"
+accuracy_threshold: 0.85
+num_questions: 1319
+num_fewshot: 5
+server_args: >-
+  --max-model-len 4096
+  --tensor-parallel-size 2
+  --enable-expert-parallel
+  --async-scheduling
+env:
+  VLLM_USE_FLASHINFER_MOE_FP8: "1"
--- a/tests/evals/gsm8k/configs/models-blackwell.txt
+++ b/tests/evals/gsm8k/configs/models-blackwell.txt
@@ -3,3 +3,5 @@ Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
 Qwen1.5-MoE-W4A16-CT.yaml
 DeepSeek-V2-Lite-Instruct-FP8.yaml
 Qwen3-30B-A3B-NVFP4.yaml
+Qwen3-Next-80B-A3B-NVFP4-EP2.yaml
+Qwen3-Next-FP8-EP2.yaml
--- a/tests/evals/gsm8k/configs/models-h200.txt
+++ b/tests/evals/gsm8k/configs/models-h200.txt
+DeepSeek-R1-TP.yaml
+DeepSeek-R1-DP.yaml
--- a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Llama-4-Scout-Fp8-ModelOpt-triton.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Llama-4-Scout-Fp8-ModelOpt-triton.yaml
+model_name: "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
+accuracy_threshold: 0.92
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
--- a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ht.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ht.yaml
+model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --all2all-backend deepep_high_throughput"
+env:
+  VLLM_USE_DEEP_GEMM: "1"
+  VLLM_USE_DEEP_GEMM_MOE: "1"
--- a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ll.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ll.yaml
+model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --all2all-backend deepep_low_latency --disable-uvicorn-access-log"
+env:
+  VLLM_USE_DEEP_GEMM: "1"
+  VLLM_USE_DEEP_GEMM_MOE: "1"
+  VLLM_USE_DEEP_GEMM_E8M0: "0"
--- a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml
+model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
+env:
+  VLLM_USE_DEEP_GEMM: "1"
+  VLLM_USE_DEEP_GEMM_MOE: "1"
--- a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ht.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ht.yaml
+model_name: "RedHatAI/Qwen3-30B-A3B-FP8-block"
+accuracy_threshold: 0.85
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --all2all-backend deepep_high_throughput"
+env:
+  VLLM_USE_DEEP_GEMM: "1"
+  VLLM_USE_DEEP_GEMM_MOE: "1"
--- a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ll.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm-deepep-ll.yaml
+model_name: "RedHatAI/Qwen3-30B-A3B-FP8-block"
+accuracy_threshold: 0.85
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --all2all-backend deepep_low_latency --disable-uvicorn-access-log"
+env:
+  VLLM_USE_DEEP_GEMM: "1"
+  VLLM_USE_DEEP_GEMM_MOE: "1"
+  VLLM_USE_DEEP_GEMM_E8M0: "0"
--- a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml
+model_name: "RedHatAI/Qwen3-30B-A3B-FP8-block"
+accuracy_threshold: 0.85
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
+env:
+  VLLM_USE_DEEP_GEMM: "1"
+  VLLM_USE_DEEP_GEMM_MOE: "1"
--- a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-CT-fi-cutedsl-deepep-ll.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-CT-fi-cutedsl-deepep-ll.yaml
+model_name: "RedHatAI/Qwen3-30B-A3B-NVFP4"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel --all2all-backend deepep_low_latency"
+env:
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "masked_gemm"
--- a/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml
+++ b/tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml
+model_name: "RedHatAI/Qwen3-30B-A3B-NVFP4"
+accuracy_threshold: 0.88
+num_questions: 1319
+num_fewshot: 5
+server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
+env:
+  VLLM_USE_FLASHINFER_MOE_FP4: "1"
+  VLLM_FLASHINFER_MOE_BACKEND: "throughput"