Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori

0da93439 · zhuwenwen · 25f2f756 · 298e5108 · 0da93439 · 0da93439
Commit 0da93439 authored Mar 26, 2026 by zhuwenwen
20 changed files
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -75,7 +75,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
 /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
 /tests/test_inputs.py @DarkLight1337 @ywang96
-/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
+/tests/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
 /tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
 /tests/weight_loading @mgoin @youkaichao @yewentao256
@@ -171,6 +171,7 @@ mkdocs.yaml @hmellor

 # Pooling models
 /examples/pooling @noooop
+/docs/models/pooling_models @noooop
 /tests/models/*/pooling* @noooop
 /tests/entrypoints/pooling @noooop
 /vllm/config/pooler.py @noooop

--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -260,7 +260,7 @@ pull_request_rules:
      - files=examples/offline_inference/structured_outputs.py
      - files=examples/online_serving/structured_outputs/structured_outputs.py
      - files~=^tests/v1/structured_output/
-      - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
+      - files=tests/entrypoints/llm/test_struct_output_generate.py
      - files~=^vllm/v1/structured_output/
  actions:
    label:
@@ -333,9 +333,10 @@ pull_request_rules:
    - label != stale
    - or:
      - files~=^tests/tool_use/
-      - files~=^tests/entrypoints/openai/tool_parsers/
-      - files=tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
-      - files~=^vllm/entrypoints/openai/tool_parsers/
+      - files~=^tests/tool_parsers/
+      - files~=^tests/entrypoints/openai/.*tool.*
+      - files~=^tests/entrypoints/anthropic/.*tool.*
+      - files~=^vllm/tool_parsers/
      - files=docs/features/tool_calling.md
      - files~=^examples/tool_chat_*
      - files=examples/offline_inference/chat_with_tools.py
@@ -381,7 +382,7 @@ pull_request_rules:
    - or:
      - files~=^vllm/model_executor/model_loader/tensorizer.py
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
-      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
+      - files~=^tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
      - files~=^tests/model_executor/model_loader/tensorizer_loader/
  actions:
    assign:

--- a/.github/scripts/cleanup_pr_body.sh
+++ b/.github/scripts/cleanup_pr_body.sh
-#!/bin/bash
-
-set -eu
-
-# ensure 1 argument is passed
-if [ "$#" -ne 1 ]; then
-    echo "Usage: $0 <pr_number>"
-    exit 1
-fi
-
-PR_NUMBER=$1
-OLD=/tmp/orig_pr_body.txt
-NEW=/tmp/new_pr_body.txt
-
-gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
-cp "${OLD}" "${NEW}"
-
-# Remove markdown comments (like the <!-- markdownlint-disable --> at the start)
-sed -i '/<!--.*-->$/d' "${NEW}"
-
-# Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED."
-sed -i '/PLEASE FILL IN THE PR DESCRIPTION HERE.*$/d' "${NEW}"
-
-# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
-sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
-
-# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
-python3 - <<EOF
-import regex as re
-
-with open("${NEW}", "r") as file:
-    content = file.read()
-
-pattern = re.compile(r'(---\n\n)?<details>.*?<summary>.*?PR Checklist \(Click to Expand\).*?</summary>.*?</details>', re.DOTALL)
-content = re.sub(pattern, '', content)
-
-with open("${NEW}", "w") as file:
-    file.write(content)
-EOF
-
-# Run this only if ${NEW} is different than ${OLD}
-if ! cmp -s "${OLD}" "${NEW}"; then
-    gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
-    echo
-    echo "Updated PR body:"
-    echo
-    cat "${NEW}"
-else
-    echo "No changes needed"
-fi
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
-name: Cleanup PR Body
-
-on:
-  pull_request_target:
-    types: [opened, reopened, edited]
-
-permissions:
-  pull-requests: write
-
-jobs:
-  update-description:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
-
-      - name: Set up Python
-        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
-        with:
-          python-version: '3.12'
-          cache: 'pip'
-
-      - name: Install Python dependencies
-        run: |
-          python3 -m pip install --upgrade pip
-          python3 -m pip install regex
-
-      - name: Update PR description
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -383,4 +383,107 @@ jobs:
                  core.notice(`All users for label "${label}" already mentioned, skipping comment`);
                }
              }
-            }
\ No newline at end of file
+            }
+
+      - name: Request missing ROCm info from issue author
+        if: contains(steps.label-step.outputs.labels_added, 'rocm') && contains(toJSON(github.event.issue.labels.*.name), 'bug')
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
+        with:
+          script: |
+            const body = (context.payload.issue.body || '').toLowerCase();
+
+            // Check for existing bot comments to avoid duplicate requests
+            const comments = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+            });
+            const botAlreadyAsked = comments.data.some(
+              c => c.user.type === 'Bot' && c.body.includes('<!-- rocm-info-request -->')
+            );
+            if (botAlreadyAsked) {
+              core.notice('ROCm info request already posted, skipping');
+              return;
+            }
+
+            // Define required information and detection patterns
+            const requiredInfo = [
+              {
+                name: 'Reproducer',
+                patterns: [
+                  /reproduc/i, /minimal.?example/i, /repro\b/i, /steps to reproduce/i,
+                  /code.?snippet/i, /sample.?code/i,
+                  /```python[\s\S]*?```/, /```bash[\s\S]*?```/, /```sh[\s\S]*?```/,
+                ],
+                ask: 'A minimal reproducer (code snippet or script that triggers the issue)',
+              },
+              {
+                name: 'Error message',
+                patterns: [
+                  /error/i, /traceback/i, /exception/i, /fault/i, /crash/i,
+                  /failed/i, /abort/i, /panic/i,
+                ],
+                ask: 'The full error message or traceback',
+              },
+              {
+                name: 'Installation method',
+                patterns: [
+                  /docker/i, /rocm\/pytorch/i, /dockerfile/i, /from source/i,
+                  /pip install/i, /build.?from/i, /container/i, /image/i,
+                  /wheel/i, /\.whl/i, /nightly/i,
+                ],
+                ask: 'How you installed vLLM (Docker image name, pip install, or build from source steps)',
+              },
+              {
+                name: 'Command',
+                patterns: [
+                  /vllm serve/i, /python\s+\S+\.py/i, /```bash[\s\S]*?```/,
+                  /```sh[\s\S]*?```/, /command/i, /launch/i, /run\s/i,
+                  /--model/i, /--tensor-parallel/i, /--gpu-memory/i,
+                ],
+                ask: 'The command you used to launch vLLM (e.g., `vllm serve ...` or the Python script)',
+              },
+              {
+                name: 'GFX architecture',
+                patterns: [
+                  /gfx\d{3,4}/i, /mi\d{3}/i, /mi\d{2}\b/i, /radeon/i,
+                  /gpu.?arch/i, /rocm-smi/i, /rocminfo/i, /navi/i,
+                  /instinct/i,
+                ],
+                ask: 'Your GPU model and GFX architecture (e.g., MI300X / gfx942) — run `rocminfo | grep gfx`',
+              },
+            ];
+
+            const issueBody = context.payload.issue.body || '';
+            const missing = requiredInfo.filter(info =>
+              !info.patterns.some(p => p.test(issueBody))
+            );
+
+            if (missing.length === 0) {
+              core.notice('All required ROCm info appears to be present');
+              return;
+            }
+
+            const author = context.payload.issue.user.login;
+            const checklist = requiredInfo.map(info => {
+              const found = !missing.includes(info);
+              return `- [${found ? 'x' : ' '}] ${info.ask}`;
+            }).join('\n');
+            const message = [
+              '<!-- rocm-info-request -->',
+              `Hi @${author}, thanks for reporting this ROCm issue!`,
+              '',
+              'To help us investigate, please make sure the following information is included:',
+              '',
+              checklist,
+              '',
+              'Please provide any unchecked items above. This will help us reproduce and resolve the issue faster. Thank you!',
+            ].join('\n');
+
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+              body: message,
+            });
+            core.notice(`Requested missing ROCm info from @${author}: ${missing.map(m => m.name).join(', ')}`);
\ No newline at end of file
--- a/.github/workflows/macos-smoke-test.yml
+++ b/.github/workflows/macos-smoke-test.yml
 name: macOS Apple Silicon Smoke Test

 on:
-  push:
-    branches:
-      - main
+  schedule:
+    # Daily at 2:30 AM UTC
+    - cron: '30 2 * * *'
  workflow_dispatch:  # Manual trigger

 permissions:

--- a/.github/workflows/new_pr_bot.yml
+++ b/.github/workflows/new_pr_bot.yml
+name: New PR Bot
+
+on:
+  pull_request_target:
+    types: [opened]
+
+permissions:
+  pull-requests: write
+
+jobs:
+  update-description:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Update PR description
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+        with:
+          script: |
+            const { owner, repo } = context.repo;
+            const pr_number = context.issue.number;
+
+            const { data: pr } = await github.rest.pulls.get({
+              owner,
+              repo,
+              pull_number: pr_number,
+            });
+
+            let body = pr.body || '';
+            const original = body;
+
+            // Remove markdown comments (<!-- ... -->)
+            body = body.replace(/^<!--.*-->$/gm, '');
+
+            // Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ..."
+            body = body.replace(/^PLEASE FILL IN THE PR DESCRIPTION HERE.*$/gm, '');
+
+            // Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ ..."
+            body = body.replace(/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*[\s\S]*$/, '');
+
+            // Remove <details> section containing "PR Checklist (Click to Expand)"
+            body = body.replace(/(---\n\n)?<details>[\s\S]*?<summary>[\s\S]*?PR Checklist \(Click to Expand\)[\s\S]*?<\/summary>[\s\S]*?<\/details>/g, '');
+
+            if (body !== original) {
+              await github.rest.pulls.update({
+                owner,
+                repo,
+                pull_number: pr_number,
+                body,
+              });
+              console.log('Updated PR body');
+            } else {
+              console.log('No changes needed');
+            }
+
+  reminder-comment:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Post welcome comment for first-time contributors
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+        with:
+          script: |
+            const { owner, repo } = context.repo;
+            const prAuthor = context.payload.pull_request.user.login;
+
+            const { data: searchResults } = await github.rest.search.issuesAndPullRequests({
+              q: `repo:${owner}/${repo} type:pr author:${prAuthor}`,
+              per_page: 1,
+            });
+
+            const authorPRCount = searchResults.total_count;
+            console.log(`Found ${authorPRCount} PRs by ${prAuthor}`);
+
+            if (authorPRCount === 1) {
+              console.log(`Posting welcome comment for first-time contributor: ${prAuthor}`);
+              await github.rest.issues.createComment({
+                owner,
+                repo,
+                issue_number: context.issue.number,
+                body: [
+                  '\u{1f44b} Hi! Thank you for contributing to the vLLM project.',
+                  '',
+                  '\u{1f4ac} Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.',
+                  '',
+                  'Just a reminder: PRs would not trigger full CI run by default.',
+                  '',
+                  'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.',
+                  '',
+                  'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.',
+                  '',
+                  'If you have any questions, please reach out to us on Slack at https://slack.vllm.ai.',
+                  '',
+                  '\u{1f680}',
+                ].join('\n'),
+              });
+            } else {
+              console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRCount} PRs found)`);
+            }
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -11,9 +11,39 @@ concurrency:

 permissions:
  contents: read
+  pull-requests: read

 jobs:
+  pre-run-check:
+    if: github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+    steps:
+    - name: Check PR label and author merge count
+      uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+      with:
+        script: |
+          const { data: pr } = await github.rest.pulls.get({
+            ...context.repo,
+            pull_number: context.payload.pull_request.number,
+          });
+
+          const hasReadyLabel = pr.labels.some(l => l.name === 'ready');
+
+          const { data: mergedPRs } = await github.rest.search.issuesAndPullRequests({
+            q: `repo:${context.repo.owner}/${context.repo.repo} is:pr is:merged author:${pr.user.login}`,
+            per_page: 4,
+          });
+          const mergedCount = mergedPRs.total_count;
+
+          if (hasReadyLabel || mergedCount >= 4) {
+            core.info(`Check passed: ready label=${hasReadyLabel}, 4+ merged PRs=${mergedCount >= 4}`);
+          } else {
+            core.setFailed(`PR must have the 'ready' label or the author must have at least 4 merged PRs (found ${mergedCount}).`);
+          }
+
  pre-commit:
+    needs: pre-run-check
+    if: always() && (needs.pre-run-check.result == 'success' || needs.pre-run-check.result == 'skipped')
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1

--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
-name: PR Reminder Comment Bot
-permissions:
-  pull-requests: write
-on:
-  pull_request_target:
-    types: [opened]
-jobs:
-  pr_reminder:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Remind to run full CI on PR
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
-        with:
-          script: |
-            try {
-              // Get the PR author
-              const prAuthor = context.payload.pull_request.user.login;
-              
-              // Check if this is the author's first PR in this repository
-              // Use GitHub's search API to find all PRs by this author
-              const { data: searchResults } = await github.rest.search.issuesAndPullRequests({
-                q: `repo:${context.repo.owner}/${context.repo.repo} type:pr author:${prAuthor}`,
-                per_page: 100  
-              });
-              
-              const authorPRCount = searchResults.total_count;
-              
-              console.log(`Found ${authorPRCount} PRs by ${prAuthor}`);
-              
-              // Only post comment if this is the first PR (only one PR by this author)
-              if (authorPRCount === 1) {
-                console.log(`Posting welcome comment for first-time contributor: ${prAuthor}`);
-                await github.rest.issues.createComment({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                issue_number: context.issue.number,
-                body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
-                  '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
-                  'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. \n\n' +
-                  'You ask your reviewers to trigger select CI tests on top of `fastcheck` CI. \n\n' +
-                  'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
-                  'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
-                  'If you have any questions, please reach out to us on Slack at https://slack.vllm.ai.\n\n' +
-                  '🚀'
-                });
-              } else {
-                console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRCount} PRs found)`);
-              }
-            } catch (error) {
-              console.error('Error checking PR history or posting comment:', error);
-              // Don't fail the workflow, just log the error
-            }
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -340,7 +340,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")

  list(APPEND VLLM_EXT_SRC
    "csrc/quantization/awq/gemm_kernels.cu"
-    "csrc/permute_cols.cu"
    "csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
@@ -986,6 +985,48 @@ define_extension_target(
 # Setting this variable sidesteps the issue by calling the driver directly.
 target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)

+# add OR VLLM_GPU_LANG STREQUAL "HIP" here once
+# https://github.com/vllm-project/vllm/issues/35163 is resolved
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  #
+  # _C_stable_libtorch extension (ops registered via STABLE_TORCH_LIBRARY)
+  #
+  set(VLLM_STABLE_EXT_SRC
+    "csrc/libtorch_stable/torch_bindings.cpp")
+
+  if(VLLM_GPU_LANG STREQUAL "CUDA")
+    list(APPEND VLLM_STABLE_EXT_SRC "csrc/libtorch_stable/permute_cols.cu")
+  endif()
+
+  if(VLLM_GPU_LANG STREQUAL "CUDA")
+    set_gencode_flags_for_srcs(
+      SRCS "${VLLM_STABLE_EXT_SRC}"
+      CUDA_ARCHS "${CUDA_ARCHS}")
+  endif()
+
+  message(STATUS "Enabling C_stable extension.")
+  define_extension_target(
+    _C_stable_libtorch
+    DESTINATION vllm
+    LANGUAGE ${VLLM_GPU_LANG}
+    SOURCES ${VLLM_STABLE_EXT_SRC}
+    COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+    ARCHITECTURES ${VLLM_GPU_ARCHES}
+    USE_SABI 3
+    WITH_SOABI)
+
+  # Set TORCH_TARGET_VERSION for stable ABI compatibility.
+  # This ensures we only use C-shim APIs available in PyTorch 2.10.
+  # _C_stable_libtorch is abi compatible with PyTorch >= TORCH_TARGET_VERSION
+  # which is currently set to 2.10.
+  target_compile_definitions(_C_stable_libtorch PRIVATE
+    TORCH_TARGET_VERSION=0x020A000000000000ULL)
+
+  # Needed to use cuda APIs from C-shim
+  target_compile_definitions(_C_stable_libtorch PRIVATE
+    USE_CUDA)
+endif()
+
 #
 # _moe_C extension
 #
@@ -999,6 +1040,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_MOE_EXT_SRC
    "csrc/moe/moe_wna16.cu"
    "csrc/moe/grouped_topk_kernels.cu"
+    "csrc/moe/gpt_oss_router_gemm.cu"
    "csrc/moe/router_gemm.cu")
 endif()


--- a/benchmarks/attention_benchmarks/benchmark.py
+++ b/benchmarks/attention_benchmarks/benchmark.py
@@ -47,6 +47,8 @@ from common import (
    is_mla_backend,
 )

+from vllm.v1.worker.workspace import init_workspace_manager
+

 def run_standard_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
    """Run standard attention benchmark (Flash/Triton/FlashInfer)."""
@@ -462,7 +464,7 @@ def main():
    parser.add_argument(
        "--batch-specs",
        nargs="+",
-        default=["q2k", "8q1s1k"],
+        default=None,
        help="Batch specifications using extended grammar",
    )

@@ -478,6 +480,21 @@ def main():
    parser.add_argument("--repeats", type=int, default=1, help="Repetitions")
    parser.add_argument("--warmup-iters", type=int, default=3, help="Warmup iterations")
    parser.add_argument("--profile-memory", action="store_true", help="Profile memory")
+    parser.add_argument(
+        "--kv-cache-dtype",
+        default="auto",
+        choices=["auto", "fp8"],
+        help="KV cache dtype: auto or fp8",
+    )
+    parser.add_argument(
+        "--cuda-graphs",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help=(
+            "Launch kernels with CUDA graphs to eliminate CPU overhead"
+            "in measurements (default: True)"
+        ),
+    )

    # Parameter sweep (use YAML config for advanced sweeps)
    parser.add_argument(
@@ -536,21 +553,24 @@ def main():

        # Batch specs and sizes
        # Support both explicit batch_specs and generated batch_spec_ranges
-        if "batch_spec_ranges" in yaml_config:
-            # Generate batch specs from ranges
-            generated_specs = generate_batch_specs_from_ranges(
-                yaml_config["batch_spec_ranges"]
-            )
-            # Combine with any explicit batch_specs
-            if "batch_specs" in yaml_config:
-                args.batch_specs = yaml_config["batch_specs"] + generated_specs
-            else:
-                args.batch_specs = generated_specs
-            console.print(
-                f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
-            )
-        elif "batch_specs" in yaml_config:
-            args.batch_specs = yaml_config["batch_specs"]
+        # CLI --batch-specs takes precedence over YAML when provided.
+        cli_batch_specs_provided = args.batch_specs is not None
+        if not cli_batch_specs_provided:
+            if "batch_spec_ranges" in yaml_config:
+                # Generate batch specs from ranges
+                generated_specs = generate_batch_specs_from_ranges(
+                    yaml_config["batch_spec_ranges"]
+                )
+                # Combine with any explicit batch_specs
+                if "batch_specs" in yaml_config:
+                    args.batch_specs = yaml_config["batch_specs"] + generated_specs
+                else:
+                    args.batch_specs = generated_specs
+                console.print(
+                    f"[dim]Generated {len(generated_specs)} batch specs from ranges[/]"
+                )
+            elif "batch_specs" in yaml_config:
+                args.batch_specs = yaml_config["batch_specs"]

        if "batch_sizes" in yaml_config:
            args.batch_sizes = yaml_config["batch_sizes"]
@@ -575,6 +595,10 @@ def main():
            args.warmup_iters = yaml_config["warmup_iters"]
        if "profile_memory" in yaml_config:
            args.profile_memory = yaml_config["profile_memory"]
+        if "kv_cache_dtype" in yaml_config:
+            args.kv_cache_dtype = yaml_config["kv_cache_dtype"]
+        if "cuda_graphs" in yaml_config:
+            args.cuda_graphs = yaml_config["cuda_graphs"]

        # Parameter sweep configuration
        if "parameter_sweep" in yaml_config:
@@ -629,12 +653,18 @@ def main():
    # Determine backends
    backends = args.backends or ([args.backend] if args.backend else ["flash"])
    prefill_backends = getattr(args, "prefill_backends", None)
+    if not args.batch_specs:
+        args.batch_specs = ["q2k", "8q1s1k"]
    console.print(f"Backends: {', '.join(backends)}")
    if prefill_backends:
        console.print(f"Prefill backends: {', '.join(prefill_backends)}")
    console.print(f"Batch specs: {', '.join(args.batch_specs)}")
+    console.print(f"KV cache dtype: {args.kv_cache_dtype}")
+    console.print(f"CUDA graphs: {args.cuda_graphs}")
    console.print()

+    init_workspace_manager(args.device)
+
    # Run benchmarks
    all_results = []

@@ -687,6 +717,8 @@ def main():
                        repeats=args.repeats,
                        warmup_iters=args.warmup_iters,
                        profile_memory=args.profile_memory,
+                        kv_cache_dtype=args.kv_cache_dtype,
+                        use_cuda_graphs=args.cuda_graphs,
                    )

                    # Add decode pipeline config
@@ -839,6 +871,8 @@ def main():
            "repeats": args.repeats,
            "warmup_iters": args.warmup_iters,
            "profile_memory": args.profile_memory,
+            "kv_cache_dtype": args.kv_cache_dtype,
+            "use_cuda_graphs": args.cuda_graphs,
        }
        all_results = run_model_parameter_sweep(
            backends,
@@ -861,6 +895,8 @@ def main():
            "repeats": args.repeats,
            "warmup_iters": args.warmup_iters,
            "profile_memory": args.profile_memory,
+            "kv_cache_dtype": args.kv_cache_dtype,
+            "use_cuda_graphs": args.cuda_graphs,
        }
        all_results = run_parameter_sweep(
            backends, args.batch_specs, base_config_args, args.parameter_sweep, console
@@ -891,6 +927,8 @@ def main():
                            repeats=args.repeats,
                            warmup_iters=args.warmup_iters,
                            profile_memory=args.profile_memory,
+                            kv_cache_dtype=args.kv_cache_dtype,
+                            use_cuda_graphs=args.cuda_graphs,
                        )

                        result = run_benchmark(config)

--- a/benchmarks/attention_benchmarks/common.py
+++ b/benchmarks/attention_benchmarks/common.py
@@ -213,6 +213,9 @@ class BenchmarkConfig:
    profile_memory: bool = False
    use_cuda_graphs: bool = False

+    # "auto" or "fp8"
+    kv_cache_dtype: str = "auto"
+
    # MLA-specific
    prefill_backend: str | None = None
    kv_lora_rank: int | None = None
@@ -369,6 +372,7 @@ class ResultsFormatter:
                    "backend",
                    "batch_spec",
                    "num_layers",
+                    "kv_cache_dtype",
                    "mean_time",
                    "std_time",
                    "throughput",
@@ -382,6 +386,7 @@ class ResultsFormatter:
                        "backend": r.config.backend,
                        "batch_spec": r.config.batch_spec,
                        "num_layers": r.config.num_layers,
+                        "kv_cache_dtype": r.config.kv_cache_dtype,
                        "mean_time": r.mean_time,
                        "std_time": r.std_time,
                        "throughput": r.throughput_tokens_per_sec or 0,

--- a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml
@@ -30,9 +30,9 @@ batch_specs:
  - "2q16k_32q1s4k"         # 2 very large prefill + 32 decode

  # Context extension + decode
-  - "2q1kkv2k_16q1s1k"       # 2 extend + 16 decode
-  - "4q2kkv4k_32q1s2k"       # 4 extend + 32 decode
-  - "2q1kkv8k_32q1s2k"       # 2 large extend + 32 decode
+  - "2q1ks2k_16q1s1k"       # 2 extend + 16 decode
+  - "4q2ks4k_32q1s2k"       # 4 extend + 32 decode
+  - "2q1ks8k_32q1s2k"       # 2 large extend + 32 decode

  # Explicitly chunked prefill
  - "q8k"           # 8k prefill with chunking hint

--- a/benchmarks/attention_benchmarks/configs/mla_sparse_decode.yaml
+++ b/benchmarks/attention_benchmarks/configs/mla_sparse_decode.yaml
+# MLA decode-only benchmark configuration
+
+model:
+  name: "deepseek-v3"
+  num_layers: 60
+  num_q_heads: 128  # Base value, can be swept for TP simulation
+  num_kv_heads: 1  # MLA uses single latent KV
+  head_dim: 576
+  kv_lora_rank: 512
+  qk_nope_head_dim: 128
+  qk_rope_head_dim: 64
+  v_head_dim: 128
+  block_size: 128  # CUTLASS MLA and FlashAttn MLA use 128
+
+# Model parameter sweep: simulate tensor parallelism by varying num_q_heads
+# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads
+model_parameter_sweep:
+  param_name: "num_q_heads"
+  values: [128, 64, 32, 16]
+  label_format: "{backend}_{value}h"
+
+batch_specs:
+  # Small batches, varying sequence lengths
+  - "16q1s512"     # 16 requests, 512 KV cache
+  - "16q1s1k"      # 16 requests, 1k KV cache
+  - "16q1s2k"      # 16 requests, 2k KV cache
+  - "16q1s4k"      # 16 requests, 4k KV cache
+
+  # Medium batches
+  - "32q1s1k"      # 32 requests, 1k KV cache
+  - "32q1s2k"      # 32 requests, 2k KV cache
+  - "32q1s4k"      # 32 requests, 4k KV cache
+  - "32q1s8k"      # 32 requests, 8k KV cache
+
+  # Large batches
+  - "64q1s1k"      # 64 requests, 1k KV cache
+  - "64q1s2k"      # 64 requests, 2k KV cache
+  - "64q1s4k"      # 64 requests, 4k KV cache
+  - "64q1s8k"      # 64 requests, 8k KV cache
+
+  # Very large batches
+  - "128q1s1k"     # 128 requests, 1k KV cache
+  - "128q1s2k"     # 128 requests, 2k KV cache
+  - "128q1s4k"     # 128 requests, 4k KV cache
+  - "128q1s8k"     # 128 requests, 8k KV cache
+
+  # Long context
+  - "32q1s16k"     # 32 requests, 16k KV cache
+  - "32q1s32k"     # 32 requests, 32k KV cache
+
+backends:
+  - FLASHMLA_SPARSE
+  - FLASHINFER_MLA_SPARSE
+
+device: "cuda:0"
+repeats: 100
+warmup_iters: 10
+profile_memory: true
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -60,9 +60,11 @@ def create_minimal_vllm_config(
    model_name: str = "deepseek-v3",
    block_size: int = 128,
    max_num_seqs: int = 256,
+    max_num_batched_tokens: int = 8192,
    mla_dims: dict | None = None,
    index_topk: int | None = None,
    prefill_backend: str | None = None,
+    kv_cache_dtype: str = "auto",
 ) -> VllmConfig:
    """
    Create minimal VllmConfig for MLA benchmarks.
@@ -149,13 +151,13 @@ def create_minimal_vllm_config(
    cache_config = CacheConfig(
        block_size=block_size,
        gpu_memory_utilization=0.9,
-        cache_dtype="auto",
+        cache_dtype=kv_cache_dtype,
        enable_prefix_caching=False,
    )

    scheduler_config = SchedulerConfig(
        max_num_seqs=max_num_seqs,
-        max_num_batched_tokens=8192,
+        max_num_batched_tokens=max(max_num_batched_tokens, max_num_seqs),
        max_model_len=32768,
        is_encoder_decoder=False,
        enable_chunked_prefill=True,
@@ -535,6 +537,7 @@ def _create_backend_impl(
    device: torch.device,
    max_num_tokens: int = 8192,
    index_topk: int | None = None,
+    kv_cache_dtype: str = "auto",
 ):
    """
    Create backend implementation instance.
@@ -583,7 +586,7 @@ def _create_backend_impl(
        "num_kv_heads": mla_dims["num_kv_heads"],
        "alibi_slopes": None,
        "sliding_window": None,
-        "kv_cache_dtype": "auto",
+        "kv_cache_dtype": kv_cache_dtype,
        "logits_soft_cap": None,
        "attn_type": "decoder",
        "kv_sharing_target_layer_name": None,
@@ -701,6 +704,7 @@ def _run_single_benchmark(
    mla_dims: dict,
    device: torch.device,
    indexer=None,
+    kv_cache_dtype: str | None = None,
 ) -> BenchmarkResult:
    """
    Run a single benchmark iteration.
@@ -734,49 +738,124 @@ def _run_single_benchmark(
    )

    # Create KV cache
-    kv_cache = torch.zeros(
-        num_blocks,
-        block_size,
-        mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"],
-        device=device,
-        dtype=torch.bfloat16,
-    )
+    if kv_cache_dtype is None:
+        kv_cache_dtype = getattr(config, "kv_cache_dtype", "auto")
+    head_size = mla_dims["kv_lora_rank"] + mla_dims["qk_rope_head_dim"]
+    if kv_cache_dtype == "fp8_ds_mla":
+        # FlashMLA sparse custom format: 656 bytes per token, stored as uint8.
+        # Layout: kv_lora_rank fp8 bytes + 4 float32 tile scales
+        #         + 2*rope_dim bf16 bytes
+        # = 512 + 16 + 128 = 656 bytes for DeepSeek dims.
+        kv_cache = torch.zeros(
+            num_blocks,
+            block_size,
+            656,
+            device=device,
+            dtype=torch.uint8,
+        )
+    elif kv_cache_dtype == "fp8":
+        from vllm.platforms import current_platform

-    # Create input tensors for both decode and prefill modes
-    decode_inputs, prefill_inputs = _create_input_tensors(
-        total_q,
-        mla_dims,
-        backend_cfg["query_format"],
-        device,
-        torch.bfloat16,
-    )
+        kv_cache = torch.zeros(
+            num_blocks,
+            block_size,
+            head_size,
+            device=device,
+            dtype=torch.uint8,
+        ).view(current_platform.fp8_dtype())
+    else:
+        kv_cache = torch.zeros(
+            num_blocks,
+            block_size,
+            head_size,
+            device=device,
+            dtype=torch.bfloat16,
+        )

    # Fill indexer with random indices for sparse backends
    is_sparse = backend_cfg.get("is_sparse", False)
    if is_sparse and indexer is not None:
        indexer.fill_random_indices(total_q, max_kv_len)

-    # Determine which forward method to use based on metadata
-    if metadata.decode is not None:
-        forward_fn = lambda: impl.forward_mqa(decode_inputs, kv_cache, metadata, layer)
-    elif metadata.prefill is not None:
-        forward_fn = lambda: impl.forward_mha(
-            prefill_inputs["q"],
-            prefill_inputs["k_c_normed"],
-            prefill_inputs["k_pe"],
-            kv_cache,
-            metadata,
-            prefill_inputs["k_scale"],
-            prefill_inputs["output"],
-        )
-    else:
+    # Determine which forward methods to use based on metadata.
+    # Sparse MLA backends always use forward_mqa
+    has_decode = is_sparse or getattr(metadata, "decode", None) is not None
+    has_prefill = not is_sparse and getattr(metadata, "prefill", None) is not None
+    if not has_decode and not has_prefill:
        raise RuntimeError("Metadata has neither decode nor prefill metadata")

+    num_decode = (
+        metadata.num_decode_tokens
+        if (has_decode and has_prefill)
+        else total_q
+        if has_decode
+        else 0
+    )
+    num_prefill = total_q - num_decode
+
+    # Some backends requires fp8 queries when using fp8 KV cache.
+    is_fp8_kvcache = kv_cache_dtype.startswith("fp8")
+    quantize_query = is_fp8_kvcache and getattr(
+        impl, "supports_quant_query_input", False
+    )
+
+    # quantize_query forces concat format
+    query_fmt = "concat" if quantize_query else backend_cfg["query_format"]
+
+    # Create decode query tensors
+    if has_decode:
+        decode_inputs, _ = _create_input_tensors(
+            num_decode, mla_dims, query_fmt, device, torch.bfloat16
+        )
+        # Cast decode query to fp8 if the backend supports it
+        if quantize_query:
+            from vllm.platforms import current_platform
+
+            if isinstance(decode_inputs, tuple):
+                decode_inputs = torch.cat(list(decode_inputs), dim=-1)
+            decode_inputs = decode_inputs.to(current_platform.fp8_dtype())
+
+    # Create prefill input tensors
+    if has_prefill:
+        _, prefill_inputs = _create_input_tensors(
+            num_prefill, mla_dims, query_fmt, device, torch.bfloat16
+        )
+
+    # Build forward function
+    def forward_fn():
+        results = []
+        if has_decode:
+            results.append(impl.forward_mqa(decode_inputs, kv_cache, metadata, layer))
+        if has_prefill:
+            results.append(
+                impl.forward_mha(
+                    prefill_inputs["q"],
+                    prefill_inputs["k_c_normed"],
+                    prefill_inputs["k_pe"],
+                    kv_cache,
+                    metadata,
+                    prefill_inputs["k_scale"],
+                    prefill_inputs["output"],
+                )
+            )
+        return results[0] if len(results) == 1 else tuple(results)
+
    # Warmup
    for _ in range(config.warmup_iters):
        forward_fn()
    torch.accelerator.synchronize()

+    # Optionally capture a CUDA graph after warmup.
+    # Graph replay eliminates CPU launch overhead so timings reflect pure
+    # kernel time.
+    if config.use_cuda_graphs:
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph):
+            forward_fn()
+        benchmark_fn = graph.replay
+    else:
+        benchmark_fn = forward_fn
+
    # Benchmark
    times = []
    for _ in range(config.repeats):
@@ -785,7 +864,7 @@ def _run_single_benchmark(

        start.record()
        for _ in range(config.num_layers):
-            forward_fn()
+            benchmark_fn()
        end.record()

        torch.accelerator.synchronize()
@@ -852,13 +931,30 @@ def _run_mla_benchmark_batched(
    # Determine if this is a sparse backend
    is_sparse = backend_cfg.get("is_sparse", False)

+    # Extract kv_cache_dtype from the first config
+    kv_cache_dtype = getattr(first_config, "kv_cache_dtype", "auto")
+
+    # FlashMLA sparse only supports "fp8_ds_mla" internally (not generic "fp8").
+    # Remap here so the user can pass --kv-cache-dtype fp8 regardless of backend.
+    if backend.upper() == "FLASHMLA_SPARSE" and kv_cache_dtype == "fp8":
+        kv_cache_dtype = "fp8_ds_mla"
+
+    # Compute max total_q across all configs so the metadata builder buffer
+    # and scheduler config are large enough for all batch specs.
+    max_total_q = max(
+        sum(r.q_len for r in parse_batch_spec(cfg.batch_spec))
+        for cfg, *_ in configs_with_params
+    )
+
    # Create and set vLLM config for MLA (reused across all benchmarks)
    vllm_config = create_minimal_vllm_config(
        model_name="deepseek-v3",  # Used only for model path
        block_size=block_size,
+        max_num_batched_tokens=max_total_q,
        mla_dims=mla_dims,  # Use custom dims from config or default
        index_topk=index_topk if is_sparse else None,
        prefill_backend=prefill_backend,
+        kv_cache_dtype=kv_cache_dtype,
    )

    results = []
@@ -883,7 +979,9 @@ def _run_mla_benchmark_batched(
            mla_dims,
            vllm_config,
            device,
+            max_num_tokens=max_total_q,
            index_topk=index_topk if is_sparse else None,
+            kv_cache_dtype=kv_cache_dtype,
        )

        # Verify the actual prefill backend matches what was requested
@@ -942,6 +1040,7 @@ def _run_mla_benchmark_batched(
                    mla_dims,
                    device,
                    indexer=indexer,
+                    kv_cache_dtype=kv_cache_dtype,
                )
                results.append(result)


--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -140,7 +140,7 @@ def _create_vllm_config(

    cache_config = CacheConfig(
        block_size=config.block_size,
-        cache_dtype="auto",
+        cache_dtype=config.kv_cache_dtype,
    )
    cache_config.num_gpu_blocks = max_num_blocks
    cache_config.num_cpu_blocks = 0
@@ -215,7 +215,7 @@ def _create_backend_impl(
        num_kv_heads=config.num_kv_heads,
        alibi_slopes=None,
        sliding_window=None,
-        kv_cache_dtype="auto",
+        kv_cache_dtype=config.kv_cache_dtype,
    )

    kv_cache_spec = FullAttentionSpec(
@@ -288,12 +288,22 @@ def _create_input_tensors(
    total_q: int,
    device: torch.device,
    dtype: torch.dtype,
+    quantize_query: bool = False,
 ) -> tuple:
-    """Create Q, K, V input tensors for all layers."""
+    """Create Q, K, V input tensors for all layers.
+
+    When quantize_query is True, queries are cast to fp8 to match backends
+    that require query/key/value dtype consistency.
+    """
+    q_dtype = dtype
+    if quantize_query:
+        from vllm.platforms import current_platform
+
+        q_dtype = current_platform.fp8_dtype()
    q_list = [
        torch.randn(
            total_q, config.num_q_heads, config.head_dim, device=device, dtype=dtype
-        )
+        ).to(q_dtype)
        for _ in range(config.num_layers)
    ]
    k_list = [
@@ -344,10 +354,17 @@ def _create_kv_cache(
    # Compute inverse permutation to get back to logical view
    inv_order = [stride_order.index(i) for i in range(len(stride_order))]

+    # Use fp8 dtype for cache when requested.
+    cache_dtype = dtype
+    if config.kv_cache_dtype == "fp8":
+        from vllm.platforms import current_platform
+
+        cache_dtype = current_platform.fp8_dtype()
+
    cache_list = []
    for _ in range(config.num_layers):
        # Allocate in physical layout order (contiguous in memory)
-        cache = torch.zeros(*physical_shape, device=device, dtype=dtype)
+        cache = torch.zeros(*physical_shape, device=device, dtype=cache_dtype)
        # Permute to logical view
        cache = cache.permute(*inv_order)
        cache_list.append(cache)
@@ -392,6 +409,37 @@ def _run_single_benchmark(
            )
    torch.accelerator.synchronize()

+    # Optionally capture a CUDA graph after warmup.
+    # Graph replay eliminates CPU launch overhead so timings reflect pure
+    # kernel time.
+    if config.use_cuda_graphs:
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph):
+            for i in range(config.num_layers):
+                impl.forward(
+                    layer,
+                    q_list[i],
+                    k_list[i],
+                    v_list[i],
+                    cache_list[i],
+                    attn_metadata,
+                    output=out,
+                )
+        benchmark_fn = graph.replay
+    else:
+
+        def benchmark_fn():
+            for i in range(config.num_layers):
+                impl.forward(
+                    layer,
+                    q_list[i],
+                    k_list[i],
+                    v_list[i],
+                    cache_list[i],
+                    attn_metadata,
+                    output=out,
+                )
+
    # Benchmark
    times = []
    for _ in range(config.repeats):
@@ -399,16 +447,7 @@ def _run_single_benchmark(
        end = torch.cuda.Event(enable_timing=True)

        start.record()
-        for i in range(config.num_layers):
-            impl.forward(
-                layer,
-                q_list[i],
-                k_list[i],
-                v_list[i],
-                cache_list[i],
-                attn_metadata,
-                output=out,
-            )
+        benchmark_fn()
        end.record()

        torch.accelerator.synchronize()
@@ -502,8 +541,12 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
                common_attn_metadata=common_metadata,
            )

+            # Only quantize queries when the impl supports it
+            quantize_query = config.kv_cache_dtype.startswith("fp8") and getattr(
+                impl, "supports_quant_query_input", False
+            )
            q_list, k_list, v_list = _create_input_tensors(
-                config, total_q, device, dtype
+                config, total_q, device, dtype, quantize_query=quantize_query
            )

            cache_list = _create_kv_cache(

--- a/benchmarks/benchmark_long_document_qa_throughput.py
+++ b/benchmarks/benchmark_long_document_qa_throughput.py
@@ -40,9 +40,9 @@ LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more
 details.
 """

-import dataclasses
 import random
 import time
+from dataclasses import fields

 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
@@ -124,7 +124,7 @@ def main(args):

    # Create the LLM engine
    engine_args = EngineArgs.from_cli_args(args)
-    llm = LLM(**dataclasses.asdict(engine_args))
+    llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})
    sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)

    print("------warm up------")

--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -32,6 +32,7 @@ import dataclasses
 import json
 import random
 import time
+from dataclasses import fields

 from transformers import PreTrainedTokenizerBase

@@ -196,7 +197,7 @@ def main(args):

    engine_args = EngineArgs.from_cli_args(args)

-    llm = LLM(**dataclasses.asdict(engine_args))
+    llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})

    sampling_params = SamplingParams(
        temperature=0,

--- a/benchmarks/benchmark_prioritization.py
+++ b/benchmarks/benchmark_prioritization.py
@@ -3,10 +3,10 @@
 """Benchmark offline prioritization."""

 import argparse
-import dataclasses
 import json
 import random
 import time
+from dataclasses import fields

 from transformers import AutoTokenizer, PreTrainedTokenizerBase

@@ -79,7 +79,7 @@ def run_vllm(
 ) -> float:
    from vllm import LLM, SamplingParams

-    llm = LLM(**dataclasses.asdict(engine_args))
+    llm = LLM(**{f.name: getattr(engine_args, f.name) for f in fields(engine_args)})

    assert all(
        llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])

--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -750,17 +750,20 @@ def get_weight_block_size_safety(config, default_value=None):


 def get_model_params(config):
-    if config.architectures[0] == "DbrxForCausalLM":
+    architectures = getattr(config, "architectures", None) or [type(config).__name__]
+    architecture = architectures[0]
+
+    if architecture == "DbrxForCausalLM":
        E = config.ffn_config.moe_num_experts
        topk = config.ffn_config.moe_top_k
        intermediate_size = config.ffn_config.ffn_hidden_size
        hidden_size = config.hidden_size
-    elif config.architectures[0] == "JambaForCausalLM":
+    elif architecture == "JambaForCausalLM":
        E = config.num_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.intermediate_size
        hidden_size = config.hidden_size
-    elif config.architectures[0] in (
+    elif architecture in (
        "DeepseekV2ForCausalLM",
        "DeepseekV3ForCausalLM",
        "DeepseekV32ForCausalLM",
@@ -774,7 +777,7 @@ def get_model_params(config):
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
        hidden_size = config.hidden_size
-    elif config.architectures[0] in (
+    elif architecture in (
        "Qwen2MoeForCausalLM",
        "Qwen3MoeForCausalLM",
        "Qwen3NextForCausalLM",
@@ -783,23 +786,27 @@ def get_model_params(config):
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
        hidden_size = config.hidden_size
-    elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration":
+    elif architecture in (
+        "Qwen3VLMoeForConditionalGeneration",
+        "Qwen3_5MoeForConditionalGeneration",
+        "Qwen3_5MoeTextConfig",
+    ):
        text_config = config.get_text_config()
        E = text_config.num_experts
        topk = text_config.num_experts_per_tok
        intermediate_size = text_config.moe_intermediate_size
        hidden_size = text_config.hidden_size
-    elif config.architectures[0] == "HunYuanMoEV1ForCausalLM":
+    elif architecture == "HunYuanMoEV1ForCausalLM":
        E = config.num_experts
        topk = config.moe_topk[0]
        intermediate_size = config.moe_intermediate_size[0]
        hidden_size = config.hidden_size
-    elif config.architectures[0] == "Qwen3OmniMoeForConditionalGeneration":
+    elif architecture == "Qwen3OmniMoeForConditionalGeneration":
        E = config.thinker_config.text_config.num_experts
        topk = config.thinker_config.text_config.num_experts_per_tok
        intermediate_size = config.thinker_config.text_config.moe_intermediate_size
        hidden_size = config.thinker_config.text_config.hidden_size
-    elif config.architectures[0] == "PixtralForConditionalGeneration":
+    elif architecture == "PixtralForConditionalGeneration":
        # Pixtral can contain different LLM architectures,
        # recurse to get their parameters
        return get_model_params(config.get_text_config())
@@ -814,6 +821,23 @@ def get_model_params(config):
    return E, topk, intermediate_size, hidden_size


+def resolve_dtype(config) -> torch.dtype:
+    if current_platform.is_rocm():
+        return torch.float16
+
+    dtype = getattr(config, "dtype", None)
+    if dtype is not None:
+        return dtype
+
+    if hasattr(config, "get_text_config"):
+        text_config = config.get_text_config()
+        dtype = getattr(text_config, "dtype", None)
+        if dtype is not None:
+            return dtype
+
+    return torch.bfloat16
+
+
 def get_quantization_group_size(config) -> int | None:
    """Extract the quantization group size from the HF model config.

@@ -861,7 +885,7 @@ def main(args: argparse.Namespace):
    else:
        ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
+    dtype = resolve_dtype(config)
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
    use_int8_w8a16 = args.dtype == "int8_w8a16"
    use_int4_w4a16 = args.dtype == "int4_w4a16"