Unverified Commit f72b2097 authored by Tyler Michael Smith's avatar Tyler Michael Smith Committed by GitHub
Browse files

[Bugfix] Reject non-nvfp4 dtypes when using the flashinfer_nvlink_one_sided...


[Bugfix] Reject non-nvfp4 dtypes when using the flashinfer_nvlink_one_sided all2all backend (#39717)
Signed-off-by: default avatarTyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent 610a3efc
......@@ -225,6 +225,14 @@ def maybe_make_prepare_finalize(
elif moe.use_fi_nvl_one_sided_kernels:
assert quant_config is not None
if quant_config.quant_dtype != "nvfp4":
raise ValueError(
"The 'flashinfer_nvlink_one_sided' all2all backend only "
"supports nvfp4 activation quantization, but got "
f"quant_dtype={quant_config.quant_dtype!r}. Use a different "
"all2all backend (e.g. 'flashinfer_nvlink_two_sided' or "
"'allgather_reducescatter') for non-nvfp4 models."
)
max_num_tokens = (
get_current_vllm_config().scheduler_config.max_num_batched_tokens
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment