From d9bede0314ba19a3f8336dcaeeeaf9e2c5487053 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sun, 8 Feb 2026 23:15:46 -0800 Subject: [PATCH 0001/1166] [BugFix] Fix `fastsafetensors` TP all procs using all GPUs (#34070) Signed-off-by: Nick Hill Co-authored-by: Cyrus Leung --- vllm/model_executor/model_loader/weight_utils.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 13a60c7b7..d43656c4f 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -801,8 +801,8 @@ def runai_safetensors_weights_iterator( yield from tensor_iter -def _init_loader( - pg: torch.distributed.ProcessGroup, +def _init_fastsafetensors_loader( + pg: "torch.distributed.ProcessGroup", device: torch.device, f_list: list[str], *, @@ -825,13 +825,16 @@ def fastsafetensors_weights_iterator( else: pg = SingleGroup() - device = torch.device(f"cuda:{pg.rank()}") + device = torch.device(f"cuda:{current_platform.current_device()}") weight_files_sub_lists = [ hf_weights_files[i : i + pg.size()] for i in range(0, len(hf_weights_files), pg.size()) ] - nogds = False + # Use nogds=True for TP > 1 to avoid cuFileDriverOpen() which + # initializes the GDS DMA subsystem for all visible GPUs, creating + # unwanted CUDA contexts on every device. + nogds = pg.size() > 1 for f_list in tqdm( weight_files_sub_lists, @@ -839,7 +842,7 @@ def fastsafetensors_weights_iterator( disable=not enable_tqdm(use_tqdm_on_load), bar_format=_BAR_FORMAT, ): - loader = _init_loader(pg, device, f_list, nogds=nogds) + loader = _init_fastsafetensors_loader(pg, device, f_list, nogds=nogds) try: try: fb = loader.copy_files_to_device() @@ -853,7 +856,7 @@ def fastsafetensors_weights_iterator( "GDS not enabled, setting `nogds=True`.\n" "For more information, see: https://github.com/foundation-model-stack/fastsafetensors?tab=readme-ov-file#basic-api-usages" ) - loader = _init_loader(pg, device, f_list, nogds=nogds) + loader = _init_fastsafetensors_loader(pg, device, f_list, nogds=nogds) fb = loader.copy_files_to_device() try: -- GitLab From 5a5c43511ac98299856d0fee6c619fdd8bcdd2ef Mon Sep 17 00:00:00 2001 From: ihb2032 <40718643+ihb2032@users.noreply.github.com> Date: Mon, 9 Feb 2026 16:55:41 +0800 Subject: [PATCH 0002/1166] fix(cpu): fix mla_decode compilation on x86 without AVX512 (#34052) Signed-off-by: ihb2032 Co-authored-by: root --- csrc/cpu/mla_decode.cpp | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/csrc/cpu/mla_decode.cpp b/csrc/cpu/mla_decode.cpp index bd489b463..564055ef5 100644 --- a/csrc/cpu/mla_decode.cpp +++ b/csrc/cpu/mla_decode.cpp @@ -38,16 +38,7 @@ struct KernelVecType { using qk_vec_type = vec_op::BF16Vec32; using v_load_vec_type = vec_op::BF16Vec16; }; - -#elif defined(__s390x__) -template <> -struct KernelVecType { - using qk_load_vec_type = vec_op::BF16Vec16; - using qk_vec_type = vec_op::FP32Vec16; - using v_load_vec_type = vec_op::BF16Vec16; -}; - -#elif defined(__aarch64__) +#else template <> struct KernelVecType { using qk_load_vec_type = vec_op::BF16Vec16; -- GitLab From 978a37c82387ce4a40aaadddcdbaf4a06fc4d590 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Mon, 9 Feb 2026 17:32:52 +0800 Subject: [PATCH 0003/1166] [Model] GLM adaptation (#34124) --- benchmarks/kernels/benchmark_moe.py | 1 + tests/models/registry.py | 3 +++ tests/models/test_initialization.py | 2 +- vllm/config/speculative.py | 2 +- vllm/model_executor/models/deepseek_v2.py | 6 +++++- vllm/model_executor/models/registry.py | 1 + vllm/transformers_utils/model_arch_config_convertor.py | 1 + 7 files changed, 13 insertions(+), 3 deletions(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 773926bff..c35cdb121 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -686,6 +686,7 @@ def get_model_params(config): "DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM", "DeepseekV32ForCausalLM", + "GlmMoeDsaForCausalLM", "Glm4MoeForCausalLM", "Glm4MoeLiteForCausalLM", "NemotronHForCausalLM", diff --git a/tests/models/registry.py b/tests/models/registry.py index 8ae94d080..f688985ce 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -275,6 +275,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "zai-org/GLM-4.7-Flash", min_transformers_version="5.0.0", ), + "GlmMoeDsaForCausalLM": _HfExamplesInfo( + "zai-org/GLM-5", min_transformers_version="5.0.1", is_available_online=False + ), "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}), "GPTBigCodeForCausalLM": _HfExamplesInfo( "bigcode/starcoder", diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 0e5272d50..4ee86416a 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -97,7 +97,7 @@ def can_initialize( "pickle error when loading `transformers.models.auto.CONFIG_MAPPING`" ) - if model_arch == "DeepseekV32ForCausalLM": + if model_arch in ["DeepseekV32ForCausalLM", "GlmMoeDsaForCausalLM"]: from vllm.platforms import current_platform capability = current_platform.get_device_capability() diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 5a2fe8eeb..8a54dbb6d 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -181,7 +181,7 @@ class SpeculativeConfig: @staticmethod def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig: initial_architecture = hf_config.architectures[0] - if hf_config.model_type in ("deepseek_v3", "deepseek_v32"): + if hf_config.model_type in ("deepseek_v3", "deepseek_v32", "glm_moe_dsa"): hf_config.model_type = "deepseek_mtp" if hf_config.model_type == "deepseek_mtp": n_predict = getattr(hf_config, "num_nextn_predict_layers", None) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 464518a3d..ab4f498b9 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -836,7 +836,7 @@ class DeepseekV2MLAAttention(nn.Module): qk_rope_head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, - is_neox_style=True, + is_neox_style=not getattr(config, "indexer_rope_interleave", True), ) self.indexer = Indexer( vllm_config, @@ -1499,6 +1499,10 @@ class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM): pass +class GlmMoeDsaForCausalLM(DeepseekV2ForCausalLM): + pass + + # Compatibility with # https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/configuration_deepseek.py def get_spec_layer_idx_from_weight_name( diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index c310f6f17..6e68b24ba 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -114,6 +114,7 @@ _TEXT_GENERATION_MODELS = { "Glm4ForCausalLM": ("glm4", "Glm4ForCausalLM"), "Glm4MoeForCausalLM": ("glm4_moe", "Glm4MoeForCausalLM"), "Glm4MoeLiteForCausalLM": ("glm4_moe_lite", "Glm4MoeLiteForCausalLM"), + "GlmMoeDsaForCausalLM": ("deepseek_v2", "GlmMoeDsaForCausalLM"), "GptOssForCausalLM": ("gpt_oss", "GptOssForCausalLM"), "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"), "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"), diff --git a/vllm/transformers_utils/model_arch_config_convertor.py b/vllm/transformers_utils/model_arch_config_convertor.py index bd6b7376e..f82186639 100644 --- a/vllm/transformers_utils/model_arch_config_convertor.py +++ b/vllm/transformers_utils/model_arch_config_convertor.py @@ -237,6 +237,7 @@ class ModelArchConfigConvertorBase: "deepseek_v3", "deepseek_v32", "deepseek_mtp", + "glm_moe_dsa", "glm4_moe_lite", "glm4_moe_lite_mtp", "kimi_k2", -- GitLab From 3025b3cebb1f019ccd6918cc54da1ca32f53a777 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Mon, 9 Feb 2026 03:37:04 -0600 Subject: [PATCH 0004/1166] [CI] Remove empty image_size_factors for fuyu, glm4_1v, glm_ocr (#34107) Signed-off-by: Andreas Karatzas --- tests/models/multimodal/generation/test_common.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 4dab4b7d9..d9b7a2821 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -377,7 +377,7 @@ VLM_TEST_SETTINGS = { use_tokenizer_eos=True, vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output, num_logprobs=10, - image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[large_gpu_mark(min_gb=32)], ), "gemma3": VLMTestInfo( @@ -437,7 +437,7 @@ VLM_TEST_SETTINGS = { max_num_seqs=2, get_stop_token_ids=lambda tok: [151329, 151336, 151338], num_logprobs=10, - image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], auto_cls=AutoModelForImageTextToText, marks=[large_gpu_mark(min_gb=32)], ), @@ -468,7 +468,7 @@ VLM_TEST_SETTINGS = { max_num_seqs=2, get_stop_token_ids=lambda tok: [151329, 151336, 151338], num_logprobs=10, - image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], auto_cls=AutoModelForImageTextToText, marks=[large_gpu_mark(min_gb=32)], ), -- GitLab From 1d5922fadeebc5ec133dc1c88eb1e85605a5510c Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Mon, 9 Feb 2026 05:02:37 -0500 Subject: [PATCH 0005/1166] [ASR] Fix audio benchmark and add RTFx metric (#32300) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Co-authored-by: Nicolò Lucchesi --- docs/benchmarking/cli.md | 17 +++++ vllm/benchmarks/datasets.py | 74 +++++++++++++++----- vllm/benchmarks/lib/endpoint_request_func.py | 7 +- vllm/benchmarks/serve.py | 11 +++ 4 files changed, 90 insertions(+), 19 deletions(-) diff --git a/docs/benchmarking/cli.md b/docs/benchmarking/cli.md index 43b6052de..7bb91239c 100644 --- a/docs/benchmarking/cli.md +++ b/docs/benchmarking/cli.md @@ -30,6 +30,7 @@ th { | HuggingFace-Other | ✅ | ✅ | `lmms-lab/LLaVA-OneVision-Data`, `Aeala/ShareGPT_Vicuna_unfiltered` | | HuggingFace-MTBench | ✅ | ✅ | `philschmid/mt-bench` | | HuggingFace-Blazedit | ✅ | ✅ | `vdaita/edit_5k_char`, `vdaita/edit_10k_char` | +| HuggingFace-ASR | ✅ | ✅ | `openslr/librispeech_asr`, `facebook/voxpopuli`, `LIUM/tedlium`, `edinburghcstr/ami`, `speechcolab/gigaspeech`, `kensho/spgispeech` | | Spec Bench | ✅ | ✅ | `wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl` | | Custom | ✅ | ✅ | Local file: `data.jsonl` | | Custom MM | ✅ | ✅ | Local file: `mm_data.jsonl` | @@ -299,6 +300,22 @@ vllm bench serve \ --blazedit-max-distance 0.99 ``` +`openslr/librispeech_asr`, `facebook/voxpopuli`, `LIUM/tedlium`, `edinburghcstr/ami`, `speechcolab/gigaspeech`, `kensho/spgispeech` + +```bash +vllm bench serve \ + --model openai/whisper-large-v3-turbo \ + --backend openai-audio \ + --dataset-name hf \ + --dataset-path facebook/voxpopuli --hf-subset en --hf-split test --no-stream --trust-remote-code \ + --num-prompts 99999999 \ + --no-oversample \ + --endpoint /v1/audio/transcriptions \ + --ready-check-timeout-sec 600 \ + --save-result \ + --max-concurrency 512 +``` + #### Running With Sampling Parameters When using OpenAI-compatible backends such as `vllm`, optional sampling diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index d437e26ad..7148d90dc 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -1443,6 +1443,20 @@ def add_dataset_parser(parser: FlexibleArgumentParser): help="Maximum distance for blazedit dataset. Min: 0, Max: 1.0", ) + asr_group = parser.add_argument_group("asr dataset options") + asr_group.add_argument( + "--asr-max-audio-len-sec", + type=float, + default=float("inf"), + help="Maximum audio length in seconds for ASR dataset.", + ) + asr_group.add_argument( + "--asr-min-audio-len-sec", + type=float, + default=0.0, + help="Minimum audio length in seconds for ASR dataset.", + ) + random_group = parser.add_argument_group("random dataset options") add_random_dataset_base_args(random_group) @@ -1744,27 +1758,27 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: or args.hf_name in VisionArenaDataset.SUPPORTED_DATASET_PATHS ): dataset_class = VisionArenaDataset - args.hf_split = "train" + args.hf_split = args.hf_split if args.hf_split else "train" args.hf_subset = None elif ( args.dataset_path in MMVUDataset.SUPPORTED_DATASET_PATHS or args.hf_name in MMVUDataset.SUPPORTED_DATASET_PATHS ): dataset_class = MMVUDataset - args.hf_split = "validation" + args.hf_split = args.hf_split if args.hf_split else "validation" args.hf_subset = None elif ( args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS or args.hf_name in InstructCoderDataset.SUPPORTED_DATASET_PATHS ): dataset_class = InstructCoderDataset - args.hf_split = "train" + args.hf_split = args.hf_split if args.hf_split else "train" elif ( args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS or args.hf_name in MTBenchDataset.SUPPORTED_DATASET_PATHS ): dataset_class = MTBenchDataset - args.hf_split = "train" + args.hf_split = args.hf_split if args.hf_split else "train" elif ( args.dataset_path in MultiModalConversationDataset.SUPPORTED_DATASET_PATHS or args.hf_name in MultiModalConversationDataset.SUPPORTED_DATASET_PATHS @@ -1780,22 +1794,26 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: or args.hf_name in AIMODataset.SUPPORTED_DATASET_PATHS ): dataset_class = AIMODataset - args.hf_split = "train" + args.hf_split = args.hf_split if args.hf_split else "train" elif ( args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS # noqa: E501 or args.hf_name in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS ): dataset_class = NextEditPredictionDataset - args.hf_split = "train" + args.hf_split = args.hf_split if args.hf_split else "train" elif ( args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS or args.hf_name in ASRDataset.SUPPORTED_DATASET_PATHS ): dataset_class = ASRDataset - args.hf_split = "train" + args.hf_split = args.hf_split if args.hf_split else "train" + hf_kwargs = { + "asr_min_audio_len_sec": args.asr_min_audio_len_sec, + "asr_max_audio_len_sec": args.asr_max_audio_len_sec, + } elif args.dataset_path in BlazeditDataset.SUPPORTED_DATASET_PATHS: dataset_class = BlazeditDataset - args.hf_split = "train" + args.hf_split = args.hf_split if args.hf_split else "train" hf_kwargs = { "min_distance": args.blazedit_min_distance, "max_distance": args.blazedit_max_distance, @@ -1805,13 +1823,13 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: or args.hf_name in MLPerfDataset.SUPPORTED_DATASET_PATHS ): dataset_class = MLPerfDataset - args.hf_split = "train" + args.hf_split = args.hf_split if args.hf_split else "train" elif ( args.dataset_path in MMStarDataset.SUPPORTED_DATASET_PATHS or args.hf_name in MMStarDataset.SUPPORTED_DATASET_PATHS ): dataset_class = MMStarDataset - args.hf_split = "val" + args.hf_split = args.hf_split if args.hf_split else "val" args.hf_subset = None else: supported_datasets = set( @@ -1847,6 +1865,7 @@ def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: no_stream=args.no_stream, hf_name=args.hf_name, disable_shuffle=args.disable_shuffle, + trust_remote_code=args.trust_remote_code, ).sample( num_requests=args.num_prompts, tokenizer=tokenizer, @@ -2405,6 +2424,7 @@ class HuggingFaceDataset(BenchmarkDataset): no_stream: bool = False, dataset_subset: str | None = None, hf_name: str | None = None, + trust_remote_code: bool = False, **kwargs, ) -> None: super().__init__(dataset_path=dataset_path, **kwargs) @@ -2413,6 +2433,7 @@ class HuggingFaceDataset(BenchmarkDataset): self.dataset_subset = dataset_subset self.load_stream = not no_stream self.hf_name = hf_name or dataset_path + self.trust_remote_code = trust_remote_code self.load_data() def load_data(self) -> None: @@ -2422,6 +2443,7 @@ class HuggingFaceDataset(BenchmarkDataset): name=self.dataset_subset, split=self.dataset_split, streaming=self.load_stream, + trust_remote_code=self.trust_remote_code, ) if not getattr(self, "disable_shuffle", False): self.data = self.data.shuffle(seed=self.random_seed) @@ -3071,13 +3093,9 @@ class ASRDataset(HuggingFaceDataset): "kensho/spgispeech", } - DEFAULT_OUTPUT_LEN = 128 + DEFAULT_OUTPUT_LEN = 1024 IS_MULTIMODAL = True - # TODO Whisper-specific. Abstract interface when more models are supported. - TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>" - skip_long_audios: bool = True - def sample( self, tokenizer: TokenizerLike, @@ -3088,22 +3106,28 @@ class ASRDataset(HuggingFaceDataset): **kwargs, ) -> list: output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN - prompt = ASRDataset.TRANSCRIPTION_PREAMBLE + if "openai" in tokenizer.name_or_path: + prompt = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>" + else: + prompt = "" prompt_len = len(tokenizer(prompt).input_ids) sampled_requests = [] ind = 0 skipped = 0 + asr_min_audio_len_sec = kwargs.get("asr_min_audio_len_sec") + asr_max_audio_len_sec = kwargs.get("asr_max_audio_len_sec") + durations = [] for item in self.data: if len(sampled_requests) >= num_requests: break audio = item["audio"] y, sr = audio["array"], audio["sampling_rate"] duration_s = librosa.get_duration(y=y, sr=sr) - # Whisper max supported duration - if self.skip_long_audios and duration_s > 30: + if duration_s < asr_min_audio_len_sec or duration_s > asr_max_audio_len_sec: skipped += 1 continue + durations.append(duration_s) mm_content = {"audio": (y, sr)} sampled_requests.append( SampleRequest( @@ -3122,6 +3146,20 @@ class ASRDataset(HuggingFaceDataset): " what Whisper supports.", skipped, ) + + logger.info("Number of audio samples: %d", len(durations)) + avg_duration = sum(durations) / len(durations) if durations else 0 + min_duration = min(durations) if durations else 0 + max_duration = max(durations) if durations else 0 + median_duration = np.median(durations) if durations else 0 + logger.info( + "Audio duration statistics (s): avg=%.2f, min=%.2f, max=%.2f, median=%.2f", + avg_duration, + min_duration, + max_duration, + median_duration, + ) + self.maybe_oversample_requests( sampled_requests, num_requests, request_id_prefix, no_oversample ) diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index 987e8a5fd..cccbcdb83 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -93,6 +93,7 @@ class RequestFuncOutput: prompt_len: int = 0 error: str = "" start_time: float = 0.0 + input_audio_duration: float = 0.0 # in seconds class RequestFunc(Protocol): @@ -422,6 +423,8 @@ async def async_request_openai_audio( output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len + output.input_audio_duration = soundfile.info(f).duration + f.seek(0) generated_text = "" ttft = 0.0 @@ -442,7 +445,9 @@ async def async_request_openai_audio( messages = handler.add_chunk(chunk_bytes) for message in messages: - chunk = message.decode("utf-8").removeprefix("data: ") + if type(message) is bytes: + message = message.decode("utf-8") + chunk = message.removeprefix("data: ") if chunk != "[DONE]": timestamp = time.perf_counter() data = json.loads(chunk) diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 19d98f659..dd853f15a 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -193,6 +193,7 @@ class BenchmarkMetrics: # Max output tokens per second and concurrent requests at that peak max_output_tokens_per_s: float max_concurrent_requests: int + rtfx: float = 0.0 # Inverse Real-Time Factor for ASR benchmarks @dataclass @@ -412,6 +413,7 @@ def calculate_metrics( all_tpots: list[float] = [] ttfts: list[float] = [] e2els: list[float] = [] + input_audio_duration = 0.0 for i in range(len(outputs)): if outputs[i].success: output_len = outputs[i].output_tokens @@ -439,6 +441,7 @@ def calculate_metrics( itls += outputs[i].itl ttfts.append(outputs[i].ttft) e2els.append(outputs[i].latency) + input_audio_duration += outputs[i].input_audio_duration completed += 1 else: actual_output_lens.append(0) @@ -583,6 +586,7 @@ def calculate_metrics( ], max_output_tokens_per_s=max_output_tokens_per_s, max_concurrent_requests=max_concurrent_requests, + rtfx=input_audio_duration / dur_s, ) return metrics, actual_output_lens @@ -937,6 +941,12 @@ async def benchmark( "Peak concurrent requests:", metrics.max_concurrent_requests ) ) + if metrics.rtfx > 0.0: + print( + "{:<40} {:<10.2f}".format( + "RTFx (Inverse Real-Time Factor):", metrics.rtfx + ) + ) print( "{:<40} {:<10.2f}".format( "Total token throughput (tok/s):", metrics.total_token_throughput @@ -963,6 +973,7 @@ async def benchmark( "errors": [output.error for output in outputs], "max_output_tokens_per_s": metrics.max_output_tokens_per_s, "max_concurrent_requests": metrics.max_concurrent_requests, + "rtfx": metrics.rtfx, } else: result = { -- GitLab From caad9f1e01ee04e4f5912d0287031ea3a850f6dc Mon Sep 17 00:00:00 2001 From: Nikhil Gupta Date: Mon, 9 Feb 2026 10:04:41 +0000 Subject: [PATCH 0006/1166] [Fix] [CPU Backend] : Prepack weights for w8a8 oneDNN matmul (#33901) Signed-off-by: nikhil-arm --- csrc/cpu/dnnl_helper.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp index e337e10e1..03944dc0d 100644 --- a/csrc/cpu/dnnl_helper.cpp +++ b/csrc/cpu/dnnl_helper.cpp @@ -237,12 +237,20 @@ W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args) }; dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_, {b_k_stride_, b_n_stride_}); +#ifdef __aarch64__ + // dummy M size for prepacking weights + // Prepacking weights improves performance and avoid runtime reorders + constexpr dnnl_dim_t kProbeM = 128; +#else + constexpr dnnl_dim_t kProbeM = DNNL_RUNTIME_DIM_VAL; +#endif + prepack_weight(args.b_ptr, original_b_md, create_primitive_desc( - MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL, + MSizeCacheKey{.a_m_size = kProbeM, .use_bias = false, .bias_type = dnnl::memory::data_type::undef}, - true) + /*first_time=*/true) .weights_desc()); init_runtime_memory_cache(args); } -- GitLab From 9bdb06b4368e304bc5e23c8df2dff8f8b2ccf0f6 Mon Sep 17 00:00:00 2001 From: zofia <110436990+zufangzhu@users.noreply.github.com> Date: Mon, 9 Feb 2026 20:17:35 +0800 Subject: [PATCH 0007/1166] [XPU][6/N] add xpu scaled_mm kernel (#34117) Signed-off-by: Zhu, Zufang --- .../scripts/hardware_ci/run-xpu-test.sh | 1 + .../model_executor/layers/quantization/fp8.py | 11 +--- .../kernels/scaled_mm/__init__.py | 6 ++ .../quantization/kernels/scaled_mm/xpu.py | 59 +++++++++++++++++++ 4 files changed, 67 insertions(+), 10 deletions(-) create mode 100644 vllm/model_executor/layers/quantization/kernels/scaled_mm/xpu.py diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index 56676ee28..b52dd7826 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -39,6 +39,7 @@ docker run \ python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --attention-backend=TRITON_ATTN + python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager --quantization fp8 python3 examples/offline_inference/basic/generate.py --model superjob/Qwen3-4B-Instruct-2507-GPTQ-Int4 --block-size 64 --enforce-eager python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 python3 examples/offline_inference/basic/generate.py --model ibm-research/PowerMoE-3b --block-size 64 --enforce-eager -tp 2 --enable-expert-parallel diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index a61239706..80348edcc 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -180,18 +180,9 @@ class Fp8Config(QuantizationConfig): weight_block_size=weight_block_size, ) - def get_xpu_quant_method( - self, layer: torch.nn.Module, prefix: str - ) -> "QuantizeMethodBase | None": - raise NotImplementedError( - "FP8 quantization is not supported during xpu kernel migration." - ) - def get_quant_method( self, layer: torch.nn.Module, prefix: str ) -> "QuantizeMethodBase | None": - if current_platform.is_xpu(): - return self.get_xpu_quant_method(layer, prefix) if isinstance(layer, LinearBase): if is_layer_skipped( prefix=prefix, @@ -300,7 +291,7 @@ class Fp8LinearMethod(LinearMethodBase): or envs.VLLM_TEST_FORCE_FP8_MARLIN ) # Disable marlin for rocm - if current_platform.is_rocm(): + if current_platform.is_rocm() or current_platform.is_xpu(): self.use_marlin = False if vllm_is_batch_invariant(): self.use_marlin = False diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py index e5401ff81..bbd43dd10 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py @@ -39,6 +39,9 @@ from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKer from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import ( TritonInt8ScaledMMLinearKernel, ) +from vllm.model_executor.layers.quantization.kernels.scaled_mm.xpu import ( + XPUFP8ScaledMMLinearKernel, +) from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey from vllm.platforms import PlatformEnum, current_platform @@ -72,6 +75,9 @@ _POSSIBLE_FP8_KERNELS: dict[PlatformEnum, list[type[FP8ScaledMMLinearKernel]]] = PerTensorTorchFP8ScaledMMLinearKernel, ChannelWiseTorchFP8ScaledMMLinearKernel, ], + PlatformEnum.XPU: [ + XPUFP8ScaledMMLinearKernel, + ], } _KernelT = TypeVar("_KernelT", bound=ScaledMMLinearKernel) diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xpu.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xpu.py new file mode 100644 index 000000000..5b816a3f5 --- /dev/null +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xpu.py @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Sequence + +import torch + +from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import ( # noqa: E501 + FP8ScaledMMLinearKernel, + FP8ScaledMMLinearLayerConfig, +) +from vllm.platforms import current_platform + + +class XPUFP8ScaledMMLinearKernel(FP8ScaledMMLinearKernel): + @classmethod + def is_supported( + cls, compute_capability: int | None = None + ) -> tuple[bool, str | None]: + if not current_platform.is_xpu(): + return False, "XPUFP8ScaledMM only support on XPU" + return True, None + + @classmethod + def can_implement(cls, c: FP8ScaledMMLinearLayerConfig) -> tuple[bool, str | None]: + if c.weight_quant_key.dtype not in {torch.float8_e5m2, torch.float8_e4m3fn}: + return False, "XPUFP8ScaledMM only support FP8 weight dtype" + return True, None + + def __init__( + self, c: FP8ScaledMMLinearLayerConfig, layer_param_names: Sequence[str] + ) -> None: + assert self.can_implement(c)[0] + assert self.is_supported()[0] + self.config = c + self.layer_param_names = layer_param_names + + def apply_weights( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + weight = layer.weight + weight_scale = layer.weight_scale + return torch.ops._xpu_C.fp8_gemm_w8a16(x, weight, weight_scale, bias) + + def apply_scaled_mm( + self, + *, + A: torch.Tensor, + B: torch.Tensor, + out_dtype: torch.dtype, + As: torch.Tensor, + Bs: torch.Tensor, + bias: torch.Tensor | None, + output_shape: list, + ) -> torch.Tensor: + pass -- GitLab From 9562912cead1f11e8540fb91306c5cbda66f0007 Mon Sep 17 00:00:00 2001 From: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com> Date: Mon, 9 Feb 2026 21:12:58 +0800 Subject: [PATCH 0008/1166] [MODEL] Adding Support for Qwen3.5 Models (#34110) Signed-off-by: JJJYmmm <1650675829@qq.com> Signed-off-by: JJJYmmm <92386084+JJJYmmm@users.noreply.github.com> Signed-off-by: Roger Wang Co-authored-by: wulipc Co-authored-by: ywang96 Co-authored-by: Isotr0py Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Roger Wang --- docs/models/supported_models.md | 2 + tests/models/registry.py | 20 + vllm/config/model.py | 4 +- vllm/config/speculative.py | 11 + vllm/model_executor/layers/mamba/abstract.py | 3 +- vllm/model_executor/models/qwen3_5.py | 993 ++++++++++++++++++ vllm/model_executor/models/qwen3_5_mtp.py | 447 ++++++++ vllm/model_executor/models/qwen3_next.py | 12 +- vllm/model_executor/models/registry.py | 10 + .../model_arch_config_convertor.py | 6 + vllm/v1/spec_decode/eagle.py | 2 + 11 files changed, 1501 insertions(+), 9 deletions(-) create mode 100644 vllm/model_executor/models/qwen3_5.py create mode 100644 vllm/model_executor/models/qwen3_5_mtp.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 4e5abea8e..ac02e9bde 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -738,6 +738,8 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + IE+ + VE+ | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | | `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + IE+ + VE+ | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | | `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen2.5-Omni-3B`, `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎ | +| `Qwen3_5ForConditionalGeneration` | Qwen3.5 | T + IE+ + VE+ | `Qwen/Qwen3.5-9B-Instruct`, etc. | ✅︎ | ✅︎ | +| `Qwen3_5MoeForConditionalGeneration` | Qwen3.5-MOE | T + IE+ + VE+ | `Qwen/Qwen3.5-35B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | | `Qwen3VLForConditionalGeneration` | Qwen3-VL | T + IE+ + VE+ | `Qwen/Qwen3-VL-4B-Instruct`, etc. | ✅︎ | ✅︎ | | `Qwen3VLMoeForConditionalGeneration` | Qwen3-VL-MOE | T + IE+ + VE+ | `Qwen/Qwen3-VL-30B-A3B-Instruct`, etc. | ✅︎ | ✅︎ | | `Qwen3OmniMoeThinkerForConditionalGeneration` | Qwen3-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen3-Omni-30B-A3B-Instruct`, `Qwen/Qwen3-Omni-30B-A3B-Thinking` | ✅︎ | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index f688985ce..d2c67cf7e 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -970,6 +970,26 @@ _MULTIMODAL_EXAMPLE_MODELS = { max_model_len=4096, min_transformers_version="4.57", ), + "Qwen3_5ForConditionalGeneration": _HfExamplesInfo( + "Qwen/Qwen3.5-9B-Instruct", + max_model_len=4096, + min_transformers_version="5.1.0", + ), + "Qwen3_5MoeForConditionalGeneration": _HfExamplesInfo( + "Qwen/Qwen3.5-35B-A3B-Instruct", + max_model_len=4096, + min_transformers_version="5.1.0", + ), + "Qwen3_5MTP": _HfExamplesInfo( + "Qwen/Qwen3.5-9B-Instruct", + speculative_model="Qwen/Qwen3.5-9B-Instruct", + min_transformers_version="5.1.0", + ), + "Qwen3_5MoeMTP": _HfExamplesInfo( + "Qwen/Qwen3.5-35B-A3B-Instruct", + speculative_model="Qwen/Qwen3.5-35B-A3B-Instruct", + min_transformers_version="5.1.0", + ), "Qwen3OmniMoeForConditionalGeneration": _HfExamplesInfo( "Qwen/Qwen3-Omni-30B-A3B-Instruct", max_model_len=4096, diff --git a/vllm/config/model.py b/vllm/config/model.py index a359df374..b76d51868 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1218,8 +1218,8 @@ class ModelConfig: if attn_type_list: return sum(t == 1 for t in attn_type_list[start:end]) - # Hybrid model Qwen3Next - layer_types_value = getattr(self.hf_config, "layer_types", None) + # Hybrid model Qwen3Next Qwen3.5 Series + layer_types_value = getattr(self.hf_text_config, "layer_types", None) if layer_types_value is not None: if block_type == "attention": return sum( diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 8a54dbb6d..8117349d8 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -37,6 +37,7 @@ MTPModelTypes = Literal[ "ernie_mtp", "exaone_moe_mtp", "qwen3_next_mtp", + "qwen3_5_mtp", "longcat_flash_mtp", "mtp", "pangu_ultra_moe_mtp", @@ -263,6 +264,16 @@ class SpeculativeConfig: {"n_predict": n_predict, "architectures": ["ExaoneMoeMTP"]} ) + if hf_config.model_type in ("qwen3_5", "qwen3_5_moe"): + is_moe = hf_config.model_type == "qwen3_5_moe" + hf_config.model_type = "qwen3_5_mtp" + n_predict = getattr(hf_config, "mtp_num_hidden_layers", None) + hf_config.update( + { + "n_predict": n_predict, + "architectures": ["Qwen3_5MoeMTP" if is_moe else "Qwen3_5MTP"], + } + ) if hf_config.model_type == "longcat_flash": hf_config.model_type = "longcat_flash_mtp" n_predict = getattr(hf_config, "num_nextn_predict_layers", 1) diff --git a/vllm/model_executor/layers/mamba/abstract.py b/vllm/model_executor/layers/mamba/abstract.py index f92ecb6b5..347ce139e 100644 --- a/vllm/model_executor/layers/mamba/abstract.py +++ b/vllm/model_executor/layers/mamba/abstract.py @@ -43,7 +43,8 @@ class MambaBase(AttentionLayerBase): def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec | None: if ( vllm_config.speculative_config is not None - and vllm_config.model_config.hf_config.model_type not in ["qwen3_next"] + and vllm_config.model_config.hf_config.model_type + not in ["qwen3_next", "qwen3_5", "qwen3_5_moe"] ): raise NotImplementedError( "Mamba with speculative decoding is not supported yet." diff --git a/vllm/model_executor/models/qwen3_5.py b/vllm/model_executor/models/qwen3_5.py new file mode 100644 index 000000000..d6df7523b --- /dev/null +++ b/vllm/model_executor/models/qwen3_5.py @@ -0,0 +1,993 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The vLLM team. +# Copyright 2025 The Qwen Team. +# Copyright 2025 The HuggingFace Inc. team. +# All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Qwen3.5 Series compatible with HuggingFace weights.""" + +import typing +from collections.abc import Callable, Iterable + +import torch +from einops import rearrange +from torch import nn +from transformers.activations import ACT2FN +from transformers.models.qwen3_5.configuration_qwen3_5 import ( + Qwen3_5Config, + Qwen3_5TextConfig, +) +from transformers.models.qwen3_5_moe.configuration_qwen3_5_moe import ( + Qwen3_5MoeConfig, + Qwen3_5MoeTextConfig, +) + +from vllm.compilation.decorators import support_torch_compile +from vllm.config import ( + CacheConfig, + ModelConfig, + SpeculativeConfig, + VllmConfig, + get_current_vllm_config, +) +from vllm.distributed import ( + divide, + get_pp_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.layernorm import ( + GemmaRMSNorm as Qwen3_5RMSNorm, +) +from vllm.model_executor.layers.layernorm import RMSNormGated +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + MergedColumnParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.mamba.mamba_mixer2 import ( + mamba_v2_sharded_weight_loader, +) +from vllm.model_executor.layers.mamba.mamba_utils import ( + MambaStateCopyFunc, + MambaStateCopyFuncCalculator, + MambaStateDtypeCalculator, + MambaStateShapeCalculator, +) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + sharded_weight_loader, +) +from vllm.model_executor.utils import set_weight_attrs +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.platforms import current_platform +from vllm.sequence import IntermediateTensors + +from .interfaces import ( + HasInnerState, + IsHybrid, + MixtureOfExperts, + MultiModalEmbeddings, + SupportsLoRA, + SupportsPP, + _require_is_multimodal, +) +from .qwen2_moe import Qwen2MoeMLP as Qwen3NextMLP +from .qwen3_next import ( + Qwen3NextAttention, + Qwen3NextDecoderLayer, + Qwen3NextGatedDeltaNet, + Qwen3NextModel, + Qwen3NextSparseMoeBlock, + QwenNextMixtureOfExperts, +) +from .qwen3_vl import ( + Qwen3_VisionTransformer, + Qwen3VLDummyInputsBuilder, + Qwen3VLForConditionalGeneration, + Qwen3VLMultiModalProcessor, + Qwen3VLProcessingInfo, +) +from .utils import ( + AutoWeightsLoader, + PPMissingLayer, + _merge_multimodal_embeddings, + extract_layer_index, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, + make_layers, + maybe_prefix, +) + +logger = init_logger(__name__) + + +class Qwen3_5ProcessingInfo(Qwen3VLProcessingInfo): + def get_hf_config(self): + return self.ctx.get_hf_config(Qwen3_5Config) + + +class Qwen3_5MoeProcessingInfo(Qwen3VLProcessingInfo): + def get_hf_config(self): + return self.ctx.get_hf_config(Qwen3_5MoeConfig) + + +class Qwen3_5GatedDeltaNet(Qwen3NextGatedDeltaNet): + def __init__( + self, + config: Qwen3_5TextConfig | Qwen3_5MoeTextConfig, + model_config: ModelConfig | None = None, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + speculative_config: SpeculativeConfig | None = None, + prefix: str = "", + ) -> None: + super(Qwen3NextGatedDeltaNet, self).__init__() + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + self.hidden_size = config.hidden_size + self.num_v_heads = config.linear_num_value_heads + self.num_k_heads = config.linear_num_key_heads + self.head_k_dim = config.linear_key_head_dim + self.head_v_dim = config.linear_value_head_dim + self.key_dim = self.head_k_dim * self.num_k_heads + self.value_dim = self.head_v_dim * self.num_v_heads + + self.conv_kernel_size = config.linear_conv_kernel_dim + self.layer_idx = extract_layer_index(prefix) + self.activation = config.hidden_act + self.act = ACT2FN[config.hidden_act] + self.layer_norm_epsilon = config.rms_norm_eps + self.prefix = prefix + + self.config = config + self.model_config = model_config + self.cache_config = cache_config + self.quant_config = quant_config + self.speculative_config = speculative_config + self.num_spec = ( + self.speculative_config.num_speculative_tokens + if self.speculative_config + else 0 + ) + + # QKV + self.conv_dim = self.key_dim * 2 + self.value_dim + self.conv1d = ColumnParallelLinear( + input_size=self.conv_kernel_size, + output_size=self.conv_dim, + bias=False, + prefix=f"{prefix}.conv1d", + ) + self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1) + + self.in_proj_qkv = MergedColumnParallelLinear( + input_size=self.hidden_size, + output_sizes=[self.key_dim, self.key_dim, self.value_dim], + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.in_proj_qkv", + ) + self.in_proj_z = ColumnParallelLinear( + input_size=self.hidden_size, + output_size=self.value_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.in_proj_z", + ) + self.in_proj_b = ColumnParallelLinear( + input_size=self.hidden_size, + output_size=self.num_v_heads, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.in_proj_ba", + ) + self.in_proj_a = ColumnParallelLinear( + input_size=self.hidden_size, + output_size=self.num_v_heads, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.in_proj_a", + ) + + query_key_settings = (self.key_dim, 0, False) + value_settings = (self.value_dim, 0, False) + + delattr(self.conv1d.weight, "weight_loader") + set_weight_attrs( + self.conv1d.weight, + { + "weight_loader": mamba_v2_sharded_weight_loader( + [ + query_key_settings, + query_key_settings, + value_settings, + ], + self.tp_size, + self.tp_rank, + ) + }, + ) + + # selective projection used to make dt, B and C input dependant + + # time step projection (discretization) + # instantiate once and copy inv_dt in init_weights of PretrainedModel + self.dt_bias = nn.Parameter( + torch.ones(self.num_v_heads // self.tp_size), + ) + self.A_log = nn.Parameter( + torch.empty( + divide(self.num_v_heads, self.tp_size), + ) + ) + + set_weight_attrs(self.A_log, {"weight_loader": sharded_weight_loader(0)}) + set_weight_attrs(self.dt_bias, {"weight_loader": sharded_weight_loader(0)}) + + self.norm = RMSNormGated( + self.head_v_dim, + eps=self.layer_norm_epsilon, + group_size=None, + norm_before_gate=True, + device=current_platform.current_device(), + dtype=config.dtype, + ) + + self.out_proj = RowParallelLinear( + self.value_dim, + self.hidden_size, + bias=False, + input_is_parallel=True, + quant_config=quant_config, + prefix=f"{prefix}.out_proj", + ) + + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + + def fix_query_key_value_ordering( + self, + mixed_qkv, + z, + b, + a, + ): + raise NotImplementedError( + "Qwen3.5 Series dont need to fix query key value ordering" + ) + + def forward( + self, + hidden_states: torch.Tensor, + output: torch.Tensor, + ): + """ + Forward pass with three parts: + 1. Input projection + 2. Core attention (custom op) + 3. Output projection + """ + num_tokens = hidden_states.size(0) + + # ============================================================ + # Part 1: Input Projection + # ============================================================ + mixed_qkv, _ = self.in_proj_qkv(hidden_states) + z, _ = self.in_proj_z(hidden_states) + z = z.reshape(z.size(0), -1, self.head_v_dim) + b, _ = self.in_proj_b(hidden_states) + a, _ = self.in_proj_a(hidden_states) + + b = b.contiguous() + a = a.contiguous() + + # ============================================================ + # Part 2: Core Attention (Custom Op) + # ============================================================ + # Note: we should not use torch.empty here like other attention backends, + # see discussions in https://github.com/vllm-project/vllm/pull/28182 + core_attn_out = torch.zeros( + (num_tokens, self.num_v_heads // self.tp_size, self.head_v_dim), + dtype=hidden_states.dtype, + device=hidden_states.device, + ) + + torch.ops.vllm.gdn_attention_core( + mixed_qkv, + b, + a, + core_attn_out, + self.prefix, + ) + + # ============================================================ + # Part 3: Output Projection + # ============================================================ + z_shape_og = z.shape + # Reshape input data into 2D tensor + core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1]) + z = z.reshape(-1, z.shape[-1]) + core_attn_out = self.norm(core_attn_out, z) + core_attn_out = core_attn_out.reshape(z_shape_og) + core_attn_out = rearrange(core_attn_out, "... h d -> ... (h d)") + output[:num_tokens], _ = self.out_proj(core_attn_out) + + +class Qwen3_5DecoderLayer(Qwen3NextDecoderLayer): + def __init__( + self, + vllm_config: VllmConfig, + layer_type: str, + prefix: str = "", + ) -> None: + super(Qwen3NextDecoderLayer, self).__init__() + + config = vllm_config.model_config.hf_text_config + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + speculative_config = vllm_config.speculative_config + + self.layer_type = layer_type + self.layer_idx = extract_layer_index(prefix) + + if self.layer_type == "linear_attention": + self.linear_attn = Qwen3_5GatedDeltaNet( + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + speculative_config=speculative_config, + prefix=f"{prefix}.linear_attn", + ) + elif self.layer_type == "full_attention": + self.self_attn = Qwen3NextAttention( + config, + model_config=model_config, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + else: + raise ValueError(f"Invalid layer_type {self.layer_type}") + + # NOTE: Determine the MLP type based on the model type + # Qwen3.5 use all layers for MLP / Qwen3.5-MoE use sparse MoE blocks + if config.model_type == "qwen3_5_moe_text": + self.mlp = Qwen3NextSparseMoeBlock( + vllm_config=vllm_config, + prefix=f"{prefix}.mlp", + ) + elif config.model_type == "qwen3_5_text": + self.mlp = Qwen3NextMLP( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + else: + raise ValueError(f"Invalid model_type {config.model_type}") + + self.input_layernorm = Qwen3_5RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + self.post_attention_layernorm = Qwen3_5RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + self.layer_scale = getattr(config, "layer_scale", False) + if self.layer_scale: + self.attn_layer_scale = torch.nn.Parameter( + torch.zeros( + 1, + 1, + config.hidden_size, + dtype=config.dtype, + ), + ) + self.ffn_layer_scale = torch.nn.Parameter( + torch.zeros( + 1, + 1, + config.hidden_size, + dtype=config.dtype, + ), + ) + + +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl, + # otherwise (seq_len, ). + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + } +) +class Qwen3_5Model(Qwen3NextModel): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super(Qwen3NextModel, self).__init__() + + config: Qwen3_5TextConfig | Qwen3_5MoeTextConfig = ( + vllm_config.model_config.hf_text_config + ) + parallel_config = vllm_config.parallel_config + + eplb_config = parallel_config.eplb_config + self.num_redundant_experts = eplb_config.num_redundant_experts + + self.config = config + + self.vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + ) + + def get_layer(prefix: str): + return Qwen3_5DecoderLayer( + vllm_config, + layer_type=config.layer_types[extract_layer_index(prefix)], + prefix=prefix, + ) + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers" + ) + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size + ) + + if get_pp_group().is_last_rank: + self.norm = Qwen3_5RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + + def load_fused_expert_weights( + self, + name: str, + params_dict: dict, + loaded_weight: torch.Tensor, + shard_id: str, + num_experts: int, + ) -> bool: + param = params_dict[name] + weight_loader = typing.cast(Callable[..., bool], param.weight_loader) + loaded_local_expert = False + for expert_id in range(num_experts): + curr_expert_weight = loaded_weight[expert_id] + success = weight_loader( + param, + curr_expert_weight, + name, + shard_id, + expert_id, + return_success=True, + ) + if success: + loaded_local_expert = True + + return loaded_local_expert + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + expert_params_mapping = self.get_expert_mapping() + is_fused_expert = False + fused_expert_params_mapping = [ + ("experts.w13_weight", "experts.gate_up_proj", 0, "w1"), + ("experts.w2_weight", "experts.down_proj", 0, "w2"), + ] + num_experts = ( + self.config.num_experts if hasattr(self.config, "num_experts") else 0 + ) + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + if name.startswith("mtp."): + continue + + for param_name, weight_name, shard_id in stacked_params_mapping: + if "experts.gate_up_proj" in name or "experts.down_proj" in name: + is_fused_expert = True + expert_params_mapping = fused_expert_params_mapping + + if weight_name not in name: + continue + + if "mlp.experts" in name: + continue + + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + # name = apply_attn_prefix(name, params_dict) + if name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + is_expert_weight = False + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + is_expert_weight = True + name_mapped = name.replace(weight_name, param_name) + # Skip layers on other devices. + if is_pp_missing_parameter(name_mapped, self): + continue + if is_fused_expert: + # qwen3.5 no need to transpose + # loaded_weight = loaded_weight.transpose(-1, -2) + if "experts.gate_up_proj" in name: + loaded_weight = loaded_weight.chunk(2, dim=-2) + success_w1 = self.load_fused_expert_weights( + name_mapped, + params_dict, + loaded_weight[0], + "w1", + num_experts, + ) + success_w3 = self.load_fused_expert_weights( + name_mapped, + params_dict, + loaded_weight[1], + "w3", + num_experts, + ) + success = success_w1 and success_w3 + else: + # down_proj + success = self.load_fused_expert_weights( + name_mapped, + params_dict, + loaded_weight, + shard_id, + num_experts, + ) + if success: + name = name_mapped + break + else: + # Skip loading extra bias for GPTQ models. + if ( + name_mapped.endswith(".bias") + or name_mapped.endswith("_bias") + ) and name_mapped not in params_dict: + continue + param = params_dict[name_mapped] + weight_loader = param.weight_loader + success = weight_loader( + param, + loaded_weight, + name_mapped, + shard_id=shard_id, + expert_id=expert_id, + return_success=True, + ) + if success: + name = name_mapped + break + else: + if is_expert_weight: + # We've checked that this is an expert weight + # However it's not mapped locally to this rank + # So we simply skip it + continue + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + if name not in params_dict: + logger.warning_once( + f"Parameter {name} not found in params_dict, skip loading" + ) + continue + param = params_dict[name] + weight_loader = getattr( + param, "weight_loader", default_weight_loader + ) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class Qwen3_5ForCausalLMBase( + nn.Module, + HasInnerState, + SupportsLoRA, + SupportsPP, +): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": ["gate_proj", "up_proj"], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_text_config + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + + scheduler_config = vllm_config.scheduler_config + if cache_config.mamba_cache_mode == "all": + raise NotImplementedError( + "Qwen3.5 currently does not support 'all' prefix caching, " + "please use '--mamba-cache-mode=align' instead" + ) + self.quant_config = vllm_config.quant_config + + super().__init__() + self.config = config + self.scheduler_config = scheduler_config + self.model = Qwen3_5Model( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) + + if get_pp_group().is_last_rank: + if config.tie_word_embeddings: + self.lm_head = self.model.embed_tokens + else: + self.lm_head = ParallelLMHead( + config.vocab_size, + config.hidden_size, + prefix=maybe_prefix(prefix, "lm_head"), + ) + else: + self.lm_head = PPMissingLayer() + + self.logits_processor = LogitsProcessor(config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors + ) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.embed_input_ids(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs: object, + ): + hidden_states = self.model( + input_ids, positions, intermediate_tensors, inputs_embeds + ) + + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + return self.logits_processor(self.lm_head, hidden_states) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader( + self, + skip_prefixes=["mtp."], + ) + return loader.load_weights(weights) + + +class Qwen3_5ForCausalLM(Qwen3_5ForCausalLMBase): + pass + + +class Qwen3_5MoeForCausalLM(Qwen3_5ForCausalLMBase, QwenNextMixtureOfExperts): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + + # set MoE hyperparameters + self.set_moe_parameters() + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return self.model.get_expert_mapping() + + +######################################################## +# Qwen3_5-Dense +######################################################## + + +@MULTIMODAL_REGISTRY.register_processor( + Qwen3VLMultiModalProcessor, + info=Qwen3_5ProcessingInfo, + dummy_inputs=Qwen3VLDummyInputsBuilder, +) +class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + # protocols have not __init__ method, so we need to use nn.Module.__init__ + nn.Module.__init__(self) + config: Qwen3_5Config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config + + self.config = config + self.multimodal_config = multimodal_config + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" + self.video_pruning_rate = multimodal_config.video_pruning_rate + self.is_multimodal_pruning_enabled = ( + multimodal_config.is_multimodal_pruning_enabled() + ) + + with self._mark_tower_model(vllm_config, {"image", "video"}): + self.visual = Qwen3_VisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=quant_config, + prefix=maybe_prefix(prefix, "visual"), + ) + + with self._mark_language_model(vllm_config): + self.language_model = Qwen3_5ForCausalLM( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "language_model") + ) + + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors + ) + + def embed_input_ids( + self, + input_ids: torch.Tensor, + multimodal_embeddings: MultiModalEmbeddings | None = None, + *, + is_multimodal: torch.Tensor | None = None, + handle_oov_mm_token: bool = False, + ) -> torch.Tensor: + inputs_embeds = self._embed_text_input_ids( + input_ids, + self.language_model.embed_input_ids, + is_multimodal=is_multimodal, + handle_oov_mm_token=handle_oov_mm_token, + ) + + if multimodal_embeddings is None or len(multimodal_embeddings) == 0: + return inputs_embeds + + is_multimodal = _require_is_multimodal(is_multimodal) + + inputs_embeds = _merge_multimodal_embeddings( + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + ) + + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs: object, + ) -> torch.Tensor | IntermediateTensors: + """Run forward pass for Qwen3.5. + + Args: + input_ids: Flattened (concatenated) input_ids corresponding to a + batch. + positions: Flattened (concatenated) position ids corresponding to a + batch. + **NOTE**: If mrope is enabled (default setting for Qwen3VL + opensource models), the shape will be `(3, seq_len)`, + otherwise it will be `(seq_len,). + intermediate_tensors: Intermediate tensors from previous pipeline + stages. + inputs_embeds: Pre-computed input embeddings. + **kwargs: Additional keyword arguments including: + - pixel_values: Pixel values to be fed to a model. + `None` if no images are passed. + - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in + LLM. `None` if no images are passed. + - pixel_values_videos: Pixel values of videos to be fed to a + model. `None` if no videos are passed. + - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in + LLM. `None` if no videos are passed. + """ + + if intermediate_tensors is not None: + inputs_embeds = None + + hidden_states = self.language_model.model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + ) + + return hidden_states + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader( + self, + skip_prefixes=["mtp."], + ) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + + @classmethod + def get_mamba_state_dtype_from_config( + cls, + vllm_config: "VllmConfig", + ) -> tuple[torch.dtype, torch.dtype]: + return MambaStateDtypeCalculator.gated_delta_net_state_dtype( + vllm_config.model_config.dtype, vllm_config.cache_config.mamba_cache_dtype + ) + + @classmethod + def get_mamba_state_shape_from_config( + cls, vllm_config: "VllmConfig" + ) -> tuple[tuple[int, int], tuple[int, int]]: + parallel_config = vllm_config.parallel_config + hf_config = vllm_config.model_config.hf_text_config + tp_size = parallel_config.tensor_parallel_size + num_spec = ( + vllm_config.speculative_config.num_speculative_tokens + if vllm_config.speculative_config + else 0 + ) + return MambaStateShapeCalculator.gated_delta_net_state_shape( + tp_size, + hf_config.linear_num_key_heads, + hf_config.linear_num_value_heads, + hf_config.linear_key_head_dim, + hf_config.linear_value_head_dim, + hf_config.linear_conv_kernel_dim, + num_spec, + ) + + @classmethod + def get_mamba_state_copy_func(cls) -> tuple[MambaStateCopyFunc, MambaStateCopyFunc]: + return MambaStateCopyFuncCalculator.gated_delta_net_state_copy_func() + + +######################################################## +# Qwen3_5-MoE +######################################################## + + +class Qwen3_5_MoeMixtureOfExperts(MixtureOfExperts): + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = num_physical_experts - self.num_logical_experts + for layer in self.language_model.model.layers: + if isinstance(layer.mlp, Qwen3NextSparseMoeBlock): + moe = layer.mlp + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + + def set_moe_parameters(self): + self.expert_weights = [] + + self.moe_layers = [] + example_moe = None + for layer in self.language_model.model.layers: + if isinstance(layer, Qwen3_5DecoderLayer) and isinstance( + layer.mlp, Qwen3NextSparseMoeBlock + ): + example_moe = layer.mlp + self.moe_layers.append(layer.mlp.experts) + + if example_moe is None: + raise RuntimeError( + "No Qwen3_5 layer found in the language_model.model.layers." + ) + + # Set MoE hyperparameters + self.num_moe_layers = len(self.moe_layers) + self.num_expert_groups = 1 + self.num_shared_experts = 0 + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + +@MULTIMODAL_REGISTRY.register_processor( + Qwen3VLMultiModalProcessor, + info=Qwen3_5MoeProcessingInfo, + dummy_inputs=Qwen3VLDummyInputsBuilder, +) +class Qwen3_5MoeForConditionalGeneration( + Qwen3_5ForConditionalGeneration, Qwen3_5_MoeMixtureOfExperts +): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + # protocols have not __init__ method, so we need to use nn.Module.__init__ + nn.Module.__init__(self) + config: Qwen3_5MoeConfig = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config + + self.config = config + self.multimodal_config = multimodal_config + self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" + self.video_pruning_rate = multimodal_config.video_pruning_rate + self.is_multimodal_pruning_enabled = ( + multimodal_config.is_multimodal_pruning_enabled() + ) + + with self._mark_tower_model(vllm_config, {"image", "video"}): + self.visual = Qwen3_VisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=quant_config, + prefix=maybe_prefix(prefix, "visual"), + ) + + with self._mark_language_model(vllm_config): + self.language_model = Qwen3_5MoeForCausalLM( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "language_model") + ) + + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors + ) + + # set MoE hyperparameters + self.set_moe_parameters() diff --git a/vllm/model_executor/models/qwen3_5_mtp.py b/vllm/model_executor/models/qwen3_5_mtp.py new file mode 100644 index 000000000..8bd29f352 --- /dev/null +++ b/vllm/model_executor/models/qwen3_5_mtp.py @@ -0,0 +1,447 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Inference-only Qwen3_5 MTP model.""" + +import typing +from collections.abc import Callable, Iterable + +import torch +from torch import nn +from transformers.models.qwen3_5.configuration_qwen3_5 import Qwen3_5TextConfig +from transformers.models.qwen3_5_moe.configuration_qwen3_5_moe import ( + Qwen3_5MoeTextConfig, +) + +from vllm.compilation.decorators import support_torch_compile +from vllm.config import VllmConfig +from vllm.distributed.parallel_state import get_pp_group +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.linear import ColumnParallelLinear +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.qwen3_5 import Qwen3_5DecoderLayer, Qwen3_5RMSNorm +from vllm.model_executor.models.qwen3_next import QwenNextMixtureOfExperts +from vllm.sequence import IntermediateTensors + +from .interfaces import ( + MultiModalEmbeddings, + SupportsMultiModal, + _require_is_multimodal, +) +from .utils import ( + AutoWeightsLoader, + PPMissingLayer, + _merge_multimodal_embeddings, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, + maybe_prefix, +) + +logger = init_logger(__name__) + + +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl, + # otherwise (seq_len, ). + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + "hidden_states": 0, + } +) +class Qwen3_5MultiTokenPredictor(nn.Module): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + model_config = vllm_config.model_config + quant_config = vllm_config.quant_config + + config: Qwen3_5TextConfig | Qwen3_5MoeTextConfig = model_config.hf_text_config + + self.config = config + + self.vocab_size = config.vocab_size + + self.mtp_start_layer_idx = config.num_hidden_layers + self.num_mtp_layers = getattr(config, "mtp_num_hidden_layers", 1) + + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + ) + + self.fc = ColumnParallelLinear( + self.config.hidden_size * 2, + self.config.hidden_size, + gather_output=True, + bias=False, + return_bias=False, + quant_config=quant_config, + prefix=f"{prefix}.fc", + ) + + self.layers = torch.nn.ModuleList( + Qwen3_5DecoderLayer( + vllm_config, + layer_type="full_attention", + prefix=f"{prefix}.layers.{idx}", + ) + for idx in range(self.num_mtp_layers) + ) + + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size + ) + + self.norm = Qwen3_5RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.pre_fc_norm_hidden = Qwen3_5RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + self.pre_fc_norm_embedding = Qwen3_5RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + spec_step_idx: int = 0, + ) -> torch.Tensor: + if get_pp_group().is_first_rank: + if inputs_embeds is None: + inputs_embeds = self.embed_input_ids(input_ids) + assert hidden_states.shape[-1] == inputs_embeds.shape[-1] + inputs_embeds = self.pre_fc_norm_embedding(inputs_embeds) + hidden_states = self.pre_fc_norm_hidden(hidden_states) + hidden_states = torch.cat([inputs_embeds, hidden_states], dim=-1) + hidden_states = self.fc(hidden_states) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + current_step_idx = spec_step_idx % self.num_mtp_layers + hidden_states, residual = self.layers[current_step_idx]( + positions=positions, + hidden_states=hidden_states, + residual=residual, + ) + + if not get_pp_group().is_last_rank: + return IntermediateTensors( + {"hidden_states": hidden_states, "residual": residual} + ) + + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + def load_fused_expert_weights( + self, + name: str, + params_dict: dict, + loaded_weight: torch.Tensor, + shard_id: str, + num_experts: int, + ) -> bool: + param = params_dict[name] + weight_loader = typing.cast(Callable[..., bool], param.weight_loader) + loaded_local_expert = False + for expert_id in range(num_experts): + curr_expert_weight = loaded_weight[expert_id] + success = weight_loader( + param, + curr_expert_weight, + name, + shard_id, + expert_id, + return_success=True, + ) + if success: + loaded_local_expert = True + + return loaded_local_expert + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = FusedMoE.make_expert_params_mapping( + self, + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.num_experts + if hasattr(self.config, "num_experts") + else 0, + ) + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + is_fused_expert = False + fused_expert_params_mapping = [ + ("experts.w13_weight", "experts.gate_up_proj", 0, "w1"), + ("experts.w2_weight", "experts.down_proj", 0, "w2"), + ] + num_experts = ( + self.config.num_experts if hasattr(self.config, "num_experts") else 0 + ) + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + for param_name, weight_name, shard_id in stacked_params_mapping: + if "experts.gate_up_proj" in name or "experts.down_proj" in name: + is_fused_expert = True + expert_params_mapping = fused_expert_params_mapping + + if weight_name not in name: + continue + + if "mlp.experts" in name: + continue + + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Skip layers on other devices. + if is_pp_missing_parameter(name, self): + continue + if name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + is_expert_weight = False + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + is_expert_weight = True + name_mapped = name.replace(weight_name, param_name) + # Skip layers on other devices. + if is_pp_missing_parameter(name_mapped, self): + continue + if is_fused_expert: + # qwen3.5 no need to transpose + # loaded_weight = loaded_weight.transpose(-1, -2) + if "experts.gate_up_proj" in name: + loaded_weight = loaded_weight.chunk(2, dim=-2) + success_w1 = self.load_fused_expert_weights( + name_mapped, + params_dict, + loaded_weight[0], + "w1", + num_experts, + ) + success_w3 = self.load_fused_expert_weights( + name_mapped, + params_dict, + loaded_weight[1], + "w3", + num_experts, + ) + success = success_w1 and success_w3 + else: + # down_proj + success = self.load_fused_expert_weights( + name_mapped, + params_dict, + loaded_weight, + shard_id, + num_experts, + ) + if success: + name = name_mapped + break + else: + # Skip loading extra bias for GPTQ models. + if ( + name_mapped.endswith(".bias") + or name_mapped.endswith("_bias") + ) and name_mapped not in params_dict: + continue + param = params_dict[name_mapped] + weight_loader = param.weight_loader + success = weight_loader( + param, + loaded_weight, + name_mapped, + shard_id=shard_id, + expert_id=expert_id, + return_success=True, + ) + if success: + name = name_mapped + break + else: + if is_expert_weight: + # We've checked that this is an expert weight + # However it's not mapped locally to this rank + # So we simply skip it + continue + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + if name not in params_dict: + logger.warning_once( + f"Parameter {name} not found in params_dict, skip loading" + ) + continue + param = params_dict[name] + weight_loader = getattr( + param, "weight_loader", default_weight_loader + ) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl, + # otherwise (seq_len, ). + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + "hidden_states": 0, + } +) +class Qwen3_5MTP(nn.Module, SupportsMultiModal): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": ["up_proj", "down_proj"], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_text_config + self.vllm_config = vllm_config + cache_config = vllm_config.cache_config + if cache_config.mamba_cache_mode == "all": + raise NotImplementedError( + "Qwen3_5MTP currently does not support 'all' prefix caching, " + "please use '--mamba-cache-mode=align' instead" + ) + + self.quant_config = vllm_config.quant_config + + super().__init__() + self.config = config + self.model = Qwen3_5MultiTokenPredictor( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "mtp") + ) + + if get_pp_group().is_last_rank: + if config.tie_word_embeddings: + self.lm_head = self.model.embed_tokens + else: + self.lm_head = ParallelLMHead( + config.vocab_size, + config.hidden_size, + prefix=maybe_prefix(prefix, "lm_head"), + ) + else: + self.lm_head = PPMissingLayer() + + self.logits_processor = LogitsProcessor(config.vocab_size) + + def embed_input_ids( + self, + input_ids: torch.Tensor, + multimodal_embeddings: MultiModalEmbeddings | None = None, + *, + is_multimodal: torch.Tensor | None = None, + handle_oov_mm_token: bool = False, + ) -> torch.Tensor: + inputs_embeds = self._embed_text_input_ids( + input_ids, + self.model.embed_input_ids, + is_multimodal=is_multimodal, + handle_oov_mm_token=handle_oov_mm_token, + ) + + if multimodal_embeddings is None or len(multimodal_embeddings) == 0: + return inputs_embeds + + is_multimodal = _require_is_multimodal(is_multimodal) + + inputs_embeds = _merge_multimodal_embeddings( + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + ) + + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs: object, + ): + hidden_states = self.model( + input_ids, positions, hidden_states, intermediate_tensors, inputs_embeds + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + spec_step_idx: int = 0, + ) -> torch.Tensor | None: + return self.logits_processor(self.lm_head, hidden_states) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + def remap_weight_names(weights): + for name, weight in weights: + if name.startswith("mtp."): + name = name.replace("mtp.", "model.") + elif any(key in name for key in ["embed_tokens", "lm_head"]): + if "embed_tokens" in name: + name = name.replace("language_model.", "") + else: + continue + yield name, weight + + loader = AutoWeightsLoader(self) + return loader.load_weights(remap_weight_names(weights)) + + +class Qwen3_5MoeMTP(Qwen3_5MTP, QwenNextMixtureOfExperts): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + self.set_moe_parameters() diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 503b40702..3bcfbacbb 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -105,7 +105,7 @@ class Qwen3NextSparseMoeBlock(nn.Module): def __init__(self, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - config = vllm_config.model_config.hf_config + config = vllm_config.model_config.hf_text_config parallel_config = vllm_config.parallel_config quant_config = vllm_config.quant_config @@ -176,7 +176,7 @@ class Qwen3NextSparseMoeBlock(nn.Module): hidden_size=config.hidden_size, intermediate_size=config.moe_intermediate_size, reduce_results=False, - renormalize=config.norm_topk_prob, + renormalize=getattr(config, "norm_topk_prob", True), quant_config=quant_config, prefix=f"{prefix}.experts", enable_eplb=self.enable_eplb, @@ -965,7 +965,7 @@ class Qwen3NextModel(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - config: Qwen3NextConfig = vllm_config.model_config.hf_config + config: Qwen3NextConfig = vllm_config.model_config.hf_text_config parallel_config = vllm_config.parallel_config eplb_config = parallel_config.eplb_config @@ -1042,7 +1042,7 @@ class Qwen3NextModel(nn.Module): ckpt_gate_proj_name="gate_proj", ckpt_down_proj_name="down_proj", ckpt_up_proj_name="up_proj", - num_experts=self.config.num_experts, + num_experts=getattr(self.config, "num_experts", 0), num_redundant_experts=self.num_redundant_experts, ) @@ -1201,7 +1201,7 @@ class Qwen3NextForCausalLM( } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - config = vllm_config.model_config.hf_config + config = vllm_config.model_config.hf_text_config self.vllm_config = vllm_config self.model_config = vllm_config.model_config cache_config = vllm_config.cache_config @@ -1265,7 +1265,7 @@ class Qwen3NextForCausalLM( cls, vllm_config: "VllmConfig" ) -> tuple[tuple[int, int], tuple[int, int]]: parallel_config = vllm_config.parallel_config - hf_config = vllm_config.model_config.hf_config + hf_config = vllm_config.model_config.hf_text_config tp_size = parallel_config.tensor_parallel_size num_spec = ( vllm_config.speculative_config.num_speculative_tokens diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 6e68b24ba..1871591c9 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -466,6 +466,14 @@ _MULTIMODAL_MODELS = { "qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration", ), + "Qwen3_5ForConditionalGeneration": ( + "qwen3_5", + "Qwen3_5ForConditionalGeneration", + ), + "Qwen3_5MoeForConditionalGeneration": ( + "qwen3_5", + "Qwen3_5MoeForConditionalGeneration", + ), "SkyworkR1VChatModel": ("skyworkr1v", "SkyworkR1VChatModel"), "Step3VLForConditionalGeneration": ("step3_vl", "Step3VLForConditionalGeneration"), # noqa: E501 "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501 @@ -509,6 +517,8 @@ _SPECULATIVE_DECODING_MODELS = { "OpenPanguMTPModel": ("openpangu_mtp", "OpenPanguMTP"), "Qwen3NextMTP": ("qwen3_next_mtp", "Qwen3NextMTP"), "Step3p5MTP": ("step3p5_mtp", "Step3p5MTP"), + "Qwen3_5MTP": ("qwen3_5_mtp", "Qwen3_5MTP"), + "Qwen3_5MoeMTP": ("qwen3_5_mtp", "Qwen3_5MoeMTP"), # Temporarily disabled. # # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1. # "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"), diff --git a/vllm/transformers_utils/model_arch_config_convertor.py b/vllm/transformers_utils/model_arch_config_convertor.py index f82186639..5fc737e8e 100644 --- a/vllm/transformers_utils/model_arch_config_convertor.py +++ b/vllm/transformers_utils/model_arch_config_convertor.py @@ -420,6 +420,11 @@ class Qwen3NextMTPModelArchConfigConvertor(ModelArchConfigConvertorBase): return getattr(self.hf_text_config, "num_nextn_predict_layers", 0) +class Qwen3_5MTPModelArchConfigConvertor(ModelArchConfigConvertorBase): + def get_num_hidden_layers(self) -> int: + return getattr(self.hf_text_config, "mtp_num_hidden_layers", 0) + + class PanguUltraMoeMTPModelArchConfigConvertor(ModelArchConfigConvertorBase): def get_num_hidden_layers(self) -> int: return getattr(self.hf_text_config, "num_nextn_predict_layers", 0) @@ -445,6 +450,7 @@ MODEL_ARCH_CONFIG_CONVERTORS = { "nemotron-nas": NemotronNasModelArchConfigConvertor, "deepseek_mtp": DeepSeekMTPModelArchConfigConvertor, "qwen3_next_mtp": Qwen3NextMTPModelArchConfigConvertor, + "qwen3_5_mtp": Qwen3_5MTPModelArchConfigConvertor, "mimo_mtp": MimoMTPModelArchConfigConvertor, "glm4_moe_mtp": GLM4MoeMTPModelArchConfigConvertor, "glm_ocr_mtp": GLM4MoeMTPModelArchConfigConvertor, diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index d4b38d670..d29ee00fa 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -1356,6 +1356,8 @@ class SpecDecodeBaseProposer: "Qwen3VLMoeForConditionalGeneration", "HunYuanVLForConditionalGeneration", "GlmOcrForConditionalGeneration", + "Qwen3_5ForConditionalGeneration", + "Qwen3_5MoeForConditionalGeneration", ]: self.model.config.image_token_index = target_model.config.image_token_id elif self.get_model_name(target_model) == "PixtralForConditionalGeneration": -- GitLab From d0d97e2974250edb61fbff6964e95a5b6d22d763 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Mon, 9 Feb 2026 06:42:03 -0800 Subject: [PATCH 0009/1166] [Misc] Fix up attention benchmarks (#33810) Signed-off-by: Lucas Wilkinson Signed-off-by: Matthew Bonanni Co-authored-by: Matthew Bonanni --- .buildkite/test_areas/benchmarks.yaml | 11 + benchmarks/attention_benchmarks/batch_spec.py | 37 +++ benchmarks/attention_benchmarks/common.py | 13 +- .../configs/standard_attention.yaml | 12 +- benchmarks/attention_benchmarks/runner.py | 241 +++++++++++------- 5 files changed, 219 insertions(+), 95 deletions(-) diff --git a/.buildkite/test_areas/benchmarks.yaml b/.buildkite/test_areas/benchmarks.yaml index 574b642d4..57080c46f 100644 --- a/.buildkite/test_areas/benchmarks.yaml +++ b/.buildkite/test_areas/benchmarks.yaml @@ -17,3 +17,14 @@ steps: - tests/benchmarks/ commands: - pytest -v -s benchmarks/ + +- label: Attention Benchmarks Smoke Test (B200) + device: b200 + num_gpus: 2 + optional: true + timeout_in_minutes: 10 + source_file_dependencies: + - benchmarks/attention_benchmarks/ + - vllm/v1/attention/ + commands: + - python benchmarks/attention_benchmarks/benchmark.py --backends flash flashinfer --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1 diff --git a/benchmarks/attention_benchmarks/batch_spec.py b/benchmarks/attention_benchmarks/batch_spec.py index 41681796e..9f15f1d80 100644 --- a/benchmarks/attention_benchmarks/batch_spec.py +++ b/benchmarks/attention_benchmarks/batch_spec.py @@ -229,3 +229,40 @@ def get_batch_stats(requests: list[BatchRequest]) -> dict: sum(r.kv_len for r in requests) / len(requests) if requests else 0 ), } + + +def get_batch_type(batch_spec: str, spec_decode_threshold: int = 8) -> str: + """ + Classify a batch spec into a type string. + + Args: + batch_spec: Batch specification string (e.g., "q2k", "8q1s1k", "2q2k_8q1s1k") + spec_decode_threshold: Max q_len to be considered spec-decode vs extend + + Returns: + Type string: "prefill", "decode", "spec-decode", "extend", or "mixed (types...)" + """ + requests = parse_batch_spec(batch_spec) + + # Classify each request + types_present = set() + for req in requests: + if req.is_decode: + types_present.add("decode") + elif req.is_prefill: + types_present.add("prefill") + elif req.is_extend: + # Distinguish spec-decode (small q_len) from extend (chunked prefill) + if req.q_len <= spec_decode_threshold: + types_present.add("spec-decode") + else: + types_present.add("extend") + + if len(types_present) == 1: + return types_present.pop() + elif len(types_present) > 1: + # Sort for consistent output + sorted_types = sorted(types_present) + return f"mixed ({'+'.join(sorted_types)})" + else: + return "unknown" diff --git a/benchmarks/attention_benchmarks/common.py b/benchmarks/attention_benchmarks/common.py index 7155bdc3f..190b2f977 100644 --- a/benchmarks/attention_benchmarks/common.py +++ b/benchmarks/attention_benchmarks/common.py @@ -12,6 +12,7 @@ from typing import Any import numpy as np import torch +from batch_spec import get_batch_type, parse_batch_spec from rich.console import Console from rich.table import Table @@ -316,12 +317,14 @@ class ResultsFormatter: backends: List of backend names being compared compare_to_fastest: Show percentage comparison to fastest """ - # Group by batch spec + # Group by batch spec, preserving first-occurrence order by_spec = {} + specs_order = [] for r in results: spec = r.config.batch_spec if spec not in by_spec: by_spec[spec] = {} + specs_order.append(spec) by_spec[spec][r.config.backend] = r # Create shortened backend names for display @@ -337,6 +340,8 @@ class ResultsFormatter: table = Table(title="Attention Benchmark Results") table.add_column("Batch\nSpec", no_wrap=True) + table.add_column("Type", no_wrap=True) + table.add_column("Batch\nSize", justify="right", no_wrap=True) multi = len(backends) > 1 for backend in backends: @@ -350,12 +355,14 @@ class ResultsFormatter: table.add_column(col_rel, justify="right", no_wrap=False) # Add rows - for spec in sorted(by_spec.keys()): + for spec in specs_order: spec_results = by_spec[spec] times = {b: r.mean_time for b, r in spec_results.items() if r.success} best_time = min(times.values()) if times else 0.0 - row = [spec] + batch_type = get_batch_type(spec) + batch_size = len(parse_batch_spec(spec)) + row = [spec, batch_type, str(batch_size)] for backend in backends: if backend in spec_results: r = spec_results[backend] diff --git a/benchmarks/attention_benchmarks/configs/standard_attention.yaml b/benchmarks/attention_benchmarks/configs/standard_attention.yaml index c0bdb98fb..591db6837 100644 --- a/benchmarks/attention_benchmarks/configs/standard_attention.yaml +++ b/benchmarks/attention_benchmarks/configs/standard_attention.yaml @@ -25,10 +25,18 @@ batch_specs: - "4q1k_16q1s2k" # 4 prefill + 16 decode - "2q4k_32q1s1k" # 2 large prefill + 32 decode - # Context extension - - "q1ks2k" # 1k query, 2k sequence (chunked prefill) + # Speculative decode (q <= 8) + - "16q2s1k" # 16 requests, 2 spec tokens, 1k KV cache + - "16q4s1k" # 16 requests, 4 spec tokens, 1k KV cache + - "16q8s1k" # 16 requests, 8 spec tokens, 1k KV cache + - "32q4s2k" # 32 requests, 4 spec tokens, 2k KV cache + - "8q8s4k" # 8 requests, 8 spec tokens, 4k KV cache + + # Context extension (chunked prefill) + - "q1ks2k" # 1k query, 2k sequence - "2q1ks4k" # 2 requests: 1k query, 4k sequence +# Available backends: flash, triton, flashinfer backends: - flash - triton diff --git a/benchmarks/attention_benchmarks/runner.py b/benchmarks/attention_benchmarks/runner.py index bf08a1550..79bfca681 100644 --- a/benchmarks/attention_benchmarks/runner.py +++ b/benchmarks/attention_benchmarks/runner.py @@ -8,7 +8,9 @@ This module provides helpers for running standard attention backends (FlashAttention, Triton, FlashInfer) with real vLLM integration. """ +import logging import types +from contextlib import contextmanager import numpy as np import torch @@ -24,8 +26,13 @@ from vllm.config import ( ParallelConfig, SchedulerConfig, VllmConfig, + set_current_vllm_config, +) +from vllm.v1.attention.backends.utils import ( + CommonAttentionMetadata, + get_kv_cache_layout, + set_kv_cache_layout, ) -from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.kv_cache_interface import FullAttentionSpec # ============================================================================ @@ -37,22 +44,14 @@ _BACKEND_CONFIG = { "flash": { "module": "vllm.v1.attention.backends.flash_attn", "backend_class": "FlashAttentionBackend", - "dtype": torch.float16, - "cache_layout": "standard", - # ^ [2, num_blocks, block_size, num_kv_heads, head_dim] }, "triton": { "module": "vllm.v1.attention.backends.triton_attn", "backend_class": "TritonAttentionBackend", - "dtype": torch.float32, - "cache_layout": "standard", }, "flashinfer": { "module": "vllm.v1.attention.backends.flashinfer", "backend_class": "FlashInferBackend", - "dtype": torch.float16, - "cache_layout": "flashinfer", - # ^ [num_blocks, 2, block_size, num_kv_heads, head_dim] }, } @@ -66,6 +65,18 @@ def _get_backend_config(backend: str) -> dict: return _BACKEND_CONFIG[backend] +@contextmanager +def log_warnings_and_errors_only(): + """Temporarily set vLLM logger to WARNING level.""" + logger = logging.getLogger("vllm") + old_level = logger.level + logger.setLevel(logging.WARNING) + try: + yield + finally: + logger.setLevel(old_level) + + # ============================================================================ # Metadata Building Helpers # ============================================================================ @@ -88,11 +99,7 @@ def _build_common_attn_metadata( query_start_loc_cpu = query_start_loc.cpu() seq_lens = torch.tensor(kv_lens, dtype=torch.int32, device=device) - seq_lens_cpu = seq_lens.cpu() - max_seq_len = int(seq_lens_cpu.max()) - - context_lens = [kv - q for kv, q in zip(kv_lens, q_lens)] - num_computed_tokens_cpu = torch.tensor(context_lens, dtype=torch.int32) + max_seq_len = int(seq_lens.max().item()) max_blocks = (max(kv_lens) + block_size - 1) // block_size num_blocks = batch_size * max_blocks @@ -107,8 +114,6 @@ def _build_common_attn_metadata( query_start_loc=query_start_loc, query_start_loc_cpu=query_start_loc_cpu, seq_lens=seq_lens, - seq_lens_cpu=seq_lens_cpu, - num_computed_tokens_cpu=num_computed_tokens_cpu, num_reqs=batch_size, num_actual_tokens=total_tokens, max_query_len=max_query_len, @@ -121,7 +126,6 @@ def _build_common_attn_metadata( def _create_vllm_config( config: BenchmarkConfig, - dtype: torch.dtype, max_num_blocks: int, ) -> VllmConfig: """Create a VllmConfig for benchmarking with mock model methods.""" @@ -129,7 +133,7 @@ def _create_vllm_config( model="meta-llama/Meta-Llama-3-8B", tokenizer="meta-llama/Meta-Llama-3-8B", trust_remote_code=False, - dtype=dtype, + dtype="auto", # Use model's native dtype seed=0, max_model_len=1024, ) @@ -198,6 +202,7 @@ def _create_backend_impl( backend_cfg: dict, config: BenchmarkConfig, device: torch.device, + dtype: torch.dtype, ): """Create backend implementation instance.""" import importlib @@ -206,7 +211,6 @@ def _create_backend_impl( backend_class = getattr(backend_module, backend_cfg["backend_class"]) scale = get_attention_scale(config.head_dim) - dtype = backend_cfg["dtype"] impl = backend_class.get_impl_cls()( num_heads=config.num_q_heads, @@ -227,7 +231,7 @@ def _create_backend_impl( layer = MockLayer(device, kv_cache_spec=kv_cache_spec) - return backend_class, impl, layer, dtype + return backend_class, impl, layer def _create_metadata_builder( @@ -235,11 +239,44 @@ def _create_metadata_builder( kv_cache_spec: FullAttentionSpec, vllm_config: VllmConfig, device: torch.device, + backend_name: str = "", ): """Create metadata builder instance.""" - return backend_class.get_builder_cls()( + layer_names = ["layer_0"] + builder_cls = backend_class.get_builder_cls() + + # Flashinfer needs get_per_layer_parameters mocked since we don't have + # real model layers registered + if backend_name == "flashinfer": + import unittest.mock + + from vllm.v1.attention.backends.utils import PerLayerParameters + + def mock_get_per_layer_parameters(vllm_config, layer_names, impl_cls): + head_size = vllm_config.model_config.get_head_size() + return { + layer_name: PerLayerParameters( + window_left=-1, # No sliding window + logits_soft_cap=0.0, # No soft cap + sm_scale=1.0 / (head_size**0.5), # Standard scale + ) + for layer_name in layer_names + } + + with unittest.mock.patch( + "vllm.v1.attention.backends.flashinfer.get_per_layer_parameters", + mock_get_per_layer_parameters, + ): + return builder_cls( + kv_cache_spec=kv_cache_spec, + layer_names=layer_names, + vllm_config=vllm_config, + device=device, + ) + + return builder_cls( kv_cache_spec=kv_cache_spec, - layer_names=["layer_0"], + layer_names=layer_names, vllm_config=vllm_config, device=device, ) @@ -281,39 +318,44 @@ def _create_input_tensors( def _create_kv_cache( config: BenchmarkConfig, max_num_blocks: int, - cache_layout: str, + backend_class, device: torch.device, dtype: torch.dtype, ) -> list: - """Create KV cache tensors for all layers.""" - if cache_layout == "flashinfer": - # FlashInfer layout: [num_blocks, 2, block_size, num_kv_heads, head_dim] - cache_list = [ - torch.zeros( - max_num_blocks, - 2, - config.block_size, - config.num_kv_heads, - config.head_dim, - device=device, - dtype=dtype, - ) - for _ in range(config.num_layers) - ] - else: - # Standard layout: [2, num_blocks, block_size, num_kv_heads, head_dim] - cache_list = [ - torch.zeros( - 2, - max_num_blocks, - config.block_size, - config.num_kv_heads, - config.head_dim, - device=device, - dtype=dtype, - ) - for _ in range(config.num_layers) - ] + """Create KV cache tensors for all layers using the backend's methods. + + Uses the backend's get_kv_cache_shape() and get_kv_cache_stride_order() + to create the cache with the correct shape and memory layout. + """ + # Get the logical shape from the backend + cache_shape = backend_class.get_kv_cache_shape( + num_blocks=max_num_blocks, + block_size=config.block_size, + num_kv_heads=config.num_kv_heads, + head_size=config.head_dim, + ) + + # Get the stride order for custom memory layout + try: + stride_order = backend_class.get_kv_cache_stride_order() + assert len(stride_order) == len(cache_shape) + except (AttributeError, NotImplementedError): + stride_order = tuple(range(len(cache_shape))) + + # Permute shape to physical layout order + physical_shape = tuple(cache_shape[i] for i in stride_order) + + # Compute inverse permutation to get back to logical view + inv_order = [stride_order.index(i) for i in range(len(stride_order))] + + cache_list = [] + for _ in range(config.num_layers): + # Allocate in physical layout order (contiguous in memory) + cache = torch.zeros(*physical_shape, device=device, dtype=dtype) + # Permute to logical view + cache = cache.permute(*inv_order) + cache_list.append(cache) + return cache_list @@ -418,53 +460,72 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult: kv_lens = [r.kv_len for r in requests] total_q = sum(q_lens) max_kv = max(kv_lens) + batch_size = len(q_lens) - max_num_blocks = (max_kv + config.block_size - 1) // config.block_size - - backend_class, impl, layer, dtype = _create_backend_impl( - backend_cfg, config, device - ) + # Calculate total blocks needed: batch_size * max_blocks_per_request + max_blocks_per_request = (max_kv + config.block_size - 1) // config.block_size + max_num_blocks = batch_size * max_blocks_per_request + + # Suppress vLLM logs during setup to reduce spam + with log_warnings_and_errors_only(): + # Create vllm_config first - uses model's native dtype via "auto" + vllm_config = _create_vllm_config(config, max_num_blocks) + dtype = vllm_config.model_config.dtype + + # Wrap everything in set_current_vllm_config context + # This is required for backends like flashinfer that need global config + with set_current_vllm_config(vllm_config): + backend_class, impl, layer = _create_backend_impl( + backend_cfg, config, device, dtype + ) - common_metadata = _build_common_attn_metadata( - q_lens, kv_lens, config.block_size, device - ) + # Set KV cache layout if the backend requires a specific one + # (e.g., FlashInfer requires HND on SM100/Blackwell for TRTLLM attention) + required_layout = backend_class.get_required_kv_cache_layout() + if required_layout is not None: + set_kv_cache_layout(required_layout) + get_kv_cache_layout.cache_clear() - kv_cache_spec = FullAttentionSpec( - block_size=config.block_size, - num_kv_heads=config.num_kv_heads, - head_size=config.head_dim, - dtype=dtype, - ) + common_metadata = _build_common_attn_metadata( + q_lens, kv_lens, config.block_size, device + ) - vllm_config = _create_vllm_config(config, dtype, max_num_blocks) + kv_cache_spec = FullAttentionSpec( + block_size=config.block_size, + num_kv_heads=config.num_kv_heads, + head_size=config.head_dim, + dtype=dtype, + ) - builder = _create_metadata_builder( - backend_class, kv_cache_spec, vllm_config, device - ) + builder = _create_metadata_builder( + backend_class, kv_cache_spec, vllm_config, device, config.backend + ) - attn_metadata = builder.build( - common_prefix_len=0, - common_attn_metadata=common_metadata, - ) + attn_metadata = builder.build( + common_prefix_len=0, + common_attn_metadata=common_metadata, + ) - q_list, k_list, v_list = _create_input_tensors(config, total_q, device, dtype) + q_list, k_list, v_list = _create_input_tensors( + config, total_q, device, dtype + ) - cache_list = _create_kv_cache( - config, max_num_blocks, backend_cfg["cache_layout"], device, dtype - ) + cache_list = _create_kv_cache( + config, max_num_blocks, backend_class, device, dtype + ) - times, mem_stats = _run_single_benchmark( - config, - impl, - layer, - q_list, - k_list, - v_list, - cache_list, - attn_metadata, - device, - dtype, - ) + times, mem_stats = _run_single_benchmark( + config, + impl, + layer, + q_list, + k_list, + v_list, + cache_list, + attn_metadata, + device, + dtype, + ) mean_time = np.mean(times) throughput = total_q / mean_time if mean_time > 0 else 0 -- GitLab From 64a9c2528b1487fbfefa333cb1b246a57cddd4b2 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 9 Feb 2026 06:57:33 -0800 Subject: [PATCH 0010/1166] [UX] Add `--language-model-only` for hybrid models (#34120) Signed-off-by: Roger Wang --- vllm/config/model.py | 3 +++ vllm/config/multimodal.py | 14 +++++++++++--- vllm/engine/arg_utils.py | 5 +++++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index b76d51868..96dbf9725 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -297,6 +297,7 @@ class ModelConfig: multimodal_config: MultiModalConfig | None = None """Configuration for multimodal model. If `None`, this will be inferred from the architecture of `self.model`.""" + language_model_only: InitVar[bool] = False limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None enable_mm_embeds: InitVar[bool | None] = None media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None @@ -411,6 +412,7 @@ class ModelConfig: def __post_init__( self, # Multimodal config init vars + language_model_only: bool, limit_mm_per_prompt: dict[str, int | dict[str, int]] | None, enable_mm_embeds: bool | None, media_io_kwargs: dict[str, dict[str, Any]] | None, @@ -576,6 +578,7 @@ class ModelConfig: mm_encoder_tp_mode = "weights" mm_config_kwargs = dict( + language_model_only=language_model_only, limit_per_prompt=limit_mm_per_prompt, enable_mm_embeds=enable_mm_embeds, media_io_kwargs=media_io_kwargs, diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 30305e4be..68244ba2f 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -54,8 +54,12 @@ DummyOptions: TypeAlias = ( class MultiModalConfig: """Controls the behavior of multimodal models.""" + language_model_only: bool = False + """If True, disables all multimodal inputs by setting all modality limits + to 0. Equivalent to setting --limit-mm-per-prompt to 0 for every + modality.""" limit_per_prompt: dict[str, DummyOptions] = Field(default_factory=dict) - """The maximum number of input items and options allowed per + """The maximum number of input items and options allowed per prompt for each modality. Defaults to 999 for each modality. @@ -63,11 +67,11 @@ class MultiModalConfig: {"image": 16, "video": 2} Configurable format (with options): - {"video": {"count": 1, "num_frames": 32, "width": 512, "height": 512}, + {"video": {"count": 1, "num_frames": 32, "width": 512, "height": 512}, "image": {"count": 5, "width": 512, "height": 512}} Mixed format (combining both): - {"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512, + {"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512, "height": 512}} """ enable_mm_embeds: bool = False @@ -215,6 +219,7 @@ class MultiModalConfig: the final hidden states. """ factors: list[Any] = [ + self.language_model_only, self.mm_encoder_attn_backend.name if self.mm_encoder_attn_backend is not None else None, @@ -228,6 +233,9 @@ class MultiModalConfig: Get the maximum number of input items allowed per prompt for the given modality (backward compatible). """ + if self.language_model_only: + return 0 + limit_data = self.limit_per_prompt.get(modality) if limit_data is None: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cf05c8e87..c7c78ffd8 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -454,6 +454,7 @@ class EngineArgs: allow_deprecated_quantization: bool = ModelConfig.allow_deprecated_quantization enforce_eager: bool = ModelConfig.enforce_eager disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce + language_model_only: bool = MultiModalConfig.language_model_only limit_mm_per_prompt: dict[str, int | dict[str, int]] = get_field( MultiModalConfig, "limit_per_prompt" ) @@ -975,6 +976,9 @@ class EngineArgs: title="MultiModalConfig", description=MultiModalConfig.__doc__, ) + multimodal_group.add_argument( + "--language-model-only", **multimodal_kwargs["language_model_only"] + ) multimodal_group.add_argument( "--limit-mm-per-prompt", **multimodal_kwargs["limit_per_prompt"] ) @@ -1291,6 +1295,7 @@ class EngineArgs: skip_tokenizer_init=self.skip_tokenizer_init, enable_prompt_embeds=self.enable_prompt_embeds, served_model_name=self.served_model_name, + language_model_only=self.language_model_only, limit_mm_per_prompt=self.limit_mm_per_prompt, enable_mm_embeds=self.enable_mm_embeds, interleave_mm_strings=self.interleave_mm_strings, -- GitLab From 781ddf786861f40de6d94d45d7b149d0f8d58c11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luka=20Govedi=C4=8D?= Date: Mon, 9 Feb 2026 10:05:14 -0500 Subject: [PATCH 0011/1166] [CI][torch.compile] Fix incorrect filtering for E2E fusion tests on B200 (#34031) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luka Govedič --- .buildkite/test_areas/compile.yaml | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml index 56fc011c7..51b9fdc8b 100644 --- a/.buildkite/test_areas/compile.yaml +++ b/.buildkite/test_areas/compile.yaml @@ -121,13 +121,10 @@ steps: optional: true commands: - nvidia-smi - # Run all models and attn backends but only Inductor partition and native custom ops - # -k "inductor_partition and not +rms_norm and not +quant_fp8" + # Run all models but only FLASHINFER, Inductor partition and native custom ops # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported - # -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" - # Run just llama3 (fp8 & fp4) for all config combinations - # -k "llama-3" - - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3" + # Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition) + - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)" - label: Fusion E2E TP2 Quick (H100) timeout_in_minutes: 20 @@ -162,7 +159,7 @@ steps: - tests/compile/fusions_e2e/ commands: - nvidia-smi - # Run just llama3 (fp4 & fp8 & bf16) for all config combinations + # Run just llama3 (fp8 & bf16) for all config combinations - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3" - label: Fusion E2E TP2 AsyncTP Config Sweep (H100) @@ -197,7 +194,8 @@ steps: - tests/compile/fusions_e2e/ commands: - nvidia-smi - # Run all models and attn backends but only Inductor partition and native custom ops + # Run all models but only FLASHINFER, Inductor partition and native custom ops + # include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported # for ar-rms-quant-fp4, also sweep llama3 - - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4" - - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8" + - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4" + - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)" -- GitLab From cb62e86f83bf859fb25936a0c39709a31515fddc Mon Sep 17 00:00:00 2001 From: ZhengHongming888 Date: Mon, 9 Feb 2026 07:39:12 -0800 Subject: [PATCH 0012/1166] Add NUMA Core binding in nixl_connector for CPU xPyD (#32365) Signed-off-by: Hongming Zheng Signed-off-by: ZhengHongming888 Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../kv_connector/v1/nixl_connector.py | 11 ++++ vllm/platforms/cpu.py | 61 +++++++++++++++++++ vllm/v1/worker/cpu_worker.py | 1 + 3 files changed, 73 insertions(+) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index c2777b393..245ac7daf 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -926,6 +926,17 @@ class NixlConnectorWorker: else: self.use_host_buffer = self.kv_buffer_device == "cpu" + # reserve different cores for start_load_kv() from model_forward() + if self.device_type == "cpu": + numa_core_list = current_platform.discover_numa_topology() + # setup one last core in each numa for kv transfer. + rsv_cores_for_kv = [ + max(each_numa_core_list) for each_numa_core_list in numa_core_list + ] + + if rsv_cores_for_kv: + os.sched_setaffinity(0, rsv_cores_for_kv) + # support for oot platform which can't register nixl memory # type based on kv_buffer_device nixl_memory_type = current_platform.get_nixl_memory_type() diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 46465a482..60180b272 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -213,6 +213,13 @@ class CpuPlatform(Platform): cache_config.cpu_kvcache_space_bytes = CpuPlatform.get_device_total_memory() + # reserve at least one core for nixl_connector under p/d case + if vllm_config.kv_transfer_config and ( + envs.VLLM_CPU_NUM_OF_RESERVED_CPU == 0 + or envs.VLLM_CPU_NUM_OF_RESERVED_CPU is None + ): + os.environ["VLLM_CPU_NUM_OF_RESERVED_CPU"] = "1" + parallel_config = vllm_config.parallel_config if ( parallel_config.world_size > 1 @@ -396,6 +403,60 @@ class CpuPlatform(Platform): return allowed_numa_nodes_list, logical_cpu_list + @classmethod + def discover_numa_topology(cls) -> list[list[int]]: + """ + Discover NUMA topology and keep the last physical core of each numa + into one core group list for nixl start_kv_load() + """ + SYS_NODE = "/sys/devices/system/node" + SYS_CPU = "/sys/devices/system/cpu" + + if not (os.path.exists(SYS_NODE) and os.path.exists(SYS_CPU)): + return [] + + core_rsv_for_kv = [] + for node in os.listdir(SYS_NODE): + if not node.startswith("node") or not node[4:].isdigit(): + continue + node_path = f"{SYS_NODE}/{node}" + + seen_phys = set() + for cpu in os.listdir(node_path): + if not cpu.startswith("cpu") or not cpu[3:].isdigit(): + continue + + cpu_id = int(cpu[3:]) + # thread_siblings based on cpu_id + path = f"{SYS_CPU}/cpu{cpu_id}/topology/thread_siblings_list" + + if os.path.exists(path): + try: + with open(path) as f: + s = f.read() + cpus: list[int] = [] + for part in s.strip().split(","): + if "-" in part: + a, b = map(int, part.split("-")) + cpus.extend(range(a, b + 1)) + else: + cpus.append(int(part)) + siblings = cpus if cpus else [cpu_id] + except (OSError, ValueError): + siblings = [cpu_id] + else: + siblings = [cpu_id] + + phys = min(siblings) + + if phys not in seen_phys: + seen_phys.add(phys) + + if len(seen_phys) > 0: + core_rsv_for_kv.append(list(seen_phys)) + + return core_rsv_for_kv + @classmethod def is_pin_memory_available(cls) -> bool: return False diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index 169696ca1..8ccd45bb0 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -54,6 +54,7 @@ class CPUWorker(Worker): def init_device(self): # Setup OpenMP threads affinity. omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND + # Under numa binding some cores reserved for kv transfer in nixl_connector.py if omp_cpuids == "auto" and platform.system() == "Linux": cpu_arch = current_platform.get_cpu_architecture() if cpu_arch in (CpuArchEnum.POWERPC, CpuArchEnum.S390X): -- GitLab From d4f123cc48c374f7aad48cd808d797c71711ebc7 Mon Sep 17 00:00:00 2001 From: Mohammad Miadh Angkad <176301910+mmangkad@users.noreply.github.com> Date: Mon, 9 Feb 2026 23:43:24 +0800 Subject: [PATCH 0013/1166] [Kernel] FlashInfer: switch allreduce fusion to unified API (#33985) Signed-off-by: Mohammad Miadh Angkad <176301910+mmangkad@users.noreply.github.com> --- .../kernels/benchmark_fused_collective.py | 123 ++++++++---------- .../distributed/test_fusion_all_reduce.py | 5 +- .../passes/fusion/allreduce_rms_fusion.py | 66 ++++------ 3 files changed, 80 insertions(+), 114 deletions(-) diff --git a/benchmarks/kernels/benchmark_fused_collective.py b/benchmarks/kernels/benchmark_fused_collective.py index 38e7fdcf5..3cd52160d 100644 --- a/benchmarks/kernels/benchmark_fused_collective.py +++ b/benchmarks/kernels/benchmark_fused_collective.py @@ -5,7 +5,7 @@ Benchmark for FlashInfer fused collective operations vs standard operations. This benchmark compares: -1. FlashInfer's trtllm_allreduce_fusion (fused allreduce + rmsnorm + optional quant) +1. FlashInfer's allreduce_fusion (fused allreduce + rmsnorm + optional quant) 2. Standard tensor_model_parallel_all_reduce + separate rmsnorm/quant operations Usage with torchrun: @@ -24,7 +24,6 @@ import torch.distributed as dist # type: ignore from vllm.config.vllm import CompilationConfig, VllmConfig, set_current_vllm_config from vllm.distributed import ( - get_tp_group, tensor_model_parallel_all_reduce, ) from vllm.distributed.parallel_state import ( @@ -52,11 +51,12 @@ logger = init_logger(__name__) try: import flashinfer.comm as flashinfer_comm # type: ignore - if not hasattr(flashinfer_comm, "trtllm_allreduce_fusion"): + if not ( + hasattr(flashinfer_comm, "allreduce_fusion") + and hasattr(flashinfer_comm, "create_allreduce_fusion_workspace") + ): flashinfer_comm = None - logger.warning( - "FlashInfer comm module found but missing trtllm_allreduce_fusion" - ) + logger.warning("FlashInfer comm module found but missing allreduce_fusion API") except ImportError: flashinfer_comm = None logger.warning("FlashInfer not found, only benchmarking standard operations") @@ -75,7 +75,7 @@ _FI_MAX_SIZES = { } # Global workspace tensor for FlashInfer -_FI_WORKSPACE_TENSOR = None +_FI_WORKSPACE = None def setup_flashinfer_workspace( @@ -83,10 +83,10 @@ def setup_flashinfer_workspace( rank: int, hidden_dim: int, max_token_num: int, - use_fp32_lamport: bool = False, + dtype: torch.dtype, ): """Setup FlashInfer workspace for fused allreduce operations.""" - global _FI_WORKSPACE_TENSOR + global _FI_WORKSPACE if flashinfer_comm is None: return None, None @@ -96,33 +96,29 @@ def setup_flashinfer_workspace( return None, None try: - # Create IPC workspace - ipc_handles, workspace_tensor = ( - flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion( - tp_rank=rank, - tp_size=world_size, - max_token_num=max_token_num, - hidden_dim=hidden_dim, - group=get_tp_group().device_group, - use_fp32_lamport=use_fp32_lamport, - ) + workspace = flashinfer_comm.create_allreduce_fusion_workspace( + backend="trtllm", + world_size=world_size, + rank=rank, + max_token_num=max_token_num, + hidden_dim=hidden_dim, + dtype=dtype, ) - _FI_WORKSPACE_TENSOR = workspace_tensor - return ipc_handles, workspace_tensor + _FI_WORKSPACE = workspace + return workspace except Exception as e: logger.error("Failed to setup FlashInfer workspace: %s", e) - return None, None + return None -def cleanup_flashinfer_workspace(ipc_handles): +def cleanup_flashinfer_workspace(workspace): """Cleanup FlashInfer workspace.""" - if flashinfer_comm is None or ipc_handles is None: + if flashinfer_comm is None or workspace is None: return try: - group = get_tp_group().device_group - flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce(ipc_handles, group) + workspace.destroy() except Exception as e: logger.error("Failed to cleanup FlashInfer workspace: %s", e) @@ -132,25 +128,15 @@ class FlashInferFusedAllReduceParams: def __init__( self, - rank: int, - world_size: int, - use_fp32_lamport: bool = False, max_token_num: int = 1024, ): - self.rank = rank - self.world_size = world_size - self.use_fp32_lamport = use_fp32_lamport - self.trigger_completion_at_end = True self.launch_with_pdl = True self.fp32_acc = True self.max_token_num = max_token_num def get_trtllm_fused_allreduce_kwargs(self): return { - "world_rank": self.rank, - "world_size": self.world_size, "launch_with_pdl": self.launch_with_pdl, - "trigger_completion_at_end": self.trigger_completion_at_end, "fp32_acc": self.fp32_acc, } @@ -165,7 +151,7 @@ def flashinfer_fused_allreduce_rmsnorm( norm_out: torch.Tensor | None = None, ): """FlashInfer fused allreduce + rmsnorm operation.""" - if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None: + if flashinfer_comm is None or _FI_WORKSPACE is None: raise RuntimeError("FlashInfer not available or workspace not initialized") if norm_out is None: @@ -174,18 +160,15 @@ def flashinfer_fused_allreduce_rmsnorm( else: residual_out = input_tensor - flashinfer_comm.trtllm_allreduce_fusion( - allreduce_in=input_tensor, - token_num=input_tensor.shape[0], + flashinfer_comm.allreduce_fusion( + input=input_tensor, + workspace=_FI_WORKSPACE, + pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm, residual_in=residual, residual_out=residual_out, norm_out=norm_out, rms_gamma=rms_gamma, rms_eps=rms_eps, - hidden_dim=input_tensor.shape[-1], - workspace_ptrs=_FI_WORKSPACE_TENSOR, - pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm, - allreduce_out=None, quant_out=None, scale_out=None, layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4, @@ -207,7 +190,7 @@ def flashinfer_fused_allreduce_rmsnorm_fp8_quant( quant_out: torch.Tensor | None = None, ): """FlashInfer fused allreduce + rmsnorm + FP8 quantization.""" - if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None: + if flashinfer_comm is None or _FI_WORKSPACE is None: raise RuntimeError("FlashInfer not available or workspace not initialized") if norm_out is None: @@ -216,18 +199,15 @@ def flashinfer_fused_allreduce_rmsnorm_fp8_quant( else: residual_out = input_tensor - flashinfer_comm.trtllm_allreduce_fusion( - allreduce_in=input_tensor, - token_num=input_tensor.shape[0], + flashinfer_comm.allreduce_fusion( + input=input_tensor, + workspace=_FI_WORKSPACE, + pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant, residual_in=residual, residual_out=residual_out, norm_out=norm_out, rms_gamma=rms_gamma, rms_eps=rms_eps, - hidden_dim=input_tensor.shape[-1], - workspace_ptrs=_FI_WORKSPACE_TENSOR, - pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP8Quant, - allreduce_out=None, quant_out=quant_out, scale_out=None, layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4, @@ -250,7 +230,7 @@ def flashinfer_fused_allreduce_rmsnorm_fp4_quant( norm_out: torch.Tensor | None = None, ): """FlashInfer fused allreduce + rmsnorm + FP4 quantization.""" - if flashinfer_comm is None or _FI_WORKSPACE_TENSOR is None: + if flashinfer_comm is None or _FI_WORKSPACE is None: raise RuntimeError("FlashInfer not available or workspace not initialized") if norm_out is None: @@ -259,18 +239,15 @@ def flashinfer_fused_allreduce_rmsnorm_fp4_quant( else: residual_out = input_tensor - flashinfer_comm.trtllm_allreduce_fusion( - allreduce_in=input_tensor, - token_num=input_tensor.shape[0], + flashinfer_comm.allreduce_fusion( + input=input_tensor, + workspace=_FI_WORKSPACE, + pattern=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant, residual_in=residual, residual_out=residual_out, norm_out=norm_out, rms_gamma=rms_gamma, rms_eps=rms_eps, - hidden_dim=input_tensor.shape[-1], - workspace_ptrs=_FI_WORKSPACE_TENSOR, - pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNormFP4Quant, - allreduce_out=None, quant_out=quant_out, scale_out=output_scale, layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4, @@ -1040,23 +1017,31 @@ def main(): configs = list(itertools.product(args.num_tokens, dtypes, residual_options)) # Setup FlashInfer workspace if available - ipc_handles = None + workspace = None allreduce_params = None if flashinfer_comm is not None: # Use the largest hidden dimension for workspace setup + max_element_size = max(torch.finfo(dt).bits // 8 for dt in dtypes) + workspace_dtype = ( + torch.float32 + if max_element_size == 4 + else (torch.bfloat16 if torch.bfloat16 in dtypes else torch.float16) + ) max_num_token = _FI_MAX_SIZES.get(world_size) // ( - args.hidden_dim * world_size * 2 + args.hidden_dim * max_element_size ) - ipc_handles, workspace_tensor = setup_flashinfer_workspace( - world_size, rank, args.hidden_dim, max_num_token + workspace = setup_flashinfer_workspace( + world_size, + rank, + args.hidden_dim, + max_num_token, + dtype=workspace_dtype, ) - if workspace_tensor is not None: + if workspace is not None: allreduce_params = FlashInferFusedAllReduceParams( - rank=rank, - world_size=world_size, max_token_num=max_num_token, ) @@ -1119,8 +1104,8 @@ def main(): finally: # Cleanup - if ipc_handles is not None: - cleanup_flashinfer_workspace(ipc_handles) + if workspace is not None: + cleanup_flashinfer_workspace(workspace) dist.barrier() diff --git a/tests/compile/passes/distributed/test_fusion_all_reduce.py b/tests/compile/passes/distributed/test_fusion_all_reduce.py index f13f49b67..d48f22970 100644 --- a/tests/compile/passes/distributed/test_fusion_all_reduce.py +++ b/tests/compile/passes/distributed/test_fusion_all_reduce.py @@ -202,9 +202,10 @@ class TestAllReduceFusedAddRMSNormStaticQuantFP4Model(torch.nn.Module): @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA") @pytest.mark.skipif( not find_spec("flashinfer") - or not has_module_attribute("flashinfer.comm", "trtllm_allreduce_fusion"), + or not has_module_attribute("flashinfer.comm", "allreduce_fusion") + or not has_module_attribute("flashinfer.comm", "create_allreduce_fusion_workspace"), reason="flashinfer is not found or flashinfer " - "is not compiled with trtllm_allreduce_fusion", + "is not compiled with allreduce_fusion", ) def test_all_reduce_fusion_pass_replace( test_model: torch.nn.Module, diff --git a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py index 0b343fd16..b613d4424 100644 --- a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py +++ b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import contextlib from importlib.util import find_spec from types import ModuleType @@ -36,7 +37,9 @@ if find_spec("flashinfer"): try: import flashinfer.comm as _flashinfer_comm - if hasattr(_flashinfer_comm, "trtllm_allreduce_fusion"): + if hasattr(_flashinfer_comm, "allreduce_fusion") and hasattr( + _flashinfer_comm, "create_allreduce_fusion_workspace" + ): flashinfer_comm = _flashinfer_comm except ImportError: pass @@ -79,7 +82,7 @@ _FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB: dict[int, dict[int, float]] = { if flashinfer_comm is not None: - _FI_WORKSPACE_TENSOR = None + _FI_WORKSPACE = None MiB = 1024 * 1024 def call_trtllm_fused_allreduce_norm( @@ -87,10 +90,8 @@ if flashinfer_comm is not None: residual: torch.Tensor, rms_gamma: torch.Tensor, rms_eps: float, - world_rank: int, world_size: int, launch_with_pdl: bool, - trigger_completion_at_end: bool, fp32_acc: bool, max_token_num: int, pattern_code: int, @@ -121,7 +122,7 @@ if flashinfer_comm is not None: max_one_shot_size is None or current_tensor_size <= max_one_shot_size * MiB ) - assert _FI_WORKSPACE_TENSOR is not None, ( + assert _FI_WORKSPACE is not None, ( "Flashinfer must be enabled when using flashinfer" ) if norm_out is None: @@ -134,24 +135,18 @@ if flashinfer_comm is not None: residual_out = allreduce_in # For the sizes that are smaller than the max size, # we only use flashinfer one shot allreduce - flashinfer_comm.trtllm_allreduce_fusion( - allreduce_in=allreduce_in, - token_num=allreduce_in.shape[0], + flashinfer_comm.allreduce_fusion( + input=allreduce_in, + workspace=_FI_WORKSPACE, + pattern=pattern_code, residual_in=residual, residual_out=residual_out, norm_out=norm_out, rms_gamma=rms_gamma, rms_eps=rms_eps, - world_rank=world_rank, - world_size=world_size, - hidden_dim=allreduce_in.shape[-1], - workspace_ptrs=_FI_WORKSPACE_TENSOR, launch_with_pdl=launch_with_pdl, use_oneshot=use_oneshot, - trigger_completion_at_end=trigger_completion_at_end, fp32_acc=fp32_acc, - pattern_code=pattern_code, - allreduce_out=None, quant_out=quant_out, scale_out=scale_out, # in vllm we only support swizzled layout @@ -164,10 +159,8 @@ if flashinfer_comm is not None: residual: torch.Tensor, rms_gamma: torch.Tensor, rms_eps: float, - world_rank: int, world_size: int, launch_with_pdl: bool, - trigger_completion_at_end: bool, fp32_acc: bool, max_token_num: int, pattern_code: int, @@ -200,25 +193,18 @@ class FlashInferFusedAllReduceParams: def __init__( self, - rank: int, world_size: int, - use_fp32_lamport: bool = False, max_token_num: int = 1024, ) -> None: - self.rank = rank self.world_size = world_size - self.use_fp32_lamport = use_fp32_lamport - self.trigger_completion_at_end = True self.launch_with_pdl = True self.fp32_acc = True self.max_token_num = max_token_num def get_trtllm_fused_allreduce_kwargs(self) -> dict[str, bool | int]: return { - "world_rank": self.rank, "world_size": self.world_size, "launch_with_pdl": self.launch_with_pdl, - "trigger_completion_at_end": self.trigger_completion_at_end, "fp32_acc": self.fp32_acc, "max_token_num": self.max_token_num, } @@ -712,7 +698,6 @@ class AllReduceFusionPass(VllmPatternMatcherPass): self.hidden_dim = config.model_config.get_hidden_size() self.group = get_tp_group().device_group rank = get_tensor_model_parallel_rank() - use_fp32_lamport = self.model_dtype == torch.float32 if flashinfer_comm is None: logger.warning( "Flashinfer is not installed or comm module not found, " @@ -730,7 +715,7 @@ class AllReduceFusionPass(VllmPatternMatcherPass): self.tp_size, ) return - element_size = 4 if use_fp32_lamport else 2 + element_size = torch.tensor([], dtype=self.model_dtype).element_size() self.max_token_num = max_size // (self.hidden_dim * element_size) # take the min to save workspace size and we'll never use more # than max_num_batched_tokens anyways @@ -744,23 +729,19 @@ class AllReduceFusionPass(VllmPatternMatcherPass): scope="global", ) - self.ipc_handles, workspace_tensor = ( - flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion( - tp_rank=rank, - tp_size=self.tp_size, - max_token_num=self.max_token_num, - hidden_dim=self.hidden_dim, - group=self.group, - use_fp32_lamport=use_fp32_lamport, - ) + self.workspace = flashinfer_comm.create_allreduce_fusion_workspace( + backend="trtllm", + world_size=self.tp_size, + rank=rank, + max_token_num=self.max_token_num, + hidden_dim=self.hidden_dim, + dtype=self.model_dtype, ) - global _FI_WORKSPACE_TENSOR - _FI_WORKSPACE_TENSOR = workspace_tensor + global _FI_WORKSPACE + _FI_WORKSPACE = self.workspace self.allreduce_params = FlashInferFusedAllReduceParams( - rank=rank, world_size=self.tp_size, - use_fp32_lamport=use_fp32_lamport, max_token_num=self.max_token_num, ) @@ -832,7 +813,6 @@ class AllReduceFusionPass(VllmPatternMatcherPass): def __del__(self) -> None: if getattr(self, "disabled", True): return - if flashinfer_comm is not None: - flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce( - self.ipc_handles, self.group - ) + if getattr(self, "workspace", None) is not None: + with contextlib.suppress(Exception): + self.workspace.destroy() -- GitLab From 995bbf38f114a0e1bd7e34d6fd92d255ac2efca7 Mon Sep 17 00:00:00 2001 From: TomerBN-Nvidia Date: Mon, 9 Feb 2026 18:44:18 +0200 Subject: [PATCH 0014/1166] [Bugfix] Fix shared expert input for latent MoE in EP+DP (Nemotron-H) (#34087) Signed-off-by: Tomer Natan Co-authored-by: Cursor --- .../fused_moe/flashinfer_cutlass_moe.py | 2 +- .../fused_moe/fused_moe_modular_method.py | 1 + .../layers/fused_moe/modular_kernel.py | 24 +++++++++++++++++-- .../compressed_tensors_moe.py | 3 +++ .../model_executor/layers/quantization/fp8.py | 1 + .../layers/quantization/modelopt.py | 2 ++ 6 files changed, 30 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py index 7c27da46f..85df6cb66 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py @@ -139,7 +139,7 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): # work with SP. This will be removed in follow up after we get # rid of the FlashInfer specific P/F function. # TODO: the per-tensor fp8 kernels don't work with MNNVL FI A2As. - return not moe_parallel_config.is_sequence_parallel + return True @staticmethod def activation_format() -> mk.FusedMoEActivationFormat: diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py index c30eeb6dc..69a6e70fc 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -101,4 +101,5 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): global_num_experts=layer.global_num_experts, apply_router_weight_on_input=layer.apply_router_weight_on_input, expert_map=None if self.disable_expert_map else layer.expert_map, + shared_experts_input=layer._get_shared_experts_input(x), ) diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 598374af2..8a670216b 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -1228,13 +1228,28 @@ class FusedMoEModularKernel(torch.nn.Module): topk_weights: torch.Tensor, topk_ids: torch.Tensor, apply_router_weight_on_input: bool, + shared_experts_input: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: """ The _finalize method is a wrapper around self.prepare_finalize.finalize that handles DBO, async and shared expert overlap. + + Args: + shared_experts_input: Optional separate input for shared experts. + When latent MoE is used, hidden_states is the latent-projected + tensor (smaller dimension) used by routed experts, while + shared_experts_input is the original hidden_states (full + dimension) needed by the shared expert MLP. """ shared_output: torch.Tensor | None = None + # For latent MoE: shared experts need the original hidden_states + # (full hidden_size), not the latent-projected version used by + # routed experts. + se_hidden_states = ( + shared_experts_input if shared_experts_input is not None else hidden_states + ) + if not self.prepare_finalize.supports_async(): assert not dbo_enabled() @@ -1247,7 +1262,7 @@ class FusedMoEModularKernel(torch.nn.Module): self.fused_experts.finalize_weight_and_reduce_impl(), ) if self.shared_experts is not None: - shared_output = self.shared_experts(hidden_states) + shared_output = self.shared_experts(se_hidden_states) else: finalize_ret = self.prepare_finalize.finalize_async( output, @@ -1258,7 +1273,7 @@ class FusedMoEModularKernel(torch.nn.Module): self.fused_experts.finalize_weight_and_reduce_impl(), ) if self.shared_experts is not None: - shared_output = self.shared_experts(hidden_states) + shared_output = self.shared_experts(se_hidden_states) # TODO(lucas): refactor this in the alternative schedules followup # currently unpack if we have hook + receiver pair or just @@ -1298,6 +1313,7 @@ class FusedMoEModularKernel(torch.nn.Module): global_num_experts: int = -1, expert_map: torch.Tensor | None = None, apply_router_weight_on_input: bool = False, + shared_experts_input: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: """ This function computes a Mixture of Experts (MoE) layer using two sets @@ -1320,6 +1336,9 @@ class FusedMoEModularKernel(torch.nn.Module): - apply_router_weight_on_input (bool): When true, the topk weights are applied directly on the inputs. This is only applicable when topk is 1. + - shared_experts_input (Optional[torch.Tensor]): Optional separate + input for shared experts. For latent MoE, this is the original + hidden_states before latent projection. Returns: - torch.Tensor: The output tensor after applying the MoE layer. @@ -1368,4 +1387,5 @@ class FusedMoEModularKernel(torch.nn.Module): topk_weights, topk_ids, apply_router_weight_on_input, + shared_experts_input=shared_experts_input, ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index e25a415a5..604373c0a 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -361,6 +361,7 @@ class CompressedTensorsW4A4Mxfp4MoEMethod(CompressedTensorsMoEMethod): global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, + shared_experts_input=layer._get_shared_experts_input(x), ) @@ -672,6 +673,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, + shared_experts_input=layer._get_shared_experts_input(x), ) @@ -1077,6 +1079,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): # https://github.com/vllm-project/vllm/commit/84166fee9770e6fba71a96978b3e7d149392fb28 # noqa: E501 expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, + shared_experts_input=layer._get_shared_experts_input(x), ) @property diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 80348edcc..b8040e894 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1023,6 +1023,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, + shared_experts_input=layer._get_shared_experts_input(x), ) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 8e306470c..8b151133b 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -980,6 +980,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, + shared_experts_input=layer._get_shared_experts_input(x), ) @@ -1550,6 +1551,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, + shared_experts_input=layer._get_shared_experts_input(x), ) -- GitLab From 285bab47526cbc4d4e26c61d831eaeb17b253d0f Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu Date: Tue, 10 Feb 2026 01:17:25 +0800 Subject: [PATCH 0015/1166] [Kernel] use flashinfer for gdn prefill (#32846) Signed-off-by: zjy0516 --- vllm/model_executor/models/qwen3_next.py | 117 ++++++++++++++++++++++- 1 file changed, 115 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 3bcfbacbb..de97daccf 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -28,11 +28,15 @@ from vllm.distributed import ( ) from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger +from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.fla.ops import ( - chunk_gated_delta_rule, + chunk_gated_delta_rule as fla_chunk_gated_delta_rule, +) +from vllm.model_executor.layers.fla.ops import ( fused_recurrent_gated_delta_rule, ) +from vllm.model_executor.layers.fla.ops.chunk import l2norm_fwd from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.layernorm import ( GemmaRMSNorm as Qwen3NextRMSNorm, @@ -101,6 +105,113 @@ logger = init_logger(__name__) KVCache = tuple[torch.Tensor, torch.Tensor] +def fi_chunk_gated_delta_rule( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + initial_state: torch.Tensor, + output_final_state: bool, + cu_seqlens: torch.LongTensor | None = None, + head_first: bool = False, + use_qk_l2norm_in_kernel: bool = True, +): + from flashinfer.gdn_prefill import ( + chunk_gated_delta_rule as chunk_gated_delta_rule_fi, + ) + + if use_qk_l2norm_in_kernel: + q = l2norm_fwd(q) + k = l2norm_fwd(k) + + # use flashinfer implementation + q = q.squeeze(0).contiguous() + k = k.squeeze(0).contiguous() + v = v.squeeze(0).contiguous() + + g = g.squeeze(0).contiguous() + beta = beta.squeeze(0).contiguous() + fi_state = initial_state.to(torch.float32) + fi_g = g.to(torch.float32) + fi_beta = beta.to(torch.float32) + return chunk_gated_delta_rule_fi( + q=q, + k=k, + v=v, + g=torch.exp(fi_g), + beta=fi_beta, + initial_state=fi_state, + output_final_state=output_final_state, + cu_seqlens=cu_seqlens, + ) + + +@CustomOp.register("chunk_gated_delta_rule") +class ChunkGatedDeltaRule(CustomOp): + def __init__(self) -> None: + super().__init__() + if current_platform.is_cuda() and current_platform.is_device_capability(90): + logger.info_once( + "Using FlashInfer GDN prefill kernel on CUDA compute capability 90" + ) + self._forward_method = self.forward_cuda + else: + self._forward_method = self.forward_native + + def forward_cuda( + self, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + initial_state: torch.Tensor, + output_final_state: bool, + cu_seqlens: torch.LongTensor | None = None, + head_first: bool = False, + use_qk_l2norm_in_kernel: bool = True, + ): + return fi_chunk_gated_delta_rule( + q=q, + k=k, + v=v, + g=g, + beta=beta, + initial_state=initial_state, + output_final_state=output_final_state, + cu_seqlens=cu_seqlens, + head_first=head_first, + use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel, + ) + + def forward_native( + self, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + initial_state: torch.Tensor, + output_final_state: bool, + cu_seqlens: torch.LongTensor | None = None, + head_first: bool = False, + use_qk_l2norm_in_kernel: bool = True, + ): + return fla_chunk_gated_delta_rule( + q=q, + k=k, + v=v, + g=g, + beta=beta, + initial_state=initial_state, + output_final_state=output_final_state, + cu_seqlens=cu_seqlens, + head_first=head_first, + use_qk_l2norm_in_kernel=use_qk_l2norm_in_kernel, + ) + + class Qwen3NextSparseMoeBlock(nn.Module): def __init__(self, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -362,6 +473,8 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): prefix=f"{prefix}.out_proj", ) + self.chunk_gated_delta_rule = ChunkGatedDeltaRule() + compilation_config = get_current_vllm_config().compilation_config if prefix in compilation_config.static_forward_context: raise ValueError(f"Duplicate layer name: {prefix}") @@ -647,7 +760,7 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): ( core_attn_out_non_spec, last_recurrent_state, - ) = chunk_gated_delta_rule( + ) = self.chunk_gated_delta_rule( q=query_non_spec, k=key_non_spec, v=value_non_spec, -- GitLab From eadb4e868bae8acd2e9b764f5827c0500ec44c34 Mon Sep 17 00:00:00 2001 From: Artus Krohn-Grimberghe Date: Mon, 9 Feb 2026 20:17:44 +0100 Subject: [PATCH 0016/1166] [Bugfix] Avoid duplicate k-proj weight emission in helper (#34142) Signed-off-by: Artus KG --- vllm/model_executor/models/whisper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 0c777e4a5..7462d9f6e 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -958,8 +958,8 @@ def _create_fake_bias_for_k_proj( So that the bias for k_proj in qkv_proj can be initialized with zeros. """ for name, weight in weights: + yield name, weight if name.endswith(fake_bias_key_name): bias = torch.zeros(weight.size(0)) bias_name = name.replace("weight", "bias") - yield from [(name, weight), (bias_name, bias)] - yield name, weight + yield bias_name, bias -- GitLab From 8fd31f62452960efdd6dd7b912c388f487536b3c Mon Sep 17 00:00:00 2001 From: Artus Krohn-Grimberghe Date: Mon, 9 Feb 2026 20:30:38 +0100 Subject: [PATCH 0017/1166] [Bugfix] Voxtral prompt/audio placeholder alignment (#34140) Signed-off-by: Artus KG --- vllm/model_executor/models/voxtral.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 942d91e44..a33454005 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -187,6 +187,7 @@ class VoxtralProcessingInfo(BaseProcessingInfo): def get_data_parser(self): return MultiModalDataParser( target_sr=self.get_hf_processor().sampling_rate, + target_channels=1, expected_hidden_size=self._get_expected_hidden_size(), ) @@ -289,10 +290,24 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]) processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) audio_id = processor.audio_token_id + out_mm_data = out_mm_kwargs.require_data() + out_audio_items = out_mm_data.get("audio", []) def get_replacement(item_idx: int): - audios = mm_items.get_items("audio", AudioProcessorItems) - audio_len = audios.get_audio_length(item_idx) + if item_idx < len(out_audio_items): + out_audio_data = out_audio_items[item_idx].get_data() + audio_arr = out_audio_data["audio_arrays"] + if isinstance(audio_arr, (torch.Tensor, np.ndarray)): + audio_len = len(audio_arr) + else: + raise TypeError( + "Unexpected type for audio_arrays in out_mm_kwargs: " + f"{type(audio_arr)}" + ) + else: + # Fallback for unexpected processor outputs. + audios = mm_items.get_items("audio", AudioProcessorItems) + audio_len = audios.get_audio_length(item_idx) nb_audio_tokens = processor.get_num_audio_tokens(audio_len) @@ -495,7 +510,10 @@ class VoxtralForConditionalGeneration( return TokensPrompt( prompt_token_ids=tokenized.tokens, multi_modal_data={ - "audio": (tokenized.audios[0].audio_array, stt_config.sample_rate) + "audio": [ + (audio.audio_array, stt_config.sample_rate) + for audio in tokenized.audios + ], }, ) -- GitLab From 4d3965096164328451538988824d72ab03593c04 Mon Sep 17 00:00:00 2001 From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Date: Mon, 9 Feb 2026 14:36:30 -0500 Subject: [PATCH 0018/1166] [ROCm] update triton branch to support gpt-oss models for gfx11xx devices (#34032) Signed-off-by: Hongxia Yang --- docker/Dockerfile.rocm_base | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base index 6f8c7222f..948f8dc56 100644 --- a/docker/Dockerfile.rocm_base +++ b/docker/Dockerfile.rocm_base @@ -1,5 +1,5 @@ ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete -ARG TRITON_BRANCH="57c693b6" +ARG TRITON_BRANCH="f332c492" ARG TRITON_REPO="https://github.com/ROCm/triton.git" ARG PYTORCH_BRANCH="89075173" ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git" -- GitLab From bb9f97308d0b88c5ad2d64c217b22866e40c79df Mon Sep 17 00:00:00 2001 From: Charlie Fu Date: Mon, 9 Feb 2026 15:15:43 -0600 Subject: [PATCH 0019/1166] [torch.compile][Fusion] Fix attention fusion pass removing kv_udpate op. (#33945) Signed-off-by: charlifu --- tests/compile/passes/test_fusion_attn.py | 13 ++++++++++++- vllm/compilation/passes/fusion/attn_quant_fusion.py | 10 ++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/tests/compile/passes/test_fusion_attn.py b/tests/compile/passes/test_fusion_attn.py index 75d5c42f0..2b29cf605 100644 --- a/tests/compile/passes/test_fusion_attn.py +++ b/tests/compile/passes/test_fusion_attn.py @@ -267,7 +267,7 @@ elif current_platform.is_rocm(): PATTERN_TEST_MODELS_FP8 = [ ("amd/Llama-3.1-8B-Instruct-FP8-KV", TestAttentionFp8StaticQuantPatternModel) ] - BACKENDS = [ + BACKENDS_FP8 = [ AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN, AttentionBackendEnum.ROCM_ATTN, AttentionBackendEnum.TRITON_ATTN, @@ -474,6 +474,17 @@ def test_attention_quant_pattern( assert attn_nodes_pre[0].kwargs.get("output_block_scale") is None, ( "Attention should not have output_block_scale before fusion" ) + + kv_cache_dummy_dep_pre_is_none = ( + attn_nodes_pre[0].kwargs.get("kv_cache_dummy_dep") is None + ) + kv_cache_dummy_dep_post_is_none = ( + attn_nodes_post[0].kwargs.get("kv_cache_dummy_dep") is None + ) + assert not (kv_cache_dummy_dep_pre_is_none ^ kv_cache_dummy_dep_post_is_none), ( + "The kv_cache_dummy_dep should be consistent before and after fusion" + ) + if quant_key.dtype == FP8_DTYPE: assert attn_nodes_post[0].kwargs.get("output_block_scale") is None, ( "Attention should not have output_block_scale after FP8 fusion" diff --git a/vllm/compilation/passes/fusion/attn_quant_fusion.py b/vllm/compilation/passes/fusion/attn_quant_fusion.py index a104aab6c..bb064f58c 100644 --- a/vllm/compilation/passes/fusion/attn_quant_fusion.py +++ b/vllm/compilation/passes/fusion/attn_quant_fusion.py @@ -142,6 +142,7 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern): v: torch.Tensor, output_attn: torch.Tensor, scale: torch.Tensor, + kv_cache_dummy_dep: torch.Tensor, ) -> torch.Tensor: at1 = auto_functionalized( ATTN_OP, @@ -152,6 +153,7 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern): layer_name=self.layer_name, output_scale=None, output_block_scale=None, + kv_cache_dummy_dep=kv_cache_dummy_dep, ) attn_out_view = RESHAPE_OP( at1[1], [q.shape[0], self.num_heads * self.head_size] @@ -165,6 +167,7 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern): v: torch.Tensor, output_attn: torch.Tensor, scale: torch.Tensor, + kv_cache_dummy_dep: torch.Tensor, ) -> torch.Tensor: # attn output in quant_dtype output_attn = torch.ops.aten.full.default( @@ -182,6 +185,7 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern): layer_name=self.layer_name, output_scale=scale, output_block_scale=None, + kv_cache_dummy_dep=kv_cache_dummy_dep, ) return RESHAPE_OP(at1[1], [-1, self.num_heads * self.head_size]) @@ -191,6 +195,7 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern): self.empty(5, self.num_heads, self.head_size), # v self.empty(5, self.num_heads, self.head_size), # attn_output empty_fp32(1, 1), # scale + self.empty(0), # kv_cache_dummy_dep ] pm.register_replacement( @@ -228,6 +233,7 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern): output_quant: torch.Tensor, output_scale: torch.Tensor, input_scale: torch.Tensor, + kv_cache_dummy_dep: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: at1 = auto_functionalized( ATTN_OP, @@ -238,6 +244,7 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern): layer_name=self.layer_name, output_scale=None, output_block_scale=None, + kv_cache_dummy_dep=kv_cache_dummy_dep, ) attn_out_view = RESHAPE_OP( at1[1], [q.shape[0], self.num_heads * self.head_size] @@ -261,6 +268,7 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern): output_quant: torch.Tensor, output_scale: torch.Tensor, input_scale: torch.Tensor, + kv_cache_dummy_dep: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: # attention output in quant_dtype output_attn = torch.ops.aten.full.default( @@ -280,6 +288,7 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern): layer_name=self.layer_name, output_scale=input_scale, output_block_scale=output_scale_view, + kv_cache_dummy_dep=kv_cache_dummy_dep, ) output = RESHAPE_OP(at2[1], [-1, self.num_heads * self.head_size // 2]) return output, at2[2] @@ -294,6 +303,7 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern): 128, round_up(self.num_heads * self.head_size // 16, 4) ), # output_scale empty_fp32(1, 1), # input_scale + self.empty(0), # kv_cache_dummy_dep ] pm.register_replacement( -- GitLab From e7e52781ff636bf772301c9282a7601c73b8b905 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Mon, 9 Feb 2026 13:47:17 -0800 Subject: [PATCH 0020/1166] [ModelRunner V2][BugFix] Fix `max_query_len` calculation (#34167) Signed-off-by: Nick Hill --- vllm/v1/worker/gpu/attn_utils.py | 2 +- vllm/v1/worker/gpu/cudagraph_utils.py | 1 + vllm/v1/worker/gpu/model_runner.py | 3 +++ vllm/v1/worker/gpu/spec_decode/eagle.py | 1 + 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py index d45867b4e..8a08fba1e 100644 --- a/vllm/v1/worker/gpu/attn_utils.py +++ b/vllm/v1/worker/gpu/attn_utils.py @@ -149,13 +149,13 @@ def build_attn_metadata( num_tokens: int, query_start_loc_gpu: torch.Tensor, query_start_loc_cpu: torch.Tensor, + max_query_len: int, seq_lens: torch.Tensor, max_seq_len: int, block_tables: Sequence[torch.Tensor], slot_mappings: torch.Tensor, kv_cache_config: KVCacheConfig, ) -> dict[str, Any]: - max_query_len = int(query_start_loc_cpu.max()) seq_lens = seq_lens[:num_reqs] attn_metadata: dict[str, Any] = {} diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py index a855074cd..bf55b99af 100644 --- a/vllm/v1/worker/gpu/cudagraph_utils.py +++ b/vllm/v1/worker/gpu/cudagraph_utils.py @@ -267,6 +267,7 @@ def prepare_inputs_to_capture( num_tokens=num_tokens, query_start_loc_gpu=query_start_loc, query_start_loc_cpu=query_start_loc_cpu, + max_query_len=num_tokens_per_req, seq_lens=input_buffers.seq_lens, max_seq_len=max_model_len, block_tables=input_block_tables, diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index 416eaa011..d6b87bd71 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -274,6 +274,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): num_tokens=input_batch.num_tokens, query_start_loc_gpu=input_batch.query_start_loc, query_start_loc_cpu=torch.from_numpy(input_batch.query_start_loc_np), + max_query_len=input_batch.num_scheduled_tokens.max().item(), seq_lens=input_batch.seq_lens, max_seq_len=self.max_model_len, block_tables=block_tables, @@ -561,6 +562,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): query_start_loc_np = query_start_loc_np[: num_reqs + 1] query_start_loc_cpu = torch.from_numpy(query_start_loc_np) query_start_loc = self.input_buffers.query_start_loc[: num_reqs + 1] + max_query_len = num_scheduled_tokens.max().item() # Get prefill tokens. prepare_prefill_inputs( @@ -624,6 +626,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): num_tokens=num_tokens, query_start_loc_gpu=query_start_loc, query_start_loc_cpu=query_start_loc_cpu, + max_query_len=max_query_len, seq_lens=self.input_buffers.seq_lens, max_seq_len=self.max_model_len, block_tables=block_tables, diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py index b4cf9a1b4..af56c23bf 100644 --- a/vllm/v1/worker/gpu/spec_decode/eagle.py +++ b/vllm/v1/worker/gpu/spec_decode/eagle.py @@ -301,6 +301,7 @@ class EagleSpeculator: num_tokens=num_reqs, query_start_loc_gpu=query_start_loc, query_start_loc_cpu=query_start_loc_cpu, + max_query_len=1, seq_lens=self.input_buffers.seq_lens[:num_reqs], max_seq_len=self.max_model_len, block_tables=block_tables, -- GitLab From 5e75a14a667dccf7f48781568f19f1a6b9c8014a Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 9 Feb 2026 18:33:43 -0500 Subject: [PATCH 0021/1166] [Doc] Add DCP support to attention backend doc (#33936) --- docs/design/attention_backends.md | 51 +- .../generate_attention_backend_docs.py | 1362 +++++++++-------- 2 files changed, 769 insertions(+), 644 deletions(-) diff --git a/docs/design/attention_backends.md b/docs/design/attention_backends.md index 6e84dde92..b551e31db 100644 --- a/docs/design/attention_backends.md +++ b/docs/design/attention_backends.md @@ -152,6 +152,7 @@ Priority is **1 = highest** (tried first). | **Sink** | Attention sink support (for StreamingLLM) | | **Sparse** | Sparse attention support (MLA only) | | **MM Prefix** | Multimodal prefix full attention support | +| **DCP** | Decode Context Parallelism support (`--decode-context-parallel-size`) | | **Attention Types** | Supported attention patterns (Decoder, Encoder, Enc-Dec) | | **Compute Cap.** | Required CUDA compute capability (N/A for non-CUDA backends) | @@ -159,20 +160,20 @@ Priority is **1 = highest** (tried first). ## Standard Attention (MHA, MQA, GQA) Backends -| Backend | Version | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | MM Prefix | Attention Types | Compute Cap. | -|---------|---------|--------|-----------|-------------|------------|------|-----------|-----------------|--------------| -| `CPU_ATTN` | | fp16, bf16, fp32 | `auto` | Any | 32, 64, 80, 96, 112, 128, 160, 192, 224, 256 | ❌ | ❌ | All | N/A | -| `FLASHINFER` | Native† | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ❌ | ❌ | Decoder | 7.x-9.x | -| `FLASHINFER` | TRTLLM† | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ✅ | ❌ | Decoder | 10.x | -| `FLASH_ATTN` | FA2* | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | All | ≥8.0 | -| `FLASH_ATTN` | FA3* | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ❌ | All | 9.x | -| `FLASH_ATTN_DIFFKV` | | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | Decoder | Any | -| `FLEX_ATTENTION` | | fp16, bf16, fp32 | `auto`, `bfloat16` | Any | Any | ❌ | ✅ | Decoder, Encoder Only | Any | -| `ROCM_AITER_FA` | | fp16, bf16 | `auto` | 16, 32 | 64, 128, 256 | ❌ | ❌ | Decoder | N/A | -| `ROCM_AITER_UNIFIED_ATTN` | | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | Decoder | N/A | -| `ROCM_ATTN` | | fp16, bf16, fp32 | `auto` | 16, 32, 544 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | Decoder | N/A | -| `TREE_ATTN` | | fp16, bf16 | `auto` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | Decoder | Any | -| `TRITON_ATTN` | | fp16, bf16, fp32 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ✅ | All | Any | +| Backend | Version | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | MM Prefix | DCP | Attention Types | Compute Cap. | +|---------|---------|--------|-----------|-------------|------------|------|-----------|-----|-----------------|--------------| +| `CPU_ATTN` | | fp16, bf16, fp32 | `auto` | Any | 32, 64, 80, 96, 112, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | All | N/A | +| `FLASHINFER` | Native† | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ❌ | ❌ | ✅ | Decoder | 7.x-9.x | +| `FLASHINFER` | TRTLLM† | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | 16, 32, 64 | 64, 128, 256 | ✅ | ❌ | ✅ | Decoder | 10.x | +| `FLASH_ATTN` | FA2* | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ✅ | All | ≥8.0 | +| `FLASH_ATTN` | FA3* | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ❌ | ✅ | All | 9.x | +| `FLASH_ATTN_DIFFKV` | | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ✅ | Decoder | Any | +| `FLEX_ATTENTION` | | fp16, bf16, fp32 | `auto`, `bfloat16` | Any | Any | ❌ | ✅ | ❌ | Decoder, Encoder Only | Any | +| `ROCM_AITER_FA` | | fp16, bf16 | `auto` | 16, 32 | 64, 128, 256 | ❌ | ❌ | ❌ | Decoder | N/A | +| `ROCM_AITER_UNIFIED_ATTN` | | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | Decoder | N/A | +| `ROCM_ATTN` | | fp16, bf16, fp32 | `auto` | 16, 32, 544 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | Decoder | N/A | +| `TREE_ATTN` | | fp16, bf16 | `auto` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | Decoder | Any | +| `TRITON_ATTN` | | fp16, bf16, fp32 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | Any | ✅ | ✅ | ❌ | All | Any | > **†** FlashInfer uses TRTLLM attention on Blackwell (SM100), which supports sinks. Disable via `--attention-config.use_trtllm_attention=0`. > @@ -199,14 +200,14 @@ configuration. ### Decode Backends -| Backend | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | Sparse | MM Prefix | Attention Types | Compute Cap. | -|---------|--------|-----------|-------------|------------|------|--------|-----------|-----------------|--------------| -| `CUTLASS_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 128 | Any | ❌ | ❌ | ❌ | Decoder | 10.x | -| `FLASHINFER_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | Any | ❌ | ❌ | ❌ | Decoder | 10.x | -| `FLASHMLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 64 | Any | ❌ | ❌ | ❌ | Decoder | 9.x-10.x | -| `FLASHMLA_SPARSE` | bf16 | `auto`, `bfloat16`, `fp8_ds_mla` | 64 | 576 | ❌ | ✅ | ❌ | Decoder | 9.x-10.x | -| `FLASH_ATTN_MLA` | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ❌ | Decoder | 9.x | -| `ROCM_AITER_MLA` | fp16, bf16 | `auto` | 1 | Any | ❌ | ❌ | ❌ | Decoder | N/A | -| `ROCM_AITER_MLA_SPARSE` | fp16, bf16 | `auto` | Any | 576 | ❌ | ❌ | ❌ | Decoder | N/A | -| `ROCM_AITER_TRITON_MLA` | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | Decoder | N/A | -| `TRITON_MLA` | fp16, bf16 | `auto`, `bfloat16` | Any | Any | ❌ | ❌ | ❌ | Decoder | Any | +| Backend | Dtypes | KV Dtypes | Block Sizes | Head Sizes | Sink | Sparse | MM Prefix | DCP | Attention Types | Compute Cap. | +|---------|--------|-----------|-------------|------------|------|--------|-----------|-----|-----------------|--------------| +| `CUTLASS_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 128 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 10.x | +| `FLASHINFER_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | 10.x | +| `FLASHMLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 64 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x-10.x | +| `FLASHMLA_SPARSE` | bf16 | `auto`, `bfloat16`, `fp8_ds_mla` | 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 9.x-10.x | +| `FLASH_ATTN_MLA` | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x | +| `ROCM_AITER_MLA` | fp16, bf16 | `auto` | 1 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | N/A | +| `ROCM_AITER_MLA_SPARSE` | fp16, bf16 | `auto` | Any | 576 | ❌ | ❌ | ❌ | ❌ | Decoder | N/A | +| `ROCM_AITER_TRITON_MLA` | fp16, bf16 | `auto` | Any | Any | ❌ | ❌ | ❌ | ❌ | Decoder | N/A | +| `TRITON_MLA` | fp16, bf16 | `auto`, `bfloat16` | Any | Any | ❌ | ❌ | ❌ | ✅ | Decoder | Any | diff --git a/tools/pre_commit/generate_attention_backend_docs.py b/tools/pre_commit/generate_attention_backend_docs.py index 3cca4959d..eb68deb1b 100644 --- a/tools/pre_commit/generate_attention_backend_docs.py +++ b/tools/pre_commit/generate_attention_backend_docs.py @@ -17,9 +17,14 @@ import argparse import ast import fnmatch import sys +from collections.abc import Callable from pathlib import Path from typing import Any +# --------------------------------------------------------------------------- +# Constants and file paths +# --------------------------------------------------------------------------- + REPO_ROOT = Path(__file__).parent.parent.parent RELEVANT_PATTERNS = [ @@ -32,6 +37,18 @@ RELEVANT_PATTERNS = [ "docs/design/attention_backends.md", ] +BACKENDS_DIR = REPO_ROOT / "vllm" / "v1" / "attention" / "backends" +REGISTRY_FILE = BACKENDS_DIR / "registry.py" +CUDA_PLATFORM_FILE = REPO_ROOT / "vllm" / "platforms" / "cuda.py" +FA_UTILS_FILE = BACKENDS_DIR / "fa_utils.py" +FLASHINFER_UTILS_FILE = REPO_ROOT / "vllm" / "utils" / "flashinfer.py" +MLA_ATTENTION_FILE = ( + REPO_ROOT / "vllm" / "model_executor" / "layers" / "attention" / "mla_attention.py" +) + +# Backends to skip during doc generation +SKIP_BACKENDS = {"CUSTOM", "TORCH_SDPA"} + def is_relevant_file(filepath: str) -> bool: """Check if a file matches any of the relevant patterns.""" @@ -46,351 +63,234 @@ def is_relevant_file(filepath: str) -> bool: return any(fnmatch.fnmatch(path_str, pattern) for pattern in RELEVANT_PATTERNS) -BACKENDS_DIR = REPO_ROOT / "vllm" / "v1" / "attention" / "backends" -REGISTRY_FILE = BACKENDS_DIR / "registry.py" -CUDA_PLATFORM_FILE = REPO_ROOT / "vllm" / "platforms" / "cuda.py" -FA_UTILS_FILE = BACKENDS_DIR / "fa_utils.py" -FLASHINFER_UTILS_FILE = REPO_ROOT / "vllm" / "utils" / "flashinfer.py" -MLA_ATTENTION_FILE = ( - REPO_ROOT / "vllm" / "model_executor" / "layers" / "attention" / "mla_attention.py" -) +# --------------------------------------------------------------------------- +# AST utility helpers +# --------------------------------------------------------------------------- -def parse_registry() -> dict[str, str]: - """Parse the registry.py file to get backend names and their class paths.""" - tree = ast.parse(REGISTRY_FILE.read_text()) +def find_class_in_ast(tree: ast.AST, class_name: str) -> ast.ClassDef | None: + """Find a class definition in an AST.""" for node in ast.walk(tree): - if isinstance(node, ast.ClassDef) and node.name == "AttentionBackendEnum": - return _extract_enum_values(node) - return {} + if isinstance(node, ast.ClassDef) and node.name == class_name: + return node + return None -def _extract_enum_values(node: ast.ClassDef) -> dict[str, str]: - """Extract enum name -> value mapping from a class definition.""" - result: dict[str, str] = {} +def find_method(node: ast.ClassDef, method_name: str) -> ast.FunctionDef | None: + """Find a method in a class definition.""" for item in node.body: - if not isinstance(item, ast.Assign): - continue - for target in item.targets: - if not isinstance(target, ast.Name): - continue - if isinstance(item.value, ast.Constant) and item.value.value: - result[target.id] = item.value.value - return result - - -def get_file_from_class_path(class_path: str) -> Path | None: - """Convert a class path to a file path.""" - if not class_path: - return None - module_path = class_path.rsplit(".", 1)[0].replace(".", "/") - py_file = REPO_ROOT / f"{module_path}.py" - return py_file if py_file.exists() else None - - -def parse_flash_attn_features() -> dict[str, dict[str, Any]]: - """Parse fa_utils.py to detect FA2 vs FA3 feature differences. + if isinstance(item, ast.FunctionDef) and item.name == method_name: + return item + return None - Returns a dict with 'fa2' and 'fa3' keys containing their respective - feature overrides for compute capability, KV cache dtypes, and sink support. - """ - if not FA_UTILS_FILE.exists(): - return {} - try: - tree = ast.parse(FA_UTILS_FILE.read_text()) - except Exception: - return {} +def method_returns_true(method: ast.FunctionDef | None) -> bool: + """Check if a method simply returns True.""" + if method is None: + return False + for node in ast.walk(method): + if ( + isinstance(node, ast.Return) + and isinstance(node.value, ast.Constant) + and node.value.value is True + ): + return True + return False - # Analyze the functions to determine FA3-specific features - fa3_supports_fp8 = False - fa3_supports_sinks = False - fa3_compute_cap: str | None = None - for node in ast.walk(tree): - if not isinstance(node, ast.FunctionDef): - continue +def check_method_overrides(node: ast.ClassDef, method_name: str) -> bool: + """Check if a method is overridden and returns True.""" + return method_returns_true(find_method(node, method_name)) - # Check flash_attn_supports_fp8 - looks for `get_flash_attn_version() == 3` - if node.name == "flash_attn_supports_fp8": - for n in ast.walk(node): - if ( - isinstance(n, ast.Compare) - and isinstance(n.left, ast.Call) - and isinstance(n.left.func, ast.Name) - and n.left.func.id == "get_flash_attn_version" - ): - fa3_supports_fp8 = True - break - # Check flash_attn_supports_sinks - looks for `get_flash_attn_version() == 3` - if node.name == "flash_attn_supports_sinks": - for n in ast.walk(node): +def _find_bool_class_var(class_node: ast.ClassDef, var_name: str) -> bool | None: + """Find a bool class variable in a class definition. Returns None if not found.""" + for item in class_node.body: + # Check for annotated assignment: attr: bool = True/False + if ( + isinstance(item, ast.AnnAssign) + and isinstance(item.target, ast.Name) + and item.target.id == var_name + and isinstance(item.value, ast.Constant) + and isinstance(item.value.value, bool) + ): + return item.value.value + # Check for plain assignment: attr = True/False + if isinstance(item, ast.Assign): + for target in item.targets: if ( - isinstance(n, ast.Compare) - and isinstance(n.left, ast.Call) - and isinstance(n.left.func, ast.Name) - and n.left.func.id == "get_flash_attn_version" + isinstance(target, ast.Name) + and target.id == var_name + and isinstance(item.value, ast.Constant) + and isinstance(item.value.value, bool) ): - fa3_supports_sinks = True - break - - # Check get_flash_attn_version for FA3 compute capability - # Look for the ternary: 3 if (device_capability.major == 9 ...) else 2 - if node.name == "get_flash_attn_version": - for n in ast.walk(node): - # Look for IfExp (ternary) with `device_capability.major == 9` - if isinstance(n, ast.IfExp): - test = n.test - # Check if test is a BoolOp (and) containing the major check - if isinstance(test, ast.BoolOp): - for val in test.values: - if ( - isinstance(val, ast.Compare) - and isinstance(val.left, ast.Attribute) - and val.left.attr == "major" - and val.comparators - and isinstance(val.comparators[0], ast.Constant) - ): - fa3_compute_cap = f"{val.comparators[0].value}.x" - break - - return { - "fa2": { - "supports_fp8": False, - "supports_sink": False, - }, - "fa3": { - "compute_capability": fa3_compute_cap, - "supports_fp8": fa3_supports_fp8, - "supports_sink": fa3_supports_sinks, - }, - } - - -def parse_flashinfer_trtllm_features() -> dict[str, dict[str, Any]]: - """Parse flashinfer.py to detect TRTLLM-specific features. - - FLASHINFER uses TRTLLM attention on SM100 (Blackwell), which has different - capabilities (e.g., sink support) than native FlashInfer on earlier GPUs. - """ - if not FLASHINFER_UTILS_FILE.exists(): - return {} - - try: - tree = ast.parse(FLASHINFER_UTILS_FILE.read_text()) - except Exception: - return {} + return item.value.value + return None - trtllm_compute_cap: str | None = None - for node in ast.walk(tree): - if not isinstance(node, ast.FunctionDef): +def _parse_list_class_var(node: ast.ClassDef, var_name: str) -> list[str] | None: + """Parse a list-type class variable, returning None if not found.""" + for item in node.body: + if not isinstance(item, ast.AnnAssign): continue + if not isinstance(item.target, ast.Name): + continue + if item.target.id != var_name: + continue + if not (item.value and isinstance(item.value, ast.List)): + continue + result = [] + for elt in item.value.elts: + if isinstance(elt, ast.Attribute): + result.append(elt.attr) + elif isinstance(elt, ast.Constant): + result.append(str(elt.value)) + return result + return None - # Parse supports_trtllm_attention for compute capability - # Look for: current_platform.is_device_capability_family(100) - if node.name == "supports_trtllm_attention": - for n in ast.walk(node): - if ( - isinstance(n, ast.Call) - and isinstance(n.func, ast.Attribute) - and n.func.attr == "is_device_capability_family" - and n.args - and isinstance(n.args[0], ast.Constant) - and isinstance(n.args[0].value, int) - ): - cap = n.args[0].value - # Convert 100 -> "10.x" - trtllm_compute_cap = f"{cap // 10}.x" - break - - if not trtllm_compute_cap: - return {} - - return { - "native": { - # Native FlashInfer: everything except SM100 - "supports_sink": False, - }, - "trtllm": { - # TRTLLM pathway on Blackwell - "compute_capability": trtllm_compute_cap, - "supports_sink": True, - }, - } +def _parse_return_list( + method: ast.FunctionDef | None, handle_multiple_of: bool = False +) -> list[str]: + """Extract list items from a method's return statement.""" + if method is None: + return [] + for stmt in ast.walk(method): + if not isinstance(stmt, ast.Return): + continue + if not isinstance(stmt.value, ast.List): + continue + sizes = [] + for elt in stmt.value.elts: + if isinstance(elt, ast.Constant): + sizes.append(str(elt.value)) + elif ( + handle_multiple_of + and isinstance(elt, ast.Call) + and isinstance(elt.func, ast.Name) + and elt.func.id == "MultipleOf" + and elt.args + and isinstance(elt.args[0], ast.Constant) + ): + sizes.append(f"%{elt.args[0].value}") + if sizes: + return sizes + return [] -def parse_mla_prefill_backends() -> list[dict[str, Any]]: - """Parse MLA prefill backend options from mla_attention.py. - MLA uses different backends for prefill vs decode. The decode backends are - registered in the registry, but prefill backends are selected at runtime - based on conditions in MLACommonImpl.__init__. +def _get_parent_class_name(class_node: ast.ClassDef) -> str | None: + """Get the first parent class name (simple name only). - Returns a list of prefill backend info dicts with their requirements. + Handles both simple inheritance (class Foo(Bar)) and generic + inheritance (class Foo(Bar[T])). """ - if not MLA_ATTENTION_FILE.exists(): - return [] + if not class_node.bases: + return None + base = class_node.bases[0] + if isinstance(base, ast.Name): + return base.id + if isinstance(base, ast.Subscript) and isinstance(base.value, ast.Name): + return base.value.id + return None - try: - tree = ast.parse(MLA_ATTENTION_FILE.read_text()) - except Exception: - return [] - # Find compute capability requirements by parsing use_* functions - flashinfer_cc: str | None = None - cudnn_cc: str | None = None - trtllm_cc: str | None = None +def _resolve_import_to_file( + tree: ast.AST, class_name: str, source_file: Path | None = None +) -> Path | None: + """Try to resolve a class name to its source file via imports in the AST. + Handles both absolute imports (from vllm.foo import Bar) and relative + imports (from .foo import Bar) when source_file is provided. + """ for node in ast.walk(tree): - if not isinstance(node, ast.FunctionDef): + if not isinstance(node, ast.ImportFrom): continue + for alias in node.names: + actual_name = alias.asname or alias.name + if actual_name != class_name: + continue + if not node.module: + continue - # Parse use_flashinfer_prefill for compute capability (SM100) - if node.name == "use_flashinfer_prefill": - for n in ast.walk(node): - if ( - isinstance(n, ast.Call) - and isinstance(n.func, ast.Attribute) - and n.func.attr == "is_device_capability_family" - and n.args - and isinstance(n.args[0], ast.Constant) - and isinstance(n.args[0].value, int) - ): - flashinfer_cc = f"{n.args[0].value // 10}.x" - - # Parse use_cudnn_prefill for compute capability (SM100) - if node.name == "use_cudnn_prefill": - for n in ast.walk(node): - if ( - isinstance(n, ast.Call) - and isinstance(n.func, ast.Attribute) - and n.func.attr == "is_device_capability_family" - and n.args - and isinstance(n.args[0], ast.Constant) - and isinstance(n.args[0].value, int) - ): - cudnn_cc = f"{n.args[0].value // 10}.x" - - # Parse use_trtllm_ragged_deepseek_prefill for compute capability - if node.name == "use_trtllm_ragged_deepseek_prefill": - for n in ast.walk(node): - if ( - isinstance(n, ast.Call) - and isinstance(n.func, ast.Attribute) - and n.func.attr == "is_device_capability_family" - and n.args - and isinstance(n.args[0], ast.Constant) - and isinstance(n.args[0].value, int) - ): - trtllm_cc = f"{n.args[0].value // 10}.x" - - # Build prefill backend list based on what we found - # Order matches the priority in MLACommonImpl.__init__ - prefill_backends: list[dict[str, Any]] = [] - - # TRT-LLM Ragged (highest priority if available) - if trtllm_cc: - prefill_backends.append( - { - "name": "TRT-LLM Ragged‡", - "description": "TensorRT-LLM ragged attention", - "compute_capability": trtllm_cc, - "enable": "Default on SM100", - "disable": "`-ac.use_trtllm_ragged_deepseek_prefill=0`", - "notes": "DeepSeek R1 dims only", - } - ) - - # FlashInfer prefill - if flashinfer_cc: - prefill_backends.append( - { - "name": "FlashInfer", - "description": "FlashInfer CUTLASS backend", - "compute_capability": flashinfer_cc, - "enable": "`-ac.disable_flashinfer_prefill=0`", - "disable": "`-ac.disable_flashinfer_prefill=1`", - "notes": "DeepSeek R1 dims only", - } - ) - - # cuDNN prefill - if cudnn_cc: - prefill_backends.append( - { - "name": "cuDNN", - "description": "cuDNN-based attention", - "compute_capability": cudnn_cc, - "enable": "`-ac.use_cudnn_prefill=1`", - "disable": "`-ac.use_cudnn_prefill=0`", - "notes": "", - } - ) + if node.level and node.level > 0 and source_file: + # Relative import: resolve from the source file's directory + base_dir = source_file.parent + for _ in range(node.level - 1): + base_dir = base_dir.parent + module_path = node.module.replace(".", "/") + py_file = base_dir / f"{module_path}.py" + else: + # Absolute import + module_path = node.module.replace(".", "/") + py_file = REPO_ROOT / f"{module_path}.py" - # FlashAttention is always available as fallback - prefill_backends.append( - { - "name": "FlashAttention", - "description": "FlashAttention varlen (FA2/FA3)", - "compute_capability": "Any", - "enable": "Default fallback", - "disable": "Use other backends", - "notes": "FA3 on SM90, FA2 otherwise", - } - ) + if py_file.exists(): + return py_file + return None - return prefill_backends +def _find_cc_in_function(tree: ast.AST, func_name: str) -> str | None: + """Find a compute capability from is_device_capability_family() calls in a function. -def find_class_in_ast(tree: ast.AST, class_name: str) -> ast.ClassDef | None: - """Find a class definition in an AST.""" + Looks for the pattern: current_platform.is_device_capability_family(N) + and converts N (e.g. 100) to a CC string (e.g. "10.x"). + """ for node in ast.walk(tree): - if isinstance(node, ast.ClassDef) and node.name == class_name: - return node + if not isinstance(node, ast.FunctionDef) or node.name != func_name: + continue + for n in ast.walk(node): + if ( + isinstance(n, ast.Call) + and isinstance(n.func, ast.Attribute) + and n.func.attr == "is_device_capability_family" + and n.args + and isinstance(n.args[0], ast.Constant) + and isinstance(n.args[0].value, int) + ): + return f"{n.args[0].value // 10}.x" return None -def find_method(node: ast.ClassDef, method_name: str) -> ast.FunctionDef | None: - """Find a method in a class definition.""" - for item in node.body: - if isinstance(item, ast.FunctionDef) and item.name == method_name: - return item - return None +# --------------------------------------------------------------------------- +# Registry and file resolution +# --------------------------------------------------------------------------- -def method_returns_true(method: ast.FunctionDef | None) -> bool: - """Check if a method simply returns True.""" - if method is None: - return False - for node in ast.walk(method): - if not isinstance(node, ast.Return): - continue - if isinstance(node.value, ast.Constant) and node.value.value is True: - return True - return False +def parse_registry() -> dict[str, str]: + """Parse the registry.py file to get backend names and their class paths.""" + tree = ast.parse(REGISTRY_FILE.read_text()) + for node in ast.walk(tree): + if isinstance(node, ast.ClassDef) and node.name == "AttentionBackendEnum": + return _extract_enum_values(node) + return {} -def _parse_list_class_var(node: ast.ClassDef, var_name: str) -> list[str] | None: - """Parse a list-type class variable, returning None if not found.""" +def _extract_enum_values(node: ast.ClassDef) -> dict[str, str]: + """Extract enum name -> value mapping from a class definition.""" + result: dict[str, str] = {} for item in node.body: - if not isinstance(item, ast.AnnAssign): - continue - if not isinstance(item.target, ast.Name): - continue - if item.target.id != var_name: - continue - if not (item.value and isinstance(item.value, ast.List)): + if not isinstance(item, ast.Assign): continue - result = [] - for elt in item.value.elts: - if isinstance(elt, ast.Attribute): - result.append(elt.attr) - elif isinstance(elt, ast.Constant): - result.append(str(elt.value)) - return result - return None + for target in item.targets: + if not isinstance(target, ast.Name): + continue + if isinstance(item.value, ast.Constant) and item.value.value: + result[target.id] = item.value.value + return result + + +def get_file_from_class_path(class_path: str) -> Path | None: + """Convert a class path to a file path.""" + if not class_path: + return None + module_path = class_path.rsplit(".", 1)[0].replace(".", "/") + py_file = REPO_ROOT / f"{module_path}.py" + return py_file if py_file.exists() else None + + +# --------------------------------------------------------------------------- +# Backend feature extraction from AST +# --------------------------------------------------------------------------- def parse_supported_dtypes(node: ast.ClassDef) -> str: @@ -432,35 +332,6 @@ def parse_kv_cache_dtypes(node: ast.ClassDef) -> str: return "auto" -def _parse_return_list( - method: ast.FunctionDef | None, handle_multiple_of: bool = False -) -> list[str]: - """Extract list items from a method's return statement.""" - if method is None: - return [] - for stmt in ast.walk(method): - if not isinstance(stmt, ast.Return): - continue - if not isinstance(stmt.value, ast.List): - continue - sizes = [] - for elt in stmt.value.elts: - if isinstance(elt, ast.Constant): - sizes.append(str(elt.value)) - elif ( - handle_multiple_of - and isinstance(elt, ast.Call) - and isinstance(elt.func, ast.Name) - and elt.func.id == "MultipleOf" - and elt.args - and isinstance(elt.args[0], ast.Constant) - ): - sizes.append(f"%{elt.args[0].value}") - if sizes: - return sizes - return [] - - def parse_block_sizes(node: ast.ClassDef) -> str: """Parse get_supported_kernel_block_sizes method.""" method = find_method(node, "get_supported_kernel_block_sizes") @@ -536,202 +407,444 @@ def parse_compute_capability(node: ast.ClassDef) -> str: return f"{min_cap[0]}.x-{max_cap[0]}.x" return f"≥{min_cap[0]}.{min_cap[1]}" - return "Any" + return "Any" + + +def parse_attention_types(node: ast.ClassDef) -> str: + """Parse supports_attn_type method.""" + method = find_method(node, "supports_attn_type") + if method is None: + return "Decoder" + + type_map = { + "DECODER": "Decoder", + "ENCODER": "Encoder", + "ENCODER_ONLY": "Encoder Only", + "ENCODER_DECODER": "Enc-Dec", + } + types: set[str] = set() + + for n in ast.walk(method): + # Handle `attn_type in (AttentionType.DECODER, ...)` + if not ( + isinstance(n, ast.Compare) + and len(n.ops) == 1 + and isinstance(n.ops[0], ast.In) + and len(n.comparators) == 1 + and isinstance(n.comparators[0], ast.Tuple | ast.Set) + ): + continue + + for elt in n.comparators[0].elts: + if isinstance(elt, ast.Attribute) and elt.attr in type_map: + types.add(type_map[elt.attr]) + + if not types: + return "Decoder" + return "All" if len(types) >= 3 else ", ".join(sorted(types)) + + +def parse_impl_bool_attr( + tree: ast.AST, + class_name: str, + attr_name: str, + default: bool = False, + source_file: Path | None = None, + _visited: set[str] | None = None, +) -> bool: + """Parse a boolean class attribute from an impl class, following inheritance. + + Walks up the inheritance chain within the same file and across files + (by resolving imports) to find the attribute value. + """ + if _visited is None: + _visited = set() + if class_name in _visited: + return default + _visited.add(class_name) + + class_node = find_class_in_ast(tree, class_name) + if class_node is None: + return default + + # Check directly on this class + value = _find_bool_class_var(class_node, attr_name) + if value is not None: + return value + + # Check parent class + parent_name = _get_parent_class_name(class_node) + if parent_name: + # Try parent in same file first + parent_node = find_class_in_ast(tree, parent_name) + if parent_node is not None: + return parse_impl_bool_attr( + tree, parent_name, attr_name, default, source_file, _visited + ) + + # Try resolving cross-file import + parent_file = _resolve_import_to_file(tree, parent_name, source_file) + if parent_file: + try: + parent_tree = ast.parse(parent_file.read_text()) + return parse_impl_bool_attr( + parent_tree, + parent_name, + attr_name, + default, + parent_file, + _visited, + ) + except Exception: + pass + + return default + + +def analyze_backend(backend_name: str, class_path: str) -> dict[str, Any] | None: + """Analyze a backend class and extract feature information.""" + file_path = get_file_from_class_path(class_path) + if file_path is None: + return None + + try: + tree = ast.parse(file_path.read_text()) + except Exception as e: + print(f" Warning: Could not parse {file_path}: {e}", file=sys.stderr) + return None + + class_name = class_path.rsplit(".", 1)[1] + class_node = find_class_in_ast(tree, class_name) + if class_node is None: + return None + + # Check if this is an MLA backend by parent class or naming + parent = _get_parent_class_name(class_node) + mla_parents = {"MLACommonBackend", "FlashMLABackend", "FlashMLASparseBackend"} + is_mla_backend = ( + parent in mla_parents + or ".mla." in class_path.lower() + or "_mla" in backend_name.lower() + ) + + # Determine compute capability - use N/A for non-CUDA backends + is_non_cuda = backend_name.startswith(("CPU_", "ROCM_")) + compute_cap = "N/A" if is_non_cuda else parse_compute_capability(class_node) + + # Parse impl class features (DCP support) + impl_method = find_method(class_node, "get_impl_cls") + impl_class_name = None + if impl_method: + for stmt in ast.walk(impl_method): + if isinstance(stmt, ast.Return) and isinstance(stmt.value, ast.Name): + impl_class_name = stmt.value.id + break + + supports_dcp = False + if impl_class_name: + supports_dcp = parse_impl_bool_attr( + tree, impl_class_name, "can_return_lse_for_decode", False, file_path + ) + + return { + "name": backend_name, + "dtypes": parse_supported_dtypes(class_node), + "kv_cache_dtypes": parse_kv_cache_dtypes(class_node), + "block_sizes": parse_block_sizes(class_node), + "head_sizes": parse_head_sizes(class_node), + "attn_types": parse_attention_types(class_node), + "compute_capability": compute_cap, + "is_mla": is_mla_backend or check_method_overrides(class_node, "is_mla"), + "supports_sink": check_method_overrides(class_node, "supports_sink"), + "is_sparse": check_method_overrides(class_node, "is_sparse"), + "supports_mm_prefix": check_method_overrides(class_node, "supports_mm_prefix"), + "supports_dcp": supports_dcp, + } + + +# --------------------------------------------------------------------------- +# Special backend variant parsers (FA2/FA3, FlashInfer TRTLLM, MLA prefill) +# --------------------------------------------------------------------------- + + +def parse_flash_attn_features() -> dict[str, dict[str, Any]]: + """Parse fa_utils.py to detect FA2 vs FA3 feature differences. + + Returns a dict with 'fa2' and 'fa3' keys containing their respective + feature overrides for compute capability, KV cache dtypes, and sink support. + """ + if not FA_UTILS_FILE.exists(): + return {} + + try: + tree = ast.parse(FA_UTILS_FILE.read_text()) + except Exception: + return {} + + # Analyze the functions to determine FA3-specific features + fa3_supports_fp8 = False + fa3_supports_sinks = False + fa3_compute_cap: str | None = None + + for node in ast.walk(tree): + if not isinstance(node, ast.FunctionDef): + continue + + # Check flash_attn_supports_fp8 - looks for `get_flash_attn_version() == 3` + if node.name == "flash_attn_supports_fp8": + for n in ast.walk(node): + if ( + isinstance(n, ast.Compare) + and isinstance(n.left, ast.Call) + and isinstance(n.left.func, ast.Name) + and n.left.func.id == "get_flash_attn_version" + ): + fa3_supports_fp8 = True + break + + # Check flash_attn_supports_sinks - looks for `get_flash_attn_version() == 3` + if node.name == "flash_attn_supports_sinks": + for n in ast.walk(node): + if ( + isinstance(n, ast.Compare) + and isinstance(n.left, ast.Call) + and isinstance(n.left.func, ast.Name) + and n.left.func.id == "get_flash_attn_version" + ): + fa3_supports_sinks = True + break + + # Check get_flash_attn_version for FA3 compute capability + # Look for the ternary: 3 if (device_capability.major == 9 ...) else 2 + if node.name == "get_flash_attn_version": + for n in ast.walk(node): + # Look for IfExp (ternary) with `device_capability.major == 9` + if isinstance(n, ast.IfExp): + test = n.test + # Check if test is a BoolOp (and) containing the major check + if isinstance(test, ast.BoolOp): + for val in test.values: + if ( + isinstance(val, ast.Compare) + and isinstance(val.left, ast.Attribute) + and val.left.attr == "major" + and val.comparators + and isinstance(val.comparators[0], ast.Constant) + ): + fa3_compute_cap = f"{val.comparators[0].value}.x" + break + + return { + "fa2": { + "supports_fp8": False, + "supports_sink": False, + }, + "fa3": { + "compute_capability": fa3_compute_cap, + "supports_fp8": fa3_supports_fp8, + "supports_sink": fa3_supports_sinks, + }, + } + + +def parse_flashinfer_trtllm_features() -> dict[str, dict[str, Any]]: + """Parse flashinfer.py to detect TRTLLM-specific features. + + FLASHINFER uses TRTLLM attention on SM100 (Blackwell), which has different + capabilities (e.g., sink support) than native FlashInfer on earlier GPUs. + """ + if not FLASHINFER_UTILS_FILE.exists(): + return {} + + try: + tree = ast.parse(FLASHINFER_UTILS_FILE.read_text()) + except Exception: + return {} + trtllm_compute_cap = _find_cc_in_function(tree, "supports_trtllm_attention") -def parse_attention_types(node: ast.ClassDef) -> str: - """Parse supports_attn_type method.""" - method = find_method(node, "supports_attn_type") - if method is None: - return "Decoder" + if not trtllm_compute_cap: + return {} - type_map = { - "DECODER": "Decoder", - "ENCODER": "Encoder", - "ENCODER_ONLY": "Encoder Only", - "ENCODER_DECODER": "Enc-Dec", + return { + "native": { + # Native FlashInfer: everything except SM100 + "supports_sink": False, + }, + "trtllm": { + # TRTLLM pathway on Blackwell + "compute_capability": trtllm_compute_cap, + "supports_sink": True, + }, } - types: set[str] = set() - for n in ast.walk(method): - # Handle `attn_type in (AttentionType.DECODER, ...)` - if not ( - isinstance(n, ast.Compare) - and len(n.ops) == 1 - and isinstance(n.ops[0], ast.In) - and len(n.comparators) == 1 - and isinstance(n.comparators[0], ast.Tuple | ast.Set) - ): - continue - for elt in n.comparators[0].elts: - if isinstance(elt, ast.Attribute) and elt.attr in type_map: - types.add(type_map[elt.attr]) +def parse_mla_prefill_backends() -> list[dict[str, Any]]: + """Parse MLA prefill backend options from mla_attention.py. - if not types: - return "Decoder" - return "All" if len(types) >= 3 else ", ".join(sorted(types)) + MLA uses different backends for prefill vs decode. The decode backends are + registered in the registry, but prefill backends are selected at runtime + based on conditions in MLACommonImpl.__init__. + Returns a list of prefill backend info dicts with their requirements. + """ + if not MLA_ATTENTION_FILE.exists(): + return [] -def check_method_overrides(node: ast.ClassDef, method_name: str) -> bool: - """Check if a method is overridden and returns True.""" - method = find_method(node, method_name) - return method_returns_true(method) + try: + tree = ast.parse(MLA_ATTENTION_FILE.read_text()) + except Exception: + return [] + # Find compute capability requirements by parsing use_* functions + trtllm_cc = _find_cc_in_function(tree, "use_trtllm_ragged_deepseek_prefill") + flashinfer_cc = _find_cc_in_function(tree, "use_flashinfer_prefill") + cudnn_cc = _find_cc_in_function(tree, "use_cudnn_prefill") -def analyze_backend(backend_name: str, class_path: str) -> dict[str, Any] | None: - """Analyze a backend class and extract feature information.""" - file_path = get_file_from_class_path(class_path) - if file_path is None: - return None + # Build prefill backend list based on what we found + # Order matches the priority in MLACommonImpl.__init__ + prefill_backends: list[dict[str, Any]] = [] - try: - tree = ast.parse(file_path.read_text()) - except Exception as e: - print(f" Warning: Could not parse {file_path}: {e}", file=sys.stderr) - return None + # TRT-LLM Ragged (highest priority if available) + if trtllm_cc: + prefill_backends.append( + { + "name": "TRT-LLM Ragged‡", + "description": "TensorRT-LLM ragged attention", + "compute_capability": trtllm_cc, + "enable": "Default on SM100", + "disable": "`-ac.use_trtllm_ragged_deepseek_prefill=0`", + "notes": "DeepSeek R1 dims only", + } + ) - class_name = class_path.rsplit(".", 1)[1] - class_node = find_class_in_ast(tree, class_name) - if class_node is None: - return None + # FlashInfer prefill + if flashinfer_cc: + prefill_backends.append( + { + "name": "FlashInfer", + "description": "FlashInfer CUTLASS backend", + "compute_capability": flashinfer_cc, + "enable": "`-ac.disable_flashinfer_prefill=0`", + "disable": "`-ac.disable_flashinfer_prefill=1`", + "notes": "DeepSeek R1 dims only", + } + ) - # Check if this is an MLA backend by parent class or naming - parent = None - if class_node.bases: - base = class_node.bases[0] - parent = base.id if isinstance(base, ast.Name) else None - mla_parents = {"MLACommonBackend", "FlashMLABackend", "FlashMLASparseBackend"} - is_mla_backend = ( - parent in mla_parents - or ".mla." in class_path.lower() - or "_mla" in backend_name.lower() + # cuDNN prefill + if cudnn_cc: + prefill_backends.append( + { + "name": "cuDNN", + "description": "cuDNN-based attention", + "compute_capability": cudnn_cc, + "enable": "`-ac.use_cudnn_prefill=1`", + "disable": "`-ac.use_cudnn_prefill=0`", + "notes": "", + } + ) + + # FlashAttention is always available as fallback + prefill_backends.append( + { + "name": "FlashAttention", + "description": "FlashAttention varlen (FA2/FA3)", + "compute_capability": "Any", + "enable": "Default fallback", + "disable": "Use other backends", + "notes": "FA3 on SM90, FA2 otherwise", + } ) - # Determine compute capability - use N/A for non-CUDA backends - is_non_cuda = backend_name.startswith(("CPU_", "ROCM_")) - compute_cap = "N/A" if is_non_cuda else parse_compute_capability(class_node) + return prefill_backends - return { - "name": backend_name, - "dtypes": parse_supported_dtypes(class_node), - "kv_cache_dtypes": parse_kv_cache_dtypes(class_node), - "block_sizes": parse_block_sizes(class_node), - "head_sizes": parse_head_sizes(class_node), - "attn_types": parse_attention_types(class_node), - "compute_capability": compute_cap, - "is_mla": is_mla_backend or check_method_overrides(class_node, "is_mla"), - "supports_sink": check_method_overrides(class_node, "supports_sink"), - "is_sparse": check_method_overrides(class_node, "is_sparse"), - "supports_mm_prefix": check_method_overrides(class_node, "supports_mm_prefix"), - } +# --------------------------------------------------------------------------- +# Backend variant expansion (FA2/FA3, FlashInfer native/TRTLLM) +# --------------------------------------------------------------------------- -def add_literal_quotes(value: str) -> str: - """Add literal backticks around all comma-separated items in a string.""" - items = [item.strip() for item in value.split(",")] - quoted_items = [f"`{item}`" for item in items] - return ", ".join(quoted_items) +def _expand_flash_attn_variants( + all_backends: list[dict[str, Any]], + fa_features: dict[str, dict[str, Any]], +) -> list[dict[str, Any]]: + """Expand FLASH_ATTN into FA2 and FA3 variants with different capabilities.""" + expanded = [] + for backend in all_backends: + if backend["name"] != "FLASH_ATTN": + backend.setdefault("_sort_key", backend["name"]) + backend.setdefault("_sort_order", 0) + backend.setdefault("version", "") + expanded.append(backend) + continue -def bool_to_emoji(value: bool) -> str: - """Convert a boolean to a checkmark or X emoji.""" - return "✅" if value else "❌" + # Create FA2 entry (keeps base backend's compute_capability) + fa2 = backend.copy() + fa2["version"] = "FA2*" + fa2["_sort_key"] = "FLASH_ATTN" + fa2["_sort_order"] = 0 + fa2["supports_sink"] = fa_features["fa2"]["supports_sink"] + + # Create FA3 entry (uses parsed compute_capability from fa_utils) + fa3 = backend.copy() + fa3["version"] = "FA3*" + fa3["_sort_key"] = "FLASH_ATTN" + fa3["_sort_order"] = 1 + if fa_features["fa3"]["compute_capability"]: + fa3["compute_capability"] = fa_features["fa3"]["compute_capability"] + fa3["supports_sink"] = fa_features["fa3"]["supports_sink"] + if fa_features["fa3"]["supports_fp8"]: + base_dtypes = backend["kv_cache_dtypes"].split(", ") + fp8_dtypes = ["fp8", "fp8_e4m3", "fp8_e5m2"] + new_dtypes = [d for d in fp8_dtypes if d not in base_dtypes] + fa3["kv_cache_dtypes"] = ", ".join(base_dtypes + new_dtypes) + + expanded.append(fa2) + expanded.append(fa3) + return expanded + + +def _expand_flashinfer_variants( + all_backends: list[dict[str, Any]], + fi_features: dict[str, dict[str, Any]], +) -> list[dict[str, Any]]: + """Expand FLASHINFER into native and TRTLLM variants.""" + expanded = [] + for backend in all_backends: + if backend["name"] != "FLASHINFER": + expanded.append(backend) + continue + # Parse original compute capability to get min CC + orig_cap = backend["compute_capability"] + parts = orig_cap.replace(".x", "").split("-") + min_cc = parts[0] if parts else "7" + trtllm_cc = fi_features["trtllm"]["compute_capability"] -def generate_markdown_table( - backends: list[dict[str, Any]], title: str, is_mla_table: bool = False -) -> str: - """Generate a markdown table from backend info. + # Create native entry (pre-Blackwell GPUs) + native = backend.copy() + native["version"] = "Native†" + native["_sort_key"] = "FLASHINFER" + native["_sort_order"] = 0 + native["supports_sink"] = fi_features["native"]["supports_sink"] + native["compute_capability"] = f"{min_cc}.x-9.x" - Args: - backends: List of backend info dictionaries. - title: Table title. - is_mla_table: If True, include MLA and Sparse columns (for MLA table). - If False, exclude them (for standard attention table). - """ - if not backends: - return f"## {title}\n\nNo backends found.\n" + # Create TRTLLM entry + trtllm = backend.copy() + trtllm["version"] = "TRTLLM†" + trtllm["_sort_key"] = "FLASHINFER" + trtllm["_sort_order"] = 1 + trtllm["compute_capability"] = trtllm_cc + trtllm["supports_sink"] = fi_features["trtllm"]["supports_sink"] - # Check if any backend has a version (for FA2/FA3 split) - has_versions = any(b.get("version") for b in backends) + expanded.append(native) + expanded.append(trtllm) + return expanded - if is_mla_table: - header = ( - "| Backend | Dtypes | KV Dtypes | Block Sizes | Head Sizes " - "| Sink | Sparse | MM Prefix | Attention Types | Compute Cap. |" - ) - separator = ( - "|---------|--------|-----------|-------------|------------" - "|------|--------|-----------|-----------------|--------------|" - ) - elif has_versions: - header = ( - "| Backend | Version | Dtypes | KV Dtypes | Block Sizes " - "| Head Sizes | Sink | MM Prefix | Attention Types | Compute Cap. |" - ) - separator = ( - "|---------|---------|--------|-----------|-------------" - "|------------|------|-----------|-----------------|--------------|" - ) - else: - header = ( - "| Backend | Dtypes | KV Dtypes | Block Sizes | Head Sizes " - "| Sink | MM Prefix | Attention Types | Compute Cap. |" - ) - separator = ( - "|---------|--------|-----------|-------------|------------" - "|------|-----------|-----------------|--------------|" - ) - lines = [f"## {title}", "", header, separator] - - def sort_key(x: dict[str, Any]) -> tuple[str, int]: - """Sort key that keeps parent/child rows together in order.""" - return (x.get("_sort_key", x["name"]), x.get("_sort_order", 0)) - - for info in sorted(backends, key=sort_key): - if is_mla_table: - row = "| `{}` | {} | {} | {} | {} | {} | {} | {} | {} | {} |".format( - info["name"], - info["dtypes"], - add_literal_quotes(info["kv_cache_dtypes"]), - info["block_sizes"], - info["head_sizes"], - bool_to_emoji(info["supports_sink"]), - bool_to_emoji(info["is_sparse"]), - bool_to_emoji(info["supports_mm_prefix"]), - info["attn_types"], - info["compute_capability"], - ) - elif has_versions: - row = "| `{}` | {} | {} | {} | {} | {} | {} | {} | {} | {} |".format( - info["name"], - info.get("version", ""), - info["dtypes"], - add_literal_quotes(info["kv_cache_dtypes"]), - info["block_sizes"], - info["head_sizes"], - bool_to_emoji(info["supports_sink"]), - bool_to_emoji(info["supports_mm_prefix"]), - info["attn_types"], - info["compute_capability"], - ) - else: - row = "| `{}` | {} | {} | {} | {} | {} | {} | {} | {} |".format( - info["name"], - info["dtypes"], - add_literal_quotes(info["kv_cache_dtypes"]), - info["block_sizes"], - info["head_sizes"], - bool_to_emoji(info["supports_sink"]), - bool_to_emoji(info["supports_mm_prefix"]), - info["attn_types"], - info["compute_capability"], - ) - lines.append(row) - lines.append("") - return "\n".join(lines) +# --------------------------------------------------------------------------- +# CUDA priority list parsing +# --------------------------------------------------------------------------- def parse_cuda_priority_lists() -> dict[str, list[str]]: @@ -827,6 +940,105 @@ def _extract_priorities(body: list, priorities: dict[str, list[str]], prefix: st priorities[f"{prefix}_default"] = backends +# --------------------------------------------------------------------------- +# Data-driven table rendering +# +# Each column is a (header, formatter) pair. The formatter takes a backend +# info dict and returns the cell string. Tables are assembled by selecting +# which columns to include, then calling _render_table(). +# --------------------------------------------------------------------------- + +# Column type alias for readability +TableColumn = tuple[str, Callable[[dict[str, Any]], str]] + +# Shared column definitions -- order here matches the output table order +_COL_BACKEND: TableColumn = ("Backend", lambda b: f"`{b['name']}`") +_COL_VERSION: TableColumn = ("Version", lambda b: b.get("version", "")) +_COL_DTYPES: TableColumn = ("Dtypes", lambda b: b["dtypes"]) +_COL_KV_DTYPES: TableColumn = ( + "KV Dtypes", + lambda b: add_literal_quotes(b["kv_cache_dtypes"]), +) +_COL_BLOCK_SIZES: TableColumn = ("Block Sizes", lambda b: b["block_sizes"]) +_COL_HEAD_SIZES: TableColumn = ("Head Sizes", lambda b: b["head_sizes"]) +_COL_SINK: TableColumn = ("Sink", lambda b: bool_to_emoji(b["supports_sink"])) +_COL_SPARSE: TableColumn = ("Sparse", lambda b: bool_to_emoji(b["is_sparse"])) +_COL_MM_PREFIX: TableColumn = ( + "MM Prefix", + lambda b: bool_to_emoji(b["supports_mm_prefix"]), +) +_COL_DCP: TableColumn = ("DCP", lambda b: bool_to_emoji(b["supports_dcp"])) +_COL_ATTN_TYPES: TableColumn = ("Attention Types", lambda b: b["attn_types"]) +_COL_COMPUTE_CAP: TableColumn = ("Compute Cap.", lambda b: b["compute_capability"]) + + +def add_literal_quotes(value: str) -> str: + """Add literal backticks around all comma-separated items in a string.""" + items = [item.strip() for item in value.split(",")] + return ", ".join(f"`{item}`" for item in items) + + +def bool_to_emoji(value: bool) -> str: + """Convert a boolean to a checkmark or X emoji.""" + return "✅" if value else "❌" + + +def _build_columns(is_mla: bool, has_versions: bool) -> list[TableColumn]: + """Build the column list for a backend feature table. + + The column selection depends on whether it's an MLA table (includes + Sparse column) and whether any backend has version variants (includes + Version column). + """ + cols: list[TableColumn] = [_COL_BACKEND] + if has_versions: + cols.append(_COL_VERSION) + cols.extend([_COL_DTYPES, _COL_KV_DTYPES, _COL_BLOCK_SIZES, _COL_HEAD_SIZES]) + cols.append(_COL_SINK) + if is_mla: + cols.append(_COL_SPARSE) + cols.extend([_COL_MM_PREFIX, _COL_DCP, _COL_ATTN_TYPES, _COL_COMPUTE_CAP]) + return cols + + +def _sort_key(x: dict[str, Any]) -> tuple[str, int]: + """Sort key that keeps parent/child rows together in order.""" + return (x.get("_sort_key", x["name"]), x.get("_sort_order", 0)) + + +def _render_table( + columns: list[TableColumn], + backends: list[dict[str, Any]], +) -> list[str]: + """Render a markdown table from column specs and backend data.""" + header = "| " + " | ".join(name for name, _ in columns) + " |" + sep = "|" + "|".join("-" * (len(name) + 2) for name, _ in columns) + "|" + lines = [header, sep] + for info in sorted(backends, key=_sort_key): + row = "| " + " | ".join(fmt(info) for _, fmt in columns) + " |" + lines.append(row) + return lines + + +def generate_markdown_table( + backends: list[dict[str, Any]], title: str, is_mla_table: bool = False +) -> str: + """Generate a titled markdown table from backend info.""" + if not backends: + return f"## {title}\n\nNo backends found.\n" + has_versions = any(b.get("version") for b in backends) + columns = _build_columns(is_mla_table, has_versions) + lines = [f"## {title}", ""] + lines.extend(_render_table(columns, backends)) + lines.append("") + return "\n".join(lines) + + +# --------------------------------------------------------------------------- +# Markdown section generators (usage, priority, legend, MLA) +# --------------------------------------------------------------------------- + + def generate_usage_section() -> str: """Generate the usage documentation section.""" return """## Setting the Attention Backend @@ -959,6 +1171,27 @@ def generate_priority_section(priorities: dict[str, list[str]]) -> str: return "\n".join(lines) +def generate_legend() -> str: + """Generate a legend explaining the table columns.""" + return """## Legend + +| Column | Description | +|--------|-------------| +| **Dtypes** | Supported model data types (fp16, bf16, fp32) | +| **KV Dtypes** | Supported KV cache data types (`auto`, `fp8`, `fp8_e4m3`, etc.) | +| **Block Sizes** | Supported KV cache block sizes (%N means multiples of N) | +| **Head Sizes** | Supported attention head sizes | +| **Sink** | Attention sink support (for StreamingLLM) | +| **Sparse** | Sparse attention support (MLA only) | +| **MM Prefix** | Multimodal prefix full attention support | +| **DCP** | Decode Context Parallelism support (`--decode-context-parallel-size`) | +| **Attention Types** | Supported attention patterns (Decoder, Encoder, Enc-Dec) | +| **Compute Cap.** | Required CUDA compute capability (N/A for non-CUDA backends) | + +**Symbols:** ✅ = Supported, ❌ = Not supported +""" + + def generate_mla_section( prefill_backends: list[dict[str, Any]], decode_backends: list[dict[str, Any]] ) -> str: @@ -999,57 +1232,17 @@ def generate_mla_section( ] ) - # Generate decode backends table - header = ( - "| Backend | Dtypes | KV Dtypes | Block Sizes | Head Sizes " - "| Sink | Sparse | MM Prefix | Attention Types | Compute Cap. |" - ) - separator = ( - "|---------|--------|-----------|-------------|------------" - "|------|--------|-----------|-----------------|--------------|" - ) - lines.extend([header, separator]) - - def sort_key(x: dict[str, Any]) -> tuple[str, int]: - return (x.get("_sort_key", x["name"]), x.get("_sort_order", 0)) - - for info in sorted(decode_backends, key=sort_key): - row = "| `{}` | {} | {} | {} | {} | {} | {} | {} | {} | {} |".format( - info["name"], - info["dtypes"], - add_literal_quotes(info["kv_cache_dtypes"]), - info["block_sizes"], - info["head_sizes"], - bool_to_emoji(info["supports_sink"]), - bool_to_emoji(info["is_sparse"]), - bool_to_emoji(info["supports_mm_prefix"]), - info["attn_types"], - info["compute_capability"], - ) - lines.append(row) + # Reuse data-driven table rendering for decode backends + columns = _build_columns(is_mla=True, has_versions=False) + lines.extend(_render_table(columns, decode_backends)) lines.append("") return "\n".join(lines) -def generate_legend() -> str: - """Generate a legend explaining the table columns.""" - return """## Legend - -| Column | Description | -|--------|-------------| -| **Dtypes** | Supported model data types (fp16, bf16, fp32) | -| **KV Dtypes** | Supported KV cache data types (`auto`, `fp8`, `fp8_e4m3`, etc.) | -| **Block Sizes** | Supported KV cache block sizes (%N means multiples of N) | -| **Head Sizes** | Supported attention head sizes | -| **Sink** | Attention sink support (for StreamingLLM) | -| **Sparse** | Sparse attention support (MLA only) | -| **MM Prefix** | Multimodal prefix full attention support | -| **Attention Types** | Supported attention patterns (Decoder, Encoder, Enc-Dec) | -| **Compute Cap.** | Required CUDA compute capability (N/A for non-CUDA backends) | - -**Symbols:** ✅ = Supported, ❌ = Not supported -""" +# --------------------------------------------------------------------------- +# Top-level orchestration +# --------------------------------------------------------------------------- def generate_docs() -> str: @@ -1071,86 +1264,17 @@ def generate_docs() -> str: # Collect backend info all_backends = [] for backend_name, class_path in attention_backends_map.items(): - if backend_name in ("CUSTOM", "TORCH_SDPA"): + if backend_name in SKIP_BACKENDS: continue info = analyze_backend(backend_name, class_path) if info: all_backends.append(info) - # Expand FLASH_ATTN into FA2 and FA3 variants with different capabilities + # Expand backends into version variants if fa_features: - expanded_backends = [] - for backend in all_backends: - if backend["name"] == "FLASH_ATTN": - # Create FA2 entry (keeps base backend's compute_capability) - fa2 = backend.copy() - fa2["name"] = "FLASH_ATTN" - fa2["version"] = "FA2*" - fa2["_sort_key"] = "FLASH_ATTN" - fa2["_sort_order"] = 0 - fa2["supports_sink"] = fa_features["fa2"]["supports_sink"] - - # Create FA3 entry (uses parsed compute_capability from fa_utils) - fa3 = backend.copy() - fa3["name"] = "FLASH_ATTN" - fa3["version"] = "FA3*" - fa3["_sort_key"] = "FLASH_ATTN" - fa3["_sort_order"] = 1 - if fa_features["fa3"]["compute_capability"]: - fa3["compute_capability"] = fa_features["fa3"]["compute_capability"] - fa3["supports_sink"] = fa_features["fa3"]["supports_sink"] - if fa_features["fa3"]["supports_fp8"]: - # Add fp8 dtypes to the base backend's kv_cache_dtypes - base_dtypes = backend["kv_cache_dtypes"].split(", ") - fp8_dtypes = ["fp8", "fp8_e4m3", "fp8_e5m2"] - new_dtypes = [d for d in fp8_dtypes if d not in base_dtypes] - fa3["kv_cache_dtypes"] = ", ".join(base_dtypes + new_dtypes) - - # Add FA2 first, then FA3 - expanded_backends.append(fa2) - expanded_backends.append(fa3) - else: - backend["_sort_key"] = backend["name"] - backend["_sort_order"] = 0 - backend["version"] = "" # No version for other backends - expanded_backends.append(backend) - all_backends = expanded_backends - - # Expand FLASHINFER into native and TRTLLM variants + all_backends = _expand_flash_attn_variants(all_backends, fa_features) if fi_features: - expanded_backends = [] - for backend in all_backends: - if backend["name"] == "FLASHINFER": - # Parse original compute capability to get min CC - orig_cap = backend["compute_capability"] - parts = orig_cap.replace(".x", "").split("-") - min_cc = parts[0] if parts else "7" - trtllm_cc = fi_features["trtllm"]["compute_capability"] - - # Create native entry (pre-Blackwell GPUs) - native = backend.copy() - native["name"] = "FLASHINFER" - native["version"] = "Native†" - native["_sort_key"] = "FLASHINFER" - native["_sort_order"] = 0 - native["supports_sink"] = fi_features["native"]["supports_sink"] - # Native FlashInfer is used on GPUs before SM100 (Blackwell) - native["compute_capability"] = f"{min_cc}.x-9.x" - - # Create TRTLLM entry - trtllm = backend.copy() - trtllm["name"] = "FLASHINFER" - trtllm["version"] = "TRTLLM†" - trtllm["_sort_key"] = "FLASHINFER" - trtllm["_sort_order"] = 1 - trtllm["compute_capability"] = trtllm_cc - trtllm["supports_sink"] = fi_features["trtllm"]["supports_sink"] - - expanded_backends.append(native) - expanded_backends.append(trtllm) - else: - expanded_backends.append(backend) - all_backends = expanded_backends + all_backends = _expand_flashinfer_variants(all_backends, fi_features) # Split into MLA and non-MLA mla_backends = [b for b in all_backends if b["is_mla"]] -- GitLab From c60f8e3b49eced1a17ba0e11da3f8c107b309df9 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Mon, 9 Feb 2026 17:38:54 -0600 Subject: [PATCH 0022/1166] [Bugfix][ROCm][GPT-OSS] Use old triton_kernels implementation on ROCm if the new API is not available (#34153) Signed-off-by: Gregory Shtrasberg --- .../fused_moe/gpt_oss_triton_kernels_moe.py | 61 ++++++++++++++++--- 1 file changed, 52 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index 3801814d9..eafdf97a9 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -19,11 +19,14 @@ from vllm.model_executor.layers.fused_moe.utils import _resize_cache from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, ) +from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils.import_utils import has_triton_kernels logger = init_logger(__name__) +use_legacy_triton_kernels = False + if has_triton_kernels(): try: import triton_kernels.swiglu @@ -38,10 +41,20 @@ if has_triton_kernels(): from triton_kernels.tensor import ( BIT, Bitmatrix, - SparseMatrix, - make_ragged_tensor_metadata, ) from triton_kernels.topk import topk + + try: + from triton_kernels.tensor import ( + SparseMatrix, + make_ragged_tensor_metadata, + ) + except ImportError: + if current_platform.is_rocm(): + logger.warning_once("Using legacy triton_kernels on ROCm") + use_legacy_triton_kernels = True + else: + raise except (AttributeError, ImportError) as e: logger.error( "Failed to import Triton kernels. Please make sure your triton " @@ -101,6 +114,12 @@ def legacy_routing_from_bitmatrix( Replacement for the removed triton_kernels.routing.routing_from_bitmatrix. Creates routing data from a bitmatrix representation. """ + if use_legacy_triton_kernels: + from triton_kernels.routing import routing_from_bitmatrix + + return routing_from_bitmatrix( + bitmatrix, expt_scal, expt_indx, n_expts_tot, n_expts_act + ) sparse_logits = SparseMatrix(indx=expt_indx, vals=expt_scal, mask=bitmatrix) dispatch_indx = sparse_logits.mask_metadata.row_sorted_indx combine_indx = sparse_logits.mask_metadata.col_sorted_indx @@ -130,6 +149,10 @@ def legacy_routing( Replacement for the removed triton_kernels.routing.routing function. Computes routing data from gating logits. """ + if use_legacy_triton_kernels: + from triton_kernels.routing import routing + + return routing(logits, n_expts_act, sm_first=sm_first) if sm_first: logits = torch.softmax(logits, dim=-1) sparse_logits = topk(logits, n_expts_act, apply_softmax=not sm_first) @@ -231,11 +254,22 @@ def triton_kernel_fused_experts( ) output_tensor = _resize_cache(output_tensor, (batch_dim, M, K)) - act = FusedActivation( - FnSpecs( - "swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit"), reduction_n=2 - ), - (swiglu_alpha, swiglu_limit), + act = ( + FusedActivation( + FnSpecs( + "swiglu", + triton_kernels.swiglu.swiglu_fn, + ("alpha", "limit"), + reduction_n=2, + ), + (swiglu_alpha, swiglu_limit), + ) + if not use_legacy_triton_kernels + else FusedActivation( + FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")), + (swiglu_alpha, swiglu_limit), + 2, + ) ) gammas = routing_data.gate_scal if routing_data else None @@ -296,8 +330,17 @@ def make_routing_data( bitmatrix_shape = [n_rows, bm_cols * 32] bitmatrix_shape_max = [n_rows, None] - bitmatrix = Bitmatrix( - bitmatrix, dtype=BIT, shape=bitmatrix_shape, shape_max=bitmatrix_shape_max + bitmatrix = ( + Bitmatrix( + bitmatrix, dtype=BIT, shape=bitmatrix_shape, shape_max=bitmatrix_shape_max + ) + if not use_legacy_triton_kernels + else Bitmatrix( + bitmatrix, + shape=bitmatrix_shape, + shape_max=bitmatrix_shape_max, + scratchpad=None, + ) ) # matmul_ogs expects invalid topk_weights to be -1s -- GitLab From 13397841ab469cecf1ed425c3f52a9ffc38139b5 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Tue, 10 Feb 2026 07:49:09 +0800 Subject: [PATCH 0023/1166] [structured output] validate unsupported json features first (#33233) Signed-off-by: Andy Xie Co-authored-by: Chauncey Co-authored-by: Russell Bryant --- vllm/v1/structured_output/backend_xgrammar.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index 812c262a2..1ad43d218 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -304,17 +304,17 @@ def validate_xgrammar_grammar(sampling_params: SamplingParams) -> None: else: schema = so_params.json + if has_xgrammar_unsupported_json_features(schema): + raise ValueError( + "The provided JSON schema contains features not supported by xgrammar." + ) + try: xgr.Grammar.from_json_schema(schema) except Exception as err: raise ValueError( f"Failed to transform json schema into a grammar: {err}" ) from err - - if has_xgrammar_unsupported_json_features(schema): - raise ValueError( - "The provided JSON schema contains features not supported by xgrammar." - ) return if so_params.grammar: -- GitLab From e94ec597334d9a3e9b0d04bc17152e2747c83d51 Mon Sep 17 00:00:00 2001 From: Yuwei An Date: Mon, 9 Feb 2026 17:18:42 -0800 Subject: [PATCH 0024/1166] [LMCache] Token Base IPC API (#34175) Signed-off-by: Oasis-Git --- .../multi_process_adapter.py | 417 +++++++++++++++--- .../kv_connector/v1/lmcache_mp_connector.py | 49 +- 2 files changed, 376 insertions(+), 90 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py index d865f70bd..e476cba7c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py @@ -20,16 +20,42 @@ from lmcache.v1.multiprocess.protocol import RequestType, get_response_class logger = init_logger(__name__) -def wrap_kv_caches(kv_caches: dict[str, KVCache]) -> KVCache: +def wrap_kv_caches(kv_caches: dict[str, torch.Tensor]) -> KVCache: logger.info("KV caches keys are %s", list(kv_caches.keys())) return [CudaIPCWrapper(tensor) for tensor in kv_caches.values()] +def striding_block_hashes( + block_hashes: list[bytes], blocks_in_chunk: int +) -> Iterable[bytes]: + """Extract chunk-level hashes from block hashes by striding. + + In hash-based vLLM, each vLLM block has its own hash. LMCache chunks + span ``blocks_in_chunk`` consecutive blocks. The representative hash + for a chunk is the hash of the **last** block in that chunk (because + each block hash already encodes its prefix). So we start at index + ``blocks_in_chunk - 1`` and stride by ``blocks_in_chunk``. + """ + return islice(block_hashes, blocks_in_chunk - 1, None, blocks_in_chunk) + + def send_lmcache_request( mq_client: MessageQueueClient, request_type: RequestType, payloads: list[Any], ) -> MessagingFuture[Any]: + """ + Helper function to send the request to the LMCache multiprocess server + + Args: + mq_client: The LMCache multiprocess mode message queue client + request_type: The request type + payloads: The request payloads + + Returns: + A messaging future for the request + """ + future = mq_client.submit_request( request_type, payloads, get_response_class(request_type) ) @@ -39,40 +65,44 @@ def send_lmcache_request( def get_lmcache_chunk_size( mq_client: MessageQueueClient, ) -> int: - future = send_lmcache_request(mq_client, RequestType.GET_CHUNK_SIZE, []) - chunk_size = future.result() - return chunk_size + """ + Helper function to get the LMCache chunk size from the server + Args: + mq_client: The LMCache multiprocess mode message queue client -def striding_block_hashes( - block_hashes: list[bytes], - blocks_in_chunk, -) -> Iterable[bytes]: - """Striding the block hashes to get the block hashes for each chunk. - For example, if blocks_in_chunk is 16, then we will get the block hashes - for the 16th, 32nd, 48th, ... blocks. + Returns: + An integer representing the LMCache chunk size """ - return islice(block_hashes, blocks_in_chunk - 1, None, blocks_in_chunk) + future = send_lmcache_request(mq_client, RequestType.GET_CHUNK_SIZE, []) + chunk_size = future.result() + return chunk_size @dataclass class LoadStoreOp: - block_hashes: list[bytes] block_ids: list[int] + """Block ids for the load/store operation""" - def __len__(self) -> int: - return len(self.block_hashes) + token_ids: list[int] | None = None + """Token IDs for the load/store operation (token mode)""" - def __post_init__(self): - assert len(self.block_hashes) == len(self.block_ids), ( - "The number of block hashes should be equal to the number of block ids " - f"But got {len(self.block_hashes)} and {len(self.block_ids)}" - ) + block_hashes: list[bytes] | None = None + """Block hashes for the load/store operation (hash mode)""" + + start: int = 0 + """Start token index (token mode only)""" + + end: int = 0 + """End token index (token mode only)""" + + def __len__(self) -> int: + return len(self.block_ids) StoreResult = bool RetrieveResult = list[bool] -LookupResult = list[bool] +LookupResult = int class LMCacheMPSchedulerAdapter: @@ -95,10 +125,6 @@ class LMCacheMPSchedulerAdapter: kv_rank: The kv rank used for LMCache keys vllm_block_size: The block size used in vLLM """ - logger.warning( - "Importing LMCacheMPSchedulerAdapter is deprecated. " - "Please update your LMCache to the latest version." - ) self.mq_client = MessageQueueClient(server_url, context) # Request futures @@ -116,22 +142,89 @@ class LMCacheMPSchedulerAdapter: self.blocks_in_chunk = self.chunk_size // vllm_block_size @_lmcache_nvtx_annotate - def maybe_submit_lookup_request(self, request_id: str, block_hashes: list[bytes]): + def maybe_submit_lookup_request( + self, + request_id: str, + block_hashes: list[bytes] | None = None, + token_ids: list[int] | None = None, + ) -> None: + """ + Submit a new lookup request to LMCache if there is no ongoing request. + + Supports both token-based and hash-based vLLM: + - token_ids: token IDs (token-based vLLM) -> single token-mode key + - block_hashes: block hashes (hash-based vLLM) -> strided hash-mode keys + + Exactly one of block_hashes or token_ids must be provided. + + Args: + request_id: The ID of the lookup request. The same ID indicates it's + from the same request + block_hashes: Block hashes to lookup from LMCache (hash mode) + token_ids: Token IDs to lookup from LMCache (token mode) + + Returns: + None + + Notes: + This function will have a side-effect: submitting a look up request to + LMCache, which will essentially 'lock' the KV cache chunks in the LMCache + for later retrieve operations. + In the meantime, this function will record the lookup request, and the + status of the look up request can be checked by `check_lookup_result`. + """ if request_id in self.lookup_futures: # Skip if there is already a lookup request return - s = striding_block_hashes(block_hashes, self.blocks_in_chunk) - keys = [self._create_key(block_hash) for block_hash in s] + assert (block_hashes is None) != (token_ids is None), ( + "Exactly one of block_hashes or token_ids must be provided" + ) + + if block_hashes is not None: + # Hash mode: stride block hashes -> N hash-mode keys + chunk_hashes = list( + striding_block_hashes(block_hashes, self.blocks_in_chunk) + ) + keys = [ + self._create_hash_key(ch, request_id=request_id) for ch in chunk_hashes + ] + else: + # Token mode: truncate to chunk-aligned length + assert token_ids is not None + aligned_end = (len(token_ids) // self.chunk_size) * self.chunk_size + if aligned_end == 0: + return + keys = [ + self._create_key( + token_ids, + start=0, + end=aligned_end, + request_id=request_id, + ).no_worker_id_version() + ] + future = send_lmcache_request( self.mq_client, RequestType.LOOKUP, - [keys, True], + [keys], ) self.lookup_futures[request_id] = future @_lmcache_nvtx_annotate def check_lookup_result(self, request_id: str) -> int | None: + """ + Check the result of a previously submitted lookup request. + + Args: + request_id: The ID of the lookup request submitted in + `maybe_submit_lookup_request` + + Returns: + An integer representing the total number of tokens matched + in LMCache (prefix matching), or + None if the lookup request is not finished yet. + """ assert request_id in self.lookup_futures, ( f"Lookup request for request_id={request_id} has not been submitted" ) @@ -141,7 +234,7 @@ class LMCacheMPSchedulerAdapter: return None result = future.result() - num_chunks = sum(result) + num_chunks = result return num_chunks * self.chunk_size def num_blocks_per_chunk(self) -> int: @@ -159,14 +252,47 @@ class LMCacheMPSchedulerAdapter: """ self.lookup_futures.pop(request_id, None) + def end_session(self, request_id: str) -> None: + """ + Notify LMCache server to remove the session for a finished request. + Args: + request_id: The ID of the finished request. + """ + send_lmcache_request( + self.mq_client, + RequestType.END_SESSION, + [request_id], + ) + # Helper functions - def _create_key(self, block_hash: bytes) -> IPCCacheEngineKey: - """Convert a block hash to an IPC cache engine key""" + def _create_key( + self, + token_ids: list[int], + start: int = 0, + end: int = 0, + request_id: str | None = None, + ) -> IPCCacheEngineKey: + """Convert token IDs to an IPC cache engine key""" return IPCCacheEngineKey( model_name=self.model_name, world_size=self.world_size, worker_id=self.worker_id, - chunk_hash=block_hash, + token_ids=tuple(token_ids), + start=start, + end=end, + request_id=request_id, + ) + + def _create_hash_key( + self, chunk_hash: bytes, request_id: str | None = None + ) -> IPCCacheEngineKey: + """Create a hash-mode IPC cache engine key""" + return IPCCacheEngineKey( + model_name=self.model_name, + world_size=self.world_size, + worker_id=None, + chunk_hash=chunk_hash, + request_id=request_id, ) @@ -180,10 +306,6 @@ class LMCacheMPWorkerAdapter: kv_rank: int, vllm_block_size: int, ): - logger.warning( - "Importing LMCacheMPWorkerAdapter is deprecated. " - "Please update your LMCache to the latest version." - ) self.mq_client = MessageQueueClient(server_url, context) # Instance id for GPU worker @@ -201,7 +323,10 @@ class LMCacheMPWorkerAdapter: str, tuple[MessagingFuture[RetrieveResult], list[str]] ] = {} + # The store requests that have finished execution in LMCache self.finished_stores: set[str] = set() + # The finished request ids that are passed via vLLM and also + # have corresponding store requests submitted to LMCache before self.previously_finished: set[str] = set() self.model_name = model_name @@ -215,7 +340,14 @@ class LMCacheMPWorkerAdapter: ) self.blocks_in_chunk = chunk_size // vllm_block_size - def register_kv_caches(self, kv_caches: dict[str, KVCache]): + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + """ + Register the kv caches with LMCache server + + Args: + kv_caches: A dict of kv caches to register. The keys are the + layer names and the values are the corresponding tensors. + """ # Register kv cache and send the request self.kv_caches = kv_caches logger.info("Registering kv caches") @@ -230,7 +362,29 @@ class LMCacheMPWorkerAdapter: def submit_store_request( self, request_id: str, op: LoadStoreOp, event: torch.cuda.Event ): - keys = self._block_hashes_to_keys(op.block_hashes) + """ + Submit a KV cache store request to LMCache + + Args: + request_id: The ID of the request + op: The LoadStoreOp describing the store operation. + event: The CUDA event that is recorded after the current + model inference step + """ + if op.block_hashes is not None: + # Hash mode + chunk_hashes = list( + striding_block_hashes(op.block_hashes, self.blocks_in_chunk) + ) + keys = [ + self._create_hash_key(ch, request_id=request_id) for ch in chunk_hashes + ] + else: + # Token mode + assert op.token_ids is not None + keys = [ + self._create_key(op.token_ids, op.start, op.end, request_id=request_id) + ] future = send_lmcache_request( self.mq_client, RequestType.STORE, @@ -242,7 +396,29 @@ class LMCacheMPWorkerAdapter: def submit_retrieve_request( self, request_id: str, op: LoadStoreOp, event: torch.cuda.Event ): - keys = self._block_hashes_to_keys(op.block_hashes) + """ + Submit a KV cache retrieve request to LMCache + + Args: + request_id: The ID of the request + op: The LoadStoreOp describing the retrieve operation. + event: The CUDA event that is recorded after the current + model inference step + """ + if op.block_hashes is not None: + # Hash mode + chunk_hashes = list( + striding_block_hashes(op.block_hashes, self.blocks_in_chunk) + ) + keys = [ + self._create_hash_key(ch, request_id=request_id) for ch in chunk_hashes + ] + else: + # Token mode + assert op.token_ids is not None + keys = [ + self._create_key(op.token_ids, op.start, op.end, request_id=request_id) + ] future = send_lmcache_request( self.mq_client, RequestType.RETRIEVE, @@ -257,17 +433,47 @@ class LMCacheMPWorkerAdapter: ops: list[LoadStoreOp], event: torch.cuda.Event, ): - keys = [] - block_ids = [] - for op in ops: - keys.extend(self._block_hashes_to_keys(op.block_hashes)) + """ + Submit a batched store request to LMCache + + Args: + request_ids: The IDs of the requests + ops: The LoadStoreOps describing the store operations. Should have + the same length as request_ids + event: The CUDA event that is recorded after the current + model inference step + """ + all_keys: list[IPCCacheEngineKey] = [] + block_ids: list[int] = [] + for request_id, op in zip(request_ids, ops, strict=False): + if op.block_hashes is not None: + chunk_hashes = list( + striding_block_hashes(op.block_hashes, self.blocks_in_chunk) + ) + keys = [ + self._create_hash_key(ch, request_id=request_id) + for ch in chunk_hashes + ] + all_keys.extend(keys) + else: + assert op.token_ids is not None + all_keys.append( + self._create_key( + op.token_ids, op.start, op.end, request_id=request_id + ) + ) block_ids.extend(op.block_ids) future = send_lmcache_request( self.mq_client, RequestType.STORE, - [keys, self.instance_id, block_ids, event.ipc_handle()], + [ + all_keys, + self.instance_id, + block_ids, + event.ipc_handle(), + ], ).to_cuda_future() - self.store_futures[request_ids[0]] = (future, request_ids[1:]) + self.store_futures[request_ids[0]] = (future, list(request_ids[1:])) @_lmcache_nvtx_annotate def batched_submit_retrieve_requests( @@ -276,34 +482,83 @@ class LMCacheMPWorkerAdapter: ops: list[LoadStoreOp], event: torch.cuda.Event, ): - keys = [] - block_ids = [] + """ + Submit a batched retrieve request to LMCache - for op in ops: - keys.extend(self._block_hashes_to_keys(op.block_hashes)) + Args: + request_ids: The IDs of the requests + ops: The LoadStoreOps describing the retrieve operations. Should have + the same length as request_ids + event: The CUDA event that is recorded after the current + model inference step + """ + all_keys: list[IPCCacheEngineKey] = [] + block_ids: list[int] = [] + for request_id, op in zip(request_ids, ops, strict=False): + if op.block_hashes is not None: + chunk_hashes = list( + striding_block_hashes(op.block_hashes, self.blocks_in_chunk) + ) + keys = [ + self._create_hash_key(ch, request_id=request_id) + for ch in chunk_hashes + ] + all_keys.extend(keys) + else: + assert op.token_ids is not None + all_keys.append( + self._create_key( + op.token_ids, op.start, op.end, request_id=request_id + ) + ) block_ids.extend(op.block_ids) future = send_lmcache_request( self.mq_client, RequestType.RETRIEVE, - [keys, self.instance_id, block_ids, event.ipc_handle()], + [ + all_keys, + self.instance_id, + block_ids, + event.ipc_handle(), + ], ).to_cuda_future() - self.retrieve_futures[request_ids[0]] = (future, request_ids[1:]) + self.retrieve_futures[request_ids[0]] = (future, list(request_ids[1:])) @_lmcache_nvtx_annotate def get_finished( - self, finished_req_ids: set[str] + self, finished_req_ids_from_engine: set[str] ) -> tuple[set[str] | None, set[str] | None]: + """ + Check and get the finished store and retrieve requests. + + Args: + finished_req_ids_from_engine: the set of request ids that are + reported as finished from the vLLM engine side. + + Returns: + A tuple of two sets: + - The first set contains the finished store request ids. The returned + store request ids MUST be seen before in the + `finished_req_ids_from_engine`. + - The second set contains the finished retrieve request ids. + + Notes: + When enabling async scheduling in vLLM, the same request ID may appear + multiple times in `finished_req_ids_from_engine`. The adapter should + take care of deduplicating the request IDs and only return the request + IDs that have not been returned before. + """ finished_stores = set() finished_retrieves = set() - for request_id, (future, other_reqs) in self.store_futures.items(): - if not future.query(): + for request_id, (s_future, other_reqs) in self.store_futures.items(): + if not s_future.query(): continue - result = future.result() + s_result = s_future.result() finished_stores.add(request_id) finished_stores.update(other_reqs) - if not result: + if not s_result: # TODO: add error handling here logger.error( "Something went wrong when processing the " @@ -311,21 +566,21 @@ class LMCacheMPWorkerAdapter: request_id, ) - for request_id, (future, other_reqs) in self.retrieve_futures.items(): - if not future.query(): + for request_id, (r_future, other_reqs) in self.retrieve_futures.items(): + if not r_future.query(): continue - result = future.result() + r_result = r_future.result() finished_retrieves.add(request_id) finished_retrieves.update(other_reqs) - if not all(result): + if not all(r_result): # TODO: add error handing here logger.error( "Something went wrong when processing the " "retrieve request for request_id=%s, result=%s", request_id, - result, + r_result, ) # Remove the finished requests from the tracking dicts @@ -338,7 +593,7 @@ class LMCacheMPWorkerAdapter: self.finished_stores.update(finished_stores) ret_stores = set() - for req_id in finished_req_ids: + for req_id in finished_req_ids_from_engine: if req_id in self.finished_stores or req_id in self.store_futures: self.previously_finished.add(req_id) else: @@ -357,7 +612,9 @@ class LMCacheMPWorkerAdapter: return self.blocks_in_chunk def shutdown(self): - # Unregister kv cache + """ + Shutdown the LMCache MP worker adapter + """ logger.info("Unregistering kv caches") send_lmcache_request( self.mq_client, RequestType.UNREGISTER_KV_CACHE, [self.instance_id] @@ -378,18 +635,32 @@ class LMCacheMPWorkerAdapter: return safe_finished_s - def _create_key(self, block_hash: bytes) -> IPCCacheEngineKey: - """Convert a block hash to an IPC cache engine key""" + def _create_key( + self, + token_ids: list[int], + start: int = 0, + end: int = 0, + request_id: str | None = None, + ) -> IPCCacheEngineKey: + """Convert token IDs to an IPC cache engine key""" return IPCCacheEngineKey( model_name=self.model_name, world_size=self.world_size, worker_id=self.worker_id, - chunk_hash=block_hash, + token_ids=tuple(token_ids), + start=start, + end=end, + request_id=request_id, ) - def _block_hashes_to_keys( - self, block_hashes: list[bytes] - ) -> list[IPCCacheEngineKey]: - """Convert block hashes to IPC cache engine keys""" - s = striding_block_hashes(block_hashes, self.blocks_in_chunk) - return [self._create_key(block_hash) for block_hash in s] + def _create_hash_key( + self, chunk_hash: bytes, request_id: str | None = None + ) -> IPCCacheEngineKey: + """Create a hash-mode IPC cache engine key""" + return IPCCacheEngineKey( + model_name=self.model_name, + world_size=self.world_size, + worker_id=self.worker_id, + chunk_hash=chunk_hash, + request_id=request_id, + ) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py index b542265dd..0379011e7 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py @@ -3,7 +3,7 @@ import enum from collections.abc import Iterable from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, Literal, cast +from typing import TYPE_CHECKING, Any, Literal import torch import zmq @@ -130,12 +130,6 @@ def create_worker_adapter( ) -def convert_block_hashes_to_bytes( - block_hashes: list["BlockHash"], -) -> list[bytes]: - return cast(list[bytes], block_hashes) - - class LMCacheMPRequestState(enum.Enum): """ State machine: @@ -266,6 +260,7 @@ class LMCacheMPRequestMetadata: Args: tracker: The request tracker to generate the metadata from. blocks_in_chunk: the number of blocks in a LMCache data chunk + vllm_block_size: the block size used in vLLM """ # Store the blocks that has block hashes # NOTE: the invariant here is that `num_stored_blocks` should @@ -282,15 +277,21 @@ class LMCacheMPRequestMetadata: if num_chunks >= 1: start = tracker.num_stored_blocks end = start + num_chunks * blocks_in_chunk - block_hashes = convert_block_hashes_to_bytes( - tracker.block_hashes[start:end] - ) block_ids = tracker.allocated_block_ids[start:end] + start_token_idx = start * vllm_block_size + end_token_idx = end * vllm_block_size + token_ids = list(tracker.all_token_ids) + op = LoadStoreOp( + token_ids=token_ids, + block_ids=block_ids, + start=start_token_idx, + end=end_token_idx, + ) ret = LMCacheMPRequestMetadata( request_id=tracker.request_id, direction="STORE", - op=LoadStoreOp(block_hashes=block_hashes, block_ids=block_ids), + op=op, ) # Update the request tracker @@ -303,6 +304,7 @@ class LMCacheMPRequestMetadata: def GetRetrieveMetadata( tracker: LMCacheMPRequestTracker, blocks_in_chunk: int, + vllm_block_size: int, ) -> "LMCacheMPRequestMetadata | None": """ Generate the retrieve metadata for the current request tracker. @@ -310,6 +312,7 @@ class LMCacheMPRequestMetadata: Args: tracker: The request tracker to generate the metadata from. blocks_in_chunk: the number of blocks in a LMCache data chunk + vllm_block_size: the block size used in vLLM """ if not tracker.is_ready_for_retrieving(): return None @@ -330,15 +333,21 @@ class LMCacheMPRequestMetadata: "number of LMCache hit blocks. " ) if end > start: - block_hashes = convert_block_hashes_to_bytes( - tracker.block_hashes[start:end] - ) block_ids = tracker.allocated_block_ids[start:end] + start_token_idx = start * vllm_block_size + end_token_idx = end * vllm_block_size + token_ids = list(tracker.all_token_ids) + op = LoadStoreOp( + token_ids=token_ids, + block_ids=block_ids, + start=start_token_idx, + end=end_token_idx, + ) ret = LMCacheMPRequestMetadata( request_id=tracker.request_id, direction="RETRIEVE", - op=LoadStoreOp(block_hashes=block_hashes, block_ids=block_ids), + op=op, ) return ret @@ -643,7 +652,8 @@ class LMCacheMPConnector(KVConnectorBase_V1): return 0, False self.scheduler_adapter.maybe_submit_lookup_request( - request.request_id, convert_block_hashes_to_bytes(request.block_hashes) + request.request_id, + token_ids=list(request.all_token_ids), ) ret = self.scheduler_adapter.check_lookup_result(request.request_id) @@ -766,6 +776,9 @@ class LMCacheMPConnector(KVConnectorBase_V1): """ # Clean up request tracker to prevent memory leak self._cleanup_request_tracker(request.request_id) + # Notify LMCache to end the session for this request + self.scheduler_adapter.end_session(request.request_id) + return True, None def take_events(self) -> Iterable["KVCacheEvent"]: @@ -846,7 +859,9 @@ class LMCacheMPConnector(KVConnectorBase_V1): if request_tracker.state != LMCacheMPRequestState.WAITING_FOR_LOAD: continue r_metadata = LMCacheMPRequestMetadata.GetRetrieveMetadata( - request_tracker, blocks_per_chunk + request_tracker, + blocks_per_chunk, + vllm_block_size=self.vllm_block_size, ) if r_metadata is not None: metadata.add_request_metadata(r_metadata) -- GitLab From 047a457fa4af2010303ba775ae6f3ee9c1852c2c Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 9 Feb 2026 19:47:54 -0800 Subject: [PATCH 0025/1166] [Bugfix] Adopt `ChunkGatedDeltaRule` for Qwen3.5 (#34198) Signed-off-by: Roger Wang --- vllm/model_executor/models/qwen3_5.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/model_executor/models/qwen3_5.py b/vllm/model_executor/models/qwen3_5.py index d6df7523b..61ff6946c 100644 --- a/vllm/model_executor/models/qwen3_5.py +++ b/vllm/model_executor/models/qwen3_5.py @@ -99,6 +99,7 @@ from .interfaces import ( ) from .qwen2_moe import Qwen2MoeMLP as Qwen3NextMLP from .qwen3_next import ( + ChunkGatedDeltaRule, Qwen3NextAttention, Qwen3NextDecoderLayer, Qwen3NextGatedDeltaNet, @@ -268,6 +269,8 @@ class Qwen3_5GatedDeltaNet(Qwen3NextGatedDeltaNet): prefix=f"{prefix}.out_proj", ) + self.chunk_gated_delta_rule = ChunkGatedDeltaRule() + compilation_config = get_current_vllm_config().compilation_config if prefix in compilation_config.static_forward_context: raise ValueError(f"Duplicate layer name: {prefix}") -- GitLab From 4cde2e015944495e6bd650a4415cfb342bd73cfb Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Mon, 9 Feb 2026 22:50:20 -0600 Subject: [PATCH 0026/1166] [ROCm][Bugfix] Resolve Dynamo tracing crash from amdsmi calls in on_gfx* arch detection (#34108) Signed-off-by: Andreas Karatzas --- vllm/platforms/rocm.py | 62 ++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 35 deletions(-) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 2545e4620..b463c80a1 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -101,12 +101,10 @@ def _query_gcn_arch_from_amdsmi() -> str: raise RuntimeError("amdsmi did not return valid GCN arch") -@cache -def _get_gcn_arch_via_amdsmi() -> str: +def _get_gcn_arch() -> str: """ - Get the GCN architecture name using amdsmi instead of torch.cuda. - This avoids initializing CUDA, which is important for Ray workers - that need to set CUDA_VISIBLE_DEVICES after importing vLLM. + Get GCN arch via amdsmi (no CUDA init), fallback to torch.cuda. + Called once at module level; result stored in _GCN_ARCH. """ try: return _query_gcn_arch_from_amdsmi() @@ -121,34 +119,36 @@ def _get_gcn_arch_via_amdsmi() -> str: return torch.cuda.get_device_properties("cuda").gcnArchName -@cache +# Resolve once at module load. Uses amdsmi (no CUDA init) so Ray workers +# can still set CUDA_VISIBLE_DEVICES after import. +# These are plain Python bools — fully torch.compile/Dynamo safe. +_GCN_ARCH = _get_gcn_arch() + +_ON_GFX1X = any(arch in _GCN_ARCH for arch in ["gfx11", "gfx12"]) +_ON_MI3XX = any(arch in _GCN_ARCH for arch in ["gfx942", "gfx950"]) +_ON_GFX9 = any(arch in _GCN_ARCH for arch in ["gfx90a", "gfx942", "gfx950"]) +_ON_GFX942 = "gfx942" in _GCN_ARCH +_ON_GFX950 = "gfx950" in _GCN_ARCH + + def on_gfx1x() -> bool: - GPU_ARCH = _get_gcn_arch_via_amdsmi() - return any(arch in GPU_ARCH for arch in ["gfx11", "gfx12"]) + return _ON_GFX1X -@cache def on_mi3xx() -> bool: - GPU_ARCH = _get_gcn_arch_via_amdsmi() - return any(arch in GPU_ARCH for arch in ["gfx942", "gfx950"]) + return _ON_MI3XX -@cache def on_gfx9() -> bool: - GPU_ARCH = _get_gcn_arch_via_amdsmi() - return any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"]) + return _ON_GFX9 -@cache def on_gfx942() -> bool: - GPU_ARCH = _get_gcn_arch_via_amdsmi() - return any(arch in GPU_ARCH for arch in ["gfx942"]) + return _ON_GFX942 -@cache def on_gfx950() -> bool: - GPU_ARCH = _get_gcn_arch_via_amdsmi() - return any(arch in GPU_ARCH for arch in ["gfx950"]) + return _ON_GFX950 @cache @@ -163,13 +163,9 @@ def use_rocm_custom_paged_attention( alibi_slopes: torch.Tensor | None = None, sinks: torch.Tensor | None = None, ) -> bool: - GPU_ARCH = _get_gcn_arch_via_amdsmi() - ON_GFX9 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"]) - ON_GFX11_GFX12 = any(arch in GPU_ARCH for arch in ["gfx11", "gfx12"]) - # custom paged attn always supported on V0. On V1, requires sliding window # disabled due to observed numerical discrepancy. - if ON_GFX9: + if _ON_GFX9: return ( (sliding_window == 0 or sliding_window == (-1, -1)) and (qtype == torch.half or qtype == torch.bfloat16) @@ -183,7 +179,7 @@ def use_rocm_custom_paged_attention( else: return ( - ON_GFX11_GFX12 + _ON_GFX1X and (sliding_window == 0 or sliding_window == (-1, -1)) and (qtype == torch.half or qtype == torch.bfloat16) and head_size == 128 @@ -611,18 +607,16 @@ class RocmPlatform(Platform): @classmethod def supports_mx(cls) -> bool: - gcn_arch = torch.cuda.get_device_properties(0).gcnArchName - return any(gfx in gcn_arch for gfx in ["gfx95"]) + return any(gfx in _GCN_ARCH for gfx in ["gfx95"]) @classmethod def supports_fp8(cls) -> bool: - gcn_arch = torch.cuda.get_device_properties(0).gcnArchName - return any(gfx in gcn_arch for gfx in ["gfx94", "gfx95", "gfx12"]) + return any(gfx in _GCN_ARCH for gfx in ["gfx94", "gfx95", "gfx12"]) @classmethod def is_fp8_fnuz(cls) -> bool: # only device 0 is checked, this assumes MI300 platforms are homogeneous - return "gfx94" in torch.cuda.get_device_properties(0).gcnArchName + return "gfx94" in _GCN_ARCH @classmethod def fp8_dtype(cls) -> torch.dtype: @@ -634,9 +628,7 @@ class RocmPlatform(Platform): @classmethod def use_custom_allreduce(cls) -> bool: # We only enable custom allreduce for MI300 series - gcn_arch = torch.cuda.get_device_properties(0).gcnArchName - supported_archs = ["gfx94", "gfx95"] - return any(gfx in gcn_arch for gfx in supported_archs) + return any(gfx in _GCN_ARCH for gfx in ["gfx94", "gfx95"]) @classmethod def opaque_attention_op(cls) -> bool: @@ -644,7 +636,7 @@ class RocmPlatform(Platform): @classmethod def is_navi(cls) -> bool: - return "gfx1" in torch.cuda.get_device_properties(0).gcnArchName + return "gfx1" in _GCN_ARCH @classmethod def get_static_graph_wrapper_cls(cls) -> str: -- GitLab From 8a5e0e2b2bb925d162328927b7565514fa355da1 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 9 Feb 2026 21:03:32 -0800 Subject: [PATCH 0027/1166] [Bugfix][Core] Fix CPU memory leak from Request reference cycle in prefix caching (#34183) Signed-off-by: Roger Wang --- tests/v1/core/test_async_scheduler.py | 2 +- vllm/v1/core/sched/scheduler.py | 6 ++---- vllm/v1/request.py | 18 +++++++++++------- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/tests/v1/core/test_async_scheduler.py b/tests/v1/core/test_async_scheduler.py index e0645ed43..a77ae81ba 100644 --- a/tests/v1/core/test_async_scheduler.py +++ b/tests/v1/core/test_async_scheduler.py @@ -236,7 +236,7 @@ def test_prefix_caching_for_multi_turn(): req._all_token_ids = req.prompt_token_ids.copy() req.all_token_ids = ConstantList(req._all_token_ids) req.block_hashes = [] - req.block_hashes = req.get_hash_new_full_blocks() + req.update_block_hashes() # Schedule the next-turn requests. for req in next_turn_requests: diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 90ca58441..aa3bc6e2c 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -982,10 +982,8 @@ class Scheduler(SchedulerInterface): session._all_token_ids.extend(update.prompt_token_ids or ()) session.prompt_token_ids.extend(update.prompt_token_ids or ()) - # Update block hashes for the new tokens - # (mirrors Request.append_output_token_ids) - if session.get_hash_new_full_blocks is not None: - session.block_hashes.extend(session.get_hash_new_full_blocks()) + # Update block hashes for the new tokens. + session.update_block_hashes() session.num_prompt_tokens = len(session.prompt_token_ids) session.arrival_time = update.arrival_time session.sampling_params = update.sampling_params diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 3b829875f..970b7e1eb 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -6,7 +6,6 @@ import time from collections import deque from collections.abc import Callable, Mapping from dataclasses import dataclass -from functools import partial from typing import TYPE_CHECKING, Any import torch @@ -164,10 +163,11 @@ class Request: self.num_external_computed_tokens = 0 self.block_hashes: list[BlockHash] = [] - self.get_hash_new_full_blocks: Callable[[], list[BlockHash]] | None = None - if block_hasher is not None: - self.get_hash_new_full_blocks = partial(block_hasher, self) - self.block_hashes = self.get_hash_new_full_blocks() + # Store the block hasher without binding self to avoid creating a + # reference cycle (Request -> partial -> Request) that prevents + # immediate garbage collection via reference counting. + self._block_hasher: Callable[[Request], list[BlockHash]] | None = block_hasher + self.update_block_hashes() self.skip_reading_prefix_cache = self.get_skip_reading_prefix_cache() @@ -212,8 +212,12 @@ class Request: self._output_token_ids.extend(token_ids) self._all_token_ids.extend(token_ids) - if self.get_hash_new_full_blocks is not None: - self.block_hashes.extend(self.get_hash_new_full_blocks()) + self.update_block_hashes() + + def update_block_hashes(self) -> None: + """Compute block hashes for any new full blocks and append them.""" + if self._block_hasher is not None: + self.block_hashes.extend(self._block_hasher(self)) @property def use_structured_output(self) -> bool: -- GitLab From 25e48a3aae35849fd777f8a48c3c494337c11d83 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 10 Feb 2026 13:12:13 +0800 Subject: [PATCH 0028/1166] [Doc] Update usage of `--limit-mm-per-prompt` (#34148) Signed-off-by: DarkLight1337 --- docs/features/multimodal_inputs.md | 2 +- docs/models/supported_models.md | 2 +- examples/offline_inference/mistral-small.py | 4 ++-- .../openai_chat_completion_client_for_multimodal.py | 2 +- examples/pooling/classify/vision_classification_online.py | 2 +- vllm/config/multimodal.py | 8 ++++---- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index 3c1028929..5b4a81d4f 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -521,7 +521,7 @@ First, launch the OpenAI-compatible server: ```bash vllm serve microsoft/Phi-3.5-vision-instruct --runner generate \ - --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}' + --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt.image 2 ``` Then, you can use the OpenAI client as follows: diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index ac02e9bde..7ff9531c5 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -658,7 +658,7 @@ On the other hand, modalities separated by `/` are mutually exclusive. See [this page](../features/multimodal_inputs.md) on how to pass multi-modal inputs to the model. !!! tip - For hybrid-only models such as Llama-4, Step3 and Mistral-3, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (e.g, `--limit-mm-per-prompt '{"image":0}`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache. + For hybrid-only models such as Llama-4, Step3, Mistral-3 and Qwen-3.5, a text-only mode can be enabled by setting all supported multimodal modalities to 0 (`--language-model-only`) so that their multimodal modules will not be loaded to free up more GPU memory for KV cache. !!! note vLLM currently supports adding LoRA adapters to the language backbone for most multimodal models. Additionally, vLLM now experimentally supports adding LoRA to the tower and connector modules for some multimodal models. See [this page](../features/lora.md). diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py index 1f6e5ba14..0879b0dfa 100644 --- a/examples/offline_inference/mistral-small.py +++ b/examples/offline_inference/mistral-small.py @@ -18,11 +18,11 @@ from vllm.assets.image import ImageAsset # # Mistral format # vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \ # --tokenizer-mode mistral --config-format mistral --load-format mistral \ -# --limit-mm-per-prompt '{"image":4}' --max-model-len 16384 +# --limit-mm-per-prompt.image 4 --max-model-len 16384 # # # HF format # vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \ -# --limit-mm-per-prompt '{"image":4}' --max-model-len 16384 +# --limit-mm-per-prompt.image 4 --max-model-len 16384 # ``` # # - Client: diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py index 198863ae4..37f46b369 100644 --- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -10,7 +10,7 @@ vllm serve llava-hf/llava-1.5-7b-hf (multi-image inference with Phi-3.5-vision-instruct) vllm serve microsoft/Phi-3.5-vision-instruct --runner generate \ - --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}' + --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt.image 2 (audio inference with Ultravox) vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b \ diff --git a/examples/pooling/classify/vision_classification_online.py b/examples/pooling/classify/vision_classification_online.py index 64dc5d4ae..021d3dfe5 100644 --- a/examples/pooling/classify/vision_classification_online.py +++ b/examples/pooling/classify/vision_classification_online.py @@ -7,7 +7,7 @@ NOTE: vllm serve muziyongshixin/Qwen2.5-VL-7B-for-VideoCls \ --runner pooling \ --max-model-len 5000 \ - --limit-mm-per-prompt '{"video": 1}' \ + --limit-mm-per-prompt.video 1 \ --hf-overrides '{"text_config": {"architectures": ["Qwen2_5_VLForSequenceClassification"]}}' """ diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 68244ba2f..7a10783e8 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -55,12 +55,12 @@ class MultiModalConfig: """Controls the behavior of multimodal models.""" language_model_only: bool = False - """If True, disables all multimodal inputs by setting all modality limits - to 0. Equivalent to setting --limit-mm-per-prompt to 0 for every - modality.""" + """If True, disables all multimodal inputs by setting all modality limits to 0. + Equivalent to setting `--limit-mm-per-prompt` to 0 for every modality.""" limit_per_prompt: dict[str, DummyOptions] = Field(default_factory=dict) """The maximum number of input items and options allowed per - prompt for each modality. + prompt for each modality. + Defaults to 999 for each modality. Legacy format (count only): -- GitLab From ab97bcf66295fca10a892bd14090e902b4b3c317 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 10 Feb 2026 13:18:57 +0800 Subject: [PATCH 0029/1166] [CI/Build] Relax `test_mcp_tool_call` (#34204) Signed-off-by: DarkLight1337 --- tests/entrypoints/openai/responses/test_parsable_context.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/entrypoints/openai/responses/test_parsable_context.py b/tests/entrypoints/openai/responses/test_parsable_context.py index 0d50f1251..48cb28a0f 100644 --- a/tests/entrypoints/openai/responses/test_parsable_context.py +++ b/tests/entrypoints/openai/responses/test_parsable_context.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import importlib +import importlib.util import json import pytest @@ -179,12 +179,12 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str): assert response.output[2].type == "reasoning" # make sure the correct math is in the final output assert response.output[3].type == "message" - assert "56088" in response.output[3].content[0].text + assert any(s in response.output[3].content[0].text for s in ("56088", "56,088")) # test raw input_messages / output_messages assert len(response.input_messages) == 1 assert len(response.output_messages) == 3 - assert "56088" in response.output_messages[2]["message"] + assert any(s in response.output_messages[2]["message"] for s in ("56088", "56,088")) @pytest.mark.asyncio -- GitLab From 81e217fe6b5a3030aa5c4d859a2125b81979bee4 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Mon, 9 Feb 2026 21:29:39 -0800 Subject: [PATCH 0030/1166] [Bugfix] Fix DP Attention Padding in Dummy Run (#34187) Signed-off-by: Benjamin Chislett Signed-off-by: Lucas Wilkinson Co-authored-by: Benjamin Chislett --- vllm/v1/worker/gpu_model_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a7c2a8800..0e2e381f2 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4787,6 +4787,7 @@ class GPUModelRunner( pad_attn = cudagraph_runtime_mode == CUDAGraphMode.FULL attn_metadata, _ = self._build_attention_metadata( num_tokens=num_tokens_unpadded, + num_tokens_padded=num_tokens_padded if pad_attn else None, num_reqs=num_reqs_padded, max_query_len=max_query_len, ubatch_slices=ubatch_slices_padded if pad_attn else ubatch_slices, -- GitLab From f69b903b4c70716224b3936cb8503e562e25388e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 10 Feb 2026 14:37:50 +0800 Subject: [PATCH 0031/1166] [Bugfix] Add `--trust-remote-code` to dataset bench args (#34208) Signed-off-by: DarkLight1337 --- vllm/benchmarks/datasets.py | 5 +++++ vllm/benchmarks/serve.py | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 7148d90dc..1fbf19add 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -1314,6 +1314,11 @@ class _ValidateDatasetArgs(argparse.Action): def add_dataset_parser(parser: FlexibleArgumentParser): + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="Trust remote code from huggingface", + ) parser.add_argument("--seed", type=int, default=0) parser.add_argument( "--num-prompts", diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index dd853f15a..820427022 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -1300,11 +1300,6 @@ def add_cli_args(parser: argparse.ArgumentParser): "bursty requests. A higher burstiness value (burstiness > 1) " "results in a more uniform arrival of requests.", ) - parser.add_argument( - "--trust-remote-code", - action="store_true", - help="Trust remote code from huggingface", - ) parser.add_argument( "--disable-tqdm", action="store_true", -- GitLab From 9608844f96e0e739bead72520b7710f1b6f82b65 Mon Sep 17 00:00:00 2001 From: Andrew Xia Date: Mon, 9 Feb 2026 22:53:07 -0800 Subject: [PATCH 0032/1166] [responsesAPI] fix simpleContext streaming output_messages (#34188) Signed-off-by: Andrew Xia Signed-off-by: Andrew Xia Co-authored-by: Andrew Xia --- tests/entrypoints/test_context.py | 246 +++++++++++++++++++ vllm/benchmarks/datasets.py | 5 + vllm/entrypoints/openai/responses/context.py | 19 +- 3 files changed, 265 insertions(+), 5 deletions(-) diff --git a/tests/entrypoints/test_context.py b/tests/entrypoints/test_context.py index f87683fc2..1ab2b5edb 100644 --- a/tests/entrypoints/test_context.py +++ b/tests/entrypoints/test_context.py @@ -8,6 +8,7 @@ from openai_harmony import Author, Message, Role, StreamState, TextContent from vllm.entrypoints.openai.responses.context import ( HarmonyContext, + SimpleContext, StreamingHarmonyContext, TurnMetrics, ) @@ -597,3 +598,248 @@ def test_turn_metrics_copy_and_reset(): assert copied_metrics.output_tokens == 20 assert copied_metrics.cached_input_tokens == 5 assert copied_metrics.tool_output_tokens == 3 + + +# ==================== SimpleContext Tests ==================== + + +def create_simple_context_output( + text="", + token_ids=None, + prompt="Test prompt", + prompt_token_ids=None, + num_cached_tokens=0, + logprobs=None, + finished=True, +): + """Helper to create a RequestOutput with customizable text for + SimpleContext tests.""" + if token_ids is None: + token_ids = [] + return RequestOutput( + request_id="test-id", + prompt=prompt, + prompt_token_ids=prompt_token_ids, + prompt_logprobs=None, + outputs=[ + CompletionOutput( + index=0, + text=text, + token_ids=token_ids, + cumulative_logprob=0.0, + logprobs=logprobs, + finish_reason=None, + stop_reason=None, + ) + ], + finished=finished, + num_cached_tokens=num_cached_tokens, + ) + + +def test_simple_context_output_messages_empty(): + """output_messages should be empty before any output is appended.""" + context = SimpleContext() + assert context.output_messages == [] + + +def test_simple_context_output_messages_single_call(): + """Non-streaming: single append_output produces a single output message.""" + context = SimpleContext() + output = create_simple_context_output( + text="Hello world", + token_ids=[10, 20, 30], + prompt_token_ids=[1, 2, 3], + ) + context.append_output(output) + + messages = context.output_messages + assert len(messages) == 1 + assert messages[0].message == "Hello world" + assert messages[0].tokens == [10, 20, 30] + assert messages[0].type == "raw_message_tokens" + + +def test_simple_context_output_messages_streaming_consolidation(): + """Streaming: multiple append_output calls consolidate into one message.""" + context = SimpleContext() + + # Simulate 3 streaming deltas + context.append_output( + create_simple_context_output( + text="Hello", + token_ids=[10], + prompt_token_ids=[1, 2, 3], + ) + ) + context.append_output( + create_simple_context_output( + text=" world", + token_ids=[20], + prompt_token_ids=[1, 2, 3], + ) + ) + context.append_output( + create_simple_context_output( + text="!", + token_ids=[30], + prompt_token_ids=[1, 2, 3], + ) + ) + + messages = context.output_messages + assert len(messages) == 1 + assert messages[0].message == "Hello world!" + assert messages[0].tokens == [10, 20, 30] + + +def test_simple_context_output_messages_many_deltas(): + """Streaming with many small deltas still produces a single message.""" + context = SimpleContext() + + words = ["The", " quick", " brown", " fox", " jumps"] + for i, word in enumerate(words): + context.append_output( + create_simple_context_output( + text=word, + token_ids=[100 + i], + prompt_token_ids=[1, 2], + ) + ) + + messages = context.output_messages + assert len(messages) == 1 + assert messages[0].message == "The quick brown fox jumps" + assert messages[0].tokens == [100, 101, 102, 103, 104] + + +def test_simple_context_input_messages(): + """input_messages is populated on the first append_output call.""" + context = SimpleContext() + assert context.input_messages == [] + + context.append_output( + create_simple_context_output( + text="Hi", + token_ids=[10], + prompt="My prompt text", + prompt_token_ids=[1, 2, 3], + ) + ) + + assert len(context.input_messages) == 1 + assert context.input_messages[0].message == "My prompt text" + assert context.input_messages[0].tokens == [1, 2, 3] + + # Second call should not add another input message + context.append_output( + create_simple_context_output( + text=" there", + token_ids=[20], + prompt="My prompt text", + prompt_token_ids=[1, 2, 3], + ) + ) + + assert len(context.input_messages) == 1 + + +def test_simple_context_token_counting(): + """Token counting accumulates across streaming deltas.""" + context = SimpleContext() + + context.append_output( + create_simple_context_output( + text="a", + token_ids=[10, 11], + prompt_token_ids=[1, 2, 3, 4, 5], + num_cached_tokens=2, + ) + ) + context.append_output( + create_simple_context_output( + text="b", + token_ids=[12], + prompt_token_ids=[1, 2, 3, 4, 5], + num_cached_tokens=2, + ) + ) + + assert context.num_prompt_tokens == 5 + assert context.num_output_tokens == 3 # 2 + 1 + assert context.num_cached_tokens == 2 + + +def test_simple_context_final_output(): + """final_output reconstructs accumulated text and token_ids.""" + context = SimpleContext() + + context.append_output( + create_simple_context_output( + text="foo", + token_ids=[1, 2], + prompt_token_ids=[10], + ) + ) + context.append_output( + create_simple_context_output( + text="bar", + token_ids=[3], + prompt_token_ids=[10], + ) + ) + + final = context.final_output + assert final is not None + assert final.outputs[0].text == "foobar" + assert final.outputs[0].token_ids == (1, 2, 3) + + +def test_simple_context_output_messages_empty_text_with_tokens(): + """output_messages should be returned when tokens exist even if text is + empty (e.g. special tokens).""" + context = SimpleContext() + context.append_output( + create_simple_context_output( + text="", + token_ids=[99], + prompt_token_ids=[1], + ) + ) + + messages = context.output_messages + assert len(messages) == 1 + assert messages[0].message == "" + assert messages[0].tokens == [99] + + +def test_simple_context_output_messages_no_mutation(): + """Each call to output_messages returns a fresh list; callers can't + corrupt internal state.""" + context = SimpleContext() + context.append_output( + create_simple_context_output( + text="hello", + token_ids=[1], + prompt_token_ids=[10], + ) + ) + + msgs1 = context.output_messages + msgs2 = context.output_messages + assert msgs1 is not msgs2 + assert msgs1[0].message == msgs2[0].message + + # Appending more output updates the property + context.append_output( + create_simple_context_output( + text=" world", + token_ids=[2], + prompt_token_ids=[10], + ) + ) + + msgs3 = context.output_messages + assert len(msgs3) == 1 + assert msgs3[0].message == "hello world" + assert msgs3[0].tokens == [1, 2] diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 1fbf19add..a91bc694b 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -1379,6 +1379,11 @@ def add_dataset_parser(parser: FlexibleArgumentParser): action="store_true", help="Disable shuffling of dataset samples for deterministic ordering.", ) + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="Trust remote code from HuggingFace.", + ) # group for dataset specific arguments custom_group = parser.add_argument_group("custom dataset options") diff --git a/vllm/entrypoints/openai/responses/context.py b/vllm/entrypoints/openai/responses/context.py index a10567e40..b327c1e1b 100644 --- a/vllm/entrypoints/openai/responses/context.py +++ b/vllm/entrypoints/openai/responses/context.py @@ -182,7 +182,6 @@ class SimpleContext(ConversationContext): self.all_turn_metrics = [] self.input_messages: list[ResponseRawMessageAndToken] = [] - self.output_messages: list[ResponseRawMessageAndToken] = [] def append_output(self, output) -> None: self.last_output = output @@ -208,12 +207,22 @@ class SimpleContext(ConversationContext): tokens=output_prompt_token_ids, ) ) - self.output_messages.append( + + @property + def output_messages(self) -> list[ResponseRawMessageAndToken]: + """Return consolidated output as a single message. + + In streaming mode, text and tokens are accumulated across many deltas. + This property returns them as a single entry rather than one per delta. + """ + if not self._accumulated_text and not self._accumulated_token_ids: + return [] + return [ ResponseRawMessageAndToken( - message=delta_output.text, - tokens=delta_output.token_ids, + message=self._accumulated_text, + tokens=list(self._accumulated_token_ids), ) - ) + ] @property def final_output(self) -> RequestOutput | None: -- GitLab From 8d48d0a9d9edfc2eb9cee6bb941be20211eb8282 Mon Sep 17 00:00:00 2001 From: Balaxxe <136368465+jaim12005@users.noreply.github.com> Date: Tue, 10 Feb 2026 00:06:30 -0700 Subject: [PATCH 0033/1166] [Bugfix] Sort hf_weights_files in fastsafetensors_weights_iterator to match #33491 (#34190) Signed-off-by: Balaxxe <136368465+jaim12005@users.noreply.github.com> --- vllm/model_executor/model_loader/weight_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index d43656c4f..7025efd1c 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -826,6 +826,7 @@ def fastsafetensors_weights_iterator( pg = SingleGroup() device = torch.device(f"cuda:{current_platform.current_device()}") + hf_weights_files = sorted(hf_weights_files, key=_natural_sort_key) weight_files_sub_lists = [ hf_weights_files[i : i + pg.size()] for i in range(0, len(hf_weights_files), pg.size()) -- GitLab From dab1de9f3895a153a7bc2ce7ef7782ba7818a146 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Tue, 10 Feb 2026 15:30:19 +0800 Subject: [PATCH 0034/1166] [Frontend][CI] Consolidate instrumentator entrypoints (#34123) Signed-off-by: wang.yuqi --- .buildkite/test-amd.yaml | 8 ++-- .buildkite/test-pipeline.yaml | 10 ++--- .buildkite/test_areas/entrypoints.yaml | 8 ++-- .../{openai => instrumentator}/test_basic.py | 0 .../test_optional_middleware.py | 0 .../test_orca_metrics.py | 0 .../{sleep => instrumentator}/test_sleep.py | 0 tests/entrypoints/sleep/__init__.py | 0 vllm/entrypoints/openai/api_server.py | 22 ++++++++-- vllm/entrypoints/openai/basic/__init__.py | 0 vllm/entrypoints/sagemaker/api_router.py | 2 +- vllm/entrypoints/serve/__init__.py | 41 +------------------ .../serve/instrumentator/__init__.py | 29 +++++++++++++ .../instrumentator/basic.py} | 6 +-- .../serve/instrumentator/health.py | 4 -- .../serve/instrumentator/server_info.py | 8 +--- 16 files changed, 64 insertions(+), 74 deletions(-) rename tests/entrypoints/{openai => instrumentator}/test_basic.py (100%) rename tests/entrypoints/{openai => instrumentator}/test_optional_middleware.py (100%) rename tests/entrypoints/{openai => instrumentator}/test_orca_metrics.py (100%) rename tests/entrypoints/{sleep => instrumentator}/test_sleep.py (100%) delete mode 100644 tests/entrypoints/sleep/__init__.py delete mode 100644 vllm/entrypoints/openai/basic/__init__.py rename vllm/entrypoints/{openai/basic/api_router.py => serve/instrumentator/basic.py} (92%) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index e78cdd7f8..19fc79f61 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -132,7 +132,7 @@ steps: - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/rpc --ignore=entrypoints/instrumentator --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - label: Entrypoints Integration Test (LLM) # 30min timeout_in_minutes: 40 @@ -179,14 +179,14 @@ steps: torch_nightly: true source_file_dependencies: - vllm/ - - tests/entrypoints/sleep - tests/entrypoints/rpc + - tests/entrypoints/instrumentator - tests/tool_use commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/sleep + - pytest -v -s entrypoints/instrumentator + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - pytest -v -s tool_use - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - label: Entrypoints Integration Test (Pooling) timeout_in_minutes: 50 diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 73d4cf80c..74e0d19e0 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -118,7 +118,7 @@ steps: - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - label: Entrypoints Integration Test (LLM) # 30min timeout_in_minutes: 40 @@ -148,7 +148,7 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/instrumentator --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses - pytest -v -s entrypoints/test_chat_utils.py - label: Entrypoints Integration Test (API Server 2) @@ -159,13 +159,13 @@ steps: torch_nightly: true source_file_dependencies: - vllm/ - - tests/entrypoints/sleep - tests/entrypoints/rpc + - tests/entrypoints/instrumentator - tests/tool_use commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/sleep - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc + - pytest -v -s entrypoints/instrumentator + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - pytest -v -s tool_use - label: Entrypoints Integration Test (Pooling) diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml index 8e02d9f60..0c72e3d9b 100644 --- a/.buildkite/test_areas/entrypoints.yaml +++ b/.buildkite/test_areas/entrypoints.yaml @@ -42,15 +42,13 @@ steps: working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ - - tests/tool_use - - tests/entrypoints/sleep - - tests/entrypoints/instrumentator - tests/entrypoints/rpc + - tests/entrypoints/instrumentator + - tests/tool_use commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - pytest -v -s entrypoints/instrumentator - - pytest -v -s entrypoints/sleep + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - pytest -v -s tool_use - label: Entrypoints Integration (Pooling) diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/instrumentator/test_basic.py similarity index 100% rename from tests/entrypoints/openai/test_basic.py rename to tests/entrypoints/instrumentator/test_basic.py diff --git a/tests/entrypoints/openai/test_optional_middleware.py b/tests/entrypoints/instrumentator/test_optional_middleware.py similarity index 100% rename from tests/entrypoints/openai/test_optional_middleware.py rename to tests/entrypoints/instrumentator/test_optional_middleware.py diff --git a/tests/entrypoints/openai/test_orca_metrics.py b/tests/entrypoints/instrumentator/test_orca_metrics.py similarity index 100% rename from tests/entrypoints/openai/test_orca_metrics.py rename to tests/entrypoints/instrumentator/test_orca_metrics.py diff --git a/tests/entrypoints/sleep/test_sleep.py b/tests/entrypoints/instrumentator/test_sleep.py similarity index 100% rename from tests/entrypoints/sleep/test_sleep.py rename to tests/entrypoints/instrumentator/test_sleep.py diff --git a/tests/entrypoints/sleep/__init__.py b/tests/entrypoints/sleep/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 1ce706abc..d76a7446d 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -178,10 +178,6 @@ def build_app( app = FastAPI(lifespan=lifespan) app.state.args = args - from vllm.entrypoints.openai.basic.api_router import register_basic_api_routers - - register_basic_api_routers(app) - from vllm.entrypoints.serve import register_vllm_serve_api_routers register_vllm_serve_api_routers(app) @@ -205,6 +201,24 @@ def build_app( register_generate_api_routers(app) + from vllm.entrypoints.serve.disagg.api_router import ( + attach_router as attach_disagg_router, + ) + + attach_disagg_router(app) + + from vllm.entrypoints.serve.rlhf.api_router import ( + attach_router as attach_rlhf_router, + ) + + attach_rlhf_router(app) + + from vllm.entrypoints.serve.elastic_ep.api_router import ( + attach_router as elastic_ep_attach_router, + ) + + elastic_ep_attach_router(app) + if "transcription" in supported_tasks: from vllm.entrypoints.openai.speech_to_text.api_router import ( attach_router as register_speech_to_text_api_router, diff --git a/vllm/entrypoints/openai/basic/__init__.py b/vllm/entrypoints/openai/basic/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/vllm/entrypoints/sagemaker/api_router.py b/vllm/entrypoints/sagemaker/api_router.py index 7c5bae5b5..1138225c3 100644 --- a/vllm/entrypoints/sagemaker/api_router.py +++ b/vllm/entrypoints/sagemaker/api_router.py @@ -10,10 +10,10 @@ import pydantic from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request from fastapi.responses import JSONResponse, Response -from vllm.entrypoints.openai.basic.api_router import base from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.entrypoints.openai.utils import validate_json_request +from vllm.entrypoints.serve.instrumentator.basic import base from vllm.entrypoints.serve.instrumentator.health import health from vllm.tasks import POOLING_TASKS, SupportedTask diff --git a/vllm/entrypoints/serve/__init__.py b/vllm/entrypoints/serve/__init__.py index f5c80f682..8233d3324 100644 --- a/vllm/entrypoints/serve/__init__.py +++ b/vllm/entrypoints/serve/__init__.py @@ -22,12 +22,6 @@ def register_vllm_serve_api_routers(app: FastAPI): attach_lora_router(app) - from vllm.entrypoints.serve.elastic_ep.api_router import ( - attach_router as attach_elastic_ep_router, - ) - - attach_elastic_ep_router(app) - from vllm.entrypoints.serve.profile.api_router import ( attach_router as attach_profile_router, ) @@ -58,37 +52,6 @@ def register_vllm_serve_api_routers(app: FastAPI): attach_tokenize_router(app) - from vllm.entrypoints.serve.disagg.api_router import ( - attach_router as attach_disagg_router, - ) - - attach_disagg_router(app) - - from vllm.entrypoints.serve.rlhf.api_router import ( - attach_router as attach_rlhf_router, - ) - - attach_rlhf_router(app) - - from vllm.entrypoints.serve.instrumentator.metrics import ( - attach_router as attach_metrics_router, - ) - - attach_metrics_router(app) - - from vllm.entrypoints.serve.instrumentator.health import ( - attach_router as attach_health_router, - ) - - attach_health_router(app) - - from vllm.entrypoints.serve.instrumentator.offline_docs import ( - attach_router as attach_offline_docs_router, - ) - - attach_offline_docs_router(app) - from vllm.entrypoints.serve.instrumentator.server_info import ( - attach_router as attach_server_info_router, - ) + from .instrumentator import register_instrumentator_api_routers - attach_server_info_router(app) + register_instrumentator_api_routers(app) diff --git a/vllm/entrypoints/serve/instrumentator/__init__.py b/vllm/entrypoints/serve/instrumentator/__init__.py index e69de29bb..8abce0232 100644 --- a/vllm/entrypoints/serve/instrumentator/__init__.py +++ b/vllm/entrypoints/serve/instrumentator/__init__.py @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from fastapi import FastAPI + +from vllm import envs + + +def register_instrumentator_api_routers(app: FastAPI): + from .basic import router as basic_router + + app.include_router(basic_router) + + from .health import router as health_router + + app.include_router(health_router) + + from .metrics import attach_router as metrics_attach_router + + metrics_attach_router(app) + + from .offline_docs import attach_router as offline_docs_attach_router + + offline_docs_attach_router(app) + + if envs.VLLM_SERVER_DEV_MODE: + from .server_info import router as server_info_router + + app.include_router(server_info_router) diff --git a/vllm/entrypoints/openai/basic/api_router.py b/vllm/entrypoints/serve/instrumentator/basic.py similarity index 92% rename from vllm/entrypoints/openai/basic/api_router.py rename to vllm/entrypoints/serve/instrumentator/basic.py index 3378d914a..e6c96de0b 100644 --- a/vllm/entrypoints/openai/basic/api_router.py +++ b/vllm/entrypoints/serve/instrumentator/basic.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from fastapi import APIRouter, FastAPI, Request +from fastapi import APIRouter, Request from fastapi.responses import JSONResponse from vllm.engine.protocol import EngineClient @@ -55,7 +55,3 @@ async def get_server_load_metrics(request: Request): async def show_version(): ver = {"version": VLLM_VERSION} return JSONResponse(content=ver) - - -def register_basic_api_routers(app: FastAPI): - app.include_router(router) diff --git a/vllm/entrypoints/serve/instrumentator/health.py b/vllm/entrypoints/serve/instrumentator/health.py index 029ef677a..8b079ce31 100644 --- a/vllm/entrypoints/serve/instrumentator/health.py +++ b/vllm/entrypoints/serve/instrumentator/health.py @@ -27,7 +27,3 @@ async def health(raw_request: Request) -> Response: return Response(status_code=200) except EngineDeadError: return Response(status_code=503) - - -def attach_router(app): - app.include_router(router) diff --git a/vllm/entrypoints/serve/instrumentator/server_info.py b/vllm/entrypoints/serve/instrumentator/server_info.py index d6ef994f3..60967c5a6 100644 --- a/vllm/entrypoints/serve/instrumentator/server_info.py +++ b/vllm/entrypoints/serve/instrumentator/server_info.py @@ -7,7 +7,7 @@ import functools from typing import Annotated, Literal import pydantic -from fastapi import APIRouter, FastAPI, Query, Request +from fastapi import APIRouter, Query, Request from fastapi.responses import JSONResponse import vllm.envs as envs @@ -57,9 +57,3 @@ async def show_server_info( "system_env": await asyncio.to_thread(_get_system_env_info_cached), } return JSONResponse(content=server_info) - - -def attach_router(app: FastAPI): - if not envs.VLLM_SERVER_DEV_MODE: - return - app.include_router(router) -- GitLab From 97fa8f65909d4d8f2eb0edc2137fb22f576a5b25 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Mon, 9 Feb 2026 23:41:16 -0800 Subject: [PATCH 0035/1166] [BugFix] Avoid prefix cache hit in the same schedule step for mamba layers (#29387) Signed-off-by: Chen Zhang --- .../models/language/generation/test_hybrid.py | 28 +++++++++++++ tests/v1/core/test_prefix_caching.py | 2 + vllm/v1/core/kv_cache_coordinator.py | 5 +++ vllm/v1/core/kv_cache_manager.py | 4 ++ vllm/v1/core/sched/scheduler.py | 2 + vllm/v1/core/single_type_kv_cache_manager.py | 40 +++++++++++++++++-- 6 files changed, 78 insertions(+), 3 deletions(-) diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index 2724f612c..e853f65db 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -7,6 +7,7 @@ import pytest from tests.models.registry import HF_EXAMPLE_MODELS from tests.utils import multi_gpu_test +from vllm import LLM from vllm.engine.arg_utils import EngineArgs from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams @@ -769,3 +770,30 @@ def test_apc_multiple_prompts_partial_cached_outputs( name_0="vllm_no_cache", name_1=f"vllm_cache_it_{r_idx + 1}", ) + + +# we have to use a real large model to get reasonable results +# the model can't be a hybrid model as we need block_size 16 +@pytest.mark.parametrize("model", ["tiiuae/falcon-mamba-7b"]) +def test_apc_common_prefix_same_batch( + model: str, + monkeypatch, +) -> None: + # Required to put the two requests in the same batch + monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") + llm = LLM( + model=model, + enforce_eager=True, + block_size=16, + mamba_block_size=16, + enable_prefix_caching=True, + seed=42, + ) + prompts = [ + "hello what is one plus one what is one plus one what is one plus one the answer is", # noqa: E501 + "hello what is one plus one what is one plus one what is one plus one the answer is", # noqa: E501 + ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=20) + outputs = llm.generate(prompts, sampling_params) + for output in outputs: + assert "two" in output.outputs[0].text diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 287b8ad98..e2c924a61 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -857,6 +857,8 @@ def test_prefill_hybrid_model_combinations(spec_types: list[str]): # Should have blocks for all groups assert len(blocks.get_block_ids()) == num_groups + manager.new_step_starts() + # Second request: should hit cached blocks for common prefix req1 = make_request("1", common_token_ids + [4] * 5, block_size, hash_fn) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index d8f9d69c7..eaa95dfe4 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -247,6 +247,11 @@ class KVCacheCoordinator(ABC): ) -> tuple[tuple[list[KVCacheBlock], ...], int]: pass + def new_step_starts(self) -> None: + """Called when a new step is started.""" + for manager in self.single_type_managers: + manager.new_step_starts() + class KVCacheCoordinatorNoPrefixCache(KVCacheCoordinator): """ diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 2caed0493..7f8d80475 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -488,3 +488,7 @@ class KVCacheManager: ) -> KVCacheBlocks: # Only create new KVCacheBlocks for non-empty blocks return KVCacheBlocks(blocks) if any(blocks) else self.empty_kv_cache_blocks + + def new_step_starts(self) -> None: + """Called when a new step is started.""" + self.coordinator.new_step_starts() diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index aa3bc6e2c..cfd6baabb 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -347,6 +347,8 @@ class Scheduler(SchedulerInterface): # For logging. scheduled_timestamp = time.monotonic() + self.kv_cache_manager.new_step_starts() + # First, schedule the RUNNING requests. req_index = 0 while req_index < len(self.running) and token_budget > 0: diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 96660dc6f..0b6b7ed42 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -7,7 +7,11 @@ from collections.abc import Sequence from vllm.utils.math_utils import cdiv from vllm.v1.core.block_pool import BlockPool -from vllm.v1.core.kv_cache_utils import BlockHashList, KVCacheBlock +from vllm.v1.core.kv_cache_utils import ( + BlockHashList, + BlockHashWithGroupId, + KVCacheBlock, +) from vllm.v1.kv_cache_interface import ( ChunkedLocalAttentionSpec, CrossAttentionSpec, @@ -396,6 +400,10 @@ class SingleTypeKVCacheManager(ABC): # The default behavior is to not skip any tokens. return 0 + def new_step_starts(self) -> None: + # do nothing by default + return None + class FullAttentionManager(SingleTypeKVCacheManager): @classmethod @@ -742,8 +750,11 @@ class ChunkedLocalAttentionManager(SingleTypeKVCacheManager): class MambaManager(SingleTypeKVCacheManager): - def __init__(self, kv_cache_spec: MambaSpec, **kwargs) -> None: - super().__init__(kv_cache_spec, **kwargs) + def __init__( + self, kv_cache_spec: MambaSpec, block_pool: BlockPool, **kwargs + ) -> None: + super().__init__(kv_cache_spec, block_pool, **kwargs) + self.cached_blocks_this_step: set[BlockHashWithGroupId] = set() self.mamba_cache_mode = kv_cache_spec.mamba_cache_mode self.num_speculative_blocks: int = kv_cache_spec.num_speculative_blocks if self.mamba_cache_mode == "align": @@ -838,6 +849,15 @@ class MambaManager(SingleTypeKVCacheManager): num_tokens_main_model: int, ) -> int: assert isinstance(self.kv_cache_spec, MambaSpec) + if ( + len(new_computed_blocks) > 0 + and new_computed_blocks[-1].block_hash in self.cached_blocks_this_step + ): + # Mamba can't rely on blocks generated by other requests in the current step + # To put it in the next step, we return num_gpu_blocks + 1 so + # that kv_cache_manager will think there is no enough blocks to allocte now + # and don't schedule it in the current step. + return self.block_pool.num_gpu_blocks + 1 if self.mamba_cache_mode != "align": # Allocate extra `num_speculative_blocks` blocks for # speculative decoding (MTP/EAGLE) with linear attention. @@ -972,6 +992,20 @@ class MambaManager(SingleTypeKVCacheManager): """ return num_computed_tokens - 1 + def cache_blocks(self, request: Request, num_tokens: int) -> None: + num_cached_blocks_before = self.num_cached_block.get(request.request_id, 0) + super().cache_blocks(request, num_tokens) + num_cached_blocks_after = self.num_cached_block.get(request.request_id, 0) + if num_cached_blocks_after > num_cached_blocks_before: + for block in self.req_to_blocks[request.request_id][ + num_cached_blocks_before:num_cached_blocks_after + ]: + assert block.block_hash is not None + self.cached_blocks_this_step.add(block.block_hash) + + def new_step_starts(self) -> None: + self.cached_blocks_this_step.clear() + class CrossAttentionManager(SingleTypeKVCacheManager): """Manager for cross-attention KV cache in encoder-decoder models.""" -- GitLab From e1060a71a1bb96103ce9ca98345184dcdc982467 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Tue, 10 Feb 2026 02:54:41 -0500 Subject: [PATCH 0036/1166] [Perf] Optimize detokenizer python logic (#32975) Signed-off-by: yewentao256 Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: Nick Hill --- vllm/v1/engine/detokenizer.py | 12 ++++++++---- vllm/v1/engine/output_processor.py | 4 ++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index e77a316b2..18e4c98f8 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -35,6 +35,9 @@ class IncrementalDetokenizer: def output_token_ids(self) -> list[int]: return self.token_ids + def num_output_tokens(self) -> int: + return len(self.token_ids) + def update(self, new_token_ids: list[int], stop_terminated: bool) -> str | None: self.token_ids.extend(new_token_ids) return None @@ -112,14 +115,12 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC): skipped_stop_token_id = None # 1) Detokenize the new token ids incrementally. - # TODO(woosuk): This method becomes very inefficient when the number of - # new_token_ids is more than 1. We need to optimize this. stop_check_offset = len(self.output_text) for new_token_id in new_token_ids: self.token_ids.append(new_token_id) self.output_text += self.decode_next(new_token_id) # Support min_tokens, see https://github.com/vllm-project/vllm/pull/22014 - if self.min_tokens and len(self.output_token_ids) <= self.min_tokens: + if self.min_tokens and self.num_output_tokens() <= self.min_tokens: stop_check_offset = len(self.output_text) if skipped_stop_token_id is not None: @@ -128,7 +129,7 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC): # 2) Evaluate stop strings. stop_string = None - if self.stop and len(self.output_token_ids) > self.min_tokens: + if self.stop and self.num_output_tokens() > self.min_tokens: stop = check_stop_strings( output_text=self.output_text, new_char_count=len(self.output_text) - stop_check_offset, @@ -295,6 +296,9 @@ class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer): else (self.token_ids[self.prompt_len :]) ) + def num_output_tokens(self) -> int: + return len(self.token_ids) - self.prompt_len + def decode_next(self, next_token_id: int) -> str: new_tokens, decoded_text, prefix_offset, read_offset = detokenize_incrementally( tokenizer=self.tokenizer, diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 00a5355e0..58c73fbc6 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -292,7 +292,7 @@ class RequestState: if not ( finished or self.sent_tokens_offset == 0 - or len(self.detokenizer.output_token_ids) - self.sent_tokens_offset + or self.detokenizer.num_output_tokens() - self.sent_tokens_offset >= self.stream_interval ): return None @@ -303,7 +303,7 @@ class RequestState: new_token_ids = self.detokenizer.output_token_ids[ self.sent_tokens_offset : ] - self.sent_tokens_offset = len(self.detokenizer.output_token_ids) + self.sent_tokens_offset = self.detokenizer.num_output_tokens() external_req_id = self.external_req_id -- GitLab From 998e2d91f84e2b30dc40c8543b879d4e412d6f14 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 10 Feb 2026 15:59:04 +0800 Subject: [PATCH 0037/1166] Revert #34208 (#34216) --- vllm/benchmarks/datasets.py | 5 ----- vllm/benchmarks/serve.py | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index a91bc694b..86a5cec2f 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -1314,11 +1314,6 @@ class _ValidateDatasetArgs(argparse.Action): def add_dataset_parser(parser: FlexibleArgumentParser): - parser.add_argument( - "--trust-remote-code", - action="store_true", - help="Trust remote code from huggingface", - ) parser.add_argument("--seed", type=int, default=0) parser.add_argument( "--num-prompts", diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 820427022..dd853f15a 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -1300,6 +1300,11 @@ def add_cli_args(parser: argparse.ArgumentParser): "bursty requests. A higher burstiness value (burstiness > 1) " "results in a more uniform arrival of requests.", ) + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="Trust remote code from huggingface", + ) parser.add_argument( "--disable-tqdm", action="store_true", -- GitLab From 5f970120f06daab162b8692cfce39b0f366b9b47 Mon Sep 17 00:00:00 2001 From: Zetong Li <48438720+slippersss@users.noreply.github.com> Date: Tue, 10 Feb 2026 16:22:03 +0800 Subject: [PATCH 0038/1166] [Bugfix] Fix memory inconsistency in cross-process shared memory (#32022) Signed-off-by: Zetong Li --- vllm/distributed/device_communicators/shm_broadcast.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 31c6084c9..ef5f74c1e 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -488,6 +488,12 @@ class MessageQueue: for i in range(1, self.buffer.n_reader + 1): # set read flag to 0, meaning it is not read yet metadata_buffer[i] = 0 + # Memory fence here ensures the order of the buffer and flag + # writes. This guarantees that when `metadata_buffer[0] = 1` is + # visible to readers, `buf` can be completely ready. Without + # this, some CPU architectures with weak ordering may incur + # memory inconsistency. + memory_fence() # mark the block as written metadata_buffer[0] = 1 # Memory fence ensures the write is visible to readers on other cores -- GitLab From 2c32558a3c467253161e32203584c1ecb33bb584 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 10 Feb 2026 16:29:10 +0800 Subject: [PATCH 0039/1166] [Bugfix] Fix `--trust-remote-code` conflict (#34218) Signed-off-by: DarkLight1337 --- examples/offline_inference/spec_decode.py | 7 +------ vllm/benchmarks/datasets.py | 11 +---------- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index d8c5ece4f..e60226ba6 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -5,14 +5,9 @@ from transformers import AutoTokenizer from vllm import LLM, SamplingParams from vllm.benchmarks.datasets import add_dataset_parser, get_samples +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.v1.metrics.reader import Counter, Vector -try: - from vllm.utils.argparse_utils import FlexibleArgumentParser -except ImportError: - from argparse import ArgumentParser as FlexibleArgumentParser - - QUESTION = "What is the content of each image?" IMAGE_URLS = [ "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg", diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 86a5cec2f..17cc2984f 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -39,6 +39,7 @@ from vllm.lora.utils import get_adapter_absolute_path from vllm.multimodal import MultiModalDataDict from vllm.multimodal.image import convert_image_mode from vllm.tokenizers import TokenizerLike +from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.import_utils import PlaceholderModule try: @@ -57,11 +58,6 @@ try: except ImportError: librosa = PlaceholderModule("librosa") -try: - from vllm.utils.argparse_utils import FlexibleArgumentParser -except ImportError: - from argparse import ArgumentParser as FlexibleArgumentParser - logger = logging.getLogger(__name__) # ----------------------------------------------------------------------------- @@ -1374,11 +1370,6 @@ def add_dataset_parser(parser: FlexibleArgumentParser): action="store_true", help="Disable shuffling of dataset samples for deterministic ordering.", ) - parser.add_argument( - "--trust-remote-code", - action="store_true", - help="Trust remote code from HuggingFace.", - ) # group for dataset specific arguments custom_group = parser.add_argument_group("custom dataset options") -- GitLab From cbea11c9f0ddeef8f5e31449b2e6a37d08e4e653 Mon Sep 17 00:00:00 2001 From: zzaebok <44357534+zzaebok@users.noreply.github.com> Date: Tue, 10 Feb 2026 18:16:26 +0800 Subject: [PATCH 0040/1166] [Docs] Fix format error in KV load failure recovery doc (#34137) Signed-off-by: Jaebok Lee --- examples/offline_inference/kv_load_failure_recovery/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/offline_inference/kv_load_failure_recovery/README.md b/examples/offline_inference/kv_load_failure_recovery/README.md index 1f29a6ff5..176141b5d 100644 --- a/examples/offline_inference/kv_load_failure_recovery/README.md +++ b/examples/offline_inference/kv_load_failure_recovery/README.md @@ -28,3 +28,4 @@ It demonstrates vLLM's ability to recover from KV load failures in both synchron ```bash ./run.sh +``` -- GitLab From ae4e280602f3c91d322a449f33f5aebbdd59ccc1 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 10 Feb 2026 02:41:24 -0800 Subject: [PATCH 0041/1166] [Bugfix] Fix FI kernel`chunk_gated_delta_rule` output shape for Qwen3.5 (#34219) Signed-off-by: Roger Wang --- vllm/model_executor/models/qwen3_next.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index de97daccf..d0c13dd49 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -135,7 +135,7 @@ def fi_chunk_gated_delta_rule( fi_state = initial_state.to(torch.float32) fi_g = g.to(torch.float32) fi_beta = beta.to(torch.float32) - return chunk_gated_delta_rule_fi( + output, final_state = chunk_gated_delta_rule_fi( q=q, k=k, v=v, @@ -145,6 +145,8 @@ def fi_chunk_gated_delta_rule( output_final_state=output_final_state, cu_seqlens=cu_seqlens, ) + # Unsqueeze back to 4D (1, L, H, D) to match fla output format + return output.unsqueeze(0), final_state @CustomOp.register("chunk_gated_delta_rule") -- GitLab From e042d7e685daacfa9d4df92cc7d330060327a32b Mon Sep 17 00:00:00 2001 From: tc-mb <157115220+tc-mb@users.noreply.github.com> Date: Tue, 10 Feb 2026 18:51:48 +0800 Subject: [PATCH 0042/1166] Add flagos in MiniCPM-o (#34126) Signed-off-by: tc-mb Signed-off-by: Vincent-Xiao Co-authored-by: Vincent-Xiao --- vllm/model_executor/models/minicpmo.py | 42 ++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index 28978693c..39b79e4b1 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -24,6 +24,7 @@ # limitations under the License. """Inference-only MiniCPM-O model compatible with HuggingFace weights.""" +import os from collections.abc import Callable, Iterable, Mapping, Sequence from typing import Annotated, Any, Literal, TypeAlias @@ -75,6 +76,47 @@ from .utils import AutoWeightsLoader, cast_overflow_tensors, maybe_prefix CPU_DEVICE = torch.device("cpu") +if os.getenv("USE_FLAGOS") == "1": + import flag_gems + + FLAG_GEMS_CONFIG = [ + "sort", + "sort_stable", + "layer_norm", + "clamp_", + "cos", + "embedding", + "exp", + "exponential_", + "full", + "gather", + "gelu", + "index", + "le", + "lt", + "lt_scalar", + "masked_fill_", + "max", + "ones", + "pow_scalar", + "prod_dim", + "rand_like", + "reciprocal", + "repeat", + "scatter", + "scatter_", + "sin", + "sub", + "true_divide", + "true_divide_", + "uniform_", + "where_scalar_self", + "where_self_out", + "zeros", + "zeros_like", + ] + flag_gems.only_enable(record=False, include=FLAG_GEMS_CONFIG) + class MiniCPMOAudioFeatureInputs(TensorSchema): """ -- GitLab From 94de871546e8da687c08ed8a7e0a26531500d4bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ph=C3=BAc=20H=2E=20L=C3=AA=20Kh=E1=BA=AFc?= Date: Tue, 10 Feb 2026 18:16:21 +0700 Subject: [PATCH 0043/1166] [Misc] allow specify is_mm_prefix_lm in hf_config (#34215) --- vllm/config/model.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/config/model.py b/vllm/config/model.py index 96dbf9725..749af0d5d 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1119,6 +1119,9 @@ class ModelConfig: @cached_property def is_mm_prefix_lm(self) -> bool: """Whether to use bidirectional attention for mm positions.""" + if hasattr(self.hf_config, "is_mm_prefix_lm"): + return bool(self.hf_config.is_mm_prefix_lm) + # fallback to list of known models MM_PREFIX_LM_MODELS = ( "gemma3", "molmo2", -- GitLab From 61413973e83b9ca07f3c894a90ddecca0a39d2b6 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 10 Feb 2026 13:08:20 +0100 Subject: [PATCH 0044/1166] Stop testing for slow tokenizers as they will not exist soon (#34235) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/tokenizers_/test_basic.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/tokenizers_/test_basic.py b/tests/tokenizers_/test_basic.py index 99f68ecd0..1c1dd3338 100644 --- a/tests/tokenizers_/test_basic.py +++ b/tests/tokenizers_/test_basic.py @@ -4,7 +4,6 @@ from typing import _get_protocol_attrs # type: ignore import pytest from transformers import ( - PreTrainedTokenizer, PreTrainedTokenizerBase, PreTrainedTokenizerFast, ) @@ -25,10 +24,6 @@ def _assert_tokenizer_like(tokenizer: object): def test_tokenizer_like_protocol(): - tokenizer = get_tokenizer("gpt2", use_fast=False) - assert isinstance(tokenizer, PreTrainedTokenizer) - _assert_tokenizer_like(tokenizer) - tokenizer = get_tokenizer("gpt2", use_fast=True) assert isinstance(tokenizer, PreTrainedTokenizerFast) _assert_tokenizer_like(tokenizer) -- GitLab From 748625cdafd7898b163115d8c33c7c5521a708e8 Mon Sep 17 00:00:00 2001 From: Krish Gupta Date: Tue, 10 Feb 2026 18:35:32 +0530 Subject: [PATCH 0045/1166] [V1][BugFix] Fix EAGLE3 encoder cache miss with disable_chunked_mm_input (#34220) Signed-off-by: KrxGu --- tests/v1/core/test_scheduler.py | 69 +++++++++++++++++++++++++++++++++ vllm/v1/core/sched/scheduler.py | 7 +++- 2 files changed, 75 insertions(+), 1 deletion(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index a1e3d09d2..376b06a5e 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -3675,3 +3675,72 @@ def test_abort_request_finished_recving(): # verify request is deleted assert request.request_id not in scheduler.requests assert not scheduler.finished_recving_kv_req_ids + + +def test_eagle3_mm_encoder_cache_with_shift(): + """Test EAGLE3 encoder scheduling accounts for shift_computed_tokens. + + Regression test for issue #32469: When EAGLE3 is enabled with + disable_chunked_mm_input=True, ensure encoder inputs are scheduled + when tokens overlap the MM range, properly accounting for + shift_computed_tokens in the boundary calculation. + + Without the fix, the scheduler would fail to schedule encoder inputs + at the boundary, causing "Encoder cache miss" errors. + """ + scheduler = create_scheduler( + model="llava-hf/llava-1.5-7b-hf", + max_num_batched_tokens=1024, + disable_chunked_mm_input=True, + max_model_len=2048, + num_speculative_tokens=4, # This enables EAGLE with shift=1 + ) + + mm_start_pos = 100 + mm_length = 576 + + mm_positions = [ + [PlaceholderRange(offset=mm_start_pos, length=mm_length)], + ] + + requests = create_requests( + num_requests=1, + num_tokens=mm_start_pos + mm_length + 100, + mm_positions=mm_positions, + ) + + # Start with some tokens already computed to simulate decoding + request = requests[0] + request.num_computed_tokens = 0 + + scheduler.add_request(request) + output = scheduler.schedule() + + assert output is not None + shift_computed_tokens = 1 + req_id = request.request_id + + assert req_id in output.num_scheduled_tokens + num_scheduled = output.num_scheduled_tokens[req_id] + + mm_feature = request.mm_features[0] + start_pos = mm_feature.mm_position.offset + tokens_end = request.num_computed_tokens + num_scheduled + scheduled_end_with_shift = tokens_end + shift_computed_tokens + + # Assert that we scheduled into the MM range (test setup verification) + assert scheduled_end_with_shift > start_pos, ( + f"Test setup error: expected to schedule into MM range. " + f"scheduled_end_with_shift={scheduled_end_with_shift}, " + f"start_pos={start_pos}" + ) + + # The key assertion: when scheduled tokens overlap MM range + # (accounting for EAGLE's shift), encoder MUST be scheduled. + # Without the fix, this would fail at the boundary case. + assert req_id in output.scheduled_encoder_inputs, ( + f"Encoder input missing: scheduled {num_scheduled} tokens " + f"(computed={request.num_computed_tokens}, end={tokens_end}, " + f"shifted_end={scheduled_end_with_shift}) overlapping MM at " + f"{start_pos}. The fix must schedule encoder inputs." + ) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index cfd6baabb..9546672de 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1155,7 +1155,12 @@ class Scheduler(SchedulerInterface): and (num_computed_tokens + num_new_tokens) < (start_pos + num_encoder_tokens) ): - num_new_tokens = start_pos - num_computed_tokens + # Account for EAGLE shift when rolling back to avoid + # encoder cache miss. This ensures the scheduled range + # stops before start_pos even with the shift. + num_new_tokens = max( + 0, start_pos - (num_computed_tokens + shift_computed_tokens) + ) break if not self.encoder_cache_manager.can_allocate( request, i, encoder_compute_budget, num_embeds_to_schedule -- GitLab From d0bc52056915e108c347aa4b5520e163e5c5b726 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 10 Feb 2026 14:46:01 +0100 Subject: [PATCH 0046/1166] Bump `mamba-ssm` version in CI for Transformers v5 compatibility (#34233) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .buildkite/test-pipeline.yaml | 4 ++-- .buildkite/test_areas/models_language.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 74e0d19e0..24bd1736a 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -862,7 +862,7 @@ steps: commands: # Install fast path packages for testing against transformers # Note: also needed to run plamo2 model in vLLM - - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' # Shard hybrid language model tests - pytest -v -s models/language/generation \ @@ -881,7 +881,7 @@ steps: commands: # Install fast path packages for testing against transformers # Note: also needed to run plamo2 model in vLLM - - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml index f70192c4e..7a64604c3 100644 --- a/.buildkite/test_areas/models_language.yaml +++ b/.buildkite/test_areas/models_language.yaml @@ -40,7 +40,7 @@ steps: commands: # Install fast path packages for testing against transformers # Note: also needed to run plamo2 model in vLLM - - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' # Shard hybrid language model tests - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB @@ -56,7 +56,7 @@ steps: commands: # Install fast path packages for testing against transformers # Note: also needed to run plamo2 model in vLLM - - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.3.0' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' -- GitLab From a1946570d80c1bef78063e84b097951d8e8d4e6a Mon Sep 17 00:00:00 2001 From: Fan Yang Date: Tue, 10 Feb 2026 06:23:52 -0800 Subject: [PATCH 0047/1166] add --insecure arg to the vllm bench to skip TLS (#34026) Signed-off-by: Fan Yang Co-authored-by: Fan Yang --- tests/benchmarks/test_serve_cli.py | 109 ++++++++++++++++++++++++++++- vllm/benchmarks/serve.py | 35 +++++++-- 2 files changed, 139 insertions(+), 5 deletions(-) diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py index c579b3806..8aa17b7ef 100644 --- a/tests/benchmarks/test_serve_cli.py +++ b/tests/benchmarks/test_serve_cli.py @@ -1,15 +1,76 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import subprocess +import tempfile +import time +from pathlib import Path import pytest +import requests +import urllib3 from ..utils import RemoteOpenAIServer MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" -@pytest.fixture(scope="module") +def generate_self_signed_cert(cert_dir: Path) -> tuple[Path, Path]: + """Generate a self-signed certificate for testing.""" + cert_file = cert_dir / "cert.pem" + key_file = cert_dir / "key.pem" + + # Generate self-signed certificate using openssl + subprocess.run( + [ + "openssl", + "req", + "-x509", + "-newkey", + "rsa:2048", + "-keyout", + str(key_file), + "-out", + str(cert_file), + "-days", + "1", + "-nodes", + "-subj", + "/CN=localhost", + ], + check=True, + capture_output=True, + ) + return cert_file, key_file + + +class RemoteOpenAIServerSSL(RemoteOpenAIServer): + """RemoteOpenAIServer subclass that supports SSL with self-signed certs.""" + + @property + def url_root(self) -> str: + return f"https://{self.host}:{self.port}" + + def _wait_for_server(self, *, url: str, timeout: float): + """Override to use HTTPS with SSL verification disabled.""" + # Suppress InsecureRequestWarning for self-signed certs + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + start = time.time() + while True: + try: + if requests.get(url, verify=False).status_code == 200: + break + except Exception: + result = self._poll() + if result is not None and result != 0: + raise RuntimeError("Server exited unexpectedly.") from None + + time.sleep(0.5) + if time.time() - start > timeout: + raise RuntimeError("Server failed to start in time.") from None + + +@pytest.fixture(scope="function") def server(): args = ["--max-model-len", "1024", "--enforce-eager", "--load-format", "dummy"] @@ -17,6 +78,27 @@ def server(): yield remote_server +@pytest.fixture(scope="function") +def ssl_server(): + """Start a vLLM server with SSL enabled using a self-signed certificate.""" + with tempfile.TemporaryDirectory() as cert_dir: + cert_file, key_file = generate_self_signed_cert(Path(cert_dir)) + args = [ + "--max-model-len", + "1024", + "--enforce-eager", + "--load-format", + "dummy", + "--ssl-certfile", + str(cert_file), + "--ssl-keyfile", + str(key_file), + ] + + with RemoteOpenAIServerSSL(MODEL_NAME, args) as remote_server: + yield remote_server + + @pytest.mark.benchmark def test_bench_serve(server): # Test default model detection and input/output len @@ -42,6 +124,31 @@ def test_bench_serve(server): assert result.returncode == 0, f"Benchmark failed: {result.stderr}" +@pytest.mark.benchmark +def test_bench_serve_insecure(ssl_server): + """Test --insecure flag with an HTTPS server using a self-signed certificate.""" + base_url = f"https://{ssl_server.host}:{ssl_server.port}" + command = [ + "vllm", + "bench", + "serve", + "--base-url", + base_url, + "--input-len", + "32", + "--output-len", + "4", + "--num-prompts", + "5", + "--insecure", + ] + result = subprocess.run(command, capture_output=True, text=True) + print(result.stdout) + print(result.stderr) + + assert result.returncode == 0, f"Benchmark failed: {result.stderr}" + + @pytest.mark.benchmark def test_bench_serve_chat(server): command = [ diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index dd853f15a..a1361fb80 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -26,6 +26,7 @@ import json import os import random import shutil +import ssl import time import uuid import warnings @@ -60,11 +61,14 @@ TERM_PLOTLIB_AVAILABLE = (importlib.util.find_spec("termplotlib") is not None) a async def get_first_model_from_server( - base_url: str, headers: dict | None = None + base_url: str, + headers: dict | None = None, + ssl_context: ssl.SSLContext | bool | None = None, ) -> tuple[str, str]: """Fetch the first model from the server's /v1/models endpoint.""" models_url = f"{base_url}/v1/models" - async with aiohttp.ClientSession() as session: + connector = aiohttp.TCPConnector(ssl=ssl_context) + async with aiohttp.ClientSession(connector=connector) as session: try: async with session.get(models_url, headers=headers) as response: response.raise_for_status() @@ -619,6 +623,7 @@ async def benchmark( ramp_up_start_rps: int | None = None, ramp_up_end_rps: int | None = None, ready_check_timeout_sec: int = 600, + ssl_context: ssl.SSLContext | bool | None = None, ): try: request_func = ASYNC_REQUEST_FUNCS[endpoint_type] @@ -626,6 +631,8 @@ async def benchmark( raise ValueError(f"Unknown backend: {endpoint_type}") from None # Reuses connections across requests to reduce TLS handshake overhead. + # Use ssl_context if provided, otherwise default to True for https URLs + ssl_setting = ssl_context if ssl_context is not None else ("https://" in api_url) connector = aiohttp.TCPConnector( limit=max_concurrency or 0, limit_per_host=max_concurrency or 0, @@ -634,7 +641,7 @@ async def benchmark( keepalive_timeout=60, enable_cleanup_closed=True, force_close=False, - ssl=("https://" in api_url), + ssl=ssl_setting, ) session = aiohttp.ClientSession( @@ -1513,6 +1520,14 @@ def add_cli_args(parser: argparse.ArgumentParser): default=None, ) + parser.add_argument( + "--insecure", + action="store_true", + default=False, + help="Disable SSL certificate verification. Use this option when " + "connecting to servers with self-signed certificates.", + ) + def main(args: argparse.Namespace) -> dict[str, Any]: return asyncio.run(main_async(args)) @@ -1564,10 +1579,21 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: else: raise ValueError("Invalid header format. Please use KEY=VALUE format.") + # SSL context configuration + ssl_context: ssl.SSLContext | bool | None = None + if args.insecure: + # Disable SSL certificate verification + ssl_context = False + elif "https://" in base_url: + # Use default SSL context for HTTPS + ssl_context = True + # Fetch model from server if not specified if args.model is None: print("Model not specified, fetching first model from server...") - model_name, model_id = await get_first_model_from_server(base_url, headers) + model_name, model_id = await get_first_model_from_server( + base_url, headers, ssl_context + ) print(f"First model name: {model_name}, first model id: {model_id}") else: model_name = args.served_model_name @@ -1691,6 +1717,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: ramp_up_start_rps=args.ramp_up_start_rps, ramp_up_end_rps=args.ramp_up_end_rps, ready_check_timeout_sec=args.ready_check_timeout_sec, + ssl_context=ssl_context, ) # Save config and results to json -- GitLab From 599e4335a42bbb6f2cad75ac0b4be81272a77aa3 Mon Sep 17 00:00:00 2001 From: mgazz Date: Tue, 10 Feb 2026 15:04:16 +0000 Subject: [PATCH 0048/1166] Support benchmarking of Geospatial models (#33922) Signed-off-by: Michele Gazzetti --- vllm/benchmarks/datasets.py | 54 +++++++------ vllm/benchmarks/lib/endpoint_request_func.py | 32 ++++++++ vllm/benchmarks/serve.py | 80 ++++++++++++-------- 3 files changed, 110 insertions(+), 56 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 17cc2984f..f06f41a47 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -2072,32 +2072,38 @@ class CustomDataset(BenchmarkDataset): break prompt = item["prompt"] - new_output_len = output_len - if output_len is None or output_len == -1: - # check that the request has an 'output_tokens' field - if "output_tokens" not in item: - raise ValueError( - "If no output length is provided the " - "custom dataset must contain an 'output_tokens' field." + if tokenizer is None: + new_output_len = 1 + else: + new_output_len = output_len + if output_len is None or output_len == -1: + # check that the request has an 'output_tokens' field + if "output_tokens" not in item: + raise ValueError( + "If no output length is provided the " + "custom dataset must contain an 'output_tokens' field." + ) + # Use number of output tokens from the request data + try: + new_output_len = int(item["output_tokens"]) + except (ValueError, TypeError) as e: + raise ValueError( + f"Invalid value for 'output_tokens' in custom dataset: " + f"'{item['output_tokens']}'. Must be an integer." + ) from e + + if tokenizer is None: + prompt_len = 1 + else: + # apply template + if not skip_chat_template: + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, ) - # Use number of output tokens from the request data - try: - new_output_len = int(item["output_tokens"]) - except (ValueError, TypeError) as e: - raise ValueError( - f"Invalid value for 'output_tokens' in custom dataset: " - f"'{item['output_tokens']}'. Must be an integer." - ) from e - # apply template - if not skip_chat_template: - prompt = tokenizer.apply_chat_template( - [{"role": "user", "content": prompt}], - add_generation_prompt=True, - tokenize=False, - ) - - prompt_len = len(tokenizer(prompt).input_ids) + prompt_len = len(tokenizer(prompt).input_ids) sampled_requests.append( SampleRequest( prompt=prompt, diff --git a/vllm/benchmarks/lib/endpoint_request_func.py b/vllm/benchmarks/lib/endpoint_request_func.py index cccbcdb83..e231ccf6e 100644 --- a/vllm/benchmarks/lib/endpoint_request_func.py +++ b/vllm/benchmarks/lib/endpoint_request_func.py @@ -746,6 +746,37 @@ async def async_request_infinity_embeddings_clip( ) +async def async_request_vllm_pooling( + request_func_input: RequestFuncInput, + session: aiohttp.ClientSession, + pbar: tqdm | None = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + _validate_api_url(api_url, "vLLM Pooling API", "pooling") + + payload = { + "model": request_func_input.model_name + if request_func_input.model_name + else request_func_input.model, + "truncate_prompt_tokens": -1, + } + + payload = payload | request_func_input.prompt + + _update_payload_common(payload, request_func_input) + + headers = _get_headers("application/json") + _update_headers_common(headers, request_func_input) + + return await _run_pooling_request( + session, + api_url, + payload=payload, + headers=headers, + pbar=pbar, + ) + + # TODO: Add more request functions for different API protocols. ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = { "vllm": async_request_openai_completions, @@ -760,6 +791,7 @@ ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = { "infinity-embeddings": async_request_infinity_embeddings, "infinity-embeddings-clip": async_request_infinity_embeddings_clip, # (Infinity embedding server does not support vlm2vec) + "vllm-pooling": async_request_vllm_pooling, "vllm-rerank": async_request_vllm_rerank, } diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index a1361fb80..534392883 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -423,16 +423,19 @@ def calculate_metrics( output_len = outputs[i].output_tokens if not output_len: - # We use the tokenizer to count the number of output tokens - # for some serving backends instead of looking at - # len(outputs[i].itl) since multiple output tokens may be - # bundled together - # Note : this may inflate the output token count slightly - output_len = len( - tokenizer( - outputs[i].generated_text, add_special_tokens=False - ).input_ids - ) + if tokenizer is None: + output_len = 1 + else: + # We use the tokenizer to count the number of output tokens + # for some serving backends instead of looking at + # len(outputs[i].itl) since multiple output tokens may be + # bundled together + # Note : this may inflate the output token count slightly + output_len = len( + tokenizer( + outputs[i].generated_text, add_special_tokens=False + ).input_ids + ) actual_output_lens.append(output_len) total_input += input_requests[i].prompt_len tpot = 0 @@ -919,7 +922,7 @@ async def benchmark( print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate)) print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) - if isinstance(metrics, BenchmarkMetrics): + if isinstance(metrics, BenchmarkMetrics) and tokenizer: print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) print( "{:<40} {:<10.2f}".format( @@ -933,16 +936,18 @@ async def benchmark( ) ) if isinstance(metrics, BenchmarkMetrics): - print( - "{:<40} {:<10.2f}".format( - "Output token throughput (tok/s):", metrics.output_throughput + if tokenizer: + print( + "{:<40} {:<10.2f}".format( + "Output token throughput (tok/s):", metrics.output_throughput + ) ) - ) - print( - "{:<40} {:<10.2f}".format( - "Peak output token throughput (tok/s):", metrics.max_output_tokens_per_s + print( + "{:<40} {:<10.2f}".format( + "Peak output token throughput (tok/s):", + metrics.max_output_tokens_per_s, + ) ) - ) print( "{:<40} {:<10.2f}".format( "Peak concurrent requests:", metrics.max_concurrent_requests @@ -954,11 +959,12 @@ async def benchmark( "RTFx (Inverse Real-Time Factor):", metrics.rtfx ) ) - print( - "{:<40} {:<10.2f}".format( - "Total token throughput (tok/s):", metrics.total_token_throughput + if tokenizer: + print( + "{:<40} {:<10.2f}".format( + "Total token throughput (tok/s):", metrics.total_token_throughput + ) ) - ) if isinstance(metrics, BenchmarkMetrics): result = { @@ -1047,7 +1053,7 @@ async def benchmark( print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) result[f"p{p_word}_{metric_attribute_name}_ms"] = value - if task_type == TaskType.GENERATION: + if task_type == TaskType.GENERATION and tokenizer: process_one_metric("ttft", "TTFT", "Time to First Token") process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") process_one_metric("itl", "ITL", "Inter-token Latency") @@ -1519,6 +1525,12 @@ def add_cli_args(parser: argparse.ArgumentParser): type=json.loads, default=None, ) + parser.add_argument( + "--skip-tokenizer-init", + action="store_true", + default=False, + help="Skip initialization of tokenizer and detokenizer", + ) parser.add_argument( "--insecure", @@ -1599,14 +1611,18 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]: model_name = args.served_model_name model_id = args.model - tokenizer_id = args.tokenizer if args.tokenizer is not None else model_id - tokenizer_mode = args.tokenizer_mode - - tokenizer = get_tokenizer( - tokenizer_id, - tokenizer_mode=tokenizer_mode, - trust_remote_code=args.trust_remote_code, - ) + if args.skip_tokenizer_init: + tokenizer_id = None + tokenizer_mode = None + tokenizer = None + else: + tokenizer_id = args.tokenizer if args.tokenizer is not None else model_id + tokenizer_mode = args.tokenizer_mode + tokenizer = get_tokenizer( + tokenizer_id, + tokenizer_mode=tokenizer_mode, + trust_remote_code=args.trust_remote_code, + ) if args.dataset_name is None: raise ValueError( -- GitLab From b129136c7a7389133c923123a1ebd76c4401c94d Mon Sep 17 00:00:00 2001 From: xuebwang-amd Date: Tue, 10 Feb 2026 23:08:05 +0800 Subject: [PATCH 0049/1166] [ROCm][Quantization] GPT_OSS in amd-quark format model loading and emulations (#29008) Signed-off-by: xuebwang-amd Signed-off-by: Robert Shaw Co-authored-by: Robert Shaw Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> --- .../moe/test_gpt_oss_triton_kernels.py | 20 +- tests/models/quantization/test_gpt_oss.py | 110 ++++ .../test_gpt_oss_attn_quantization.py | 80 --- .../model_executor/layers/fused_moe/config.py | 36 ++ .../layers/fused_moe/fused_moe.py | 29 +- vllm/model_executor/layers/fused_moe/layer.py | 37 +- vllm/model_executor/layers/fused_moe/utils.py | 23 + .../layers/quantization/base_config.py | 16 + .../layers/quantization/mxfp4.py | 5 + .../layers/quantization/quark/quark.py | 70 ++- .../layers/quantization/quark/quark_moe.py | 352 ++++++++++-- .../layers/quantization/utils/ocp_mx_utils.py | 20 +- vllm/model_executor/models/gpt_oss.py | 509 +++++++++++++++++- 13 files changed, 1094 insertions(+), 213 deletions(-) create mode 100644 tests/models/quantization/test_gpt_oss.py delete mode 100644 tests/models/quantization/test_gpt_oss_attn_quantization.py diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py index 384f43db4..4900949ad 100644 --- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py +++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py @@ -22,7 +22,7 @@ from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor from triton_kernels.tensor_details import layout from triton_kernels.testing import assert_close -from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_config from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( triton_kernel_moe_forward, ) @@ -298,12 +298,18 @@ def test_equiv(num_token, a_dtype, w_dtype, tp, workspace_init): pc2, ) = init_compute_data(M, K, N, E, a_dtype, w_dtype, num_warps=8) - quant_config = FusedMoEQuantConfig.make( - w1_bias=w1_bias_tri, - w2_bias=w2_bias_tri, - w1_scale=pc1, - w2_scale=pc2, - ) + if a_dtype == "bf16" and w_dtype == "mx4": + quant_config = mxfp4_w4a16_moe_quant_config( + w1_scale=pc1, + w2_scale=pc2, + w1_bias=w1_bias_tri, + w2_bias=w2_bias_tri, + ) + else: + raise NotImplementedError( + f"Quantization configuration for activation={a_dtype} and weight={w_dtype} " + f"has not been implemented." + ) out_triton_monolithic = triton_kernel_moe_forward( hidden_states=x_tri, diff --git a/tests/models/quantization/test_gpt_oss.py b/tests/models/quantization/test_gpt_oss.py new file mode 100644 index 000000000..e70ccaf88 --- /dev/null +++ b/tests/models/quantization/test_gpt_oss.py @@ -0,0 +1,110 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +End-to-end accuracy test for GPT-OSS model quantization. + +Config: + Task: gsm8k_platinum + Filter: flexible-extract + n-shot: 5 + Metric: exact_match + +Run: pytest tests/models/quantization/test_gpt_oss.py +""" + +import importlib +import importlib.metadata +from dataclasses import dataclass + +import huggingface_hub +import lm_eval +import pytest +from packaging import version + +MODEL_ACCURACIES = { + # Full quantization: attention linears and MoE linears + "amd/gpt-oss-20b-WFP8-AFP8-KVFP8": 0.89, + # MoE linears only quantization + "amd/gpt-oss-20b-MoE-Quant-W-MXFP4-A-FP8-KV-FP8": 0.89, + # MoE linears only quantization + # "amd/gpt-oss-20b-MoE-Quant-W-MXFP4-A-MXFP4-KV-FP8": 0.90, +} + +QUARK_MXFP4_AVAILABLE = importlib.util.find_spec("quark") is not None and version.parse( + importlib.metadata.version("amd-quark") +) >= version.parse("0.9.0") + + +def has_huggingface_access(repo): + try: + huggingface_hub.list_repo_refs(repo) + return True + except huggingface_hub.errors.RepositoryNotFoundError: + return False + + +HF_HUB_AMD_ORG_ACCESS = all( + [has_huggingface_access(model_name) for model_name in MODEL_ACCURACIES] +) + + +@dataclass +class ModelCase: + model_id: str + tp: int + + +@dataclass +class EvaluationConfig: + model_name: str + + def get_model_args(self, tp_size: int): + return { + "pretrained": self.model_name, + "chat_template_args": {"reasoning_effort": "low"}, + "enable_thinking": True, + "think_end_token": "200008", + "tensor_parallel_size": tp_size, + "dtype": "auto", + "gpu_memory_utilization": 0.95, + "trust_remote_code": False, + "enable_prefix_caching": False, + "enforce_eager": False, + } + + +@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available") +@pytest.mark.skipif( + not HF_HUB_AMD_ORG_ACCESS, + reason="Read access to huggingface.co/amd is required for this test.", +) +@pytest.mark.parametrize("tp_size", [1, 2, 4, 8]) +@pytest.mark.parametrize("model_name, expected_accuracy", MODEL_ACCURACIES.items()) +def test_gpt_oss_attention_quantization( + model_name: str, tp_size: int, expected_accuracy: float +): + model_args = EvaluationConfig(model_name).get_model_args(tp_size) + + extra_run_kwargs = { + "gen_kwargs": {"max_gen_toks": 8000}, + "apply_chat_template": True, + "fewshot_as_multiturn": True, + "num_fewshot": 5, + } + + lm_eval_out = lm_eval.simple_evaluate( + model="vllm", + model_args=model_args, + tasks="gsm8k_platinum", + batch_size="auto", + **extra_run_kwargs, + ) + measured_accuracy = float( + lm_eval_out["results"]["gsm8k_platinum"]["exact_match,flexible-extract"] + ) + + rtol = 0.02 + assert ( + measured_accuracy - rtol < expected_accuracy + and measured_accuracy + rtol > expected_accuracy + ), f"Expected: {expected_accuracy} | Measured: {measured_accuracy}" diff --git a/tests/models/quantization/test_gpt_oss_attn_quantization.py b/tests/models/quantization/test_gpt_oss_attn_quantization.py deleted file mode 100644 index 780165ea2..000000000 --- a/tests/models/quantization/test_gpt_oss_attn_quantization.py +++ /dev/null @@ -1,80 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Test attention quantization of gpt-oss model. -The qkv_proj and o_proj in self_attention can be either quantized or excluded. - -Run `pytest tests/models/quantization/test_gpt_oss_attn_quantization.py`. - -""" - -import importlib -import importlib.metadata -from dataclasses import dataclass - -import huggingface_hub -import lm_eval -import pytest -from packaging import version - -MODEL_NAMES = ["amd/gpt-oss-20b-customized-attention-quantization"] - -QUARK_MXFP4_AVAILABLE = importlib.util.find_spec("quark") is not None and version.parse( - importlib.metadata.version("amd-quark") -) >= version.parse("0.8.99") - - -def has_huggingface_access(repo): - try: - huggingface_hub.list_repo_refs(repo) - return True - except huggingface_hub.errors.RepositoryNotFoundError: - return False - - -HF_HUB_AMD_ORG_ACCESS = all( - [has_huggingface_access(model_name) for model_name in MODEL_NAMES] -) - - -@dataclass -class ModelCase: - model_id: str - tp: int - - -@dataclass -class EvaluationConfig: - model_name: str - - def get_model_args(self) -> str: - return ( - f"pretrained={self.model_name}," - "tensor_parallel_size=4,dtype=auto,gpu_memory_utilization=0.9,trust_remote_code=False" - ) - - -EXPECTED_ACCURACIES = {"arc_challenge": 0.20} - - -@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available") -@pytest.mark.skipif( - not HF_HUB_AMD_ORG_ACCESS, - reason="Read access to huggingface.co/amd is required for this test.", -) -@pytest.mark.parametrize("model_name", MODEL_NAMES) -@pytest.mark.parametrize("task_name, expected_accuracy", EXPECTED_ACCURACIES.items()) -def test_gpt_oss_attention_quantization( - model_name: str, task_name: str, expected_accuracy: float -): - measured_accuracy = lm_eval.simple_evaluate( - model="vllm", - model_args=EvaluationConfig(model_name).get_model_args(), - tasks=task_name, - batch_size="auto", - )["results"][task_name]["acc,none"] - - rtol = 0.05 - assert ( - measured_accuracy - rtol < expected_accuracy - and measured_accuracy + rtol > expected_accuracy - ), f"Expected: {expected_accuracy} | Measured: {measured_accuracy}" diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 828e9d0f3..b9fee1dd4 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -386,6 +386,10 @@ class FusedMoEQuantConfig: def use_nvfp4_w4a4(self) -> bool: return self.quant_dtype == "nvfp4" + @property + def use_mxfp4_w4a8(self) -> bool: + return self._a1.dtype == "fp8" and self._w1.dtype == "mxfp4" + def config_name(self, dtype: torch.dtype) -> str | None: """ Return a string used to construct the filename that contains the @@ -532,6 +536,8 @@ def fp8_w8a8_moe_quant_config( w2_scale: torch.Tensor, a1_scale: torch.Tensor | None = None, a2_scale: torch.Tensor | None = None, + w1_bias: torch.Tensor | None = None, + w2_bias: torch.Tensor | None = None, per_act_token_quant: bool = False, per_out_ch_quant: bool = False, block_shape: list[int] | None = None, @@ -549,6 +555,8 @@ def fp8_w8a8_moe_quant_config( g1_alphas=g1_alphas, w2_scale=w2_scale, g2_alphas=g2_alphas, + w1_bias=w1_bias, + w2_bias=w2_bias, a1_scale=a1_scale, a1_gscale=a1_gscale, a2_scale=a2_scale, @@ -564,6 +572,8 @@ def int8_w8a8_moe_quant_config( w2_scale: torch.Tensor, a1_scale: torch.Tensor | None, a2_scale: torch.Tensor | None, + w1_bias: torch.Tensor | None = None, + w2_bias: torch.Tensor | None = None, per_act_token_quant: bool = False, ) -> FusedMoEQuantConfig: """ @@ -575,6 +585,8 @@ def int8_w8a8_moe_quant_config( w2_scale=w2_scale, a1_scale=a1_scale, a2_scale=a2_scale, + w1_bias=w1_bias, + w2_bias=w2_bias, per_act_token_quant=per_act_token_quant, per_out_ch_quant=False, block_shape=None, @@ -654,6 +666,26 @@ def mxfp4_mxfp8_moe_quant_config( ) +def mxfp4_w4a8_moe_quant_config( + w1_scale: Union[torch.Tensor, "PrecisionConfig"], + w2_scale: Union[torch.Tensor, "PrecisionConfig"], + a1_scale: torch.Tensor | None = None, + a2_scale: torch.Tensor | None = None, + w1_bias: torch.Tensor | None = None, + w2_bias: torch.Tensor | None = None, + block_shape: list[int] | None = None, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for fp8 activations and mxfp4 weights. + """ + return FusedMoEQuantConfig( + _a1=FusedMoEQuantDesc("fp8", None, a1_scale, None, None, None), + _a2=FusedMoEQuantDesc("fp8", None, a2_scale, None, None, None), + _w1=FusedMoEQuantDesc("mxfp4", None, w1_scale, None, None, w1_bias), + _w2=FusedMoEQuantDesc("mxfp4", None, w2_scale, None, None, w2_bias), + ) + + def ocp_mx_moe_quant_config( quant_dtype: str, w1_scale: Union[torch.Tensor, "PrecisionConfig"], @@ -691,6 +723,8 @@ def nvfp4_moe_quant_config( a2_gscale: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor, + w1_bias: torch.Tensor | None = None, + w2_bias: torch.Tensor | None = None, ) -> FusedMoEQuantConfig: """ Construct a quant config for mxfp4 activations and nvp4 weights. @@ -699,6 +733,8 @@ def nvfp4_moe_quant_config( "nvfp4", w1_scale=w1_scale, w2_scale=w2_scale, + w1_bias=w1_bias, + w2_bias=w2_bias, a1_gscale=a1_gscale, a2_gscale=a2_gscale, g1_alphas=g1_alphas, diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index e0907368b..63aae43c3 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -38,7 +38,6 @@ from vllm.model_executor.layers.fused_moe.utils import ( ) from vllm.model_executor.layers.quantization.utils.mxfp4_utils import dequant_mxfp4 from vllm.model_executor.layers.quantization.utils.mxfp6_utils import dequant_mxfp6 -from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import OCP_MX_Scheme from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, kFp8Dynamic128Sym, @@ -1583,6 +1582,11 @@ def _get_config_quant_dtype( return "mxfp6_e3m2" elif ocp_mx_scheme in {"w_mxfp4_a_mxfp6_e2m3", "w_mxfp6_e2m3_a_mxfp6_e2m3"}: return "mxfp6_e2m3" + elif ocp_mx_scheme in {"w_mxfp4", "w_mxfp6_e3m2", "w_mxfp6_e2m3"}: + return torch.bfloat16 + elif ocp_mx_scheme in {"w_mxfp4_a_fp8", "w_mxfp6_e3m2_a_fp8", "w_mxfp6_e2m3_a_fp8"}: + return torch.float8_e4m3fn + return None @@ -1617,17 +1621,10 @@ def fused_experts_impl( if use_int4_w4a16: assert hidden_states.size(1) // 2 == w1.size(2), "Hidden size mismatch" elif ocp_mx_scheme is not None: - if ocp_mx_scheme in { - "w_mxfp4_a_mxfp4", - "w_mxfp4_a_mxfp6_e3m2", - "w_mxfp4_a_mxfp6_e2m3", - }: + if ocp_mx_scheme.startswith("w_mxfp4"): # 16bit activation and fp4x2 packed weight assert hidden_states.size(1) == w1.size(2) * 2, "hidden size mismatch" - elif ocp_mx_scheme in { - "w_mxfp6_e3m2_a_mxfp6_e3m2", - "w_mxfp6_e2m3_a_mxfp6_e2m3", - }: + elif ocp_mx_scheme.startswith("w_mxfp6"): assert hidden_states.size(1) == (w1.size(2) * 4) // 3, ( "hidden size mismatch" ) @@ -1717,17 +1714,13 @@ def fused_experts_impl( # TODO: On platforms for which `current_platform.supports_mx()` is True # and for which we have a native OCP mx fused MOE kernel, # this dequantization step should not be done. - if ocp_mx_scheme in { - OCP_MX_Scheme.w_mxfp4_a_mxfp4, - OCP_MX_Scheme.w_mxfp4_a_mxfp6_e3m2, - OCP_MX_Scheme.w_mxfp4_a_mxfp6_e2m3, - }: + if ocp_mx_scheme.startswith("w_mxfp4"): # Weight has to be dequantized for mxfp4 emulation. w1 = dequant_mxfp4(w1, w1_scale, hidden_states.dtype) w1_scale = None w2 = dequant_mxfp4(w2, w2_scale, hidden_states.dtype) w2_scale = None - elif ocp_mx_scheme == OCP_MX_Scheme.w_mxfp6_e3m2_a_mxfp6_e3m2: + elif ocp_mx_scheme.startswith("w_mxfp6_e3m2"): w1 = dequant_mxfp6( w1, w1_scale, quant_dtype="fp6_e3m2", float_dtype=hidden_states.dtype ) @@ -1736,7 +1729,7 @@ def fused_experts_impl( w2, w2_scale, quant_dtype="fp6_e3m2", float_dtype=hidden_states.dtype ) w2_scale = None - elif ocp_mx_scheme == OCP_MX_Scheme.w_mxfp6_e2m3_a_mxfp6_e2m3: + elif ocp_mx_scheme.startswith("w_mxfp6_e2m3"): w1 = dequant_mxfp6( w1, w1_scale, quant_dtype="fp6_e2m3", float_dtype=hidden_states.dtype ) @@ -1779,6 +1772,7 @@ def fused_experts_impl( quant_dtype=quant_dtype, per_act_token_quant=per_channel_quant, block_shape=block_shape, + ocp_mx_scheme=ocp_mx_scheme, ) # SPARSITY_FACTOR is a heuristic margin ensuring tokens_in_chunk * top_k @@ -1846,6 +1840,7 @@ def fused_experts_impl( quant_dtype=quant_dtype, per_act_token_quant=per_channel_quant, block_shape=block_shape, + ocp_mx_scheme=ocp_mx_scheme, ) if expert_map is not None: diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index c3be1be85..f35ec87aa 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -221,12 +221,14 @@ def get_compressed_expert_map(expert_map: torch.Tensor) -> str: ) +# TODO(rob): move this down to the kernel. def maybe_roundup_hidden_size( hidden_size: int, act_dtype: torch.dtype, - quant_config: QuantizationConfig | None, moe_parallel_config: FusedMoEParallelConfig, is_lora_enabled: bool, + model_type: str | None, + is_mxfp4_quant: bool, ) -> int: """ Given layer hidden size and MoE configurations, round up hidden_size @@ -235,11 +237,12 @@ def maybe_roundup_hidden_size( Args: hidden_size: Layer hidden-size act_dtype: Data type of the layer activations. - quant_config: Fused MoE quantization configuration. moe_parallel_config: Fused MoE parallelization strategy configuration. is_lora_enabled: True if the engine is enabled with LoRA. This is used in the case of mxfp4 quantization in selecting the MxFP4Backend. + model_type: for checking if gpt-oss + is_mxfp4_quant: whether the layer is quantized with mxfp4 Return: Rounded up hidden_size if rounding up is required based on the configs. @@ -254,7 +257,7 @@ def maybe_roundup_hidden_size( ) # we are padding globally so EP buffer allocation works - if quant_config and quant_config.get_name() == "mxfp4": + if model_type == "gpt_oss" and is_mxfp4_quant: from vllm.model_executor.layers.quantization.mxfp4 import ( Mxfp4Backend, get_mxfp4_backend, @@ -398,15 +401,6 @@ class FusedMoE(CustomOp): # Expert mapping used in self.load_weights self.expert_mapping = expert_mapping - # Round up hidden size if needed. - hidden_size = maybe_roundup_hidden_size( - hidden_size, - moe_in_dtype, - quant_config, - self.moe_parallel_config, - is_lora_enabled=self.vllm_config.lora_config is not None, - ) - # For smuggling this layer into the fused moe custom op compilation_config = vllm_config.compilation_config if prefix in compilation_config.static_forward_context: @@ -508,7 +502,6 @@ class FusedMoE(CustomOp): ), "Aiter Fused MoE kernel only supports expert_map with 0 and 1s." assert intermediate_size % self.tp_size == 0 - self.hidden_size = hidden_size self.intermediate_size_per_partition = intermediate_size // self.tp_size self.reduce_results = reduce_results self.renormalize = renormalize @@ -548,6 +541,24 @@ class FusedMoE(CustomOp): ) self.routing_method_type: RoutingMethodType = self.router.routing_method_type + # Round up hidden size before creating moe_config. + # This way moe_config is created with the correct hidden_size from the start. + hidden_size = maybe_roundup_hidden_size( + hidden_size=hidden_size, + act_dtype=moe_in_dtype, + moe_parallel_config=self.moe_parallel_config, + is_lora_enabled=vllm_config.lora_config is not None, + model_type=( + self.vllm_config.model_config.hf_config.model_type + if self.vllm_config.model_config is not None + else None + ), + is_mxfp4_quant=( + quant_config is not None and quant_config.is_mxfp4_quant(prefix, self) + ), + ) + self.hidden_size = hidden_size + self.moe_config: FusedMoEConfig = FusedMoEConfig( num_experts=self.global_num_experts, experts_per_token=top_k, diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py index 75873a92a..7d5ca876b 100644 --- a/vllm/model_executor/layers/fused_moe/utils.py +++ b/vllm/model_executor/layers/fused_moe/utils.py @@ -23,6 +23,9 @@ from vllm.model_executor.layers.quantization.utils.mxfp6_utils import ( from vllm.model_executor.layers.quantization.utils.mxfp8_utils import ( mxfp8_e4m3_quantize, ) +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + per_tensor_dequantize, +) from vllm.triton_utils import tl, triton from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import is_torch_equal_or_newer @@ -241,7 +244,27 @@ def moe_kernel_quantize_input( per_act_token_quant: bool, block_shape: list[int] | None = None, is_fp4_scale_swizzled: bool = True, + ocp_mx_scheme: str | None = None, ) -> tuple[torch.Tensor, torch.Tensor | None]: + # Handle OCP MX scheme that requires QDQ (quantize-dequantize) for emulation + if ocp_mx_scheme is not None: + if ocp_mx_scheme in {"w_mxfp4", "w_mxfp4_a_mxfp4"}: + pass # No QDQ needed for these schemes + elif ocp_mx_scheme.endswith("a_fp8"): + # Perform QDQ (quantize and dequantize) on activation for emulation + # purpose, because there is no native kernel for weight in ocp_mx_scheme + # and activation in FP8. The implementation is based on existing + # non-emulation ops. + qA, qA_scale = ops.scaled_fp8_quant( + A, A_scale, use_per_token_if_dynamic=False + ) + A = per_tensor_dequantize(qA, qA_scale).to(A.dtype) + # After QDQ, we don't need further quantization + return A, None + # else: For other schemes (e.g., *_a_mxfp6_e3m2, *_a_mxfp6_e2m3), + # weights are already dequantized, and we proceed with normal + # activation quantization below. + if quant_dtype == torch.float8_e4m3fn: return _fp8_quantize(A, A_scale, per_act_token_quant, block_shape) elif quant_dtype == torch.int8: diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index c8a8424eb..a10264865 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -168,3 +168,19 @@ class QuantizationConfig(ABC): Interface to update values after config initialization. """ pass + + def is_mxfp4_quant(self, prefix: str, layer: torch.nn.Module) -> bool: + """ + Determine if mxfp4 quantization will be used for this config. + + This allows hidden_size rounding to happen before moe_config creation + without needing to instantiate quant_method first. + + Args: + prefix: The layer prefix/name in the model + layer: The layer module + + Returns: + True if this config uses MXFP4 quantization, False otherwise + """ + return False diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index b9dec4530..d1c9cb6bb 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -229,10 +229,15 @@ class Mxfp4Config(QuantizationConfig): ) return None + def is_mxfp4_quant(self, prefix: str, layer: torch.nn.Module) -> bool: + """MXFP4 config always uses MXFP4 quantization.""" + return True + class Mxfp4MoEMethod(FusedMoEMethodBase): def __init__(self, moe: FusedMoEConfig): super().__init__(moe) + self.weight_dtype = "mxfp4" self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled) self.marlin_input_dtype = None diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index dd6db7193..2e75a3de5 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -320,38 +320,45 @@ class QuarkConfig(QuantizationConfig): # Only symmetric weight quantization supported. return is_int8_dtype and is_tensor and is_weight_symmetric and is_static - def _is_ocp_mx( - self, - weight_quant: dict[str, Any] | None, - input_quant: dict[str, Any] | None, + def _is_w_ocp_mx_a_x( + self, weight_quant: dict[str, Any] | None, input_quant: dict[str, Any] | None ) -> bool: - # Confirm weights and input quantized. - if weight_quant is None or input_quant is None: + """ + This check returns True only if it is an OCP-MX weight quantization. + The activation can be any data type (e.g., FP16/BF16, FP8, or OCP-MX format). + The rationale for checking only the weight type is that + the model loading concept and process primarily concerns the weights themselves. + """ + # Confirm weights quantized. + if weight_quant is None: logger.debug( - "Quark model is not in OCP MX format: " - "weight_quant or input_quant not set" + "Quark model's weight quantization is incompatible with OCP_MX format: " + "weight_quant is not set." ) return False # Input and weight qscheme needs to be per group. - if ( - weight_quant.get("qscheme") != "per_group" - or input_quant.get("qscheme") != "per_group" - ): - logger.debug("Quark model is not in OCP MX format: not per_group") + if weight_quant.get("qscheme") != "per_group": + logger.debug( + "Quark model's weight quantization is incompatible with OCP MX format: " + "weight is not per_group." + ) return False # Input and weight group size needs to be 32. - if weight_quant.get("group_size") != 32 or input_quant.get("group_size") != 32: - logger.debug("Quark model is not in OCP MX format: not group_size=32") + if weight_quant.get("group_size") != 32: + logger.debug( + "Quark model's weight quantization is incompatible with OCP MX format: " + "group_size of weight is not 32." + ) return False # Activations and weight scales need to be in e8m0 format. - if ( - weight_quant.get("scale_format") != "e8m0" - or input_quant.get("scale_format") != "e8m0" - ): - logger.debug("Quark model is not in OCP MX format: not scale_format e8m0") + if weight_quant.get("scale_format") != "e8m0": + logger.debug( + "Quark model's weight quantization is incompatible with OCP MX format: " + "scale_format of weight is not e8m0." + ) return False # Input and weight dtypes need to be any of fp4, @@ -360,14 +367,31 @@ class QuarkConfig(QuantizationConfig): "fp4", "fp6_e3m2", "fp6_e2m3", - } or input_quant.get("dtype") not in {"fp4", "fp6_e3m2", "fp6_e2m3"}: + }: logger.debug( - "Quark model is not in OCP MX format: dtype not fp4, fp6_e3m2, fp6_e2m3" + "Quark model's weight quantization is incompatible with OCP MX format: " + "dtype is not in {fp4, fp6_e3m2, fp6_e2m3}." ) return False return True + def is_mxfp4_quant(self, prefix: str, layer: torch.nn.Module) -> bool: + """ + For Quark, determine if it's OCP MXFP4 by checking config directly. + This allows hidden_size rounding to happen before moe_config creation. + """ + layer_quant_config = self._find_matched_config(prefix, layer) + weight_config = layer_quant_config.get("weight") + input_config = layer_quant_config.get("input_tensors") + + return ( + self._is_w_ocp_mx_a_x(weight_config, input_config) + and weight_config is not None + and weight_config.get("dtype") == "fp4" + and getattr(torch, "float4_e2m1fn_x2", None) is not None + ) + def _find_matched_config( self, layer_name: str, module: torch.nn.Module ) -> dict[str, Any]: @@ -441,7 +465,7 @@ class QuarkConfig(QuantizationConfig): is_static_input_scheme=True, input_symmetric=input_config.get("symmetric"), ) - elif self._is_ocp_mx(weight_config, input_config): + elif self._is_w_ocp_mx_a_x(weight_config, input_config): return QuarkOCP_MX(weight_config, input_config) raise NotImplementedError( diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index fc836c56b..190890130 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -8,6 +8,7 @@ import torch import vllm.envs as envs from vllm import _custom_ops as ops from vllm._aiter_ops import rocm_aiter_ops +from vllm.config import get_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import ( FusedMoE, @@ -18,9 +19,15 @@ from vllm.model_executor.layers.fused_moe import ( from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, fp8_w8a8_moe_quant_config, + mxfp4_w4a8_moe_quant_config, + mxfp4_w4a16_moe_quant_config, ocp_mx_moe_quant_config, ) from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe +from vllm.model_executor.layers.quantization.mxfp4 import ( + Mxfp4Backend, + get_mxfp4_backend, +) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( prepare_fp8_moe_layer_for_marlin, ) @@ -37,6 +44,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.scalar_type import scalar_types +from vllm.utils.math_utils import round_up logger = init_logger(__name__) @@ -46,6 +54,7 @@ __all__ = ["QuarkMoEMethod", "QuarkW8A8Fp8MoEMethod", "QuarkOCP_MX_MoEMethod"] class QuarkMoEMethod(FusedMoEMethodBase): def __init__(self, moe: FusedMoEConfig): super().__init__(moe) + self.has_bias = self.moe.has_bias @staticmethod def get_moe_method( @@ -67,7 +76,7 @@ class QuarkMoEMethod(FusedMoEMethodBase): return QuarkW4A8Fp8MoEMethod(weight_config, input_config, module.moe_config) elif quant_config._is_fp8_w8a8(weight_config, input_config): return QuarkW8A8Fp8MoEMethod(weight_config, input_config, module.moe_config) - elif quant_config._is_ocp_mx(weight_config, input_config): + elif quant_config._is_w_ocp_mx_a_x(weight_config, input_config): return QuarkOCP_MX_MoEMethod(weight_config, input_config, module.moe_config) else: raise RuntimeError("Unsupported FusedMoe scheme") @@ -86,6 +95,10 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): self.weight_qscheme = self.weight_quant.get("qscheme") self.input_qscheme = self.input_quant.get("qscheme") + self.weight_dtype = self.weight_quant.get("dtype", "").replace( + "fp8_e4m3", "fp8" + ) + self.input_dtype = self.input_quant.get("dtype", "").replace("fp8_e4m3", "fp8") per_tensor = ( self.weight_qscheme == "per_tensor" and self.input_qscheme == "per_tensor" ) @@ -121,6 +134,10 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() + self.model_type = getattr( + get_current_vllm_config().model_config.hf_config, "model_type", None + ) + def create_weights( self, layer: torch.nn.Module, @@ -166,9 +183,16 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): if self.weight_qscheme == "per_tensor": # Allocate 2 scales for w1 and w3 respectively. # They are combined to a single scale after weight loading. - w13_weight_scale = torch.nn.Parameter( - torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False - ) + if self.model_type != "gpt_oss": + w13_weight_scale = torch.nn.Parameter( + torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False + ) + else: + # For gpt_oss, the w1(gate) & w3(up) are fused as one. + # Therefore, only one weight scale for each expert. + w13_weight_scale = torch.nn.Parameter( + torch.ones(num_experts, 1, dtype=torch.float32), requires_grad=False + ) layer.register_parameter("w13_weight_scale", w13_weight_scale) w2_weight_scale = torch.nn.Parameter( torch.ones(num_experts, dtype=torch.float32), requires_grad=False @@ -220,6 +244,27 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): layer.w13_input_scale = None layer.w2_input_scale = None + if self.has_bias: + w13_bias = torch.nn.Parameter( + torch.zeros( + num_experts, + 2 * intermediate_size_per_partition, + dtype=torch.float32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_bias", w13_bias) + set_weight_attrs(w13_bias, extra_weight_attrs) + + w2_bias = torch.nn.Parameter( + torch.zeros(num_experts, hidden_size, dtype=torch.float32), + requires_grad=False, + ) + layer.register_parameter("w2_bias", w2_bias) + set_weight_attrs(w2_bias, extra_weight_attrs) + else: + layer.w13_bias, layer.w2_bias = None, None + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: # Fp8 moe kernels require a single activation scale. # We take the max of all the scales in case they differ. @@ -278,21 +323,40 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): assert layer.w13_weight_scale is not None shard_size = layer.intermediate_size_per_partition max_w13_scales = layer.w13_weight_scale.max(dim=1).values - for expert_id in range(layer.local_num_experts): - start = 0 - for shard_id in range(2): + + # For gpt_oss, w1 and w3 are fused into a single combined + # gate_up_proj tensor with size 2*intermediate_size_per_partition + # and only one scale per expert. + # Process the entire weight tensor as one shard. + if self.model_type == "gpt_oss": + for expert_id in range(layer.local_num_experts): + # Process all 2*intermediate_size_per_partition rows at once dq_weight = per_tensor_dequantize( - layer.w13_weight[expert_id][start : start + shard_size, :], - layer.w13_weight_scale[expert_id][shard_id], + layer.w13_weight[expert_id], + layer.w13_weight_scale[expert_id][0], ) - layer.w13_weight[expert_id][start : start + shard_size, :], _ = ( - ops.scaled_fp8_quant(dq_weight, max_w13_scales[expert_id]) + layer.w13_weight[expert_id], _ = ops.scaled_fp8_quant( + dq_weight, max_w13_scales[expert_id] ) - start += shard_size + else: + # For non-gpt_oss, process w1 and w3 shards separately + for expert_id in range(layer.local_num_experts): + start = 0 + for shard_id in range(2): + dq_weight = per_tensor_dequantize( + layer.w13_weight[expert_id][start : start + shard_size, :], + layer.w13_weight_scale[expert_id][shard_id], + ) + ( + layer.w13_weight[expert_id][start : start + shard_size, :], + _, + ) = ops.scaled_fp8_quant(dq_weight, max_w13_scales[expert_id]) + start += shard_size layer.w13_weight_scale = torch.nn.Parameter( max_w13_scales, requires_grad=False ) + # quark's scale is 1 dim. elif self.weight_qscheme == "per_channel": if self.act_quant_group_shape == GroupShape.PER_TOKEN: @@ -343,6 +407,8 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): w2_scale=layer.w2_weight_scale, a1_scale=layer.w13_input_scale, a2_scale=layer.w2_input_scale, + w1_bias=layer.w13_bias, + w2_bias=layer.w2_bias, per_act_token_quant=self.input_qscheme == "per_channel", per_out_ch_quant=self.weight_qscheme == "per_channel", ) @@ -563,7 +629,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): def __init__( self, weight_config: dict[str, Any], - input_config: dict[str, Any], + input_config: dict[str, Any] | None, moe: FusedMoEConfig, ): super().__init__(moe) @@ -571,35 +637,79 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): self.input_quant = input_config weight_qscheme = self.weight_quant.get("qscheme") - input_qscheme = self.input_quant.get("qscheme") - if not (weight_qscheme == "per_group" and input_qscheme == "per_group"): + if not weight_qscheme == "per_group": raise ValueError( "For MX(FP4) Fused MoE layers, only per-group scales " - "for weights and activations are supported. Found " - f"{weight_qscheme}, {input_qscheme}" + f"for weights are supported. Found {weight_qscheme}." ) # noqa E501 - self.static_input_scales = not self.input_quant.get("is_dynamic") - self.weight_dtype = self.weight_quant["dtype"].replace("fp", "mxfp") - self.input_dtype = self.input_quant["dtype"].replace("fp", "mxfp") + if self.input_quant is not None: + input_quant = self.input_quant["dtype"] + if input_quant in ["fp4", "fp6_e3m2", "fp6_e2m3"]: + self.input_dtype = input_quant.replace("fp", "mxfp") + elif input_quant == "fp8_e4m3": + self.input_dtype = input_quant.replace("fp8_e4m3", "fp8") + else: + raise NotImplementedError( + f"Current input dtype {input_quant} is not compatible \ + with OCP MX (weight) MoE quantization. Please open an issue" + ) + else: + self.input_dtype = None + self.fp4_dtype = getattr(torch, "float4_e2m1fn_x2", None) self.ocp_mx_scheme = OCP_MX_Scheme.from_quant_dtype( self.input_dtype, self.weight_dtype ) - if self.static_input_scales: + if self.ocp_mx_scheme is None: + raise ValueError( + f"Unsupported OCP MX dtype combination for MoE: " + f"input_dtype={self.input_dtype}, weight_dtype={self.weight_dtype}. " + f"Please check that the combination is supported in OCP_MX_Scheme." + ) + + self.mxfp4_backend: Mxfp4Backend | None = None + if self.ocp_mx_scheme == "w_mxfp4": + self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled) + + if self.input_quant is not None: + self.static_input_scales = not self.input_quant.get("is_dynamic") + else: + self.static_input_scales = False + + if any( + self.ocp_mx_scheme.endswith(a_scheme) + for a_scheme in ["a_mxfp4", "a_mxfp6_e3m2", "a_mxfp6_e2m3"] + ): + if self.static_input_scales: + raise NotImplementedError( + "QuarkOCP_MX_MoEMethod with static input scales is currently " + f"not implemented for OCP MX scheme {self.ocp_mx_scheme}. " + "Please open an issue." + ) + elif self.ocp_mx_scheme.endswith("a_fp8") and not self.static_input_scales: raise NotImplementedError( - "QuarkOCP_MX_MoEMethod with static input scales is currently " - "not implemented. Please open an issue." + "QuarkOCP_MX_MoEMethod with dynamic input scales is currently " + f"not implemented for OCP MX scheme {self.ocp_mx_scheme}. " + "Please open an issue." ) self.use_rocm_aiter_moe = rocm_aiter_ops.is_fused_moe_enabled() - self.emulate = not current_platform.supports_mx() or not ( - self.use_rocm_aiter_moe and self.ocp_mx_scheme == "w_mxfp4_a_mxfp4" + self.model_type = getattr( + get_current_vllm_config().model_config.hf_config, "model_type", None ) + + self._emulate = ( + not current_platform.supports_mx() + or not self.ocp_mx_scheme.startswith("w_mxfp4") + ) and (self.mxfp4_backend is None or not self.use_rocm_aiter_moe) + + self.emulate = True if self.model_type == "gpt_oss" else self._emulate + if self.emulate: logger.warning_once( f"The current mode (supports_mx={current_platform.supports_mx()}, " @@ -640,12 +750,23 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): ) params_dtype = torch.uint8 + if self.model_type == "gpt_oss": + if current_platform.is_rocm(): + intermediate_size_per_partition_after_pad = round_up( + intermediate_size_per_partition, 256 + ) + else: + intermediate_size_per_partition_after_pad = round_up( + intermediate_size_per_partition, 64 + ) + else: + intermediate_size_per_partition_after_pad = intermediate_size_per_partition # WEIGHTS w13_weight = torch.nn.Parameter( torch.empty( num_experts, - 2 * intermediate_size_per_partition, + 2 * intermediate_size_per_partition_after_pad, self.get_packed_dim(hidden_size, self.weight_dtype), dtype=params_dtype, ), @@ -659,7 +780,9 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): torch.empty( num_experts, hidden_size, - self.get_packed_dim(intermediate_size_per_partition, self.weight_dtype), + self.get_packed_dim( + intermediate_size_per_partition_after_pad, self.weight_dtype + ), dtype=params_dtype, ), requires_grad=False, @@ -672,7 +795,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): w13_weight_scale = torch.nn.Parameter( torch.ones( num_experts, - 2 * intermediate_size_per_partition, + 2 * intermediate_size_per_partition_after_pad, hidden_size // OCP_MX_BLOCK_SIZE, dtype=params_dtype, ), @@ -682,7 +805,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): torch.ones( num_experts, hidden_size, - intermediate_size_per_partition // OCP_MX_BLOCK_SIZE, + intermediate_size_per_partition_after_pad // OCP_MX_BLOCK_SIZE, dtype=params_dtype, ), requires_grad=False, @@ -693,8 +816,96 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): layer.register_parameter("w13_weight_scale", w13_weight_scale) layer.register_parameter("w2_weight_scale", w2_weight_scale) + if self.has_bias: + w13_bias = torch.nn.Parameter( + torch.zeros( + num_experts, + 2 * intermediate_size_per_partition_after_pad, + dtype=torch.float32, + ), + requires_grad=False, + ) + layer.register_parameter("w13_bias", w13_bias) + set_weight_attrs(w13_bias, extra_weight_attrs) + + w2_bias = torch.nn.Parameter( + torch.zeros(num_experts, hidden_size, dtype=torch.float32), + requires_grad=False, + ) + layer.register_parameter("w2_bias", w2_bias) + set_weight_attrs(w2_bias, extra_weight_attrs) + else: + layer.w13_bias, layer.w2_bias = None, None + + # INPUT_SCALES + if self.static_input_scales: + w13_input_scale = torch.nn.Parameter( + torch.ones(num_experts, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("w13_input_scale", w13_input_scale) + set_weight_attrs(w13_input_scale, extra_weight_attrs) + + w2_input_scale = torch.nn.Parameter( + torch.ones(num_experts, dtype=torch.float32), requires_grad=False + ) + layer.register_parameter("w2_input_scale", w2_input_scale) + set_weight_attrs(w2_input_scale, extra_weight_attrs) + else: + layer.w13_input_scale = None + layer.w2_input_scale = None + def process_weights_after_loading(self, layer): + if self.static_input_scales: + # firstly, process activations if fp8 static input + if layer.w13_input_scale is None or layer.w2_input_scale is None: + raise ValueError( + "QuantConfig has static quantization, but found " + "activation scales are None." + ) + if not all_close_1d(layer.w13_input_scale) or not all_close_1d( + layer.w2_input_scale + ): + logger.warning_once( + "Found input_scales that are not equal for " + "fp8 MoE layer. Using the maximum across experts " + "for each layer. " + ) + layer.w13_input_scale = torch.nn.Parameter( + layer.w13_input_scale.max(), requires_grad=False + ) + layer.w2_input_scale = torch.nn.Parameter( + layer.w2_input_scale.max(), requires_grad=False + ) + + if current_platform.is_fp8_fnuz(): + # Normalize the weights and scales + _, _, w13_input_scale = normalize_e4m3fn_to_e4m3fnuz( + torch.empty_like(layer.w13_weight, dtype=torch.float8_e4m3fnuz), + torch.empty_like( + layer.w13_weight_scale, dtype=layer.w13_weight_scale.dtype + ), + layer.w13_input_scale, + ) + _, _, w2_input_scale = normalize_e4m3fn_to_e4m3fnuz( + torch.empty_like(layer.w2_weight, dtype=torch.float8_e4m3fnuz), + torch.empty_like( + layer.w2_weight_scale, dtype=layer.w13_weight_scale.dtype + ), + layer.w2_input_scale, + ) + # Reset the parameter + if w13_input_scale is not None: + layer.w13_input_scale = torch.nn.Parameter( + w13_input_scale, requires_grad=False + ) + if w2_input_scale is not None: + layer.w2_input_scale = torch.nn.Parameter( + w2_input_scale, requires_grad=False + ) + + # secondly, process mxfp weights if self.emulate: + torch.cuda.empty_cache() return from aiter.utility.fp4_utils import e8m0_shuffle @@ -725,15 +936,40 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): def get_fused_moe_quant_config( self, layer: torch.nn.Module ) -> FusedMoEQuantConfig | None: - return ocp_mx_moe_quant_config( - quant_dtype=self.input_dtype, - weight_dtype=self.weight_dtype, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, - a1_scale=None, - a2_scale=None, - block_shape=None, - ) + if self.ocp_mx_scheme == "w_mxfp4": + return mxfp4_w4a16_moe_quant_config( + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + w1_bias=layer.w13_bias, + w2_bias=layer.w2_bias, + ) + elif self.ocp_mx_scheme == "w_mxfp4_a_fp8": + return mxfp4_w4a8_moe_quant_config( + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + w1_bias=layer.w13_bias, + w2_bias=layer.w2_bias, + block_shape=None, + ) + elif self.ocp_mx_scheme in ["w_mxfp6_e3m2_a_fp8", "w_mxfp6_e2m3_a_fp8"]: + raise NotImplementedError( + "Currently there is no corresponding fused moe quant config configured " + f"in vLLM for OCP MX scheme {self.ocp_mx_scheme}. Please open an issue." + ) + else: + return ocp_mx_moe_quant_config( + quant_dtype=self.input_dtype, + weight_dtype=self.weight_dtype, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + w1_bias=layer.w13_bias, + w2_bias=layer.w2_bias, + a1_scale=None, + a2_scale=None, + block_shape=None, + ) def apply( self, @@ -743,24 +979,34 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): topk_ids: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: if not self.emulate: - from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( - rocm_aiter_fused_experts, - ) + if ( + self.model_type == "gpt_oss" + and self.mxfp4_backend == Mxfp4Backend.TRITON + ): + raise NotImplementedError( + "Triton kernel implemented fused MoE for GPT_OSS model " + "in Quark(MoE) format is not integrated or provided yet." + ) - out = rocm_aiter_fused_experts( - x, - layer.w13_weight, - layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - activation=layer.activation, - quant_config=self.moe_quant_config, - expert_map=layer.expert_map, - ) + else: + from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( + rocm_aiter_fused_experts, + ) + + return rocm_aiter_fused_experts( + x, + layer.w13_weight, + layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + activation=layer.activation, + quant_config=self.moe_quant_config, + expert_map=layer.expert_map, + ) else: from vllm.model_executor.layers.fused_moe import fused_experts - out = fused_experts( + return fused_experts( x, layer.w13_weight, layer.w2_weight, @@ -773,5 +1019,3 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) - - return out diff --git a/vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py b/vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py index 7752324f4..a9157cbfb 100644 --- a/vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py +++ b/vllm/model_executor/layers/quantization/utils/ocp_mx_utils.py @@ -20,26 +20,44 @@ SUPPORTED_OCP_MX_DTYPES = {"mxfp4", "mxfp6_e3m2", "mxfp6_e2m3"} class OCP_MX_Scheme(str, Enum): + w_mxfp4 = "w_mxfp4" w_mxfp4_a_mxfp4 = "w_mxfp4_a_mxfp4" w_mxfp4_a_mxfp6_e3m2 = "w_mxfp4_a_mxfp6_e3m2" w_mxfp4_a_mxfp6_e2m3 = "w_mxfp4_a_mxfp6_e2m3" + w_mxfp4_a_fp8 = "w_mxfp4_a_fp8" + w_mxfp6_e3m2 = "w_mxfp6_e3m2" w_mxfp6_e3m2_a_mxfp6_e3m2 = "w_mxfp6_e3m2_a_mxfp6_e3m2" + w_mxfp6_e3m2_a_fp8 = "w_mxfp6_e3m2_a_fp8" + w_mxfp6_e2m3 = "w_mxfp6_e2m3" w_mxfp6_e2m3_a_mxfp6_e2m3 = "w_mxfp6_e2m3_a_mxfp6_e2m3" + w_mxfp6_e2m3_a_fp8 = "w_mxfp6_e2m3_a_fp8" @classmethod def from_quant_dtype(cls, input_dtype: str | None, weight_dtype: str | None): - if input_dtype not in OCP_MX_DTYPES or weight_dtype not in OCP_MX_DTYPES: + if input_dtype not in OCP_MX_DTYPES and weight_dtype not in OCP_MX_DTYPES: return None + elif input_dtype is None and weight_dtype == "mxfp4": + return cls.w_mxfp4 + elif input_dtype is None and weight_dtype == "mxfp6_e3m2": + return cls.w_mxfp6_e3m2 + elif input_dtype is None and weight_dtype == "mxfp6_e2m3": + return cls.w_mxfp6_e2m3 elif input_dtype == "mxfp4" and weight_dtype == "mxfp4": return cls.w_mxfp4_a_mxfp4 elif input_dtype == "mxfp6_e3m2" and weight_dtype == "mxfp4": return cls.w_mxfp4_a_mxfp6_e3m2 elif input_dtype == "mxfp6_e2m3" and weight_dtype == "mxfp4": return cls.w_mxfp4_a_mxfp6_e2m3 + elif input_dtype == "fp8" and weight_dtype == "mxfp4": + return cls.w_mxfp4_a_fp8 elif input_dtype == "mxfp6_e3m2" and weight_dtype == "mxfp6_e3m2": return cls.w_mxfp6_e3m2_a_mxfp6_e3m2 + elif input_dtype == "fp8" and weight_dtype == "mxfp6_e3m2": + return cls.w_mxfp6_e3m2_a_fp8 elif input_dtype == "mxfp6_e2m3" and weight_dtype == "mxfp6_e2m3": return cls.w_mxfp6_e2m3_a_mxfp6_e2m3 + elif input_dtype == "fp8" and weight_dtype == "mxfp6_e2m3": + return cls.w_mxfp6_e2m3_a_fp8 else: logger.warning( "input_dtype='%s' and" diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index f62771c36..28c37c64b 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Iterable +import typing +from collections.abc import Callable, Iterable import torch import torch.distributed as dist @@ -25,13 +26,17 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import OCP_MX_BLOCK_SIZE from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.utils import rocm_unquantized_gemm from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, ) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, + maybe_remap_kv_scale_name, +) from vllm.model_executor.models.utils import sequence_parallel_chunk from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors @@ -98,6 +103,7 @@ class OAIAttention(nn.Module): head_size=self.head_dim, total_num_heads=self.num_attention_heads, total_num_kv_heads=self.num_key_value_heads, + bias=True, quant_config=quant_config, prefix=f"{prefix}.qkv_proj", ) @@ -105,6 +111,7 @@ class OAIAttention(nn.Module): self.o_proj = RowParallelLinear( input_size=self.num_attention_heads * self.head_dim, output_size=self.hidden_size, + bias=True, quant_config=quant_config, prefix=f"{prefix}.o_proj", ) @@ -306,6 +313,19 @@ class GptOssModel(nn.Module): return x, aux_hidden_states return x + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + # Params for weights, weight scales, activation scales + # (param_name, weight_name, expert_id, shard_id) + # NOTE: this is only used for quark. + return FusedMoE.make_expert_params_mapping( + self, + ckpt_gate_proj_name="w1", + ckpt_down_proj_name="w2", + ckpt_up_proj_name="w3", + num_experts=self.config.num_local_experts, + num_redundant_experts=0, + ) + def _load_weights_mxfp4( self, ep_rank_end: int, @@ -318,7 +338,6 @@ class GptOssModel(nn.Module): params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() - mxfp4_block = 32 use_ep = self.parallel_config.enable_expert_parallel num_experts = self.config.num_local_experts @@ -333,9 +352,11 @@ class GptOssModel(nn.Module): ) intermediate_size = self.config.intermediate_size - intermediate_size_block = intermediate_size // mxfp4_block + intermediate_size_block = intermediate_size // OCP_MX_BLOCK_SIZE per_rank_intermediate_size_block = cdiv(intermediate_size_block, tp_size) - per_rank_intermediate_size = per_rank_intermediate_size_block * mxfp4_block + per_rank_intermediate_size = ( + per_rank_intermediate_size_block * OCP_MX_BLOCK_SIZE + ) # Calculate common slicing bounds for current rank tp_rank_start = tp_rank * per_rank_intermediate_size @@ -370,7 +391,9 @@ class GptOssModel(nn.Module): narrow_weight = weight[ep_rank_start:ep_rank_end, ...] else: narrow_weight = weight[ - ..., tp_rank_start // mxfp4_block : tp_rank_end // mxfp4_block + ..., + tp_rank_start // OCP_MX_BLOCK_SIZE : tp_rank_end + // OCP_MX_BLOCK_SIZE, ] param = params_dict[name] @@ -495,6 +518,449 @@ class GptOssModel(nn.Module): loaded_params.add(name) return loaded_params + def _load_weights_quark( + self, + ep_rank_end: int, + ep_rank_start: int, + heads_per_rank: int, + head_start: int, + weights: Iterable[tuple[str, torch.Tensor]], + stacked_params_mapping: list[tuple[str, ...]], + ) -> set[str]: + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + + use_ep = self.parallel_config.enable_expert_parallel + num_experts = self.config.num_local_experts + + if use_ep: + tp_rank = get_tensor_model_parallel_rank() + tp_size = get_tensor_model_parallel_world_size() + else: + tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp_and_pcp( + tp_size=get_tensor_model_parallel_world_size(), + dp_size=get_dp_group().world_size, + dp_rank=get_dp_group().rank_in_group, + pcp_size=get_pcp_group().world_size, + pcp_rank=get_pcp_group().rank_in_group, + ) + + def _get_moe_weight_dtype(layer_id: int = 0) -> str | None: + """Helper function to get MoE quantization weight dtype. + + Args: + layer_id: Layer index to check (default 0, as all layers should + have the same quantization method) + + Returns: + Weight dtype string (e.g., "mxfp4", "fp8") or None if not available + """ + if hasattr(self.layers[layer_id].mlp.experts.quant_method, "weight_dtype"): + return self.layers[layer_id].mlp.experts.quant_method.weight_dtype + return None + + intermediate_size = self.config.intermediate_size + + moe_weight_dtype = _get_moe_weight_dtype(layer_id=0) + + if moe_weight_dtype == "mxfp4": + # MXFP4 requires OCP_MX_BLOCK_SIZE alignment + intermediate_size_block = intermediate_size // OCP_MX_BLOCK_SIZE + per_rank_intermediate_size_block = cdiv(intermediate_size_block, tp_size) + per_rank_intermediate_size = ( + per_rank_intermediate_size_block * OCP_MX_BLOCK_SIZE + ) + else: + # FP8 and other formats don't need alignment + per_rank_intermediate_size = cdiv(intermediate_size, tp_size) + + tp_rank_start = tp_rank * per_rank_intermediate_size + tp_rank_end = min((tp_rank + 1) * per_rank_intermediate_size, intermediate_size) + expert_params_mapping = self.get_expert_mapping() + for name, loaded_weight in weights: + if is_pp_missing_parameter(name, self): + continue + + layer_id, expert_id, fused_name = None, None, None + moe_quant_method = None + if "experts" in name: + parts = name.split(".") + ids = [s for s in parts if s.isdigit()] + + # for amd-quark format that each expert is seperated + # need to extract the parameter name with experts fused. + # example model: amd/gpt-oss-20b-MoE-Quant-W-MXFP4-A-FP8-KV-FP8 + if len(ids) == 2: + layer_id, expert_id = int(ids[0]), int(ids[-1]) + parts.pop(len(parts) - 1 - parts[::-1].index(str(expert_id))) + fused_name = ".".join(parts) + + # for openai mxfp4 format that all experts are combined + # no need to extract the parameter name with experts fused. + # models: openai/gpt-oss-20b, openai/gpt-oss-120b + elif len(ids) == 1: + layer_id, expert_id = int(ids[0]), None + fused_name = name + + else: + raise NameError( + f"Layer {name} contains more than 2 numeric indices. This is " + "an unexpected condition. Please open an issue if encountered." + ) + + moe_quant_method = _get_moe_weight_dtype(layer_id=layer_id) + + def kv_cache_scale_loader( + quant_config: QuantizationConfig, + name: str, + params_dict: dict[str, typing.Any], + weight: torch.Tensor, + default_weight_loader: Callable[..., None], + loaded_params: set[str], + ) -> tuple[bool, set[str]]: + """ + Load KV cache output scales. + Returns: + Tuple of (bool, set): + - bool: True if KV-cache scale was loaded into loaded_params + - set: Updated set of loaded_params if True else the original set + """ + # load explicit cached KV output scale from quant_config + if quant_config is not None and ( + scale_name := quant_config.get_cache_scale(name) + ): + param = params_dict[scale_name] + weight_loader = getattr( + param, "weight_loader", default_weight_loader + ) + if weight.numel() != 1: + raise ValueError( + f"KV cache scale '{scale_name}' is expected to be a " + f"scalar, but got a tensor of shape {weight.shape}." + ) + # Ensure weight is a scalar before passing to loader. + weight_loader(param, weight.flatten()[0]) + loaded_params.add(scale_name) + return True, loaded_params + + return False, loaded_params + + load_kv_cache_scale_completed, loaded_params = kv_cache_scale_loader( + self.quant_config, + name, + params_dict, + loaded_weight, + default_weight_loader, + loaded_params, + ) + if load_kv_cache_scale_completed: + continue + + if ( + all(key in name for key in ["input_scale", "mlp.experts"]) + and expert_id is not None + ): + assert loaded_weight.numel() == 1 + expert_data = params_dict[fused_name].data[expert_id] + expert_data.copy_(loaded_weight) + loaded_params.add(fused_name) + continue + + # Unified handler for mxfp4 weights and scales + elif moe_quant_method == "mxfp4" and any( + name.endswith(suffix) + for suffix in [ + ".w13_weight_scale", + ".w2_weight_scale", + ".w13_weight", + ".w2_weight", + ] + ): + is_w13 = ".w13_" in name + is_scale = "_scale" in name + + # Reshape weight for mxfp4 if needed (not for scales) + if not is_scale and expert_id is None: + if is_w13: + if loaded_weight.dim() < 3: + raise ValueError( + f"Expected w13_weight to have at least 3 " + f"dimensions, got shape " + f"{loaded_weight.shape}" + ) + if loaded_weight.shape[0] != num_experts: + raise ValueError( + f"Expected w13_weight first dimension to be " + f"{num_experts}, got " + f"{loaded_weight.shape[0]}" + ) + loaded_weight = loaded_weight.view( + num_experts, 2 * intermediate_size, -1 + ).contiguous() + else: + if loaded_weight.dim() < 3: + raise ValueError( + f"Expected w2_weight to have at least 3 " + f"dimensions, got shape " + f"{loaded_weight.shape}" + ) + if loaded_weight.shape[0] != num_experts: + raise ValueError( + f"Expected w2_weight first dimension to be " + f"{num_experts}, got " + f"{loaded_weight.shape[0]}" + ) + loaded_weight = loaded_weight.view( + num_experts, -1, intermediate_size // 2 + ).contiguous() + + if use_ep: + sliced_weight = loaded_weight[ep_rank_start:ep_rank_end, ...] + else: + if is_w13: + if expert_id is None: + sliced_weight = loaded_weight[ + :, 2 * tp_rank_start : 2 * tp_rank_end, ... + ] + else: + sliced_weight = loaded_weight[ + 2 * tp_rank_start : 2 * tp_rank_end, ... + ] + else: + if is_scale: + sliced_weight = loaded_weight[ + ..., + tp_rank_start // OCP_MX_BLOCK_SIZE : tp_rank_end + // OCP_MX_BLOCK_SIZE, + ] + else: + sliced_weight = loaded_weight[ + ..., tp_rank_start // 2 : tp_rank_end // 2 + ] + + # NOTE(rob): because gpt-oss ckpt has "unique" structure with + # fused gate_up_proj fused on disk, we cannot use the existing + # weight loaders without added complexity, so just do the + # direct load here. + param = params_dict[fused_name] + expert_data = param.data[expert_id] + dim1 = sliced_weight.shape[0] + dim2 = sliced_weight.shape[1] + expert_data.data[:dim1, :dim2].copy_(sliced_weight) + loaded_params.add(fused_name) + continue + + elif name.endswith(".w13_weight") and moe_quant_method == "fp8": + if use_ep: + narrow_weight = loaded_weight[ep_rank_start:ep_rank_end, ...] + else: + if expert_id is None: + narrow_weight = loaded_weight[ + :, 2 * tp_rank_start : 2 * tp_rank_end, : + ] + else: + narrow_weight = loaded_weight[ + 2 * tp_rank_start : 2 * tp_rank_end, : + ] + + assert fused_name is not None + param = params_dict[fused_name] + + if expert_id is None: + param.data.copy_(narrow_weight) + else: + param.data[expert_id].copy_(narrow_weight) + + loaded_params.add(fused_name) + continue + + elif name.endswith(".w13_weight_scale") and moe_quant_method == "fp8": + assert fused_name is not None + param = params_dict[fused_name] + + # Check if this is per-channel or per-tensor scale + if loaded_weight.numel() > 1 and loaded_weight.dim() == 1: + if use_ep: + narrow_weight = loaded_weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = loaded_weight[ + 2 * tp_rank_start : 2 * tp_rank_end + ] + else: + narrow_weight = loaded_weight + + if expert_id is None: + param.data.copy_(narrow_weight) + else: + param.data[expert_id].copy_(narrow_weight) + + loaded_params.add(fused_name) + continue + + elif name.endswith(".w13_input_scale") and moe_quant_method == "fp8": + assert fused_name is not None + param = params_dict[fused_name] + + if expert_id is None: + param.data.copy_(loaded_weight) + else: + param.data[expert_id].copy_(loaded_weight) + + loaded_params.add(fused_name) + continue + + elif name.endswith(".w2_weight") and moe_quant_method == "fp8": + if use_ep: + narrow_weight = loaded_weight[ep_rank_start:ep_rank_end, ...] + else: + if expert_id is None: + narrow_weight = loaded_weight[..., tp_rank_start:tp_rank_end] + else: + narrow_weight = loaded_weight[..., tp_rank_start:tp_rank_end] + + assert fused_name is not None + param = params_dict[fused_name] + + if expert_id is None: + param.data.copy_(narrow_weight) + else: + param.data[expert_id].copy_(narrow_weight) + + loaded_params.add(fused_name) + continue + + elif name.endswith(".w2_weight_scale") and moe_quant_method == "fp8": + assert fused_name is not None + param = params_dict[fused_name] + + if use_ep: + narrow_weight = loaded_weight[ep_rank_start:ep_rank_end, ...] + else: + narrow_weight = loaded_weight + + if expert_id is None: + param.data.copy_(narrow_weight) + else: + param.data[expert_id].copy_(narrow_weight) + + loaded_params.add(fused_name) + continue + + # Unified handler for bias loading (w13_bias and w2_bias) + elif name.endswith(".w13_bias") or name.endswith(".w2_bias"): + is_w13_bias = name.endswith(".w13_bias") + + if use_ep: + sliced_weight = loaded_weight[ep_rank_start:ep_rank_end, ...] + else: + if is_w13_bias: + if expert_id is None: + sliced_weight = loaded_weight[ + :, 2 * tp_rank_start : 2 * tp_rank_end + ] + else: + sliced_weight = loaded_weight[ + 2 * tp_rank_start : 2 * tp_rank_end + ] + else: + sliced_weight = loaded_weight + if tp_rank != 0: + sliced_weight = sliced_weight.zero_() + + # NOTE(rob): because gpt-oss ckpt has "unique" structure with + # fused gate_up_proj fused on disk, we cannot use the existing + # weight loaders without added complexity, so just do the + # direct load here. + assert fused_name is not None + param = params_dict[fused_name] + expert_data = param.data[expert_id] + dim1 = sliced_weight.shape[0] + expert_data.data[:dim1].copy_(sliced_weight) + loaded_params.add(fused_name) + continue + + elif "sinks" in name: + # Handle attention sinks (distributed across ranks) + param = params_dict[name] + narrow_weight = loaded_weight.narrow(0, head_start, heads_per_rank) + param.data.copy_(narrow_weight) + loaded_params.add(name) + continue + + for param_name, weight_name, shard_id in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if ("mlp.experts." in name) and name not in params_dict: + continue + name = name.replace(weight_name, param_name) + + if name.endswith("scale"): + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + param = params_dict[name] + weight_loader = param.weight_loader + + weight_loader(param, loaded_weight, shard_id) + loaded_params.add(name) + break + else: + for mapping in expert_params_mapping: + # Anyway, this is an expert weight and should not be + # attempted to load as other weights later + param_name, weight_name, mapping_expert_id, shard_id = mapping + weight_name = ( + weight_name[:-1] if weight_name.endswith(".") else weight_name + ) + + if weight_name not in name: + continue + + param = params_dict[fused_name] + # We should ask the weight loader to return success or not + # here since otherwise we may skip experts with other + # available replicas. + weight_loader = typing.cast( + Callable[..., bool], param.weight_loader + ) + # Use checkpoint's expert_id for quark format (when expert_id + # is extracted from weight name), otherwise use mapping's expert_id + actual_expert_id = ( + expert_id if expert_id is not None else mapping_expert_id + ) + success = weight_loader( + param, + loaded_weight, + fused_name, + shard_id=shard_id, + expert_id=actual_expert_id, + return_success=True, + ) + if success: + name = fused_name + loaded_params.add(name) + break + else: + if name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr( + param, "weight_loader", default_weight_loader + ) + weight_loader(param, loaded_weight) + + loaded_params.add(name) + return loaded_params + def _load_weights_other( self, ep_rank_end: int, @@ -635,6 +1101,7 @@ class GptOssModel(nn.Module): if hasattr(self.config, "quantization_config") else None ) + if quant_method == "mxfp4": return self._load_weights_mxfp4( ep_rank_end, @@ -644,6 +1111,15 @@ class GptOssModel(nn.Module): weights, stacked_params_mapping, ) + elif quant_method == "quark": + return self._load_weights_quark( + ep_rank_end, + ep_rank_start, + heads_per_rank, + head_start, + weights, + stacked_params_mapping, + ) else: return self._load_weights_other( ep_rank_end, @@ -676,6 +1152,15 @@ class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3, SupportsLoRA): # MoE Bias ".gate_up_proj_bias": ".w13_bias", ".down_proj_bias": ".w2_bias", + # For quark format + ".gate_up_proj.weight": ".w13_weight", + ".gate_up_proj.weight_scale": ".w13_weight_scale", + ".gate_up_proj.bias": ".w13_bias", + ".gate_up_proj.input_scale": ".w13_input_scale", + ".down_proj.weight": ".w2_weight", + ".down_proj.weight_scale": ".w2_weight_scale", + ".down_proj.bias": ".w2_bias", + ".down_proj.input_scale": ".w2_input_scale", }, ) @@ -725,18 +1210,6 @@ class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3, SupportsLoRA): logits = self.logits_processor(self.lm_head, hidden_states) return logits - def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: - # Params for weights, weight scales, activation scales - # (param_name, weight_name, expert_id, shard_id) - return FusedMoE.make_expert_params_mapping( - self, - ckpt_gate_proj_name="gate_proj", - ckpt_down_proj_name="down_proj", - ckpt_up_proj_name="up_proj", - num_experts=self.config.num_local_experts, - num_redundant_experts=0, - ) - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, -- GitLab From 82e11973cc07909de895a1309ce0f6a2144c576a Mon Sep 17 00:00:00 2001 From: Zhengxu Chen Date: Tue, 10 Feb 2026 10:24:42 -0500 Subject: [PATCH 0050/1166] [compile] Enable AOT compile with 2.10 in trunk. (#34155) Signed-off-by: Zhengxu Chen --- vllm/envs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/envs.py b/vllm/envs.py index 19464f2f2..3af85be0a 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -271,7 +271,7 @@ def use_aot_compile() -> bool: default_value = ( "1" - if is_torch_equal_or_newer("2.11.0.dev") and not disable_compile_cache() + if is_torch_equal_or_newer("2.10.0") and not disable_compile_cache() else "0" ) -- GitLab From afdce12c89555ce7b7bd4f3215b5d844de0a32ed Mon Sep 17 00:00:00 2001 From: "Roberto L. Castro" <38211239+LopezCastroRoberto@users.noreply.github.com> Date: Tue, 10 Feb 2026 16:29:52 +0100 Subject: [PATCH 0051/1166] [Perf][Kernel] Add faster topKperRow decode kernel for DeepSeek-V3.2 sparse attention (#33680) Signed-off-by: LopezCastroRoberto Signed-off-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com> Co-authored-by: Claude Sonnet 4.5 --- CMakeLists.txt | 1 + csrc/ops.h | 4 + csrc/sampler.cu | 2 +- csrc/topk.cu | 373 ++++++++++++++++++ csrc/torch_bindings.cpp | 6 + tests/kernels/test_top_k_per_row.py | 111 ++++++ .../layers/sparse_attn_indexer.py | 50 ++- vllm/v1/attention/backends/mla/indexer.py | 19 + 8 files changed, 554 insertions(+), 12 deletions(-) create mode 100644 csrc/topk.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index 168376ca1..c9b1bf54e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -293,6 +293,7 @@ set(VLLM_EXT_SRC "csrc/fused_qknorm_rope_kernel.cu" "csrc/layernorm_quant_kernels.cu" "csrc/sampler.cu" + "csrc/topk.cu" "csrc/cuda_view.cu" "csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/w8a8/int8/scaled_quant.cu" diff --git a/csrc/ops.h b/csrc/ops.h index 9ee6bda31..f5dfb0ecc 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -114,6 +114,10 @@ void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n, int64_t numRows, int64_t stride0, int64_t stride1, int64_t topK); +void large_context_topk(const torch::Tensor& score, torch::Tensor& indices, + const torch::Tensor& lengths, + std::optional row_starts_opt); + void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight, torch::Tensor& scale, double epsilon); diff --git a/csrc/sampler.cu b/csrc/sampler.cu index f7c091f1d..30bfef33c 100644 --- a/csrc/sampler.cu +++ b/csrc/sampler.cu @@ -725,4 +725,4 @@ void top_k_per_row_prefill(const torch::Tensor& logits, static_cast(stride0), static_cast(stride1), static_cast(topK), kSortingAlgorithmThreshold); } -} +} \ No newline at end of file diff --git a/csrc/topk.cu b/csrc/topk.cu new file mode 100644 index 000000000..e2702b2d0 --- /dev/null +++ b/csrc/topk.cu @@ -0,0 +1,373 @@ +// Portions of this file are adapted from SGLang PR: +// https://github.com/sgl-project/sglang/pull/11194 +// and +// https://github.com/sgl-project/sglang/pull/17747 + +#include "cuda_compat.h" +#include "dispatch_utils.h" + +#include +#include + +#ifndef USE_ROCM + #include +#else + #include +#endif + +namespace vllm { + +constexpr int TopK = 2048; // DeepSeek V3 sparse attention top-k +constexpr int kThreadsPerBlock = 1024; // Threads per block + +// Shared memory budget +#if defined(USE_ROCM) +constexpr size_t kSmem = 48 * 1024; // ROCm default: 48KB +#else +// Reduced from 128KB to 32KB to improve occupancy. +// Each radix pass needs at most ~TopK candidates in the threshold bin, +// so 4K entries per round (2 rounds = 8K entries = 32KB) is sufficient. +constexpr size_t kSmem = 8 * 1024 * sizeof(uint32_t); // 32KB (bytes) +#endif + +struct FastTopKParams { + const float* __restrict__ input; // [batch, seq_len] Logits + const int32_t* __restrict__ row_starts; // [batch] Offset into each row + // (optional) + int32_t* __restrict__ indices; // [batch, TopK] Output top-k indices + int32_t* __restrict__ lengths; // [batch] Sequence lengths per row + int64_t input_stride; // Stride between rows +}; + +__device__ __forceinline__ auto convert_to_uint32_v2(float x) -> uint32_t { + uint32_t bits = __float_as_uint(x); + return (bits & 0x80000000u) ? ~bits : (bits | 0x80000000u); +} + +__device__ __forceinline__ auto convert_to_uint8(float x) -> uint8_t { + __half h = __float2half_rn(x); + uint16_t bits = __half_as_ushort(h); + uint16_t key = (bits & 0x8000) ? static_cast(~bits) + : static_cast(bits | 0x8000); + return static_cast(key >> 8); +} + +__device__ void naive_topk_cuda(const float* __restrict__ logits, + int32_t* __restrict__ output_indices, + int32_t seq_len) { + const int thread_id = threadIdx.x; + for (int i = thread_id; i < TopK; i += kThreadsPerBlock) { + output_indices[i] = (i < seq_len) ? i : -1; + } +} + +// Adapted from: +// https://github.com/sgl-project/sglang/blob/v0.5.8/sgl-kernel/csrc/elementwise/topk.cu#L87 +// by: DarkSharpness +// which at the same time is an optimized topk kernel copied from tilelang +// kernel +__device__ void fast_topk_cuda_tl( + const float* __restrict__ logits, // Input logits [seq_len] + int* __restrict__ output_indices, // Output top-k indices [TopK] + int logits_offset, // Starting offset in logits array + int seq_len) // Number of valid logits to process +{ + constexpr int RADIX = 256; + constexpr int MAX_BUFFERED_ITEMS = kSmem / (2 * sizeof(int)); + + alignas(128) __shared__ int shared_histogram[2][RADIX + 128]; + alignas(128) __shared__ int shared_output_count; + alignas(128) __shared__ int shared_threshold_bin; + alignas(128) __shared__ int shared_buffered_count[2]; + + extern __shared__ int buffered_indices[][MAX_BUFFERED_ITEMS]; + + const int thread_id = threadIdx.x; + int remaining_k = TopK; + + // Pass 0: Build coarse 8-bit histogram using FP16 high bits + if (thread_id < RADIX + 1) { + shared_histogram[0][thread_id] = 0; + } + __syncthreads(); + + for (int idx = thread_id; idx < seq_len; idx += kThreadsPerBlock) { + const auto bin = convert_to_uint8(logits[idx + logits_offset]); + ::atomicAdd(&shared_histogram[0][bin], 1); + } + __syncthreads(); + + // Helper: Compute cumulative sum (suffix sum) over histogram using ping-pong + // buffers + auto compute_cumulative_sum = [&]() { + static_assert(1 << 8 == RADIX, + "Radix must be 256 for 8 unrolled iterations"); +#pragma unroll 8 + for (int i = 0; i < 8; ++i) { + if (C10_LIKELY(thread_id < RADIX)) { + const int stride = 1 << i; + const int src_buffer = i & 1; + const int dst_buffer = src_buffer ^ 1; + + int value = shared_histogram[src_buffer][thread_id]; + if (thread_id < RADIX - stride) { + value += shared_histogram[src_buffer][thread_id + stride]; + } + shared_histogram[dst_buffer][thread_id] = value; + } + __syncthreads(); + } + }; + + compute_cumulative_sum(); + + // Find threshold bin where cumsum crosses remaining_k + if (thread_id < RADIX && shared_histogram[0][thread_id] > remaining_k && + shared_histogram[0][thread_id + 1] <= remaining_k) { + shared_threshold_bin = thread_id; + shared_buffered_count[0] = 0; + shared_output_count = 0; + } + __syncthreads(); + + const int threshold_bin = shared_threshold_bin; + remaining_k -= shared_histogram[0][threshold_bin + 1]; + + // Early exit if threshold bin perfectly matches remaining_k + if (remaining_k == 0) { + for (int idx = thread_id; idx < seq_len; idx += kThreadsPerBlock) { + const int bin = convert_to_uint8(logits[idx + logits_offset]); + if (bin > threshold_bin) { + const int output_pos = ::atomicAdd(&shared_output_count, 1); + output_indices[output_pos] = idx; + } + } + __syncthreads(); + return; + } + + // Prepare for refinement passes: Process threshold bin + __syncthreads(); + if (thread_id < RADIX + 1) { + shared_histogram[0][thread_id] = 0; + } + __syncthreads(); + + // Scan all elements and: + // 1. Write indices > threshold_bin to output + // 2. Buffer indices == threshold_bin for refinement + // 3. Build histogram for next refinement pass (fused optimization) + for (int idx = thread_id; idx < seq_len; idx += kThreadsPerBlock) { + const float logit_value = logits[idx + logits_offset]; + const int bin = convert_to_uint8(logit_value); + + if (bin > threshold_bin) { + // in top-k, write to output + const int output_pos = ::atomicAdd(&shared_output_count, 1); + output_indices[output_pos] = idx; + } else if (bin == threshold_bin) { + // Candidate for top-k, needs refinement + const int buffer_pos = ::atomicAdd(&shared_buffered_count[0], 1); + if (C10_LIKELY(buffer_pos < MAX_BUFFERED_ITEMS)) { + buffered_indices[0][buffer_pos] = idx; + // Fused: Build histogram for next pass + const uint32_t fp32_bits = convert_to_uint32_v2(logit_value); + const int next_bin = (fp32_bits >> 24) & 0xFF; + ::atomicAdd(&shared_histogram[0][next_bin], 1); + } + } + } + __syncthreads(); + + // ============================================================================ + // Passes 1-4: Refine using 8-bit passes over FP32 bits + // ============================================================================ + // FP32 bits [31:0] split into 4 bytes processed MSB-first: + // Pass 1: bits [31:24], Pass 2: bits [23:16], Pass 3: bits [15:8], Pass 4: + // bits [7:0] +#pragma unroll 4 + for (int pass = 0; pass < 4; ++pass) { + __shared__ int shared_final_k; // For final pass: remaining slots to fill + const int src_buffer = pass % 2; + const int dst_buffer = src_buffer ^ 1; + + // Clamp buffered count to prevent overflow + const int raw_buffered = shared_buffered_count[src_buffer]; + const int num_buffered = + (raw_buffered < MAX_BUFFERED_ITEMS) ? raw_buffered : MAX_BUFFERED_ITEMS; + + compute_cumulative_sum(); + + // Find threshold bin for this pass + if (thread_id < RADIX && shared_histogram[0][thread_id] > remaining_k && + shared_histogram[0][thread_id + 1] <= remaining_k) { + shared_threshold_bin = thread_id; + shared_buffered_count[dst_buffer] = 0; + shared_final_k = remaining_k - shared_histogram[0][thread_id + 1]; + } + __syncthreads(); + + const int threshold_bin = shared_threshold_bin; + remaining_k -= shared_histogram[0][threshold_bin + 1]; + + // Bit offset for this pass: 24, 16, 8, 0 + const int bit_offset = 24 - pass * 8; + + // Early exit if threshold bin perfectly matches + if (remaining_k == 0) { + for (int i = thread_id; i < num_buffered; i += kThreadsPerBlock) { + const int idx = buffered_indices[src_buffer][i]; + const uint32_t fp32_bits = + convert_to_uint32_v2(logits[idx + logits_offset]); + const int bin = (fp32_bits >> bit_offset) & 0xFF; + if (bin > threshold_bin) { + const int output_pos = ::atomicAdd(&shared_output_count, 1); + output_indices[output_pos] = idx; + } + } + __syncthreads(); + break; + } + + // Continue refinement + __syncthreads(); + if (thread_id < RADIX + 1) { + shared_histogram[0][thread_id] = 0; + } + __syncthreads(); + + for (int i = thread_id; i < num_buffered; i += kThreadsPerBlock) { + const int idx = buffered_indices[src_buffer][i]; + const float logit_value = logits[idx + logits_offset]; + const uint32_t fp32_bits = convert_to_uint32_v2(logit_value); + const int bin = (fp32_bits >> bit_offset) & 0xFF; + + if (bin > threshold_bin) { + // Definitely in top-k + const int output_pos = ::atomicAdd(&shared_output_count, 1); + output_indices[output_pos] = idx; + } else if (bin == threshold_bin) { + if (pass == 3) { + // Final pass (bits [7:0]): No more refinement possible + // Fill remaining slots in reverse order to maintain descending order + const int slot = ::atomicAdd(&shared_final_k, -1); + if (slot > 0) { + output_indices[TopK - slot] = idx; + } + } else { + // Buffer for next pass and build next histogram + const int buffer_pos = + ::atomicAdd(&shared_buffered_count[dst_buffer], 1); + if (C10_LIKELY(buffer_pos < MAX_BUFFERED_ITEMS)) { + buffered_indices[dst_buffer][buffer_pos] = idx; + // Fused: Build histogram for next pass + const int next_bit_offset = bit_offset - 8; + const int next_bin = (fp32_bits >> next_bit_offset) & 0xFF; + ::atomicAdd(&shared_histogram[0][next_bin], 1); + } + } + } + } + __syncthreads(); + } +} + +__global__ __launch_bounds__(kThreadsPerBlock) void topk_kernel( + const FastTopKParams params) { + const auto& [input, row_starts, indices, lengths, input_stride] = params; + const uint64_t batch_idx = blockIdx.x; + const int logits_offset = row_starts == nullptr ? 0 : row_starts[batch_idx]; + const int seq_len = lengths[batch_idx]; + int* output_indices = indices + batch_idx * TopK; + const float* logits = input + batch_idx * input_stride; + + if (seq_len <= TopK) { + // Shortcut: All elements are in top-k + return naive_topk_cuda(logits, output_indices, seq_len); + } else { + return fast_topk_cuda_tl(logits, output_indices, logits_offset, seq_len); + } +} + +FastTopKParams get_params( + const at::Tensor& score, const at::Tensor& lengths, + std::optional row_starts_opt = std::nullopt, + std::optional indices_opt = std::nullopt) { + const int64_t batch_size = score.size(0); + + TORCH_CHECK(score.dim() == 2 && score.stride(1) == 1, + "score must be 2D with contiguous rows"); + TORCH_CHECK(lengths.dim() == 1 && lengths.is_contiguous() && + lengths.size(0) == batch_size, + "lengths must be 1D contiguous with size matching batch"); + + const int32_t* row_starts_ptr = nullptr; + if (row_starts_opt.has_value()) { + const auto& row_starts = *row_starts_opt; + TORCH_CHECK(row_starts.dim() == 1 && row_starts.size(0) == batch_size, + "row_starts must be 1D with size matching batch"); + row_starts_ptr = row_starts.data_ptr(); + } + + int32_t* indices_ptr = nullptr; + if (indices_opt.has_value()) { + const auto& indices = *indices_opt; + TORCH_CHECK(indices.dim() == 2 && indices.is_contiguous() && + indices.size(0) == batch_size && indices.size(1) == TopK, + "indices must be 2D contiguous [batch, TopK]"); + indices_ptr = indices.data_ptr(); + } + + return FastTopKParams{ + .input = score.data_ptr(), + .row_starts = row_starts_ptr, + .indices = indices_ptr, + .lengths = lengths.data_ptr(), + .input_stride = score.stride(0), + }; +} + +template +void setup_kernel_smem_once() { + static const cudaError_t result = []() -> cudaError_t { +#ifdef USE_ROCM + auto func_ptr = reinterpret_cast(kernel_func); +#else + auto func_ptr = kernel_func; +#endif + return cudaFuncSetAttribute( + func_ptr, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes); + }(); + + TORCH_CHECK( + result == cudaSuccess, + "Failed to set kernel shared memory limit: ", cudaGetErrorString(result)); +} + +} // namespace vllm + +void large_context_topk( + const torch::Tensor& logits, torch::Tensor& indices, + const torch::Tensor& seq_lens, + c10::optional row_starts = c10::nullopt) { + TORCH_CHECK(logits.is_cuda(), "logits must be a CUDA tensor"); + TORCH_CHECK(indices.is_cuda(), "indices must be a CUDA tensor"); + TORCH_CHECK(seq_lens.is_cuda(), "seq_lens must be a CUDA tensor"); + if (row_starts.has_value()) { + TORCH_CHECK(row_starts->is_cuda(), "row_starts must be a CUDA tensor"); + } + + const auto params = vllm::get_params(logits, seq_lens, row_starts, indices); + const int64_t batch_size = logits.size(0); + + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + const dim3 grid(static_cast(batch_size)); + const dim3 block(vllm::kThreadsPerBlock); + + vllm::setup_kernel_smem_once(); + vllm::topk_kernel<<>>(params); + + const cudaError_t result = cudaGetLastError(); + TORCH_CHECK(result == cudaSuccess, + "large_context_topk kernel failed: ", cudaGetErrorString(result)); +} \ No newline at end of file diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 97c0e80e7..9766b15ea 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -190,6 +190,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "int numRows, int stride0, int stride1, int topK) -> ()"); ops.impl("top_k_per_row_decode", torch::kCUDA, &top_k_per_row_decode); + ops.def( + "large_context_topk(Tensor score, Tensor indices, Tensor lengths, " + "Tensor? " + "row_starts_opt) -> ()"); + ops.impl("large_context_topk", torch::kCUDA, &large_context_topk); + // Layernorm-quant // Apply Root Mean Square (RMS) Normalization to the input tensor. ops.def( diff --git a/tests/kernels/test_top_k_per_row.py b/tests/kernels/test_top_k_per_row.py index 2d9dd2a04..9b96e6dfc 100644 --- a/tests/kernels/test_top_k_per_row.py +++ b/tests/kernels/test_top_k_per_row.py @@ -275,3 +275,114 @@ def test_top_k_per_row_decode_large_vocab_size(clean_logits: bool) -> None: _run_top_k_per_row_decode_test( top_k, batch_size, next_n, vocab_size, clean_logits, data_generation ) + + +@pytest.mark.skipif(not current_platform.is_cuda(), reason="This test requires CUDA") +@pytest.mark.parametrize("clean_logits", [True, False]) +@torch.inference_mode() +def test_deepseek_hybrid_topk(clean_logits: bool) -> None: + torch.set_default_device("cuda:0") + + top_k = 2048 + + # Test case 1: Short sequences (< 8192) + batch_size_short = 4 + next_n = 1 + num_rows_short = batch_size_short * next_n + + # Create sequences with max length < 8192 + seq_lens_short = torch.randint( + 4000, 8000, (batch_size_short,), dtype=torch.int32, device="cuda" + ) + + row_starts_short = torch.zeros(num_rows_short, dtype=torch.int32, device="cuda") + row_indices_short = torch.arange(num_rows_short, device="cuda") // next_n + next_n_offset_short = torch.arange(num_rows_short, device="cuda") % next_n + row_ends_short = ( + seq_lens_short[row_indices_short] - next_n + next_n_offset_short + 1 + ) + + logits_short = create_random_logits( + row_starts_short, row_ends_short, torch.float32, 42, clean_logits, "random" + ) + + indices_vllm = torch.empty( + (num_rows_short, top_k), dtype=torch.int32, device="cuda" + ) + + # Use vllm's kernel for short sequences + torch.ops._C.top_k_per_row_decode( + logits_short, + next_n, + seq_lens_short, + indices_vllm, + num_rows_short, + logits_short.stride(0), + logits_short.stride(1), + top_k, + ) + + # Test case 2: Long sequences (>= 8192) - should use large_context_topk kernel + batch_size_long = 4 + num_rows_long = batch_size_long * next_n + + # Create sequences with max length >= 8192 + seq_lens_long = torch.randint( + 8192, 16384, (batch_size_long,), dtype=torch.int32, device="cuda" + ) + + row_starts_long = torch.zeros(num_rows_long, dtype=torch.int32, device="cuda") + row_indices_long = torch.arange(num_rows_long, device="cuda") // next_n + next_n_offset_long = torch.arange(num_rows_long, device="cuda") % next_n + row_ends_long = seq_lens_long[row_indices_long] - next_n + next_n_offset_long + 1 + + logits_long = create_random_logits( + row_starts_long, row_ends_long, torch.float32, 43, clean_logits, "random" + ) + + indices = torch.empty((num_rows_long, top_k), dtype=torch.int32, device="cuda") + + # Use large_context_topk kernel for long sequences + if next_n == 1: + lengths = seq_lens_long + else: + offsets = torch.arange(next_n, device=logits_long.device, dtype=torch.int32) + lengths = (seq_lens_long.unsqueeze(1) - next_n + 1 + offsets).flatten() + + torch.ops._C.large_context_topk( + logits_long, + indices, + lengths, + None, + ) + + torch_indices_short = torch.empty( + (num_rows_short, top_k), dtype=torch.int32, device="cuda" + ) + for i in range(num_rows_short): + row_end = int(row_ends_short[i]) + k_i = min(top_k, row_end) + idx = logits_short[i, :row_end].topk(k_i, dim=-1)[1] + torch_indices_short[i, :k_i] = idx + + assert compare_top_k_results( + logits_short, + indices_vllm, + torch_indices_short, + row_starts_short, + row_ends_short, + top_k, + ), "top_k_per_row_decode kernel (short sequences) doesn't match torch.topk" + + torch_indices_long = torch.empty( + (num_rows_long, top_k), dtype=torch.int32, device="cuda" + ) + for i in range(num_rows_long): + row_end = int(row_ends_long[i]) + k_i = min(top_k, row_end) + idx = logits_long[i, :row_end].topk(k_i, dim=-1)[1] + torch_indices_long[i, :k_i] = idx + + assert compare_top_k_results( + logits_long, indices, torch_indices_long, row_starts_long, row_ends_long, top_k + ), "large_context_topk kernel (long sequences) doesn't match torch.topk" diff --git a/vllm/model_executor/layers/sparse_attn_indexer.py b/vllm/model_executor/layers/sparse_attn_indexer.py index 9ca7a42b7..bd063de74 100644 --- a/vllm/model_executor/layers/sparse_attn_indexer.py +++ b/vllm/model_executor/layers/sparse_attn_indexer.py @@ -126,6 +126,15 @@ def sparse_attn_indexer( topk_tokens, ) + # Compute lengths from row spans + # lengths = (chunk.cu_seqlen_ke - chunk.cu_seqlen_ks).to(torch.int32) + # torch.ops._C.large_context_topk( + # logits, + # topk_indices, + # lengths, + # chunk.cu_seqlen_ks, # row_starts + # ) + if has_decode: decode_metadata = attn_metadata.decode # kv_cache size requirement [num_block, block_size, n_head, head_dim], @@ -162,18 +171,37 @@ def sparse_attn_indexer( ) num_rows = logits.shape[0] - topk_indices = topk_indices_buffer[:num_padded_tokens, :topk_tokens] - torch.ops._C.top_k_per_row_decode( - logits, - next_n, - decode_metadata.seq_lens, - topk_indices, - num_rows, - logits.stride(0), - logits.stride(1), - topk_tokens, - ) + + if decode_metadata.use_large_context_topk: + if next_n == 1: + lengths = decode_metadata.seq_lens + else: + # (bs,) -> (bs, 1) + (next_n,) -> (bs, next_n) -> (bs * next_n,) + lengths = ( + decode_metadata.seq_lens.unsqueeze(1) + - next_n + + 1 + + decode_metadata.offsets + ).flatten() + + torch.ops._C.large_context_topk( + logits, + topk_indices, + lengths, + None, + ) + else: + torch.ops._C.top_k_per_row_decode( + logits, + next_n, + decode_metadata.seq_lens, + topk_indices, + num_rows, + logits.stride(0), + logits.stride(1), + topk_tokens, + ) if decode_metadata.requires_padding: # if padded, we need to unpack diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py index 8c1ea1646..368b217f0 100644 --- a/vllm/v1/attention/backends/mla/indexer.py +++ b/vllm/v1/attention/backends/mla/indexer.py @@ -86,6 +86,8 @@ class DeepSeekV32IndexerDecodeMetadata: decode_lens: torch.Tensor requires_padding: bool schedule_metadata: torch.Tensor + use_large_context_topk: bool + offsets: torch.Tensor | None # Precomputed offsets for speculative decoding @dataclass @@ -320,6 +322,21 @@ class DeepseekV32IndexerMetadataBuilder(AttentionMetadataBuilder): # Use CPU to avoid GPU sync; breaking async scheduling requires_padding = (decode_lens_cpu.max() > decode_lens_cpu.min()).item() + # Decide which top-k kernel to use based on batch size and sequence length + batch_size = num_decodes + _is_large_context = common_attn_metadata.max_seq_len > 8192 + + # Decision logic based on micro-benchmark results: + # - large_context_topk wins for batch <= 128 and seq_len > 8K + # - top_k_per_row_decode wins for batch > 128 or seq_len <= 8K + use_large_context_topk = batch_size <= 128 and _is_large_context + + next_n = 1 + self.num_speculative_tokens + if next_n > 1: + offsets = torch.arange(next_n, device=self.device, dtype=torch.int32) + else: + offsets = None + seq_lens = common_attn_metadata.seq_lens[:num_decodes] if is_deep_gemm_supported(): self.scheduler_metadata_buffer[:] = get_paged_mqa_logits_metadata( @@ -331,6 +348,8 @@ class DeepseekV32IndexerMetadataBuilder(AttentionMetadataBuilder): decode_lens=decode_lens, requires_padding=requires_padding, schedule_metadata=self.scheduler_metadata_buffer, + use_large_context_topk=use_large_context_topk, + offsets=offsets, ) attn_metadata = DeepseekV32IndexerMetadata( -- GitLab From c5a66d16970fbbc4633761d30f12ec1fc98a9523 Mon Sep 17 00:00:00 2001 From: junuxyz <216036880+junuxyz@users.noreply.github.com> Date: Wed, 11 Feb 2026 00:46:24 +0900 Subject: [PATCH 0052/1166] [Core][BugFix] Fix PP KV cache sharding memory validation (#33698) Signed-off-by: junuxyz <216036880+junuxyz@users.noreply.github.com> --- tests/v1/core/test_kv_cache_utils.py | 93 ++++++++++++++++++++++ vllm/v1/core/kv_cache_utils.py | 114 ++++++++++++++++++--------- 2 files changed, 168 insertions(+), 39 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index d97362e06..b91d59e46 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -1046,6 +1046,99 @@ def test_get_kv_cache_configs_multiple_workers(): ) +@pytest.mark.parametrize( + "asymmetric_memory", + [False, True], + ids=["symmetric", "asymmetric"], +) +def test_get_kv_cache_configs_pp_sharding(asymmetric_memory): + model_config = ModelConfig(max_model_len=512) + vllm_config = VllmConfig(model_config=model_config) + + ref_kv_cache_spec = new_kv_cache_spec() + pp_kv_cache_specs = [ + {"layer1": ref_kv_cache_spec}, + {"layer2": ref_kv_cache_spec}, + ] + + expected_num_blocks = model_config.max_model_len // ref_kv_cache_spec.block_size + 1 + avail_memory = ref_kv_cache_spec.page_size_bytes * expected_num_blocks + + # With per-worker validation, each worker only needs memory for its own + # layers. Worker 2 having more memory shouldn't affect worker 1's config. + available_memory = ( + [avail_memory, avail_memory * 2] if asymmetric_memory else [avail_memory] * 2 + ) + + kv_cache_configs = get_kv_cache_configs( + vllm_config, + pp_kv_cache_specs, + available_memory, + ) + + assert kv_cache_configs == [ + KVCacheConfig( + num_blocks=expected_num_blocks, + kv_cache_tensors=[ + KVCacheTensor( + size=ref_kv_cache_spec.page_size_bytes * expected_num_blocks, + shared_by=["layer1"], + ), + ], + kv_cache_groups=[KVCacheGroupSpec(["layer1"], ref_kv_cache_spec)], + ), + KVCacheConfig( + num_blocks=expected_num_blocks, + kv_cache_tensors=[ + KVCacheTensor( + size=ref_kv_cache_spec.page_size_bytes * expected_num_blocks, + shared_by=["layer2"], + ), + ], + kv_cache_groups=[KVCacheGroupSpec(["layer2"], ref_kv_cache_spec)], + ), + ] + + +def test_project_kv_cache_groups_to_worker(): + spec_a = new_kv_cache_spec() + spec_b = new_kv_cache_spec(num_kv_heads=4) + + global_groups = [ + KVCacheGroupSpec(["layer1", "layer2", "layer3"], spec_a), + ] + worker_spec = {"layer1": spec_a, "layer2": spec_a} + projected = kv_cache_utils._project_kv_cache_groups_to_worker( + global_groups, worker_spec + ) + assert len(projected) == 1 + assert projected[0].layer_names == ["layer1", "layer2"] + assert projected[0].kv_cache_spec is spec_a + + projected = kv_cache_utils._project_kv_cache_groups_to_worker( + global_groups, {"layer4": spec_a} + ) + assert len(projected) == 1 + assert projected[0].layer_names == [] + assert projected[0].kv_cache_spec is spec_a + + uniform_spec = UniformTypeKVCacheSpecs( + block_size=16, + kv_cache_specs={"layer1": spec_a, "layer2": spec_b, "layer3": spec_a}, + ) + global_groups_uniform = [ + KVCacheGroupSpec(["layer1", "layer2", "layer3"], uniform_spec), + ] + projected = kv_cache_utils._project_kv_cache_groups_to_worker( + global_groups_uniform, {"layer1": spec_a, "layer3": spec_a} + ) + assert len(projected) == 1 + assert projected[0].layer_names == ["layer1", "layer3"] + proj_spec = projected[0].kv_cache_spec + assert isinstance(proj_spec, UniformTypeKVCacheSpecs) + assert set(proj_spec.kv_cache_specs.keys()) == {"layer1", "layer3"} + + def test_merge_kv_cache_spec(): same_layer_specs = [ new_kv_cache_spec(num_kv_heads=32), diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index fd12dfe04..2f59e71a1 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -7,6 +7,7 @@ import os from collections import defaultdict from collections.abc import Callable, Iterable, Iterator, Sequence from dataclasses import dataclass, replace +from functools import partial from typing import Any, NewType, TypeAlias, overload from vllm import envs @@ -1390,7 +1391,7 @@ def _estimate_max_model_len_from_groups( def _auto_fit_max_model_len( vllm_config: VllmConfig, - kv_cache_groups: list[KVCacheGroupSpec], + projected_groups_per_worker: list[list[KVCacheGroupSpec]], available_memory: list[int], ) -> None: """ @@ -1401,14 +1402,13 @@ def _auto_fit_max_model_len( Args: vllm_config: The global VllmConfig (will be modified in-place) - kv_cache_groups: The global KV cache groups (from get_kv_cache_groups). - This correctly accounts for padding in hybrid models. + projected_groups_per_worker: KV cache groups projected to each worker. available_memory: Memory available for KV cache in bytes for each worker. """ original_max = vllm_config.model_config.max_model_len - if not kv_cache_groups: + if all(not groups for groups in projected_groups_per_worker): # All workers have empty specs (attention-free model) logger.info_once( "Auto-fit max_model_len: attention-free model, " @@ -1418,11 +1418,16 @@ def _auto_fit_max_model_len( ) return - # Use minimum available memory across all workers - min_available_memory = min(available_memory) - auto_fit_max = _estimate_max_model_len_from_groups( - vllm_config, kv_cache_groups, min_available_memory - ) + # Find the max_model_len that fits across all workers. + auto_fit_max = original_max + limiting_worker_mem = available_memory[0] + for groups, avail_mem in zip(projected_groups_per_worker, available_memory): + if not groups: + continue + worker_max = _estimate_max_model_len_from_groups(vllm_config, groups, avail_mem) + if worker_max < auto_fit_max: + auto_fit_max = worker_max + limiting_worker_mem = avail_mem if auto_fit_max <= 0: raise ValueError( @@ -1446,11 +1451,47 @@ def _auto_fit_max_model_len( "available GPU memory (%s GiB available for KV cache)", original_max, auto_fit_max, - format_gib(min_available_memory), + format_gib(limiting_worker_mem), scope="local", ) +def _project_kv_cache_groups_to_worker( + global_kv_cache_groups: list[KVCacheGroupSpec], + worker_spec: dict[str, KVCacheSpec], +) -> list[KVCacheGroupSpec]: + """ + Projects global KV cache groups onto a single worker's assigned layers. + + In pipeline parallelism, each worker only owns a subset of layers. This + function filters the global groups to include only layers present on the + given worker, adjusting UniformTypeKVCacheSpecs accordingly. + + Args: + global_kv_cache_groups: The global KV cache groups for the whole model. + worker_spec: The KV cache spec of each layer on this worker. + + Returns: + The projected KV cache groups containing only this worker's layers. + """ + projected_groups: list[KVCacheGroupSpec] = [] + for group in global_kv_cache_groups: + worker_layer_names = [ + layer_name for layer_name in group.layer_names if layer_name in worker_spec + ] + group_spec = group.kv_cache_spec + if worker_layer_names and isinstance(group_spec, UniformTypeKVCacheSpecs): + group_spec = UniformTypeKVCacheSpecs( + block_size=group_spec.block_size, + kv_cache_specs={ + layer_name: group_spec.kv_cache_specs[layer_name] + for layer_name in worker_layer_names + }, + ) + projected_groups.append(KVCacheGroupSpec(worker_layer_names, group_spec)) + return projected_groups + + def get_kv_cache_configs( vllm_config: VllmConfig, kv_cache_specs: list[dict[str, KVCacheSpec]], @@ -1468,7 +1509,8 @@ def get_kv_cache_configs( the whole model. 2. Generate the KV cache groups based on the layer ratio of the whole model. This also handles spec unification for hybrid models. - 3. Handle auto-fit max_model_len and memory checks using the unified specs. + 3. Handle auto-fit max_model_len and memory checks using per-worker + projected groups to account for PP sharding. 4. Generate the KV cache configs for each worker based on the KV cache grouping strategy. (This is reasonable because the layer ratio of different PP stages are similar.) @@ -1506,44 +1548,38 @@ def get_kv_cache_configs( # If original_max_model_len was -1, automatically # determine the maximum model length that fits in available GPU memory. - # We use the global groups here to correctly account for padding. + # We use per-worker projected groups to account for PP sharding. + projected_groups_per_worker = [ + _project_kv_cache_groups_to_worker(global_kv_cache_groups, worker_spec) + for worker_spec in kv_cache_specs + ] + if vllm_config.model_config.original_max_model_len == -1: - _auto_fit_max_model_len(vllm_config, global_kv_cache_groups, available_memory) + _auto_fit_max_model_len( + vllm_config, projected_groups_per_worker, available_memory + ) - # Check if the available memory is enough (using min across all workers). - # We use the global groups to correctly account for padding. - if global_kv_cache_groups: + # Check if the available memory is enough per worker. + for groups, avail_mem in zip(projected_groups_per_worker, available_memory): + if not groups: + continue _check_enough_kv_cache_memory( - min(available_memory), - lambda: _max_memory_usage_bytes_from_groups( - vllm_config, global_kv_cache_groups - ), + avail_mem, + partial(_max_memory_usage_bytes_from_groups, vllm_config, groups), vllm_config.model_config.max_model_len, - lambda am: _estimate_max_model_len_from_groups( - vllm_config, global_kv_cache_groups, am - ), + partial(_estimate_max_model_len_from_groups, vllm_config, groups), ) kv_cache_configs: list[KVCacheConfig] = [] - for kv_cache_spec_one_worker, available_memory_one_worker in zip( - kv_cache_specs, available_memory + for projected_groups, kv_cache_spec_one_worker, available_memory_one_worker in zip( + projected_groups_per_worker, kv_cache_specs, available_memory ): - kv_cache_groups_one_worker: list[KVCacheGroupSpec] = [] - for group in global_kv_cache_groups: - group_layer_names_one_worker = [ - layer_name - for layer_name in group.layer_names - if layer_name in kv_cache_spec_one_worker - ] - kv_cache_groups_one_worker.append( - KVCacheGroupSpec(group_layer_names_one_worker, group.kv_cache_spec) - ) - assert sum( - len(group.layer_names) for group in kv_cache_groups_one_worker - ) == len(kv_cache_spec_one_worker), "Some layers are not assigned to any group." + assert sum(len(group.layer_names) for group in projected_groups) == len( + kv_cache_spec_one_worker + ), "Some layers are not assigned to any group." kv_cache_configs.append( get_kv_cache_config_from_groups( - vllm_config, kv_cache_groups_one_worker, available_memory_one_worker + vllm_config, projected_groups, available_memory_one_worker ) ) -- GitLab From 000214c4bb3f4fb61989eea19c625aedd0559ace Mon Sep 17 00:00:00 2001 From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com> Date: Tue, 10 Feb 2026 19:57:11 +0400 Subject: [PATCH 0053/1166] [BUGFIX] Fix accuracy bugs in Qwen3-Next MTP (#34077) Signed-off-by: Vadim Gimpelson --- vllm/v1/attention/backends/gdn_attn.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py index 41109ff41..c7a41abe5 100644 --- a/vllm/v1/attention/backends/gdn_attn.py +++ b/vllm/v1/attention/backends/gdn_attn.py @@ -208,7 +208,9 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata] non_spec_query_lens = query_lens[~spec_sequence_masks] num_decodes = (non_spec_query_lens == 1).sum().item() - num_prefills = non_spec_query_lens.size(0) - num_decodes + # Exclude zero-length padded sequences from prefill count. + num_zero_len = (non_spec_query_lens == 0).sum().item() + num_prefills = non_spec_query_lens.size(0) - num_decodes - num_zero_len num_decode_tokens = num_decodes num_prefill_tokens = non_spec_query_lens.sum().item() - num_decode_tokens num_spec_decode_tokens = ( @@ -228,9 +230,15 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata] non_spec_token_indx = torch.empty( 0, dtype=torch.int32, device=query_start_loc.device ) - spec_state_indices_tensor = block_table_tensor[:, : self.num_spec + 1] + # Filter by spec_sequence_masks to exclude padded sequences + spec_state_indices_tensor = block_table_tensor[ + spec_sequence_masks, : self.num_spec + 1 + ] non_spec_state_indices_tensor = None - spec_query_start_loc = query_start_loc + # Padded sequences are always at the back, so the first + # num_spec_decodes + 1 entries of query_start_loc already + # contain the correct cumulative token counts. + spec_query_start_loc = query_start_loc[: num_spec_decodes + 1] non_spec_query_start_loc = None non_spec_query_start_loc_cpu = None else: @@ -294,6 +302,12 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata] else: has_initial_state = None + # Function code counted on either presency non-spec decode or spec decode, + # but not both. + assert not (num_decodes > 0 and num_spec_decodes > 0), ( + f"num_decodes: {num_decodes}, num_spec_decodes: {num_spec_decodes}" + ) + # Prepare tensors for cudagraph # Note: m.num_actual_tokens is already padded by the model runner for CUDAGraph batch_size = m.num_actual_tokens @@ -312,7 +326,7 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata] spec_state_indices_tensor[num_spec_decodes:].fill_(PAD_SLOT_ID) self.spec_sequence_masks[:num_spec_decodes].copy_( - spec_sequence_masks, non_blocking=True + spec_sequence_masks[:num_spec_decodes], non_blocking=True ) spec_sequence_masks = self.spec_sequence_masks[:batch_size] spec_sequence_masks[num_spec_decodes:].fill_(False) -- GitLab From f84a2a8f318abdec197b957babe13c9766abb4ed Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 10 Feb 2026 17:34:43 +0100 Subject: [PATCH 0054/1166] [Docs] Speed up build environment set-up (#34240) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .readthedocs.yaml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index d83d6df35..f372a3fb8 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -9,13 +9,14 @@ build: python: "3.12" jobs: post_checkout: - - git fetch --unshallow || true + - git fetch origin main --unshallow --no-tags --filter=blob:none || true + pre_create_environment: + - pip install uv + create_environment: + - uv venv $READTHEDOCS_VIRTUALENV_PATH + install: + - uv pip install --python $READTHEDOCS_VIRTUALENV_PATH/bin/python --no-cache-dir -r requirements/docs.txt mkdocs: configuration: mkdocs.yaml fail_on_warning: true - -# Optionally declare the Python requirements required to build your docs -python: - install: - - requirements: requirements/docs.txt -- GitLab From a2443de5fa4a0605607f6c3d9219022c7f6ac480 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 10 Feb 2026 08:55:22 -0800 Subject: [PATCH 0055/1166] [Model Runner V2] Use pinned memory for write_contents (#34222) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/buffer_utils.py | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/vllm/v1/worker/gpu/buffer_utils.py b/vllm/v1/worker/gpu/buffer_utils.py index 84d1a6ba0..d2cb20186 100644 --- a/vllm/v1/worker/gpu/buffer_utils.py +++ b/vllm/v1/worker/gpu/buffer_utils.py @@ -7,9 +7,11 @@ import numpy as np import torch from vllm.triton_utils import tl, triton -from vllm.utils.math_utils import next_power_of_2 from vllm.utils.platform_utils import is_uva_available -from vllm.utils.torch_utils import get_accelerator_view_from_cpu_tensor +from vllm.utils.torch_utils import ( + async_tensor_h2d, + get_accelerator_view_from_cpu_tensor, +) def async_copy_to_gpu( @@ -117,6 +119,7 @@ class StagedWriteTensor: ) self.num_rows = size if isinstance(size, int) else size[0] self.dtype = dtype + self.device = device self.max_concurrency = max_concurrency if not uva_instead_of_gpu: @@ -137,8 +140,6 @@ class StagedWriteTensor: self.write_indices = new_buffer(self.num_rows, dtype=torch.int32) self.write_starts = new_buffer(self.num_rows, dtype=torch.int32) - init_size = next_power_of_2(self.num_rows) - self.write_contents = new_buffer(init_size, dtype=dtype) self.write_cu_lens = new_buffer(self.num_rows, dtype=torch.int32) def stage_write( @@ -170,21 +171,9 @@ class StagedWriteTensor: cu_lens_uva = self.write_cu_lens.copy_to_uva(self._staged_write_cu_lens) # Special handling for write_contents - diff_len = len(self._staged_write_contents) - assert isinstance(self.write_contents.size, int) - if diff_len > self.write_contents.size: - # Re-allocate a larger buffer for the write_contents - new_size = next_power_of_2(diff_len) - self.write_contents = UvaBufferPool( - new_size, dtype=self.dtype, max_concurrency=self.max_concurrency - ) - # NOTE(woosuk): Since the previous write_contents buffer is released, - # we perform a synchronization here to ensure that all data transfers - # involving the old buffer have finished before allocating a new one. - # This prevents potential race conditions. The slight overhead is - # negligible because the reallocations are infrequent in practice. - torch.cuda.synchronize() - contents_uva = self.write_contents.copy_to_uva(self._staged_write_contents) + write_contents = async_tensor_h2d( + self._staged_write_contents, self.dtype, self.device, pin_memory=True + ) # Write diffs to the GPU buffer _apply_write_kernel[(n,)]( @@ -192,7 +181,7 @@ class StagedWriteTensor: self.gpu.stride(0), indices_uva, starts_uva, - contents_uva, + write_contents, cu_lens_uva, BLOCK_SIZE=1024, ) -- GitLab From ae871ca9234be3f6cb6966d998e51a7cb672f912 Mon Sep 17 00:00:00 2001 From: Andy Lo Date: Tue, 10 Feb 2026 18:18:30 +0000 Subject: [PATCH 0056/1166] Minor cleanup for Voxtral (#34247) Signed-off-by: Andy Lo --- vllm/model_executor/models/voxtral.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index a33454005..581664aec 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -792,7 +792,9 @@ class VoxtralEncoderModel(nn.Module): audio_waveforms: torch.Tensor, ) -> torch.Tensor: input_dtype = audio_waveforms.dtype - window = torch.hann_window(self.config.window_size).to(audio_waveforms.device) + window = torch.hann_window( + self.config.window_size, device=audio_waveforms.device + ) stft = torch.stft( audio_waveforms, self.config.window_size, -- GitLab From 1f5febb4b8587378a38ea7050503c3cf0431eef6 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 10 Feb 2026 13:35:58 -0500 Subject: [PATCH 0057/1166] [UX nit] Fix non-default api_server_count message (#34152) Signed-off-by: mgoin --- vllm/entrypoints/cli/serve.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 8dfa19e16..c12cc7ff2 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -108,6 +108,7 @@ class ServeSubcommand(CLISubcommand): run_multi_api_server(args) else: # Single API server (this process). + args.api_server_count = None uvloop.run(run_server(args)) def validate(self, args: argparse.Namespace) -> None: -- GitLab From 33bcd3dc3bf4d581c051400c8d9bb9433d2c87af Mon Sep 17 00:00:00 2001 From: Qi Wang Date: Tue, 10 Feb 2026 10:55:35 -0800 Subject: [PATCH 0058/1166] [Misc] Introduce ec_both role EC (encoder cache) connector (#34182) Signed-off-by: Qi Wang --- vllm/config/ec_transfer.py | 6 +++--- vllm/distributed/ec_transfer/ec_connector/base.py | 5 +++++ vllm/v1/worker/ec_connector_model_runner_mixin.py | 3 ++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/vllm/config/ec_transfer.py b/vllm/config/ec_transfer.py index c7f56557f..a3a927d51 100644 --- a/vllm/config/ec_transfer.py +++ b/vllm/config/ec_transfer.py @@ -7,8 +7,8 @@ from typing import Any, Literal, get_args from vllm.config.utils import config -ECProducer = Literal["ec_producer"] -ECConsumer = Literal["ec_consumer"] +ECProducer = Literal["ec_producer", "ec_both"] +ECConsumer = Literal["ec_consumer", "ec_both"] ECRole = Literal[ECProducer, ECConsumer] @@ -33,7 +33,7 @@ class ECTransferConfig: ec_role: ECRole | None = None """Whether this vLLM instance produces, consumes EC cache, or both. Choices - are 'ec_producer', 'ec_consumer'.""" + are 'ec_producer', 'ec_consumer', 'ec_both'.""" ec_rank: int | None = None """The rank of this vLLM instance in the EC cache transfer. Typical value: diff --git a/vllm/distributed/ec_transfer/ec_connector/base.py b/vllm/distributed/ec_transfer/ec_connector/base.py index 2c212c29c..7f1407d0c 100644 --- a/vllm/distributed/ec_transfer/ec_connector/base.py +++ b/vllm/distributed/ec_transfer/ec_connector/base.py @@ -63,6 +63,7 @@ class ECConnectorBase(ABC): self._role = role if vllm_config.ec_transfer_config is not None: self._is_producer = vllm_config.ec_transfer_config.is_ec_producer + self._is_consumer = vllm_config.ec_transfer_config.is_ec_consumer else: raise ValueError("ec_transfer_config must be set for ECConnectorBase") @@ -74,6 +75,10 @@ class ECConnectorBase(ABC): def is_producer(self) -> bool: return self._is_producer + @property + def is_consumer(self) -> bool: + return self._is_consumer + # ============================== # Worker-side methods # ============================== diff --git a/vllm/v1/worker/ec_connector_model_runner_mixin.py b/vllm/v1/worker/ec_connector_model_runner_mixin.py index 1a347a0b9..4d785c4ef 100644 --- a/vllm/v1/worker/ec_connector_model_runner_mixin.py +++ b/vllm/v1/worker/ec_connector_model_runner_mixin.py @@ -72,7 +72,8 @@ class ECConnectorModelRunnerMixin: assert scheduler_output.ec_connector_metadata is not None ec_connector.bind_connector_metadata(scheduler_output.ec_connector_metadata) - if not ec_connector.is_producer: + # Load caches for consumer or both roles + if ec_connector.is_consumer: ec_connector.start_load_caches(encoder_cache, **kwargs) try: -- GitLab From fdd6f2ad58b113fe0fdc3fd9998e63d6064b5f16 Mon Sep 17 00:00:00 2001 From: Reagan Lee <96998476+reaganjlee@users.noreply.github.com> Date: Tue, 10 Feb 2026 11:44:31 -0800 Subject: [PATCH 0059/1166] Convert online APIs to use Renderer (#34084) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Reagan Lee <“reaganjlee@gmail.com”> Co-authored-by: Reagan Lee <“reaganjlee@gmail.com”> --- .../openai/speech_to_text/speech_to_text.py | 26 +++++++++++++++---- vllm/entrypoints/serve/disagg/serving.py | 16 +++++++++--- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py index 454359ffd..8d8f0e6b7 100644 --- a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py @@ -471,15 +471,31 @@ class OpenAISpeechToText(OpenAIServing): lora_request=lora_request, ) - list_result_generator = [ - self.engine_client.generate( + trace_headers = ( + None + if raw_request is None + else await self._get_trace_headers(raw_request.headers) + ) + + list_result_generator = [] + for i, prompt in enumerate(prompts): + request_id_item = f"{request_id}_{i}" + engine_request = self.input_processor.process_inputs( + request_id_item, prompt, sampling_params, - f"{request_id}_{i}", lora_request=lora_request, + trace_headers=trace_headers, + priority=0, + ) + list_result_generator.append( + self.engine_client.generate( + engine_request, + sampling_params, + request_id_item, + lora_request=lora_request, + ) ) - for i, prompt in enumerate(prompts) - ] except ValueError as e: return self.create_error_response(e) diff --git a/vllm/entrypoints/serve/disagg/serving.py b/vllm/entrypoints/serve/disagg/serving.py index 0e61f5ec0..81fab153e 100644 --- a/vllm/entrypoints/serve/disagg/serving.py +++ b/vllm/entrypoints/serve/disagg/serving.py @@ -99,8 +99,6 @@ class ServingTokens(OpenAIServing): if raw_request: raw_request.state.request_metadata = request_metadata - # TODO(NickLucche): Change to EngineCoreRequest once Renderer work is - # completed engine_prompts = await self._preprocess_completion( request, prompt_input=request.token_ids, @@ -132,16 +130,26 @@ class ServingTokens(OpenAIServing): tok_params = request.build_tok_params(self.model_config) tokenization_kwargs = tok_params.get_encode_kwargs() - result_generator = self.engine_client.generate( + engine_request = self.input_processor.process_inputs( + request_id, engine_prompt, sampling_params, - request_id, lora_request=lora_request, tokenization_kwargs=tokenization_kwargs, trace_headers=trace_headers, priority=request.priority, ) + result_generator = self.engine_client.generate( + engine_request, + sampling_params, + request_id, + lora_request=lora_request, + trace_headers=trace_headers, + priority=request.priority, + tokenization_kwargs=tokenization_kwargs, + ) + except ValueError as e: return self.create_error_response(str(e)) -- GitLab From 506ad7d7c178ac20f2140cfaac1ae657683e8013 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=20Sepp=C3=A4nen?= <83203+jseppanen@users.noreply.github.com> Date: Tue, 10 Feb 2026 22:38:17 +0200 Subject: [PATCH 0060/1166] [Bugfix] Fix weights offloading for sleep mode (#32947) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Jarno Seppänen Co-authored-by: Tyler Michael Smith --- vllm/v1/worker/gpu_worker.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 969627170..1c526bab9 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -286,9 +286,10 @@ class Worker(WorkerBase): # to hijack tensor allocation. def load_model(self) -> None: eep_scale_up = os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1" - with self._maybe_get_memory_pool_context( - tag="weights" - ) and set_current_vllm_config(self.vllm_config): + with ( + self._maybe_get_memory_pool_context(tag="weights"), + set_current_vllm_config(self.vllm_config), + ): self.model_runner.load_model(eep_scale_up=eep_scale_up) def update_config(self, overrides: dict[str, Any]) -> None: -- GitLab From 4293c00b84b968ed25f80dfd2af3bb34d1eeeef6 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Tue, 10 Feb 2026 16:04:07 -0500 Subject: [PATCH 0061/1166] [Benchmarks] Fix attention benchmark smoke test (#34269) Signed-off-by: Matthew Bonanni --- .buildkite/test_areas/benchmarks.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.buildkite/test_areas/benchmarks.yaml b/.buildkite/test_areas/benchmarks.yaml index 57080c46f..a30ec60ea 100644 --- a/.buildkite/test_areas/benchmarks.yaml +++ b/.buildkite/test_areas/benchmarks.yaml @@ -22,9 +22,10 @@ steps: device: b200 num_gpus: 2 optional: true + working_dir: "/vllm-workspace/" timeout_in_minutes: 10 source_file_dependencies: - benchmarks/attention_benchmarks/ - vllm/v1/attention/ commands: - - python benchmarks/attention_benchmarks/benchmark.py --backends flash flashinfer --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1 + - python3 benchmarks/attention_benchmarks/benchmark.py --backends flash flashinfer --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1 -- GitLab From 9615575afc0d9a7d5fe98b65ac2a7150b068472e Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 10 Feb 2026 13:12:31 -0800 Subject: [PATCH 0062/1166] [Bugfix] Fix mamba cache dtype for Qwen3.5 (#34200) Signed-off-by: Roger Wang --- vllm/model_executor/models/qwen3_5.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen3_5.py b/vllm/model_executor/models/qwen3_5.py index 61ff6946c..808db2d6f 100644 --- a/vllm/model_executor/models/qwen3_5.py +++ b/vllm/model_executor/models/qwen3_5.py @@ -870,8 +870,9 @@ class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid) cls, vllm_config: "VllmConfig", ) -> tuple[torch.dtype, torch.dtype]: + mamba_ssm_dtype = vllm_config.model_config.hf_text_config.mamba_ssm_dtype return MambaStateDtypeCalculator.gated_delta_net_state_dtype( - vllm_config.model_config.dtype, vllm_config.cache_config.mamba_cache_dtype + vllm_config.model_config.dtype, mamba_ssm_dtype ) @classmethod -- GitLab From 578977bb5ed208c62cf9cff80d955836775e0d24 Mon Sep 17 00:00:00 2001 From: Pavani Majety Date: Tue, 10 Feb 2026 13:18:43 -0800 Subject: [PATCH 0063/1166] [SM100] Resubmit FMHA FP8 prefill for MLA (#31195) Signed-off-by: Pavani Majety --- tests/v1/attention/test_mla_backends.py | 7 +- vllm/config/attention.py | 3 + .../layers/attention/mla_attention.py | 158 +++++++++++++++--- 3 files changed, 145 insertions(+), 23 deletions(-) diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py index 815274e1c..ba70c8251 100644 --- a/tests/v1/attention/test_mla_backends.py +++ b/tests/v1/attention/test_mla_backends.py @@ -27,7 +27,7 @@ from vllm.v1.attention.backend import CommonAttentionMetadata from vllm.v1.attention.backends.fa_utils import flash_attn_supports_mla from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.ops.flashmla import is_flashmla_dense_supported -from vllm.v1.kv_cache_interface import FullAttentionSpec +from vllm.v1.kv_cache_interface import MLAAttentionSpec BACKENDS_TO_TEST = [ AttentionBackendEnum.CUTLASS_MLA, @@ -512,7 +512,7 @@ class MockMLAAttentionLayer(AttentionLayerBase): def run_attention_backend( backend: AttentionBackendEnum, - kv_cache_spec: FullAttentionSpec, + kv_cache_spec: MLAAttentionSpec, layer_names: list[str], vllm_config, device: torch.device, @@ -989,7 +989,7 @@ def test_backend_correctness( kv_cache = kv_cache_per_block_size[block_size] # Create kv_cache_spec with the correct block_size for this backend - backend_kv_cache_spec = FullAttentionSpec( + backend_kv_cache_spec = MLAAttentionSpec( block_size=block_size, num_kv_heads=vllm_config.model_config.get_num_kv_heads( vllm_config.parallel_config @@ -997,6 +997,7 @@ def test_backend_correctness( head_size=vllm_config.model_config.get_head_size(), dtype=vllm_config.model_config.dtype, sliding_window=vllm_config.model_config.get_sliding_window(), + cache_dtype_str=vllm_config.cache_config.cache_dtype, ) backend_output = run_attention_backend( diff --git a/vllm/config/attention.py b/vllm/config/attention.py index 9379b2878..97a139c79 100644 --- a/vllm/config/attention.py +++ b/vllm/config/attention.py @@ -43,6 +43,9 @@ class AttentionConfig: disable_flashinfer_q_quantization: bool = False """If set, when using fp8 kv, do not quantize Q to fp8.""" + use_prefill_query_quantization: bool = False + """If set, quantize query for attention in prefill.""" + def compute_hash(self) -> str: """ Provide a hash that uniquely identifies all the configs diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py index c31aa7b41..c44bf1f16 100644 --- a/vllm/model_executor/layers/attention/mla_attention.py +++ b/vllm/model_executor/layers/attention/mla_attention.py @@ -1052,6 +1052,7 @@ class MLACommonPrefillMetadata: query_seq_lens: torch.Tensor | None = None workspace_buffer: torch.Tensor | None = None q_data_type: torch.dtype | None = None + output_dtype: torch.dtype | None = None @dataclass @@ -1145,6 +1146,7 @@ def is_deepseek_r1_mla_compatible(vllm_config: VllmConfig) -> bool: return qk_nope_head_dim == 128 and qk_rope_head_dim == 64 and v_head_dim == 128 +@functools.cache def use_flashinfer_prefill() -> bool: # For blackwell default to flashinfer prefill if it's available since # it is faster than FA2. @@ -1162,6 +1164,7 @@ def use_flashinfer_prefill() -> bool: return is_deepseek_r1_mla_compatible(vllm_config) +@functools.cache def use_cudnn_prefill() -> bool: from vllm.config import get_current_vllm_config @@ -1174,6 +1177,7 @@ def use_cudnn_prefill() -> bool: ) +@functools.cache def use_trtllm_ragged_deepseek_prefill() -> bool: """Check if TRT-LLM ragged DeepSeek prefill should be used.""" from vllm.config import get_current_vllm_config @@ -1210,6 +1214,27 @@ def get_mla_dims(model_config: ModelConfig) -> MLADims: ) +@functools.cache +def backend_supports_prefill_query_quantization() -> bool: + """Check if the selected MLA backend supports prefill query quantization. + + Currently supported backends: + - FlashInfer prefill + - TRT-LLM ragged DeepSeek prefill + + Not supported: + - cuDNN Prefill + - FlashAttention + - Non-GB200 devices (FP8 prefill requires device capability 100) + """ + # FP8 prefill query quantization requires GB200 (device capability 100) + # for the necessary FP8 kernels at the moment. + if not current_platform.is_device_capability_family(100): + return False + + return use_flashinfer_prefill() or use_trtllm_ragged_deepseek_prefill() + + class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): """ NOTE: Please read the comment at the top of the file before trying to @@ -1262,6 +1287,40 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): return chunked_prefill_workspace_size + @staticmethod + def determine_prefill_query_data_type( + vllm_config: VllmConfig, + model_dtype: torch.dtype, + ) -> torch.dtype: + """ + Determine the query data type for prefill queries. + Return FP8 dtype if cache is FP8 and prefill query quantization + is enabled, else model dtype. + """ + use_fp8 = ( + vllm_config.cache_config.cache_dtype.startswith("fp8") + and vllm_config.attention_config.use_prefill_query_quantization + and backend_supports_prefill_query_quantization() + ) + + if use_fp8: + fp8_dtype = current_platform.fp8_dtype() + logger.info_once( + "FP8 prefill attention enabled: query data type is FP8", scope="local" + ) + return fp8_dtype + elif vllm_config.attention_config.use_prefill_query_quantization: + logger.info_once( + "Unable to perform FP8 prefill attention when" + " use_prefill_query_quantization is enabled. Please" + " ensure that --kv-cache-dtype is set to fp8 and your prefill" + " backend is compatible with FP8 attention.", + scope="local", + ) + return model_dtype + + return model_dtype + def __init__( self, kv_cache_spec: AttentionSpec, @@ -1285,6 +1344,12 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): self.num_heads = self.model_config.get_num_attention_heads(parallel_config) self.mla_dims = get_mla_dims(self.model_config) self.aot_schedule = current_platform.is_cuda() + + self.kv_cache_spec = kv_cache_spec + self.q_data_type = self.determine_prefill_query_data_type( + vllm_config, self.model_config.dtype + ) + try: self.dcp_world_size = get_dcp_group().world_size self.dcp_rank = get_dcp_group().rank_in_group @@ -1325,7 +1390,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): self.chunked_prefill_workspace_size, self.model_config.get_head_size(), ), - dtype=self.model_config.dtype, + dtype=self.q_data_type, device=device, ) @@ -1435,7 +1500,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): sm_scale=self._global_hyperparameters.sm_scale, window_left=self._global_hyperparameters.window_left, logits_soft_cap=self._global_hyperparameters.logits_soft_cap, - q_data_type=self.model_config.dtype, + q_data_type=self.q_data_type, + o_data_type=prefill.output_dtype, ) # Prepare context prefills @@ -1454,7 +1520,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): sm_scale=self._global_hyperparameters.sm_scale, window_left=self._global_hyperparameters.window_left, logits_soft_cap=self._global_hyperparameters.logits_soft_cap, - q_data_type=self.model_config.dtype, + q_data_type=self.q_data_type, + o_data_type=prefill.output_dtype, ) prefill.prefill_main = self._fi_prefill_main @@ -1709,6 +1776,8 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]): query_start_loc=prefill_query_start_loc, max_query_len=max_query_len, chunked_context=chunked_context_metadata, + output_dtype=self.model_config.dtype, + q_data_type=self.q_data_type, ) if self._use_cudnn_prefill: @@ -1894,7 +1963,6 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): self.kv_b_proj = kv_b_proj self.indexer = indexer self.q_pad_num_heads = q_pad_num_heads - self.supports_quant_query_input = True # Use flashinfer's optimized concat_mla_k kernel when available. @@ -2129,6 +2197,14 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): assert prefill.query_seq_lens is not None assert prefill.workspace_buffer is not None + # allocate BF16 / FP16 output tensor for TRT-LLM ragged attention + out = torch.empty( + q.shape[0], + q.shape[1], + v.shape[2], + device=q.device, + dtype=prefill.output_dtype, + ) ret = trtllm_ragged_attention_deepseek( query=q, @@ -2148,6 +2224,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): enable_pdl=False, is_causal=True, return_lse=return_softmax_lse, + out=out, ) if isinstance(ret, tuple): @@ -2170,7 +2247,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): q.shape[1], v.shape[2], device=q.device, - dtype=q.dtype, + dtype=prefill.output_dtype, ) prefill.workspace_buffer.fill_(0) @@ -2240,29 +2317,59 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): prefill_metadata = attn_metadata.prefill assert prefill_metadata.chunked_context is not None + use_fp8_prefill = prefill_metadata.q_data_type == current_platform.fp8_dtype() + output = None iters = len(prefill_metadata.chunked_context.seq_tot) workspace = prefill_metadata.chunked_context.workspace + + if use_fp8_prefill: + q = q.to(prefill_metadata.q_data_type) + for i in range(iters): toks = prefill_metadata.chunked_context.seq_tot[i] - ops.gather_and_maybe_dequant_cache( - src_cache=kv_c_and_k_pe_cache, - dst=workspace, - block_table=prefill_metadata.block_table, - cu_seq_lens=prefill_metadata.chunked_context.cu_seq_lens[i], - token_to_seq=prefill_metadata.chunked_context.token_to_seq[i], - num_tokens=prefill_metadata.chunked_context.chunk_total_token[i], - kv_cache_dtype=self.kv_cache_dtype, - scale=k_scale, - seq_starts=prefill_metadata.chunked_context.starts[i], - ) + if not use_fp8_prefill: + ops.gather_and_maybe_dequant_cache( + src_cache=kv_c_and_k_pe_cache, + dst=workspace, + block_table=prefill_metadata.block_table, + cu_seq_lens=prefill_metadata.chunked_context.cu_seq_lens[i], + token_to_seq=prefill_metadata.chunked_context.token_to_seq[i], + num_tokens=prefill_metadata.chunked_context.chunk_total_token[i], + kv_cache_dtype=self.kv_cache_dtype, + scale=k_scale, + seq_starts=prefill_metadata.chunked_context.starts[i], + ) + else: + # FP8 path: gather cache without dequantization + ops.cp_gather_cache( + src_cache=kv_c_and_k_pe_cache, + dst=workspace, + block_table=prefill_metadata.block_table, + cu_seq_lens=prefill_metadata.chunked_context.cu_seq_lens[i], + batch_size=attn_metadata.num_prefills, + seq_starts=prefill_metadata.chunked_context.starts[i], + ) + # Extract kv_c_normed from workspace kv_c_normed = workspace[:toks][..., : self.kv_lora_rank] - k_pe = workspace[:toks][..., self.kv_lora_rank :].unsqueeze(1) + # When FP8 weights are used without FP8 prefill, kv_b_proj expects + # model dtype input and will quantize internally. + if ( + use_fp8_prefill + or self.kv_b_proj.weight.dtype != current_platform.fp8_dtype() + ): + kv_c_normed = kv_c_normed.to(self.kv_b_proj.weight.dtype) + k_pe = workspace[:toks][..., self.kv_lora_rank :].unsqueeze(1) kv_nope = self.kv_b_proj(kv_c_normed)[0].view( -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim ) + + # To Do: Use epilogue of kv_b_proj to generate fp8 kv_nope. + if use_fp8_prefill: + kv_nope = kv_nope.to(prefill_metadata.q_data_type) + k_pe = k_pe.to(prefill_metadata.q_data_type) k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) k = self._concat_k_nope_k_pe(k_nope, k_pe) @@ -2412,16 +2519,27 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): assert attn_metadata.prefill is not None assert self.dcp_world_size != -1 - has_context = attn_metadata.prefill.chunked_context is not None + prefill_metadata = attn_metadata.prefill + use_fp8_prefill = prefill_metadata.q_data_type == current_platform.fp8_dtype() + + # Convert q to FP8 if FP8 prefill attention is enabled + if use_fp8_prefill: + q = q.to(prefill_metadata.q_data_type) + + has_context = prefill_metadata.chunked_context is not None + kv_nope = self.kv_b_proj(kv_c_normed)[0].view( -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim ) k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) - k = self._concat_k_nope_k_pe(k_nope, k_pe) + if use_fp8_prefill: + k = k.to(prefill_metadata.q_data_type) + v = v.to(prefill_metadata.q_data_type) + output_prefill = self._run_prefill_new_tokens( - prefill=attn_metadata.prefill, + prefill=prefill_metadata, q=q, k=k, v=v, -- GitLab From f0ca0671c70fae6d1562127e3330eeaedf4abb3f Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Tue, 10 Feb 2026 15:45:38 -0600 Subject: [PATCH 0064/1166] [Feature] Warn about unrecognized environment variables (#33581) Signed-off-by: Gregory Shtrasberg --- tests/config/test_config_generation.py | 24 ++++++++++++++++++++++++ vllm/engine/arg_utils.py | 12 ++++++++++++ vllm/envs.py | 9 +++++++++ 3 files changed, 45 insertions(+) diff --git a/tests/config/test_config_generation.py b/tests/config/test_config_generation.py index 23ceb920c..225ac0f22 100644 --- a/tests/config/test_config_generation.py +++ b/tests/config/test_config_generation.py @@ -78,3 +78,27 @@ def test_ray_runtime_env(monkeypatch: pytest.MonkeyPatch): ) ray.shutdown() + + +def test_unrecognized_env(): + import os + + # Test that if fail_on_environ_validation is True, then an error + # is raised when an unrecognized vLLM environment variable is set + os.environ["VLLM_UNRECOGNIZED_ENV_VAR"] = "some_value" + engine_args = EngineArgs( + fail_on_environ_validation=True, + ) + with pytest.raises(ValueError, match="Unknown vLLM environment variable detected"): + engine_args.create_engine_config() + + # Test that if fail_on_environ_validation is False, then no error is raised + engine_args = EngineArgs() + engine_args.create_engine_config() + + # Test that when the unrecognized env var is removed, no error is raised + os.environ.pop("VLLM_UNRECOGNIZED_ENV_VAR", None) + engine_args = EngineArgs( + fail_on_environ_validation=True, + ) + engine_args.create_engine_config() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c7c78ffd8..2d1e2feb9 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -593,6 +593,8 @@ class EngineArgs: "weight_transfer_config", ) + fail_on_environ_validation: bool = False + def __post_init__(self): # support `EngineArgs(compilation_config={...})` # without having to manually construct a @@ -1239,6 +1241,14 @@ class EngineArgs: help="Log aggregate rather than per-engine statistics " "when using data parallelism.", ) + + parser.add_argument( + "--fail-on-environ-validation", + help="If set, the engine will raise an error if " + "environment validation fails.", + default=False, + action=argparse.BooleanOptionalAction, + ) return parser @classmethod @@ -1396,6 +1406,8 @@ class EngineArgs: device_config = DeviceConfig(device=cast(Device, current_platform.device_type)) + envs.validate_environ(self.fail_on_environ_validation) + # Check if the model is a speculator and override model/tokenizer/config # BEFORE creating ModelConfig, so the config is created with the target model # Skip speculator detection for cloud storage models (eg: S3, GCS) since diff --git a/vllm/envs.py b/vllm/envs.py index 3af85be0a..314f42758 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1606,6 +1606,15 @@ def is_set(name: str): raise AttributeError(f"module {__name__!r} has no attribute {name!r}") +def validate_environ(hard_fail: bool) -> None: + for env in os.environ: + if env.startswith("VLLM_") and env not in environment_variables: + if hard_fail: + raise ValueError(f"Unknown vLLM environment variable detected: {env}") + else: + logger.warning("Unknown vLLM environment variable detected: %s", env) + + def compile_factors() -> dict[str, object]: """Return env vars used for torch.compile cache keys. -- GitLab From 67132945bbad23233fd583e6106ebebe859c8366 Mon Sep 17 00:00:00 2001 From: Ilya Markov Date: Tue, 10 Feb 2026 23:19:10 +0100 Subject: [PATCH 0065/1166] [Perf] Move eplb rebalance algo to async thread (#30888) Signed-off-by: ilmarkov Signed-off-by: Tyler Michael Smith Co-authored-by: Tyler Michael Smith Co-authored-by: Tyler Michael Smith --- tests/distributed/test_eplb_execute.py | 7 +- vllm/distributed/eplb/async_worker.py | 107 +++++++++++++--- vllm/distributed/eplb/eplb_state.py | 142 +++++++++++++-------- vllm/distributed/eplb/rebalance_execute.py | 59 +++++---- vllm/distributed/parallel_state.py | 39 +++++- 5 files changed, 251 insertions(+), 103 deletions(-) diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py index f8f950084..48afc39c6 100644 --- a/tests/distributed/test_eplb_execute.py +++ b/tests/distributed/test_eplb_execute.py @@ -295,12 +295,11 @@ def _test_async_transfer_layer_without_mtp_worker( for layer_idx in range(num_layers): is_unchanged, is_received_locally, recv_metadata = asyncio.run( transfer_layer( - old_global_expert_indices=old_indices_cpu, - new_global_expert_indices=new_indices_cpu, - expert_weights=expert_weights, + old_layer_indices=old_indices_cpu[layer_idx], + new_layer_indices=new_indices_cpu[layer_idx], + expert_weights=expert_weights[layer_idx], expert_weights_buffer=expert_buffer, ep_group=ep_group, - layer=layer_idx, cuda_stream=cuda_stream, ) ) diff --git a/vllm/distributed/eplb/async_worker.py b/vllm/distributed/eplb/async_worker.py index fbafaf888..b81c7fa9c 100644 --- a/vllm/distributed/eplb/async_worker.py +++ b/vllm/distributed/eplb/async_worker.py @@ -11,13 +11,13 @@ from typing import TYPE_CHECKING import torch from torch.distributed import ProcessGroup -from vllm.distributed.parallel_state import get_ep_group +from vllm.distributed.parallel_state import get_eplb_group from vllm.logger import init_logger from .rebalance_execute import transfer_layer if TYPE_CHECKING: - from .eplb_state import EplbState + from .eplb_state import EplbModelState, EplbState logger = init_logger(__name__) @@ -27,8 +27,8 @@ def start_async_worker( rank_mapping: dict[int, int] | None = None, is_profile: bool = False, ) -> threading.Thread: - ep_group = get_ep_group().device_group - rank = ep_group.rank() + eplb_group = get_eplb_group().device_group + rank = eplb_group.rank() device_index = state.cuda_device_index assert state.is_async @@ -42,7 +42,7 @@ def start_async_worker( loop.run_until_complete( transfer_run_periodically( state=state, - ep_group=ep_group, + eplb_group=eplb_group, cuda_stream=cuda_stream, is_profile=is_profile, rank_mapping=rank_mapping, @@ -58,9 +58,53 @@ def start_async_worker( return thread +def run_rebalance_experts( + model_state: "EplbModelState", + eplb_state: "EplbState", + physical_to_logical_map_cpu: torch.Tensor, +) -> None: + assert model_state.eplb_stats is not None + eplb_stats = model_state.eplb_stats + + # Wait for the main thread's all-reduce and clone to complete before + # accessing the global_expert_load_window tensor. + assert model_state.window_ready_event is not None + model_state.window_ready_event.wait() + model_state.window_ready_event = None + + # Move the global expert load window to CPU for computation. + global_expert_load_window = eplb_stats.global_expert_load_window.cpu() + # Compute new expert mappings for the model + ( + new_physical_to_logical_map, + new_logical_to_physical_map, + new_logical_replica_count, + ) = eplb_state.policy.rebalance_experts( + global_expert_load_window, + eplb_stats.num_replicas, + eplb_stats.num_groups, + eplb_stats.num_nodes, + eplb_stats.num_gpus, + physical_to_logical_map_cpu, + ) + assert new_physical_to_logical_map.device == torch.device("cpu") + + model_state.new_physical_to_logical_map = new_physical_to_logical_map + + max_slots = model_state.logical_to_physical_map.shape[-1] + padded_logical = torch.nn.functional.pad( + new_logical_to_physical_map, + (0, max(0, max_slots - new_logical_to_physical_map.shape[-1])), + value=-1, + ).to(model_state.logical_to_physical_map.device) + new_replica = new_logical_replica_count.to(model_state.logical_replica_count.device) + model_state.new_logical_to_physical_map = padded_logical + model_state.new_logical_replica_count = new_replica + + async def transfer_run_periodically( state: "EplbState", - ep_group: ProcessGroup, + eplb_group: ProcessGroup, cuda_stream: torch.cuda.Stream, is_profile: bool = False, rank_mapping: dict[int, int] | None = None, @@ -71,23 +115,51 @@ async def transfer_run_periodically( assert state.is_async for model_state in state.model_states.values(): + rebalancing_algorithm_executed = False + physical_to_logical_map_cpu = None current_num_layers = model_state.model.num_moe_layers while ( model_state.rebalanced and model_state.layer_to_transfer < current_num_layers ): - if ( - not model_state.ep_buffer_ready - and model_state.rebalanced - and model_state.new_physical_to_logical_map is not None - ): - await asyncio.to_thread(model_state.buffer_lock.acquire) + if not model_state.ep_buffer_ready and model_state.rebalanced: + # Polling the lock directly in the async thread avoids + # the thread switch overhead of asyncio.to_thread. + # This is typically faster than offloading to a worker thread. + while not model_state.buffer_lock.acquire(blocking=False): + await asyncio.sleep(0) try: if model_state.layer_to_transfer >= current_num_layers: break + if ( + not rebalancing_algorithm_executed + or model_state.new_physical_to_logical_map is None + ): + # Move the physical_to_logical_map to CPU + # for rebalancing and transfer_layer. + physical_to_logical_map_cpu = ( + model_state.physical_to_logical_map.cpu() + ) + run_rebalance_experts( + model_state, state, physical_to_logical_map_cpu + ) + rebalancing_algorithm_executed = True + logger.info( + "Async worker computed new indices for model %s", + model_state.model_name, + ) + + assert model_state.new_physical_to_logical_map is not None + assert physical_to_logical_map_cpu is not None + + layer_idx = model_state.layer_to_transfer + old_layer_indices = physical_to_logical_map_cpu[layer_idx] + new_layer_indices = model_state.new_physical_to_logical_map[ + layer_idx + ] # Wait for the main thread to finish consuming the buffer - # before overwriting it + # before initiating an EPLB transfer on another layer. if model_state.buffer_consumed_event is not None: cuda_stream.wait_event(model_state.buffer_consumed_event) model_state.buffer_consumed_event = None @@ -97,13 +169,12 @@ async def transfer_run_periodically( model_state.is_received_locally, model_state.recv_metadata, ) = await transfer_layer( - old_global_expert_indices=model_state.physical_to_logical_map, - new_global_expert_indices=model_state.new_physical_to_logical_map, - expert_weights=model_state.model.expert_weights, + old_layer_indices=old_layer_indices, + new_layer_indices=new_layer_indices, + expert_weights=model_state.model.expert_weights[layer_idx], expert_weights_buffer=model_state.expert_buffer, - ep_group=ep_group, + ep_group=eplb_group, is_profile=is_profile, - layer=model_state.layer_to_transfer, cuda_stream=cuda_stream, rank_mapping=rank_mapping, ) diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index 1c84aeb15..7c3701b4e 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -55,6 +55,35 @@ from .rebalance_execute import ( logger = init_logger(__name__) +@dataclass +class EplbStats: + """ + Model stats used in EPLB rebalancing algorithm. + """ + + global_expert_load_window: torch.Tensor + """ + Experts load window. + Shape: (window_size, num_moe_layers, num_physical_experts) + """ + num_replicas: int + """ + Number of physical experts. + """ + num_groups: int + """ + Number of expert groups. + """ + num_nodes: int + """ + Number of nodes. + """ + num_gpus: int + """ + Number of GPUs. + """ + + @dataclass class EplbModelState: """EPLB metrics.""" @@ -156,6 +185,11 @@ class EplbModelState: CUDA event recorded after the main thread finishes consuming the buffer. The async worker waits on this before writing to the buffer again. """ + window_ready_event: torch.cuda.Event | None + """ + CUDA event recorded after all-reduce and clone on the main thread. + The async worker waits on this before accessing global_expert_load_window. + """ ep_buffer_ready: int """ The flag indicates whether the expert buffer is ready for transfer. @@ -173,6 +207,10 @@ class EplbModelState: """ Whether the async EPLB needs to poll peers for buffer readiness. """ + eplb_stats: EplbStats | None + """ + EPLB stats for the model. + """ is_unchanged: np.ndarray """ intermediate variable between `move_to_buffer` and `move_to_workspace`. @@ -508,10 +546,12 @@ class EplbState: buffer_lock=threading.Lock(), buffer_ready_event=None, buffer_consumed_event=None, + window_ready_event=None, ep_buffer_ready=0, layer_to_transfer=0, rebalanced=False, pending_global_ready_check=False, + eplb_stats=None, is_unchanged=np.array([]), is_received_locally=np.array([]), recv_metadata=RecvMetadata( @@ -642,20 +682,6 @@ class EplbState: ep_group=ep_group, is_profile=is_profile, ) - if ( - eplb_model_state.layer_to_transfer - >= eplb_model_state.model.num_moe_layers - ): - self.post_eplb(eplb_model_state, is_profile) - eplb_model_state.rebalanced = False - eplb_model_state.layer_to_transfer = 0 - eplb_model_state.pending_global_ready_check = False - logger.info( - "finish async transfer for model %s rank %d layer %d", - eplb_model_state.model_name, - ep_group.rank(), - eplb_model_state.model.num_moe_layers, - ) if self.expert_rearrangement_step >= self.expert_rearrangement_step_interval: if self.is_async and any( @@ -802,21 +828,21 @@ class EplbState: for eplb_model_state, global_expert_load_window in zip( self.model_states.values(), global_expert_load_windows ): - # Get new expert mappings for the model - ( - new_physical_to_logical_map, - new_logical_to_physical_map, - new_logical_replica_count, - ) = self.policy.rebalance_experts( - global_expert_load_window, - num_replicas, - num_groups, - num_nodes, - num_gpus, - eplb_model_state.physical_to_logical_map, - ) - if not self.is_async or is_profile: + # Get new expert mappings for the model + ( + new_physical_to_logical_map, + new_logical_to_physical_map, + new_logical_replica_count, + ) = self.policy.rebalance_experts( + global_expert_load_window, + num_replicas, + num_groups, + num_nodes, + num_gpus, + eplb_model_state.physical_to_logical_map, + ) + # Update expert weights rearrange_expert_weights_inplace( eplb_model_state.physical_to_logical_map, @@ -873,27 +899,25 @@ class EplbState: gpu_elapsed, ) else: - max_slots = eplb_model_state.logical_to_physical_map.shape[-1] - padded_logical = torch.nn.functional.pad( - new_logical_to_physical_map, - (0, max(0, max_slots - new_logical_to_physical_map.shape[-1])), - value=-1, - ).to(eplb_model_state.logical_to_physical_map.device) - new_replica = new_logical_replica_count.to( - eplb_model_state.logical_replica_count.device + eplb_model_state.eplb_stats = EplbStats( + # We copy the tensor to snapshot the global_expert_load_window + # on the main thread so that async worker can access it safely + # while the main thread is running. + global_expert_load_window=global_expert_load_window.clone(), + num_replicas=num_replicas, + num_groups=num_groups, + num_nodes=num_nodes, + num_gpus=num_gpus, ) - - # Move map to cpu in advance - eplb_model_state.new_physical_to_logical_map = ( - new_physical_to_logical_map.cpu() - ) - eplb_model_state.new_logical_to_physical_map = padded_logical - eplb_model_state.new_logical_replica_count = new_replica + # Record event after clone to signal async worker + # that load stats data is ready + sync_event = torch.cuda.Event() + sync_event.record() + eplb_model_state.window_ready_event = sync_event eplb_model_state.rebalanced = True eplb_model_state.layer_to_transfer = 0 eplb_model_state.pending_global_ready_check = True - # Signal async thread to start transferring layers if self.is_async and (not is_profile): self.rearrange_event.set() @@ -925,11 +949,13 @@ class EplbState: target_device = model_state.physical_to_logical_map.device new_physical = model_state.new_physical_to_logical_map + # If the number of physical experts has changed, then the new map needs to + # be copied synchronously to avoid a race condition with the async worker if model_state.physical_to_logical_map.shape[1] != new_physical.shape[1]: model_state.physical_to_logical_map = new_physical.to(target_device) else: model_state.physical_to_logical_map[layer].copy_( - new_physical[layer].to(target_device) + new_physical[layer].to(target_device, non_blocking=True) ) logical_device = model_state.logical_to_physical_map.device @@ -1004,11 +1030,9 @@ class EplbState: model_state.layer_to_transfer ] expert_weights_buffer = model_state.expert_buffer - new_indices = ( - model_state.new_physical_to_logical_map[model_state.layer_to_transfer] - .cpu() - .numpy() - ) + new_indices = model_state.new_physical_to_logical_map[ + model_state.layer_to_transfer + ].numpy() move_from_buffer( expert_weights=expert_weights, expert_weights_buffers=expert_weights_buffer, @@ -1019,7 +1043,7 @@ class EplbState: ep_rank=ep_group.rank(), ) # Record event after consuming buffer to signal async thread - # that it's safe to overwrite the buffer + # that it's safe to overwrite the intermediate buffer consumed_event = torch.cuda.Event() consumed_event.record() model_state.buffer_consumed_event = consumed_event @@ -1034,6 +1058,18 @@ class EplbState: model_state.model_name, transferred_layer, ) + if model_state.layer_to_transfer >= model_state.model.num_moe_layers: + self.post_eplb(model_state, is_profile) + model_state.rebalanced = False + model_state.layer_to_transfer = 0 + model_state.pending_global_ready_check = False + logger.info( + "finish async transfer for model %s rank %d layer %d", + model_state.model_name, + ep_group.rank(), + model_state.model.num_moe_layers, + ) + finally: try: model_state.buffer_lock.release() @@ -1048,9 +1084,7 @@ class EplbState: assert model_state.new_physical_to_logical_map is not None assert model_state.new_logical_to_physical_map is not None assert model_state.new_logical_replica_count is not None - if not is_profile: - for layer_idx in range(model_state.physical_to_logical_map.shape[0]): - self._update_layer_mapping_from_new(model_state, layer_idx) + model_state.new_physical_to_logical_map = None model_state.new_logical_to_physical_map = None model_state.new_logical_replica_count = None diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py index 72bbe1c5d..1be1e2483 100644 --- a/vllm/distributed/eplb/rebalance_execute.py +++ b/vllm/distributed/eplb/rebalance_execute.py @@ -434,13 +434,12 @@ def move_from_buffer( async def transfer_layer( - old_global_expert_indices: torch.Tensor, - new_global_expert_indices: torch.Tensor, - expert_weights: Sequence[Sequence[torch.Tensor]], + old_layer_indices: torch.Tensor, + new_layer_indices: torch.Tensor, + expert_weights: Sequence[torch.Tensor], expert_weights_buffer: Sequence[torch.Tensor], ep_group: ProcessGroup, is_profile: bool = False, - layer: int = 0, cuda_stream: torch.cuda.Stream | None = None, rank_mapping: dict[int, int] | None = None, ) -> MoveToBufferResult: @@ -451,56 +450,64 @@ async def transfer_layer( while keys are physical. Args: - old_global_expert_indices: Shape (num_moe_layers, num_physical_experts). - new_global_expert_indices: Shape (num_moe_layers, num_physical_experts). - expert_weights: A sequence of shape (num_moe_layers)(weight_count) - of tensors of shape (num_local_physical_experts, hidden_size_i). - For example, a linear layer may have up and down projection, - so weight_count = 2. Each weight's hidden size can be different. + old_layer_indices: Shape (num_physical_experts,). + new_layer_indices: Shape (num_physical_experts,). + expert_weights: Iterable of weight tensors for this layer, each with shape + (num_local_physical_experts, hidden_size_i). + For example, a linear layer may have up and down projection. + expert_weights_buffer: Intermediate buffers (one per weight tensor). ep_group: The device process group for expert parallelism. is_profile (bool): If `True`, do not perform any actual weight copy. This is used during profile run, where we only perform dummy communications to reserve enough memory for the buffers. + cuda_stream: CUDA stream for async copies (can be None for sync mode). + rank_mapping: Optional rank mapping for elastic expert parallelism. Returns: - is_unchanged (np.ndarray): (1, num_local_experts), True where expert + is_unchanged (np.ndarray): (num_local_experts,), True where expert is left unchanged. - is_received_locally (np.ndarray): (1, num_local_experts), True where expert + is_received_locally (np.ndarray): (num_local_experts,), True where expert can be received locally. RecvMetadata: Metadata needed for completing remote weight transfers. """ ep_size = ep_group.size() if rank_mapping is not None: + # Add a layer dimension for compatibility with mapping functions + old_layer_indices_2d = old_layer_indices.unsqueeze(0) + new_layer_indices_2d = new_layer_indices.unsqueeze(0) + if len(rank_mapping) == ep_group.size(): # scale down - new_global_expert_indices = _map_new_expert_indices_with_rank_mapping( - new_global_expert_indices, + new_layer_indices_2d = _map_new_expert_indices_with_rank_mapping( + new_layer_indices_2d, rank_mapping, ) else: # scale up - old_global_expert_indices = _map_old_expert_indices_with_rank_mapping( - old_global_expert_indices, + old_layer_indices_2d = _map_old_expert_indices_with_rank_mapping( + old_layer_indices_2d, rank_mapping, ep_group.size(), ) - assert old_global_expert_indices.shape[1] == new_global_expert_indices.shape[1] - num_moe_layers, num_physical_experts = old_global_expert_indices.shape - assert len(expert_weights) == num_moe_layers + # Remove the layer dimension + old_layer_indices = old_layer_indices_2d.squeeze(0) + new_layer_indices = new_layer_indices_2d.squeeze(0) + + assert old_layer_indices.shape == new_layer_indices.shape + num_physical_experts = old_layer_indices.shape[0] assert len(expert_weights[0]) >= 1 - num_local_physical_experts = expert_weights[0][0].shape[0] - assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts) + num_local_physical_experts = expert_weights[0].shape[0] assert num_physical_experts == ep_size * num_local_physical_experts - old_global_expert_indices_np = old_global_expert_indices.cpu().numpy() - new_global_expert_indices_np = new_global_expert_indices.cpu().numpy() + old_layer_indices_np = old_layer_indices.cpu().numpy() + new_layer_indices_np = new_layer_indices.cpu().numpy() is_unchanged, is_received_locally, recv_metadata = move_to_buffer( num_local_experts=num_local_physical_experts, - old_indices=old_global_expert_indices_np[layer], - new_indices=new_global_expert_indices_np[layer], - expert_weights=expert_weights[layer], + old_indices=old_layer_indices_np, + new_indices=new_layer_indices_np, + expert_weights=expert_weights, expert_weights_buffers=expert_weights_buffer, cuda_stream=cuda_stream, ep_group=ep_group, diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 00366f96c..b8b2607ff 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1143,6 +1143,18 @@ def get_ep_group() -> GroupCoordinator: return _EP +_EPLB: GroupCoordinator | None = None + + +def get_eplb_group() -> GroupCoordinator: + assert _EPLB is not None, ( + "EPLB group is not initialized. " + "EPLB group is only created for MoE models when EPLB is enabled. " + "Ensure parallel_config.enable_eplb is True." + ) + return _EPLB + + _PCP: GroupCoordinator | None = None @@ -1440,12 +1452,29 @@ def initialize_model_parallel( _EP = init_model_parallel_group( group_ranks, get_world_group().local_rank, backend, group_name="ep" ) + + # Create EPLB group with the same ranks as EP if EPLB is enabled. + # This is a separate process group to isolate EPLB communications + # from MoE forward pass collectives and prevent deadlocks when + # using torch.distributed in execution with torch.distributed in EPLB. + global _EPLB + assert _EPLB is None, "EPLB group is already initialized" + if ( + config is not None + and config.parallel_config is not None + and config.parallel_config.enable_eplb + ): + # Reuse the same group_ranks from EP + _EPLB = init_model_parallel_group( + group_ranks, get_world_group().local_rank, backend, group_name="eplb" + ) # If no EP group needed, _EP remains None + # If no EPLB group needed, _EPLB remains None logger.info_once( "rank %s in world size %s is assigned as " "DP rank %s, PP rank %s, PCP rank %s, " - "TP rank %s, EP rank %s", + "TP rank %s, EP rank %s, EPLB rank %s", rank, world_size, _DP.rank_in_group, @@ -1453,6 +1482,7 @@ def initialize_model_parallel( _PCP.rank_in_group, _TP.rank_in_group, _EP.rank_in_group if _EP is not None else "N/A", + _EPLB.rank_in_group if _EPLB is not None else "N/A", ) @@ -1514,6 +1544,8 @@ def prepare_communication_buffer_for_model(model: torch.nn.Module): _DP.prepare_communication_buffer_for_model(model) if _EP is not None: _EP.prepare_communication_buffer_for_model(model) + if _EPLB is not None: + _EPLB.prepare_communication_buffer_for_model(model) def model_parallel_is_initialized(): @@ -1608,6 +1640,11 @@ def destroy_model_parallel(): _EP.destroy() _EP = None + global _EPLB + if _EPLB: + _EPLB.destroy() + _EPLB = None + def destroy_distributed_environment(): global _WORLD, _NODE_COUNT -- GitLab From bb2fc8b5e7beca9c5749e464b4607c753db0b630 Mon Sep 17 00:00:00 2001 From: Ilya Markov Date: Tue, 10 Feb 2026 23:34:47 +0100 Subject: [PATCH 0066/1166] [BugFix] Fix async EPLB hang with DeepEP LL all2all backend (#32860) Signed-off-by: ilmarkov --- vllm/distributed/eplb/eplb_utils.py | 54 +++++++++++++++++++++++++++++ vllm/v1/worker/gpu_worker.py | 2 ++ 2 files changed, 56 insertions(+) create mode 100644 vllm/distributed/eplb/eplb_utils.py diff --git a/vllm/distributed/eplb/eplb_utils.py b/vllm/distributed/eplb/eplb_utils.py new file mode 100644 index 000000000..455848341 --- /dev/null +++ b/vllm/distributed/eplb/eplb_utils.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Utility functions for EPLB (Expert Parallel Load Balancing).""" + +import os + +from vllm.config import ParallelConfig +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def override_envs_for_eplb(parallel_config: ParallelConfig) -> None: + """ + Override environment variables for EPLB when specific conditions are met. + + Args: + parallel_config: The parallel configuration object. + """ + is_data_parallel = parallel_config.data_parallel_size > 1 + is_eplb_enabled = parallel_config.enable_eplb + async_eplb = parallel_config.eplb_config.use_async + is_deepep_ll = parallel_config.all2all_backend == "deepep_low_latency" + + # Override NCCL_MAX_CTAS to avoid hangs when using async EPLB with the + # DeepEP low-latency backend. + # + # The hang happens when two ranks interleave kernel launches differently + # between NCCL collectives (used by async EPLB weight exchange) and DeepEP + # low-latency (LL) kernels. DeepEP LL uses a cooperative launch and tries + # to reserve a large fraction of the GPU's SMs; if those SMs are currently + # occupied by NCCL, the DeepEP LL launch blocks until enough SMs are + # freed. + # + # If rank A enters DeepEP LL in main thread while rank B is still executing + # NCCL in async thread, rank A can block waiting for SMs, while rank B can + # block inside NCCL waiting for rank A to participate in the collective. + # This circular wait causes a deadlock. + # Limiting NCCL occupancy via NCCL_MAX_CTAS leaves space for the DeepEP + # cooperative kernel to launch and complete, breaking the deadlock. + # See: https://github.com/deepseek-ai/DeepEP/issues/496 + if is_data_parallel and is_eplb_enabled and is_deepep_ll and async_eplb: + current_value_str = os.getenv("NCCL_MAX_CTAS") + + if current_value_str and current_value_str.isdigit(): + return + + override_value = 8 + os.environ["NCCL_MAX_CTAS"] = str(override_value) + logger.info_once( + f"EPLB: Setting NCCL_MAX_CTAS={override_value} " + "for expert parallel with EPLB and deepep_low_latency backend", + scope="global", + ) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 1c526bab9..2b7d9ff29 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -22,6 +22,7 @@ from vllm.distributed import ( set_custom_all_reduce, ) from vllm.distributed.ec_transfer import ensure_ec_transfer_initialized +from vllm.distributed.eplb.eplb_utils import override_envs_for_eplb from vllm.distributed.kv_transfer import ( ensure_kv_transfer_initialized, ensure_kv_transfer_shutdown, @@ -1035,6 +1036,7 @@ def init_worker_distributed_environment( from vllm.model_executor.layers.batch_invariant import init_batch_invariance init_batch_invariance(attention_config.backend) + override_envs_for_eplb(parallel_config) set_custom_all_reduce(not parallel_config.disable_custom_all_reduce) init_method = distributed_init_method or "env://" -- GitLab From 6f2f59f2b333151aac19f8ca7bf71d83c1a7c068 Mon Sep 17 00:00:00 2001 From: Zhengkai Zhang <33679250+ZhengkaiZ@users.noreply.github.com> Date: Tue, 10 Feb 2026 14:52:43 -0800 Subject: [PATCH 0067/1166] [Misc][Spec Decode] support different load config for draft model (#34022) Signed-off-by: zzhengkai Co-authored-by: zzhengkai --- vllm/config/speculative.py | 5 +++++ vllm/model_executor/model_loader/__init__.py | 3 ++- vllm/v1/spec_decode/eagle.py | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 8117349d8..47e4a7bbb 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Literal, get_args from pydantic import Field, SkipValidation, model_validator from typing_extensions import Self +from vllm.config import LoadConfig from vllm.config.model import ModelConfig from vllm.config.parallel import ParallelConfig from vllm.config.utils import config @@ -160,6 +161,10 @@ class SpeculativeConfig: tokens with estimated probability (based on frequency counts) greater than or equal to this value.""" + draft_load_config: LoadConfig | None = None + """Load config for the draft model. If not specified, will use the load + config from the target model.""" + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py index e1d8d2ead..ff95d5b94 100644 --- a/vllm/model_executor/model_loader/__init__.py +++ b/vllm/model_executor/model_loader/__init__.py @@ -128,8 +128,9 @@ def get_model( vllm_config: VllmConfig, model_config: ModelConfig | None = None, prefix: str = "", + load_config: LoadConfig | None = None, ) -> nn.Module: - loader = get_model_loader(vllm_config.load_config) + loader = get_model_loader(load_config or vllm_config.load_config) if model_config is None: model_config = vllm_config.model_config return loader.load_model( diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index d29ee00fa..b5532d652 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -1286,6 +1286,7 @@ class SpecDecodeBaseProposer: model = get_model( vllm_config=self.vllm_config, model_config=self.speculative_config.draft_model_config, + load_config=self.speculative_config.draft_load_config, ) return model -- GitLab From 341eed3d30b7579b730e9959213d83b5dbd4731c Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Tue, 10 Feb 2026 18:02:31 -0500 Subject: [PATCH 0068/1166] [torch.compile] Disable recursive pre_grad_passes (#34092) Signed-off-by: Richard Zou --- vllm/compilation/compiler_interface.py | 15 ++++++++++++++- vllm/envs.py | 10 ++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 1d5adb185..c00486af6 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -257,7 +257,20 @@ class InductorStandaloneAdaptor(CompilerInterface): if use_aot: compile_kwargs["aot"] = True # type: ignore[assignment] - compiled_graph = standalone_compile(graph, example_inputs, **compile_kwargs) + # Inductor's pre-grad passes don't do anything for vLLM. + # The pre-grad passes get run even on cache-hit and negatively impact + # vllm cold compile times by O(1s) + # Can remove this after the following issue gets fixed + # https://github.com/pytorch/pytorch/issues/174502 + if envs.VLLM_ENABLE_PREGRAD_PASSES: + ctx: Any = contextlib.nullcontext() + else: + ctx = patch( + "torch._inductor.compile_fx._recursive_pre_grad_passes", + lambda gm, _: gm, + ) + with ctx: + compiled_graph = standalone_compile(graph, example_inputs, **compile_kwargs) if use_aot: from torch._inductor.standalone_compile import AOTCompiledArtifact diff --git a/vllm/envs.py b/vllm/envs.py index 314f42758..039b3239c 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -132,6 +132,7 @@ if TYPE_CHECKING: VLLM_DP_RANK_LOCAL: int = -1 VLLM_DP_SIZE: int = 1 VLLM_USE_STANDALONE_COMPILE: bool = True + VLLM_ENABLE_PREGRAD_PASSES: bool = False VLLM_DP_MASTER_IP: str = "" VLLM_DP_MASTER_PORT: int = 0 VLLM_MOE_DP_CHUNK_SIZE: int = 256 @@ -568,6 +569,15 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_USE_STANDALONE_COMPILE", "1" ) == "1", + # Inductor's pre-grad passes don't do anything for vLLM. + # The pre-grad passes get run even on cache-hit and negatively impact + # vllm cold compile times by O(1s) + # Can remove this after the following issue gets fixed + # https://github.com/pytorch/pytorch/issues/174502 + "VLLM_ENABLE_PREGRAD_PASSES": lambda: os.environ.get( + "VLLM_ENABLE_PREGRAD_PASSES", "0" + ) + == "1", # Debug pattern matching inside custom passes. # Should be set to the fx.Node name (e.g. 'getitem_34' or 'scaled_mm_3'). "VLLM_PATTERN_MATCH_DEBUG": lambda: os.environ.get( -- GitLab From c4b9e6778f9d8054c1665b2d1c2cb0ee36e9e2f5 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Tue, 10 Feb 2026 18:13:20 -0500 Subject: [PATCH 0069/1166] [Misc] Add pre-commit hook to catch boolean ops in with-statements (#34271) Signed-off-by: Tyler Michael Smith Co-authored-by: Claude Opus 4.6 --- .pre-commit-config.yaml | 5 ++ .../check_boolean_context_manager.py | 70 +++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 tools/pre_commit/check_boolean_context_manager.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index db7321b93..33460222e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -143,6 +143,11 @@ repos: name: Check attention backend documentation is up to date entry: python tools/pre_commit/generate_attention_backend_docs.py --check language: python + - id: check-boolean-context-manager + name: Check for boolean ops in with-statements + entry: python tools/pre_commit/check_boolean_context_manager.py + language: python + types: [python] # Keep `suggestion` last - id: suggestion name: Suggestion diff --git a/tools/pre_commit/check_boolean_context_manager.py b/tools/pre_commit/check_boolean_context_manager.py new file mode 100644 index 000000000..a482451ba --- /dev/null +++ b/tools/pre_commit/check_boolean_context_manager.py @@ -0,0 +1,70 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Lint: detect `with a() and b():` (boolean op in with-statement context). + +Using `and`/`or` to combine context managers is almost always a bug: + + with ctx_a() and ctx_b(): # BUG: only ctx_b is entered + with ctx_a() or ctx_b(): # BUG: only ctx_a is entered + +The correct way to combine context managers is: + + with ctx_a(), ctx_b(): # comma-separated + with (ctx_a(), ctx_b()): # parenthesized (Python 3.10+) + with contextlib.ExitStack() ... # ExitStack +""" + +import ast +import sys + + +def check_file(filepath: str) -> list[str]: + try: + with open(filepath, encoding="utf-8") as f: + source = f.read() + except (OSError, UnicodeDecodeError): + return [] + + try: + tree = ast.parse(source, filename=filepath) + except SyntaxError: + return [] + + violations = [] + for node in ast.walk(tree): + if isinstance(node, (ast.With, ast.AsyncWith)): + for item in node.items: + if isinstance(item.context_expr, ast.BoolOp): + op = "and" if isinstance(item.context_expr.op, ast.And) else "or" + violations.append( + f"{filepath}:{item.context_expr.lineno}: " + f"boolean `{op}` used to combine context managers " + f"in `with` statement — use a comma instead" + ) + return violations + + +def main() -> int: + if len(sys.argv) < 2: + print("Usage: check_boolean_context_manager.py ...", file=sys.stderr) + return 1 + + all_violations = [] + for filepath in sys.argv[1:]: + all_violations.extend(check_file(filepath)) + + if all_violations: + print( + "❌ Boolean operator used to combine context managers in `with` " + "statement.\n" + " `with a() and b():` only enters `b()` as a context manager.\n" + " Use `with a(), b():` or `with (a(), b()):` instead.\n" + ) + for v in all_violations: + print(f" {v}") + return 1 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) -- GitLab From dc6de33c3d5e9026cef7b27791dfe0f98e64bbde Mon Sep 17 00:00:00 2001 From: "7. Sun" Date: Wed, 11 Feb 2026 08:45:28 +0800 Subject: [PATCH 0070/1166] [CI] Add pip caching to cleanup_pr_body workflow (#32979) Signed-off-by: 7. Sun --- .github/workflows/cleanup_pr_body.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml index df8910837..f1a91a7cd 100644 --- a/.github/workflows/cleanup_pr_body.yml +++ b/.github/workflows/cleanup_pr_body.yml @@ -19,6 +19,7 @@ jobs: uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 with: python-version: '3.12' + cache: 'pip' - name: Install Python dependencies run: | -- GitLab From d1481ba78323bcba5937f5ff74f3a8d27ab54f88 Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Tue, 10 Feb 2026 19:51:07 -0500 Subject: [PATCH 0071/1166] [MoE Refactor] Introduce MoERunner abstraction and move execution logic from FusedMoE to DefaultMoERunner (#32344) Signed-off-by: Bill Nell --- docs/design/moe_kernel_features.md | 2 +- .../moe/modular_kernel_tools/common.py | 2 + tests/kernels/moe/utils.py | 1 + .../model_executor/layers/fused_moe/config.py | 22 +- .../layers/fused_moe/fused_moe_method_base.py | 3 +- .../fused_moe/fused_moe_modular_method.py | 3 +- vllm/model_executor/layers/fused_moe/layer.py | 741 ++--------------- .../layers/fused_moe/modular_kernel.py | 2 +- .../layers/fused_moe/runner/__init__.py | 2 + .../fused_moe/runner/default_moe_runner.py | 743 ++++++++++++++++++ .../layers/fused_moe/runner/moe_runner.py | 34 + .../layers/fused_moe/shared_fused_moe.py | 64 -- .../fused_moe/unquantized_fused_moe_method.py | 9 +- .../layers/quantization/awq_marlin.py | 1 + .../layers/quantization/bitsandbytes.py | 1 + .../compressed_tensors_moe.py | 13 +- .../layers/quantization/experts_int8.py | 1 + .../model_executor/layers/quantization/fp8.py | 3 +- .../layers/quantization/gguf.py | 1 + .../layers/quantization/gptq_marlin.py | 1 + .../layers/quantization/modelopt.py | 6 +- .../layers/quantization/moe_wna16.py | 1 + .../layers/quantization/mxfp4.py | 1 + .../layers/quantization/quark/quark_moe.py | 3 + vllm/v1/worker/gpu_worker.py | 6 +- 25 files changed, 913 insertions(+), 753 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/runner/__init__.py create mode 100644 vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py create mode 100644 vllm/model_executor/layers/fused_moe/runner/moe_runner.py diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md index 75ebee6ec..9ac31d2c0 100644 --- a/docs/design/moe_kernel_features.md +++ b/docs/design/moe_kernel_features.md @@ -32,7 +32,7 @@ th { | Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Subclass | |---------|--------------------|--------------|---------------|-------|-----------------------|-----------| -| naive | standard | all1 | G,A,T | N | 6 | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE.forward_impl] | +| naive | standard | all1 | G,A,T | N | 6 | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE | | pplx | batched | fp8,int8 | G,A,T | Y | Y | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize] | | deepep_high_throughput | standard | fp8 | G(128),A,T2 | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] | | deepep_low_latency | batched | fp8 | G(128),A,T3 | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] | diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py index 893968b5c..6dfcd5ebe 100644 --- a/tests/kernels/moe/modular_kernel_tools/common.py +++ b/tests/kernels/moe/modular_kernel_tools/common.py @@ -585,6 +585,7 @@ def make_modular_kernel( tp_size_=get_tensor_model_parallel_world_size(), pcp_size_=get_pcp_group().world_size, dp_size_=get_dp_group().world_size, + sp_size_=1, vllm_parallel_config=vllm_config.parallel_config, ) @@ -594,6 +595,7 @@ def make_modular_kernel( hidden_dim=config.K, intermediate_size_per_partition=config.N, num_local_experts=config.num_local_experts, + num_logical_experts=config.E, moe_parallel_config=moe_parallel_config, in_dtype=config.dtype, max_num_tokens=next_power_of_2(config.M), diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index 897bfddce..984fabc47 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -52,6 +52,7 @@ def make_dummy_moe_config( hidden_dim=hidden_dim, intermediate_size_per_partition=intermediate_size_per_partition, num_local_experts=num_experts, + num_logical_experts=num_experts, moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), activation="silu", in_dtype=in_dtype, diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index b9fee1dd4..6dce6875d 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -913,12 +913,16 @@ class FusedMoEParallelConfig: pcp_rank: int dp_rank: int ep_rank: int + sp_size: int use_ep: bool # whether to use EP or not all2all_backend: str # all2all backend for MoE communication - is_sequence_parallel: bool # whether sequence parallelism is used enable_eplb: bool # whether to enable expert load balancing + @property + def is_sequence_parallel(self) -> bool: + return self.sp_size > 1 + @property def use_all2all_kernels(self): return self.dp_size > 1 and self.use_ep @@ -974,6 +978,7 @@ class FusedMoEParallelConfig: tp_size_: int, pcp_size_: int, dp_size_: int, + sp_size_: int, vllm_parallel_config: ParallelConfig, ) -> "FusedMoEParallelConfig": """ @@ -1073,9 +1078,9 @@ class FusedMoEParallelConfig: dp_rank=dp_rank, ep_size=1, ep_rank=0, + sp_size=sp_size_, use_ep=False, all2all_backend=vllm_parallel_config.all2all_backend, - is_sequence_parallel=vllm_parallel_config.use_sequence_parallel_moe, enable_eplb=vllm_parallel_config.enable_eplb, ) # DP + EP / TP + EP / DP + TP + EP @@ -1093,9 +1098,9 @@ class FusedMoEParallelConfig: dp_rank=dp_rank, ep_size=ep_size, ep_rank=ep_rank, + sp_size=sp_size_, use_ep=True, all2all_backend=vllm_parallel_config.all2all_backend, - is_sequence_parallel=vllm_parallel_config.use_sequence_parallel_moe, enable_eplb=vllm_parallel_config.enable_eplb, ) @@ -1111,10 +1116,10 @@ class FusedMoEParallelConfig: dp_rank=0, ep_size=1, ep_rank=0, + sp_size=1, use_ep=False, all2all_backend="naive", enable_eplb=False, - is_sequence_parallel=False, ) @@ -1126,6 +1131,7 @@ class FusedMoEConfig: hidden_dim: int intermediate_size_per_partition: int num_local_experts: int + num_logical_experts: int activation: str device: torch.device | str routing_method: RoutingMethodType @@ -1175,6 +1181,14 @@ class FusedMoEConfig: def ep_size(self): return self.moe_parallel_config.ep_size + @property + def sp_size(self): + return self.moe_parallel_config.sp_size + + @property + def is_sequence_parallel(self): + return self.moe_parallel_config.is_sequence_parallel + @property def tp_rank(self): return self.moe_parallel_config.tp_rank diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py index 93db1c545..ac7c71e52 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py @@ -121,17 +121,16 @@ class FusedMoEMethodBase(QuantizeMethodBase): def is_monolithic(self) -> bool: return False - # @abstractmethod def apply( self, layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: raise NotImplementedError - # @abstractmethod def apply_monolithic( self, layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py index 69a6e70fc..1aa9e3a65 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -89,6 +89,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert self.moe_mk is not None return self.moe_mk( @@ -101,5 +102,5 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): global_num_experts=layer.global_num_experts, apply_router_weight_on_input=layer.apply_router_weight_on_input, expert_map=None if self.disable_expert_map else layer.expert_map, - shared_experts_input=layer._get_shared_experts_input(x), + shared_experts_input=shared_experts_input, ) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index f35ec87aa..914dc6846 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1,13 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable, Generator, Iterable -from contextlib import contextmanager, nullcontext +from collections.abc import Callable, Iterable from enum import Enum from typing import Literal, cast, get_args, overload import torch -import torch.nn.functional as F from torch.nn.parameter import UninitializedParameter import vllm.envs as envs @@ -16,17 +14,10 @@ from vllm.config import VllmConfig, get_current_vllm_config from vllm.config.parallel import ExpertPlacementStrategy from vllm.distributed import ( get_dp_group, - get_ep_group, get_pcp_group, get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce, ) from vllm.distributed.eplb.eplb_state import EplbLayerState, EplbState -from vllm.forward_context import ( - ForwardContext, - get_forward_context, - is_forward_context_available, -) from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.fused_moe.config import ( @@ -47,6 +38,9 @@ from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( from vllm.model_executor.layers.fused_moe.router.router_factory import ( create_fused_moe_router, ) +from vllm.model_executor.layers.fused_moe.runner.default_moe_runner import ( + DefaultMoERunner, +) from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( UnquantizedFusedMoEMethod, ) @@ -57,13 +51,7 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, ) from vllm.platforms import current_platform -from vllm.utils.math_utils import cdiv, round_up -from vllm.utils.torch_utils import ( - aux_stream, - current_stream, - direct_register_custom_op, -) -from vllm.v1.worker.ubatching import dbo_current_ubatch_id +from vllm.utils.math_utils import round_up logger = init_logger(__name__) @@ -264,6 +252,7 @@ def maybe_roundup_hidden_size( ) current_mxfp4_backend = get_mxfp4_backend(is_lora_enabled) + if ( current_mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16 or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS @@ -273,6 +262,7 @@ def maybe_roundup_hidden_size( current_platform.is_rocm() or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16 + or current_mxfp4_backend == Mxfp4Backend.MARLIN ): hidden_size = round_up(hidden_size, 256) @@ -338,29 +328,15 @@ class FusedMoE(CustomOp): expert_mapping: list[tuple[str, str, int, str]] | None = None, n_shared_experts: int | None = None, router_logits_dtype: torch.dtype | None = None, - has_shared_experts: bool = False, + gate: torch.nn.Module | None = None, + shared_experts: torch.nn.Module | None = None, + routed_input_transform: torch.nn.Module | None = None, ): super().__init__() - # Allow disabling of the separate shared experts stream for - # debug purposes. - # TODO: Remove this after more extensive testings with TP/DP - # and other execution modes - if envs.VLLM_DISABLE_SHARED_EXPERTS_STREAM: - logger.debug_once("Disabling MoE shared_experts cuda stream", scope="local") - self.shared_experts_stream = None - else: - # TODO(rob): enable shared expert overlap with non-cuda-alike. - # aux_stream() returns None on non-cuda-alike platforms. - self.shared_experts_stream = aux_stream() - if self.shared_experts_stream is not None: - logger.debug_once( - "Enabled separate cuda stream for MoE shared_experts", scope="local" - ) - - # For latent MoE: stores original hidden_states before routed_input_transform - # so shared_experts can use it for cloning (they need original dimension) - self._shared_experts_input: torch.Tensor | None = None + self._gate = gate + self._shared_experts = shared_experts + self._routed_input_transform = routed_input_transform if params_dtype is None: params_dtype = torch.get_default_dtype() @@ -392,9 +368,12 @@ class FusedMoE(CustomOp): tp_size_=tp_size_, pcp_size_=pcp_size_, dp_size_=dp_size_, + sp_size_=self.sp_size, vllm_parallel_config=vllm_config.parallel_config, ) + assert self.moe_parallel_config.is_sequence_parallel == is_sequence_parallel + self.global_num_experts = num_experts + num_redundant_experts self.logical_num_experts = num_experts @@ -410,6 +389,7 @@ class FusedMoE(CustomOp): self.layer_name = prefix self.enable_eplb = enable_eplb + # TODO(bnell): should this be owned by router? self.eplb_state = EplbLayerState() self.expert_placement_strategy: ExpertPlacementStrategy = ( vllm_config.parallel_config.expert_placement_strategy @@ -506,7 +486,8 @@ class FusedMoE(CustomOp): self.reduce_results = reduce_results self.renormalize = renormalize - # TODO(bnell): these attributes are only used by cpu/xpu/mxfp4 + # TODO(bnell): these attributes are only used by monolithic kernels. + # Put them in a MoERouterConfig dataclass? self.use_grouped_topk = use_grouped_topk if self.use_grouped_topk: assert num_expert_group is not None and topk_group is not None @@ -565,6 +546,7 @@ class FusedMoE(CustomOp): hidden_dim=hidden_size, intermediate_size_per_partition=self.intermediate_size_per_partition, num_local_experts=self.local_num_experts, + num_logical_experts=self.logical_num_experts, moe_parallel_config=self.moe_parallel_config, in_dtype=moe_in_dtype, router_logits_dtype=router_logits_dtype, @@ -576,9 +558,9 @@ class FusedMoE(CustomOp): device=vllm_config.device_config.device, routing_method=self.routing_method_type, # TODO: in_dtype == out_dtype? - disable_inplace=disable_inplace() or has_shared_experts, + disable_inplace=disable_inplace() or self._shared_experts is not None, ) - if self.use_mori_kernels: + if self.moe_config.use_mori_kernels: assert self.rocm_aiter_fmoe_enabled, ( "Mori needs to be used with aiter fused_moe for now." ) @@ -641,9 +623,36 @@ class FusedMoE(CustomOp): self.quant_method.create_weights(layer=self, **moe_quant_params) - # Chunked all2all staging tensor - self.batched_hidden_states: torch.Tensor | None = None - self.batched_router_logits: torch.Tensor | None = None + # Disable shared expert overlap if: + # - we are using eplb with non-default backend, because of correctness issues + # - we are using flashinfer with DP, since there nothing to gain + # - we are using marlin kernels + backend = self.moe_parallel_config.all2all_backend + self.use_overlapped = ( + not ( + (self.enable_eplb and backend != "allgather_reducescatter") + or self.moe_parallel_config.use_fi_all2allv_kernels + ) + and self._shared_experts is not None + ) + + self.runner = self._init_runner() + + def _init_runner(self): + # Storing the runner in the FusedMoE is an intermediate state, eventually + # the runner will own the FusedMoE layer and provide the execution interface + # for MoE ops. + return DefaultMoERunner( + layer=self, + moe_config=self.moe_config, + router=self.router, + routed_input_transform=self._routed_input_transform, + gate=self.gate, + shared_experts=self.shared_experts, + quant_method=self.quant_method, + reduce_results=self.reduce_results, + enable_dbo=self.vllm_config.parallel_config.enable_dbo, + ) # Note: maybe_init_modular_kernel should only be called by # prepare_communication_buffer_for_model. @@ -673,10 +682,14 @@ class FusedMoE(CustomOp): self.shared_experts, inplace=not self.moe_config.disable_inplace, ) + # We need to force reconstruction of runner because we're swapping out + # the quant_method with a FusedMoEModularMethod. This logic can go + # away once the FusedMoEModularMethod is eliminated. + self.runner = self._init_runner() @property def shared_experts(self) -> torch.nn.Module | None: - return None + return self._shared_experts if self.use_overlapped else None @property def layer_id(self): @@ -687,53 +700,12 @@ class FusedMoE(CustomOp): @property def gate(self) -> torch.nn.Module | None: - return None - - def apply_routed_input_transform(self, hidden_states: torch.Tensor) -> torch.Tensor: - """Hook to transform hidden_states before passing to routed experts. - For latent MoE: transforms [S, hidden_size] → [S, moe_latent_size]. - The original hidden_states is saved in _shared_experts_input so - shared_experts still receive the original [S, hidden_size]. - - Override in subclasses (e.g., SharedFusedMoE) for latent MoE. - """ - return hidden_states - - @contextmanager - def _set_shared_experts_input( - self, value: torch.Tensor | None - ) -> Generator[None, None, None]: - """Context manager to safely set/clear _shared_experts_input.""" - self._shared_experts_input = value - try: - yield - finally: - self._shared_experts_input = None - - def _get_shared_experts_input(self, hidden_states: torch.Tensor) -> torch.Tensor: - """Get input for shared experts. - - For latent MoE: shared_experts need original [S, hidden_size], - not the transformed [S, latent_size] used by routed experts. - """ - return ( - self._shared_experts_input - if self._shared_experts_input is not None - else hidden_states - ) + return self._gate @property def tp_size(self): return self.moe_parallel_config.tp_size - @property - def dp_size(self): - return self.moe_parallel_config.dp_size - - @property - def pcp_size(self): - return self.moe_parallel_config.pcp_size - @property def ep_size(self): return self.moe_parallel_config.ep_size @@ -742,14 +714,6 @@ class FusedMoE(CustomOp): def tp_rank(self): return self.moe_parallel_config.tp_rank - @property - def dp_rank(self): - return self.moe_parallel_config.dp_rank - - @property - def pcp_rank(self): - return self.moe_parallel_config.pcp_rank - @property def ep_rank(self): return self.moe_parallel_config.ep_rank @@ -758,39 +722,10 @@ class FusedMoE(CustomOp): def use_ep(self): return self.moe_parallel_config.use_ep - @property - def use_pplx_kernels(self): - return self.moe_parallel_config.use_pplx_kernels - - @property - def use_deepep_ht_kernels(self): - return self.moe_parallel_config.use_deepep_ht_kernels - - @property - def use_deepep_ll_kernels(self): - return self.moe_parallel_config.use_deepep_ll_kernels - - @property - def use_mori_kernels(self): - return self.moe_parallel_config.use_mori_kernels - - @property - def use_marlin_kernels(self): - return getattr(self.quant_method, "use_marlin", False) - - @property - def use_dp_chunking(self) -> bool: - return ( - self.moe_parallel_config.use_pplx_kernels - or self.moe_parallel_config.use_deepep_ll_kernels - or self.moe_parallel_config.use_mori_kernels - or self.moe_parallel_config.use_fi_all2allv_kernels - ) and envs.VLLM_ENABLE_MOE_DP_CHUNK - @property def is_internal_router(self) -> bool: # By default, router/gate is called before FusedMoE forward pass - return False + return self._gate is not None def _maybe_init_expert_routing_tables( self, @@ -799,7 +734,7 @@ class FusedMoE(CustomOp): # with DeepEP-ll all2all backend. if ( self.expert_placement_strategy != "round_robin" - or not self.use_deepep_ll_kernels + or not self.moe_parallel_config.use_deepep_ll_kernels ): return None @@ -892,48 +827,6 @@ class FusedMoE(CustomOp): dp_size=get_dp_group().world_size, ) - def _maybe_setup_shared_experts_stream( - self, - hidden_states: torch.Tensor, - has_separate_shared_experts: bool, - use_chunked_impl: bool, - ) -> tuple[bool, torch.Tensor | None]: - use_shared_experts_stream = ( - current_platform.is_cuda() - and has_separate_shared_experts - and not use_chunked_impl - and self.shared_experts_stream is not None - and ( - hidden_states.shape[0] - <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD - ) - ) - - hidden_states_clone: torch.Tensor | None = None - if use_shared_experts_stream: - assert self.shared_experts_stream is not None - - shared_experts_input = self._get_shared_experts_input(hidden_states) - - # Clone BEFORE switching streams to avoid race condition - # where routed_expert kernel may mutate hidden_states. - hidden_states_clone = shared_experts_input.clone() - - # Record that the clone will be used by shared_experts_stream - # to avoid gc issue from deallocation of hidden_states_clone - # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501 - # NOTE: We don't need shared_output.record_stream(current_stream()) - # because we synch the streams before using shared_output. - hidden_states_clone.record_stream(self.shared_experts_stream) - - # Mark sync start point for the separate shared experts - # stream here since we want to run in parallel with the - # router/gate (next op below) - assert self.shared_experts_stream is not None - self.shared_experts_stream.wait_stream(current_stream()) - - return use_shared_experts_stream, hidden_states_clone - def _load_per_tensor_weight_scale( self, shard_id: str, @@ -1191,7 +1084,7 @@ class FusedMoE(CustomOp): # compressed-tensors checkpoints with packed weights are stored flipped # TODO (mgoin): check self.quant_method.quant_config.quant_format # against known CompressionFormat enum values that have this quality - if self.quant_method.__class__.__name__ in ( + if quant_method_name in ( "CompressedTensorsWNA16MarlinMoEMethod", "CompressedTensorsWNA16MoEMethod", ): @@ -1488,7 +1381,7 @@ class FusedMoE(CustomOp): assert all( weight.is_contiguous() for name, weight in weights - if not name.startswith("_shared_experts.") + if not (name.startswith("_shared_experts.") or name.startswith("_gate.")) ) # Filter out the non-expert weights. @@ -1538,32 +1431,6 @@ class FusedMoE(CustomOp): self.ensure_moe_quant_config_init() return self.quant_method.moe_quant_config - def ensure_dp_chunking_init(self): - if not self.use_dp_chunking or self.batched_hidden_states is not None: - return - - states_shape: tuple[int, ...] - logits_shape: tuple[int, ...] - - moe = self.moe_config - - if self.vllm_config.parallel_config.enable_dbo: - states_shape = (2, moe.max_num_tokens, self.hidden_size) - logits_shape = (2, moe.max_num_tokens, self.logical_num_experts) - else: - states_shape = (moe.max_num_tokens, self.hidden_size) - logits_shape = (moe.max_num_tokens, self.logical_num_experts) - - self.batched_hidden_states = torch.zeros( - states_shape, dtype=moe.in_dtype, device=torch.cuda.current_device() - ) - - self.batched_router_logits = torch.zeros( - logits_shape, - dtype=moe.router_logits_dtype, - device=torch.cuda.current_device(), - ) - def must_reduce_shared_expert_outputs(self) -> bool: """ The shared_experts are typically computed using the RowParallelLinear @@ -1577,100 +1444,24 @@ class FusedMoE(CustomOp): Therefore it is required that we reduce the shared_experts output early. """ - assert self.quant_method is not None - return ( - isinstance(self.quant_method, FusedMoEModularMethod) - and self.quant_method.moe_mk.output_is_reduced() # type: ignore[union-attr] - ) + return self.runner.must_reduce_shared_expert_outputs() def maybe_all_reduce_tensor_model_parallel(self, final_hidden_states: torch.Tensor): """ Some combine kernels reduce across GPU ranks by default. """ - if self.must_reduce_shared_expert_outputs(): - return final_hidden_states - else: - return tensor_model_parallel_all_reduce(final_hidden_states) + return self.runner.maybe_all_reduce_tensor_model_parallel(final_hidden_states) def forward_native( self, hidden_states: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - # For latent MoE: save ORIGINAL hidden_states before transform - # (shared_experts need original dimension, routed experts use transformed) - original_hidden_states = hidden_states - original_hidden_dim = hidden_states.shape[-1] - - # Apply transform for routed experts (e.g., latent projection for latent MoE) - hidden_states = self.apply_routed_input_transform(hidden_states) - - # This is the dimension after transform (for routed expert output slicing) - transformed_hidden_dim = hidden_states.shape[-1] - if self.hidden_size != transformed_hidden_dim: - hidden_states = F.pad( - hidden_states, - (0, self.hidden_size - transformed_hidden_dim), - mode="constant", - value=0.0, - ) - - def reduce_output(states: torch.Tensor) -> torch.Tensor: - if ( - not self.is_sequence_parallel - and not self.use_dp_chunking - and self.reduce_results - and (self.tp_size > 1 or self.ep_size > 1) - ): - states = self.maybe_all_reduce_tensor_model_parallel(states) - return states - - def encode_layer_name() -> str: - # Can be unavailable or None in unittests - if ( - is_forward_context_available() - and get_forward_context().all_moe_layers is not None - ): - return "from_forward_context" - return self.layer_name - - if self.shared_experts is None: - if current_platform.is_tpu() or current_platform.is_cpu(): - # TODO: Once the OOM issue for the TPU backend is resolved, we - # will switch to using the moe_forward custom op. - # Note: CPU doesn't require wrapped forward_impl. - fused_output = self.forward_impl(hidden_states, router_logits) - assert not isinstance(fused_output, tuple) - else: - fused_output = torch.ops.vllm.moe_forward( - hidden_states, router_logits, encode_layer_name() - ) - return reduce_output(fused_output)[..., :transformed_hidden_dim] - else: - if current_platform.is_tpu() or current_platform.is_cpu(): - # TODO: Once the OOM issue for the TPU backend is resolved, we - # will switch to using the moe_forward custom op. - # Note: CPU doesn't require wrapped forward_impl. - with self._set_shared_experts_input(original_hidden_states): - shared_output, fused_output = self.forward_impl( - hidden_states, router_logits - ) - else: - # Custom op handles setting/clearing _shared_experts_input internally - # We pass original tensor for shared experts (not transformed) - shared_output, fused_output = torch.ops.vllm.moe_forward_shared( - hidden_states, - router_logits, - encode_layer_name(), - original_hidden_states, - ) - - # shared_output uses original dimension (before transform) - # fused_output uses transformed dimension (after transform) - return ( - reduce_output(shared_output)[..., :original_hidden_dim], - reduce_output(fused_output)[..., :transformed_hidden_dim], - ) + self.ensure_moe_quant_config_init() + return self.runner.forward( + hidden_states, + router_logits, + ) @property def expert_map(self) -> torch.Tensor | None: @@ -1685,312 +1476,6 @@ class FusedMoE(CustomOp): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: return self.forward_native(hidden_states, router_logits) - def forward_impl_chunked( - self, - full_hidden_states: torch.Tensor, - full_router_logits: torch.Tensor, - has_separate_shared_experts: bool, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert self.batched_hidden_states is not None - assert self.batched_router_logits is not None - assert self.batched_hidden_states.dtype == full_hidden_states.dtype, ( - f"{self.batched_hidden_states.dtype} == {full_hidden_states.dtype}" - ) - assert self.batched_router_logits.dtype == full_router_logits.dtype, ( - f"{self.batched_router_logits.dtype} == {full_router_logits.dtype}" - ) - # Check size compatibility. - assert self.batched_hidden_states.size(-1) == full_hidden_states.size(-1) - assert self.batched_router_logits.size(-1) == full_router_logits.size(-1) - - full_fused_final_hidden_states = torch.empty_like(full_hidden_states) - if self.shared_experts is not None: - full_shared_final_hidden_states = torch.empty_like(full_hidden_states) - - def process_chunk(chunk_start, chunk_end, skip_result_store=False): - chunk_size = chunk_end - chunk_start - hidden_states = full_hidden_states[chunk_start:chunk_end, :] - router_logits = full_router_logits[chunk_start:chunk_end, :] - - assert self.batched_hidden_states is not None - assert self.batched_router_logits is not None - # This is only true when DBO has been enabled in the config. - # Both tensors will have an outer dimension for the ubatch id - if self.batched_hidden_states.dim() == 3: - assert self.batched_router_logits.dim() == 3 - batch_buffer_idx = dbo_current_ubatch_id() - batched_hidden_states = self.batched_hidden_states[batch_buffer_idx, :] - batched_router_logits = self.batched_router_logits[batch_buffer_idx, :] - else: - batched_hidden_states = self.batched_hidden_states - batched_router_logits = self.batched_router_logits - - assert ( - batched_hidden_states.size(0) # type: ignore - >= chunk_size - ) - assert ( - batched_router_logits.size(0) # type: ignore - >= chunk_size - ) - staged_hidden_states = batched_hidden_states[:chunk_size, :] # type: ignore - staged_router_logits = batched_router_logits[:chunk_size, :] # type: ignore - staged_hidden_states.copy_(hidden_states, non_blocking=True) - staged_router_logits.copy_(router_logits, non_blocking=True) - - # Matrix multiply. - if self.quant_method.is_monolithic: - final_hidden_states = self.quant_method.apply_monolithic( - layer=self, - x=staged_hidden_states, - router_logits=staged_router_logits, - ) - else: - topk_weights, topk_ids = self.router.select_experts( - hidden_states=staged_hidden_states, - router_logits=staged_router_logits, - ) - - final_hidden_states = self.quant_method.apply( - layer=self, - x=staged_hidden_states, - topk_weights=topk_weights, - topk_ids=topk_ids, - ) - - if has_separate_shared_experts: - assert not isinstance(final_hidden_states, tuple) - assert self.shared_experts is not None - - shared_output = self.shared_experts(staged_hidden_states) - - final_hidden_states = ( - shared_output, - final_hidden_states, - ) - - if not skip_result_store: - if self.shared_experts is None: - full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_( - final_hidden_states, non_blocking=True - ) - else: - full_shared_final_hidden_states[chunk_start:chunk_end, :].copy_( - final_hidden_states[0], non_blocking=True - ) - full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_( - final_hidden_states[1], non_blocking=True - ) - - ctx = get_forward_context() - # flashinfer_cutlass_kernels can handle: optional DP + TP/EP - max_tokens_across_dispatchers = ctx.dp_metadata.max_tokens_across_dp_cpu - moe_dp_chunk_size_per_rank = self.moe_config.max_num_tokens - - # If the input to the MoE is sequence parallel then divide by sp_size - # to find the maximum number of tokens for any individual dispatcher. - if self.is_sequence_parallel: - max_tokens_across_dispatchers = cdiv( - max_tokens_across_dispatchers, self.sp_size - ) - - num_tokens = full_hidden_states.size(0) - for chunk_idx, chunk_start_ in enumerate( - range(0, max_tokens_across_dispatchers, moe_dp_chunk_size_per_rank) - ): - chunk_start = chunk_start_ - chunk_end = min( - chunk_start + moe_dp_chunk_size_per_rank, max_tokens_across_dispatchers - ) - # clamp start and end - chunk_start = min(chunk_start, num_tokens - 1) - chunk_end = min(chunk_end, num_tokens) - with ctx.dp_metadata.chunked_sizes( - self.sp_size, moe_dp_chunk_size_per_rank, chunk_idx - ): - process_chunk( - chunk_start, chunk_end, skip_result_store=chunk_start_ >= num_tokens - ) - - if self.shared_experts is None: - return full_fused_final_hidden_states - else: - return (full_shared_final_hidden_states, full_fused_final_hidden_states) - - def forward_impl( - self, - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert self.quant_method is not None - - self.ensure_moe_quant_config_init() - self.ensure_dp_chunking_init() - - has_separate_shared_experts = ( - not self.quant_method.mk_owns_shared_expert - and self.shared_experts is not None - ) - - use_chunked_impl = self.use_dp_chunking - - use_shared_experts_stream, hidden_states_clone = ( - self._maybe_setup_shared_experts_stream( - hidden_states, has_separate_shared_experts, use_chunked_impl - ) - ) - - # If router/gate provided, then apply it here. - # (Note: This code runs only when "overlapped mode" is on to allow - # parallel execution of shared experts with the FusedMoE via - # separate cuda stream) - if self.gate is not None: - router_logits, _ = self.gate(hidden_states) - - if use_chunked_impl: - return self.forward_impl_chunked( - hidden_states, router_logits, has_separate_shared_experts - ) - - # NOTE(rob): once we finish migrating all the quant methods to use - # MKs, we can remove the naive dispatch/combine path from here. - do_naive_dispatch_combine = ( - self.dp_size > 1 and not self.quant_method.supports_internal_mk - ) - - ctx = get_forward_context() - sp_ctx = ( - ctx.dp_metadata.sp_local_sizes(self.sp_size) - if ctx.dp_metadata - else nullcontext() - ) - - with sp_ctx: - extra_tensors = None - if do_naive_dispatch_combine: - post_quant_allgather = ( - self.quant_method is not None - and self.dp_size > 1 - and self.use_ep - and getattr(self.quant_method, "do_post_quant_allgather", False) - ) - if post_quant_allgather: - hidden_states_to_dispatch, extra_tensors = ( - self.quant_method.prepare_dp_allgather_tensor( - self, hidden_states, router_logits - ) - ) - else: - hidden_states_to_dispatch = hidden_states - - dispatch_res = get_ep_group().dispatch_router_logits( - hidden_states_to_dispatch, - router_logits, - self.is_sequence_parallel, - extra_tensors=extra_tensors, - ) - if extra_tensors is not None: - ( - orig_hidden_states, - router_logits, - extra_tensors_combined, - ) = dispatch_res - hidden_states_combined = ( - orig_hidden_states, - extra_tensors_combined[0], - ) - else: - hidden_states_combined, router_logits = dispatch_res - orig_hidden_states = hidden_states_combined - else: - orig_hidden_states = hidden_states - - # Run shared experts before matrix multiply. - # because matrix multiply maybe modify the hidden_states. - if has_separate_shared_experts and not use_shared_experts_stream: - assert self.shared_experts is not None - shared_input = self._get_shared_experts_input(hidden_states) - shared_output = self.shared_experts(shared_input) - - # NOTE: Similar with DP, PCP also needs dispatch and combine. For - # simplicity, AgRsAll2All was added separately for PCP here. Maybe - # we should modify All2AllManager abstract to better support PCP. - if self.pcp_size > 1: - hidden_states = get_pcp_group().all_gather( - hidden_states, - dim=0, - ) - router_logits = get_pcp_group().all_gather( - router_logits, - dim=0, - ) - - # Matrix multiply. - x = hidden_states_combined if do_naive_dispatch_combine else hidden_states - - # TODO(bnell): deal with fp4 flashinfer tuple hidden states hack (#30014). - # Figure out nicer way to do this. - x_orig = orig_hidden_states if do_naive_dispatch_combine else hidden_states - - if self.quant_method.is_monolithic: - final_hidden_states = self.quant_method.apply_monolithic( - layer=self, - x=x, - router_logits=router_logits, - ) - else: - topk_weights, topk_ids = self.router.select_experts( - hidden_states=x_orig, - router_logits=router_logits, - ) - - final_hidden_states = self.quant_method.apply( - layer=self, - x=x, # The type signture of this is wrong due to the hack. - topk_weights=topk_weights, - topk_ids=topk_ids, - ) - - if has_separate_shared_experts: - assert self.shared_experts is not None - - if use_shared_experts_stream: - # Run shared experts in parallel on a separate stream - # NOTE: We start the separate stream here and mark the - # sync end point immediately after it is done. This is - # important to avoid excessive stream allocations by the cuda - # graph replay later. - with torch.cuda.stream(self.shared_experts_stream): - # Note that hidden_states clone() is necessary here to avoid - # conflict with the main stream - shared_output = self.shared_experts(hidden_states_clone) - current_stream().wait_stream(self.shared_experts_stream) - - final_hidden_states = ( - shared_output, - final_hidden_states, - ) - - def combine_output(states: torch.Tensor) -> torch.Tensor: - if do_naive_dispatch_combine: - states = get_ep_group().combine(states, self.is_sequence_parallel) - - if self.pcp_size > 1: - states = get_pcp_group().reduce_scatter( - states, - dim=0, - ) - - return states - - if self.shared_experts is not None: - return ( - final_hidden_states[0], - combine_output(final_hidden_states[1]), - ) - else: - return combine_output(final_hidden_states) - @classmethod def make_expert_params_mapping( cls, @@ -2051,94 +1536,6 @@ class FusedMoE(CustomOp): return s -def get_layer_from_name(layer_name: str) -> FusedMoE: - forward_context: ForwardContext = get_forward_context() - if layer_name == "from_forward_context": - all_moe_layers = forward_context.all_moe_layers - assert all_moe_layers is not None - moe_layer_index = forward_context.moe_layer_index - if moe_layer_index >= len(all_moe_layers): - raise AssertionError( - "We expected the number of MOE layers in `all_moe_layers` " - "to be equal to the number of " - "{vllm.moe_forward, vllm.moe_forward_shared} calls." - ) - layer_name = all_moe_layers[moe_layer_index] - forward_context.moe_layer_index += 1 - self = cast(FusedMoE, forward_context.no_compile_layers[layer_name]) - return self - - -def moe_forward( - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - layer_name: str, -) -> torch.Tensor: - self = get_layer_from_name(layer_name) - assert self.shared_experts is None - return self.forward_impl(hidden_states, router_logits) - - -def moe_forward_fake( - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - layer_name: str, -) -> torch.Tensor: - return torch.empty_like(hidden_states) - - -direct_register_custom_op( - op_name="moe_forward", - op_func=moe_forward, - mutates_args=["hidden_states"], - fake_impl=moe_forward_fake, - tags=(torch.Tag.needs_fixed_stride_order,), -) - - -def moe_forward_shared( - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - layer_name: str, - shared_experts_input: torch.Tensor | None = None, -) -> tuple[torch.Tensor, torch.Tensor]: - self = get_layer_from_name(layer_name) - assert self.shared_experts is not None - - # Set here because torch.compile skips forward_native() setup code - # and calls this op directly. forward_impl() reads from this var. - with self._set_shared_experts_input(shared_experts_input): - return self.forward_impl(hidden_states, router_logits) - - -def moe_forward_shared_fake( - hidden_states: torch.Tensor, - router_logits: torch.Tensor, - layer_name: str, - shared_experts_input: torch.Tensor | None = None, -) -> tuple[torch.Tensor, torch.Tensor]: - # Output shapes: - # - fused_out: same as hidden_states (routed experts use transformed size) - # - shared_out: same as shared_experts_input if provided, else same as hidden_states - # (For latent MoE: shared experts use original hidden_size, not latent size) - fused_out = torch.empty_like(hidden_states) - - if shared_experts_input is not None: - shared_out = torch.empty_like(shared_experts_input) - else: - shared_out = torch.empty_like(hidden_states) - - return shared_out, fused_out - - -direct_register_custom_op( - op_name="moe_forward_shared", - op_func=moe_forward_shared, - mutates_args=["hidden_states"], - fake_impl=moe_forward_shared_fake, - tags=(torch.Tag.needs_fixed_stride_order,), -) - # Mark the FusedMoE weight_loader as supporting MoE-specific parameters # to avoid expensive runtime reflection in model loading code FusedMoE.weight_loader.supports_moe_loading = True # type: ignore[attr-defined] diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 8a670216b..e2f77d6c8 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -1228,7 +1228,7 @@ class FusedMoEModularKernel(torch.nn.Module): topk_weights: torch.Tensor, topk_ids: torch.Tensor, apply_router_weight_on_input: bool, - shared_experts_input: torch.Tensor | None = None, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: """ The _finalize method is a wrapper around self.prepare_finalize.finalize diff --git a/vllm/model_executor/layers/fused_moe/runner/__init__.py b/vllm/model_executor/layers/fused_moe/runner/__init__.py new file mode 100644 index 000000000..208f01a7c --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/runner/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py new file mode 100644 index 000000000..12b795f30 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -0,0 +1,743 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from contextlib import nullcontext + +import torch +import torch.nn.functional as F + +import vllm.envs as envs +from vllm.distributed import ( + get_ep_group, + get_pcp_group, + tensor_model_parallel_all_reduce, +) +from vllm.forward_context import ( + ForwardContext, + get_forward_context, + is_forward_context_available, +) +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, +) +from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( + FusedMoEMethodBase, +) +from vllm.model_executor.layers.fused_moe.router.fused_moe_router import ( + FusedMoERouter, +) +from vllm.model_executor.layers.fused_moe.runner.moe_runner import MoERunner +from vllm.platforms import current_platform +from vllm.utils.math_utils import cdiv +from vllm.utils.torch_utils import ( + aux_stream, + current_stream, + direct_register_custom_op, +) +from vllm.v1.worker.ubatching import dbo_current_ubatch_id + +logger = init_logger(__name__) + + +def get_layer_from_name(layer_name: str) -> torch.nn.Module: + forward_context: ForwardContext = get_forward_context() + if layer_name == "from_forward_context": + all_moe_layers = forward_context.all_moe_layers + assert all_moe_layers is not None + moe_layer_index = forward_context.moe_layer_index + if moe_layer_index >= len(all_moe_layers): + raise AssertionError( + "We expected the number of MOE layers in `all_moe_layers` " + "to be equal to the number of " + "{vllm.moe_forward, vllm.moe_forward_shared} calls." + ) + layer_name = all_moe_layers[moe_layer_index] + forward_context.moe_layer_index += 1 + return forward_context.no_compile_layers[layer_name] + + +def _moe_forward( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + layer_name: str, +) -> torch.Tensor: + layer = get_layer_from_name(layer_name) + return layer.runner.forward_impl( + layer, hidden_states, router_logits, shared_experts_input + ) + + +def _moe_forward_fake( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + layer_name: str, +) -> torch.Tensor: + return torch.empty_like(hidden_states) + + +def _moe_forward_shared( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + layer_name: str, +) -> tuple[torch.Tensor, torch.Tensor]: + layer = get_layer_from_name(layer_name) + return layer.runner.forward_impl( + layer, hidden_states, router_logits, shared_experts_input + ) + + +def _moe_forward_shared_fake( + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_experts_input: torch.Tensor | None, + layer_name: str, +) -> tuple[torch.Tensor, torch.Tensor]: + # Output shapes: + # - fused_out: same as hidden_states (routed experts use transformed size) + # - shared_out: same as shared_experts_input if provided, else same as + # hidden_states + # (For latent MoE: shared experts use original hidden_size, not latent size) + fused_out = torch.empty_like(hidden_states) + + if shared_experts_input is not None: + shared_out = torch.empty_like(shared_experts_input) + else: + shared_out = torch.empty_like(hidden_states) + + return shared_out, fused_out + + +direct_register_custom_op( + op_name="moe_forward", + op_func=_moe_forward, + mutates_args=["hidden_states"], + fake_impl=_moe_forward_fake, + tags=(torch.Tag.needs_fixed_stride_order,), +) + + +direct_register_custom_op( + op_name="moe_forward_shared", + op_func=_moe_forward_shared, + mutates_args=["hidden_states"], + fake_impl=_moe_forward_shared_fake, + tags=(torch.Tag.needs_fixed_stride_order,), +) + + +class DefaultMoERunner(MoERunner): + """ + Default implementation of the MoE runner for executing Mixture of Experts layers. + + This class provides a comprehensive implementation for running MoE computations + with support for: + - Expert routing and token dispatching + - Shared experts computation with optional parallel execution using CUDA streams + - Data parallel (DP) chunking for large batch processing + - Tensor model parallel and expert parallel operations + - Various quantization methods and custom operators + - Both monolithic and decomposed expert execution paths + + The runner handles the complete MoE forward pass including routing tokens to + experts, executing expert computations, and combining results. It supports + advanced features like overlapped execution of shared experts and optimized + kernels for different parallel execution modes. + + Eventually, this class will be split up and specialized for different + configurations, e.g. the presense or absence of shared experts, a gate, etc. + """ + + def __init__( + self, + layer: torch.nn.Module, + moe_config: FusedMoEConfig, + router: FusedMoERouter, + routed_input_transform: torch.nn.Module | None, + gate: torch.nn.Module | None, + shared_experts: torch.nn.Module | None, + quant_method: FusedMoEMethodBase, + reduce_results: bool, + enable_dbo: bool, + ): + super().__init__() + self.moe_config = moe_config + self.router = router + self.routed_input_transform = routed_input_transform + self.gate = gate + self.shared_experts = shared_experts + self.quant_method = quant_method + self.reduce_results = reduce_results + self.enable_dbo = enable_dbo + + # Allow disabling of the separate shared experts stream for + # debug purposes. + # TODO: Remove this after more extensive testings with TP/DP + # and other execution modes + if envs.VLLM_DISABLE_SHARED_EXPERTS_STREAM: + logger.debug_once("Disabling MoE shared_experts cuda stream", scope="local") + self.shared_experts_stream = None + else: + # TODO(rob): enable shared expert overlap with non-cuda-alike. + # aux_stream() returns None on non-cuda-alike platforms. + self.shared_experts_stream = aux_stream() + if self.shared_experts_stream is not None: + logger.debug_once( + "Enabled separate cuda stream for MoE shared_experts", scope="local" + ) + + # Needed for string -> FusedMoE layer lookup in custom ops. + self.layer_name = layer.layer_name + + if current_platform.is_tpu() or current_platform.is_cpu(): + # TODO: Once the OOM issue for the TPU backend is resolved, we + # will switch to using the moe_forward custom op. + # Note: CPU doesn't require wrapped forward_impl. + if self.shared_experts is None: + self.moe_forward = _moe_forward + else: + self.moe_forward = _moe_forward_shared + else: + if self.shared_experts is None: + self.moe_forward = torch.ops.vllm.moe_forward + else: + self.moe_forward = torch.ops.vllm.moe_forward_shared + + # Chunked all2all staging tensor + self.batched_hidden_states: torch.Tensor | None = None + self.batched_router_logits: torch.Tensor | None = None + + @property + def use_dp_chunking(self) -> bool: + return ( + self.moe_config.moe_parallel_config.use_pplx_kernels + or self.moe_config.moe_parallel_config.use_deepep_ll_kernels + or self.moe_config.moe_parallel_config.use_mori_kernels + or self.moe_config.moe_parallel_config.use_fi_all2allv_kernels + ) and envs.VLLM_ENABLE_MOE_DP_CHUNK + + def _maybe_setup_shared_experts_stream( + self, + hidden_states: torch.Tensor, + shared_input: torch.Tensor | None, + has_separate_shared_experts: bool, + use_chunked_impl: bool, + ) -> tuple[bool, torch.Tensor | None]: + use_shared_experts_stream = ( + current_platform.is_cuda() + and has_separate_shared_experts + and not use_chunked_impl + and self.shared_experts_stream is not None + and ( + hidden_states.shape[0] + <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD + ) + ) + + hidden_states_clone: torch.Tensor | None = None + if use_shared_experts_stream: + assert self.shared_experts_stream is not None + + shared_experts_input = ( + shared_input if shared_input is not None else hidden_states + ) + + # Clone BEFORE switching streams to avoid race condition + # where routed_expert kernel may mutate hidden_states. + hidden_states_clone = shared_experts_input.clone() + + # Record that the clone will be used by shared_experts_stream + # to avoid gc issue from deallocation of hidden_states_clone + # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501 + # NOTE: We don't need shared_output.record_stream(current_stream()) + # because we synch the streams before using shared_output. + hidden_states_clone.record_stream(self.shared_experts_stream) + + # Mark sync start point for the separate shared experts + # stream here since we want to run in parallel with the + # router/gate (next op below) + assert self.shared_experts_stream is not None + self.shared_experts_stream.wait_stream(current_stream()) + + return use_shared_experts_stream, hidden_states_clone + + def ensure_dp_chunking_init(self): + if not self.use_dp_chunking or self.batched_hidden_states is not None: + return + + states_shape: tuple[int, ...] + logits_shape: tuple[int, ...] + + moe = self.moe_config + + if self.enable_dbo: + states_shape = (2, moe.max_num_tokens, self.moe_config.hidden_dim) + logits_shape = (2, moe.max_num_tokens, self.moe_config.num_logical_experts) + else: + states_shape = (moe.max_num_tokens, self.moe_config.hidden_dim) + logits_shape = (moe.max_num_tokens, self.moe_config.num_logical_experts) + + self.batched_hidden_states = torch.zeros( + states_shape, dtype=moe.in_dtype, device=torch.cuda.current_device() + ) + + self.batched_router_logits = torch.zeros( + logits_shape, + dtype=moe.router_logits_dtype, + device=torch.cuda.current_device(), + ) + + def must_reduce_shared_expert_outputs(self) -> bool: + """ + The shared_experts are typically computed using the RowParallelLinear + layer. The result of this function is typically used as + the reduce_results argument to the module. + When just tensor-parallel is used, it is not required to reduce + the shared_experts results immediately. Instead we reduce at the + once at the end of the MoE op. (Refer to DeepSeekV2MoE module) + With EP and all2all kernels - this is no longer viable as all + GPU ranks in DP, produce the complete set of hidden_states. + Therefore it is required that we reduce the shared_experts output + early. + """ + assert self.quant_method is not None + return ( + self.quant_method.moe_mk is not None + and self.quant_method.moe_mk.output_is_reduced() + ) + + def maybe_all_reduce_tensor_model_parallel(self, final_hidden_states: torch.Tensor): + """ + Some combine kernels reduce across GPU ranks by default. + """ + if self.must_reduce_shared_expert_outputs(): + return final_hidden_states + else: + return tensor_model_parallel_all_reduce(final_hidden_states) + + def apply_routed_input_transform(self, hidden_states: torch.Tensor) -> torch.Tensor: + """Apply transform for routed experts (e.g., latent projection). + + This is called by FusedMoE.forward_native. The original hidden_states + is saved separately so shared experts get [S, hidden_size] while + routed experts get the transformed [S, moe_latent_size]. + + TODO: For latent MoE bandwidth optimization, fc2_latent_proj could be + moved inside SharedFusedMoE to all-reduce on the smaller latent + dimension. + """ + if self.routed_input_transform is not None: + result = self.routed_input_transform(hidden_states) + # ReplicatedLinear returns (output, extra_bias) tuple. + # We only need the output tensor; extra_bias is not used here. + if isinstance(result, tuple): + return result[0] + return result + return hidden_states + + def _reduce_output( + self, + states: torch.Tensor | tuple[torch.Tensor, torch.Tensor], + trunc_sizes: list[int], + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + def trunc(x: torch.Tensor, trunc_size: int) -> torch.Tensor: + return x[..., :trunc_size] + + def reduce_and_trunc(x: torch.Tensor, trunc_size: int) -> torch.Tensor: + return trunc(self.maybe_all_reduce_tensor_model_parallel(x), trunc_size) + + if ( + not self.moe_config.is_sequence_parallel + and not self.use_dp_chunking + and self.reduce_results + and (self.moe_config.tp_size > 1 or self.moe_config.ep_size > 1) + ): + func = reduce_and_trunc + else: + func = trunc + + if isinstance(states, tuple): + return tuple( + [func(s, trunc_size) for s, trunc_size in zip(states, trunc_sizes)] + ) + else: + assert len(trunc_sizes) == 1 + return func(states, trunc_sizes[0]) + + def _encode_layer_name(self) -> str: + # Can be unavailable or None in unittests + if ( + is_forward_context_available() + and get_forward_context().all_moe_layers is not None + ): + return "from_forward_context" + return self.layer_name + + def forward( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + # For latent MoE: save ORIGINAL hidden_states before transform + # (shared_experts need original dimension, routed experts use transformed) + original_hidden_states = hidden_states + original_hidden_dim = hidden_states.shape[-1] + + # Apply transform for routed experts (e.g., latent projection for latent MoE) + hidden_states = self.apply_routed_input_transform(hidden_states) + + # This is the dimension after transform (for routed expert output slicing) + transformed_hidden_dim = hidden_states.shape[-1] + if self.moe_config.hidden_dim != transformed_hidden_dim: + hidden_states = F.pad( + hidden_states, + (0, self.moe_config.hidden_dim - transformed_hidden_dim), + mode="constant", + value=0.0, + ) + + fused_output = self.moe_forward( + hidden_states, + router_logits, + original_hidden_states, + self._encode_layer_name(), + ) + + if isinstance(fused_output, tuple): + orig_hidden_dims = [original_hidden_dim, transformed_hidden_dim] + else: + orig_hidden_dims = [transformed_hidden_dim] + + return self._reduce_output(fused_output, orig_hidden_dims) + + def forward_impl_chunked( + self, + layer: torch.nn.Module, + full_hidden_states: torch.Tensor, + full_router_logits: torch.Tensor, + shared_input: torch.Tensor | None, + has_separate_shared_experts: bool, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + assert self.batched_hidden_states is not None + assert self.batched_router_logits is not None + assert self.batched_hidden_states.dtype == full_hidden_states.dtype, ( + f"{self.batched_hidden_states.dtype} == {full_hidden_states.dtype}" + ) + assert self.batched_router_logits.dtype == full_router_logits.dtype, ( + f"{self.batched_router_logits.dtype} == {full_router_logits.dtype}" + ) + # Check size compatibility. + assert self.batched_hidden_states.size(-1) == full_hidden_states.size(-1) + assert self.batched_router_logits.size(-1) == full_router_logits.size(-1) + + # TODO(bnell): Fix shared_expert_inputs w/chunking. + # assert shared_input is None, ( + # "Routed input transform is not currently supported with DP chunking." + # ) + + full_fused_final_hidden_states = torch.empty_like(full_hidden_states) + if self.shared_experts is not None: + full_shared_final_hidden_states = torch.empty_like(full_hidden_states) + + def process_chunk(chunk_start, chunk_end, skip_result_store=False): + chunk_size = chunk_end - chunk_start + hidden_states = full_hidden_states[chunk_start:chunk_end, :] + router_logits = full_router_logits[chunk_start:chunk_end, :] + + assert self.batched_hidden_states is not None + assert self.batched_router_logits is not None + # This is only true when DBO has been enabled in the config. + # Both tensors will have an outer dimension for the ubatch id + if self.batched_hidden_states.dim() == 3: + assert self.batched_router_logits.dim() == 3 + batch_buffer_idx = dbo_current_ubatch_id() + batched_hidden_states = self.batched_hidden_states[batch_buffer_idx, :] + batched_router_logits = self.batched_router_logits[batch_buffer_idx, :] + else: + batched_hidden_states = self.batched_hidden_states + batched_router_logits = self.batched_router_logits + + assert ( + batched_hidden_states.size(0) # type: ignore + >= chunk_size + ) + assert ( + batched_router_logits.size(0) # type: ignore + >= chunk_size + ) + staged_hidden_states = batched_hidden_states[:chunk_size, :] # type: ignore + staged_router_logits = batched_router_logits[:chunk_size, :] # type: ignore + staged_hidden_states.copy_(hidden_states, non_blocking=True) + staged_router_logits.copy_(router_logits, non_blocking=True) + + # Matrix multiply. + if self.quant_method.is_monolithic: + final_hidden_states = self.quant_method.apply_monolithic( + layer=layer, + x=staged_hidden_states, + router_logits=staged_router_logits, + ) + else: + topk_weights, topk_ids = self.router.select_experts( + hidden_states=staged_hidden_states, + router_logits=staged_router_logits, + ) + + final_hidden_states = self.quant_method.apply( + layer=layer, + x=staged_hidden_states, + topk_weights=topk_weights, + topk_ids=topk_ids, + shared_experts_input=shared_input, + ) + + if has_separate_shared_experts: + assert not isinstance(final_hidden_states, tuple) + assert self.shared_experts is not None + + shared_output = self.shared_experts(staged_hidden_states) + + final_hidden_states = ( + shared_output, + final_hidden_states, + ) + + if not skip_result_store: + if self.shared_experts is None: + full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_( + final_hidden_states, non_blocking=True + ) + else: + full_shared_final_hidden_states[chunk_start:chunk_end, :].copy_( + final_hidden_states[0], non_blocking=True + ) + full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_( + final_hidden_states[1], non_blocking=True + ) + + ctx = get_forward_context() + # flashinfer_cutlass_kernels can handle: optional DP + TP/EP + max_tokens_across_dispatchers = ctx.dp_metadata.max_tokens_across_dp_cpu + moe_dp_chunk_size_per_rank = self.moe_config.max_num_tokens + + # If the input to the MoE is sequence parallel then divide by sp_size + # to find the maximum number of tokens for any individual dispatcher. + if self.moe_config.is_sequence_parallel: + max_tokens_across_dispatchers = cdiv( + max_tokens_across_dispatchers, self.moe_config.sp_size + ) + + num_tokens = full_hidden_states.size(0) + for chunk_idx, chunk_start_ in enumerate( + range(0, max_tokens_across_dispatchers, moe_dp_chunk_size_per_rank) + ): + chunk_start = chunk_start_ + chunk_end = min( + chunk_start + moe_dp_chunk_size_per_rank, max_tokens_across_dispatchers + ) + # clamp start and end + chunk_start = min(chunk_start, num_tokens - 1) + chunk_end = min(chunk_end, num_tokens) + with ctx.dp_metadata.chunked_sizes( + self.moe_config.sp_size, moe_dp_chunk_size_per_rank, chunk_idx + ): + process_chunk( + chunk_start, chunk_end, skip_result_store=chunk_start_ >= num_tokens + ) + + if self.shared_experts is None: + return full_fused_final_hidden_states + else: + return (full_shared_final_hidden_states, full_fused_final_hidden_states) + + def forward_impl( + self, + layer: torch.nn.Module, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + shared_input: torch.Tensor | None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + assert self.quant_method is not None + + self.ensure_dp_chunking_init() + + has_separate_shared_experts = ( + not self.quant_method.mk_owns_shared_expert + and self.shared_experts is not None + ) + + use_chunked_impl = self.use_dp_chunking + + use_shared_experts_stream, hidden_states_clone = ( + self._maybe_setup_shared_experts_stream( + hidden_states, + shared_input, + has_separate_shared_experts, + use_chunked_impl, + ) + ) + + # If router/gate provided, then apply it here. + # (Note: This code runs only when "overlapped mode" is on to allow + # parallel execution of shared experts with the FusedMoE via + # separate cuda stream) + if self.gate is not None: + router_logits, _ = self.gate(hidden_states) + + if use_chunked_impl: + return self.forward_impl_chunked( + layer, + hidden_states, + router_logits, + shared_input, + has_separate_shared_experts, + ) + + # NOTE(rob): once we finish migrating all the quant methods to use + # MKs, we can remove the naive dispatch/combine path from here. + do_naive_dispatch_combine = ( + self.moe_config.dp_size > 1 and not self.quant_method.supports_internal_mk + ) + + ctx = get_forward_context() + sp_ctx = ( + ctx.dp_metadata.sp_local_sizes(self.moe_config.sp_size) + if ctx.dp_metadata + else nullcontext() + ) + + with sp_ctx: + extra_tensors = None + if do_naive_dispatch_combine: + post_quant_allgather = ( + self.quant_method is not None + and self.moe_config.dp_size > 1 + and self.moe_config.use_ep + and getattr(self.quant_method, "do_post_quant_allgather", False) + ) + if post_quant_allgather: + hidden_states_to_dispatch, extra_tensors = ( + self.quant_method.prepare_dp_allgather_tensor( + layer, hidden_states, router_logits + ) + ) + else: + hidden_states_to_dispatch = hidden_states + + dispatch_res = get_ep_group().dispatch_router_logits( + hidden_states_to_dispatch, + router_logits, + self.moe_config.is_sequence_parallel, + extra_tensors=extra_tensors, + ) + if extra_tensors is not None: + ( + orig_hidden_states, + router_logits, + extra_tensors_combined, + ) = dispatch_res + hidden_states_combined = ( + orig_hidden_states, + extra_tensors_combined[0], + ) + else: + hidden_states_combined, router_logits = dispatch_res + orig_hidden_states = hidden_states_combined + else: + orig_hidden_states = hidden_states + + # Run shared experts before matrix multiply. + # because matrix multiply maybe modify the hidden_states. + if has_separate_shared_experts and not use_shared_experts_stream: + assert self.shared_experts is not None + shared_input = ( + shared_input if shared_input is not None else hidden_states + ) + shared_output = self.shared_experts(shared_input) + + # NOTE: Similar with DP, PCP also needs dispatch and combine. For + # simplicity, AgRsAll2All was added separately for PCP here. Maybe + # we should modify All2AllManager abstract to better support PCP. + if self.moe_config.pcp_size > 1: + hidden_states = get_pcp_group().all_gather( + hidden_states, + dim=0, + ) + router_logits = get_pcp_group().all_gather( + router_logits, + dim=0, + ) + + # TODO(bnell): deal with fp4 flashinfer tuple hidden states hack (#30014). + # Figure out nicer way to do this. + if do_naive_dispatch_combine: + x = hidden_states_combined + x_orig = orig_hidden_states + else: + x = hidden_states + x_orig = hidden_states + + # Matrix multiply. + if self.quant_method.is_monolithic: + final_hidden_states = self.quant_method.apply_monolithic( + layer=layer, + x=x, + router_logits=router_logits, + ) + else: + topk_weights, topk_ids = self.router.select_experts( + hidden_states=x_orig, + router_logits=router_logits, + ) + + final_hidden_states = self.quant_method.apply( + layer=layer, + x=x, # The type signture of this is wrong due to the hack. + topk_weights=topk_weights, + topk_ids=topk_ids, + shared_experts_input=shared_input, + ) + + if has_separate_shared_experts: + assert self.shared_experts is not None + + if use_shared_experts_stream: + # Run shared experts in parallel on a separate stream + # NOTE: We start the separate stream here and mark the + # sync end point immediately after it is done. This is + # important to avoid excessive stream allocations by the cuda + # graph replay later. + with torch.cuda.stream(self.shared_experts_stream): + # Note that hidden_states clone() is necessary here to avoid + # conflict with the main stream + shared_output = self.shared_experts(hidden_states_clone) + current_stream().wait_stream(self.shared_experts_stream) + + final_hidden_states = ( + shared_output, + final_hidden_states, + ) + + def combine_output(states: torch.Tensor) -> torch.Tensor: + if do_naive_dispatch_combine: + states = get_ep_group().combine( + states, self.moe_config.is_sequence_parallel + ) + + if self.moe_config.pcp_size > 1: + states = get_pcp_group().reduce_scatter( + states, + dim=0, + ) + + return states + + if self.shared_experts is not None: + return ( + final_hidden_states[0], + combine_output(final_hidden_states[1]), + ) + else: + return combine_output(final_hidden_states) diff --git a/vllm/model_executor/layers/fused_moe/runner/moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py new file mode 100644 index 000000000..b298cc2d0 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/runner/moe_runner.py @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from abc import ABC, abstractmethod + +import torch + + +class MoERunner(ABC): + """ + Abstract base class for Mixture of Experts (MoE) runners. + + This class defines the interface that all MoE runner implementations must follow. + MoE runners are responsible for executing the forward pass of MoE layers, handling + expert routing, and managing tensor parallel operations. + """ + + @abstractmethod + def forward( + self, + hidden_states: torch.Tensor, + router_logits: torch.Tensor, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + raise NotImplementedError + + @abstractmethod + def must_reduce_shared_expert_outputs(self) -> bool: + raise NotImplementedError + + @abstractmethod + def maybe_all_reduce_tensor_model_parallel( + self, + final_hidden_states: torch.Tensor, + ): + raise NotImplementedError diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py index 937d13d34..37336df17 100644 --- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py @@ -18,70 +18,6 @@ class SharedFusedMoE(FusedMoE): can be interleaved with the fused all2all dispatch communication step. """ - def __init__( - self, - shared_experts: torch.nn.Module | None, - gate: torch.nn.Module | None = None, - use_overlapped: bool = True, - routed_input_transform: torch.nn.Module | None = None, - **kwargs, - ): - # Pass has_shared_experts so FusedMoE.__init__ can set disable_inplace - # without accessing self.shared_experts (submodules cannot be set before - # Module.__init__()). - kwargs["has_shared_experts"] = shared_experts is not None - super().__init__(**kwargs) - self._shared_experts = shared_experts - self._routed_input_transform = routed_input_transform - - # Disable shared expert overlap if: - # - we are using eplb with non-default backend, because of correctness issues - # - we are using flashinfer with DP, since there nothing to gain - # - we are using marlin kernels - backend = self.moe_parallel_config.all2all_backend - self.use_overlapped = ( - use_overlapped - and not ( - (self.enable_eplb and backend != "allgather_reducescatter") - or self.moe_parallel_config.use_fi_all2allv_kernels - ) - and self._shared_experts is not None - ) - - self._gate = gate - - @property - def shared_experts(self) -> torch.nn.Module | None: - return self._shared_experts if self.use_overlapped else None - - @property - def gate(self) -> torch.nn.Module | None: - return self._gate if self.use_overlapped else None - - @property - def is_internal_router(self) -> bool: - return self.gate is not None - - def apply_routed_input_transform(self, hidden_states: torch.Tensor) -> torch.Tensor: - """Apply transform for routed experts (e.g., latent projection). - - This is called by FusedMoE.forward_native. The original hidden_states - is saved separately so shared experts get [S, hidden_size] while - routed experts get the transformed [S, moe_latent_size]. - - TODO: For latent MoE bandwidth optimization, fc2_latent_proj could be - moved inside SharedFusedMoE to all-reduce on the smaller latent - dimension. - """ - if self._routed_input_transform is not None: - result = self._routed_input_transform(hidden_states) - # ReplicatedLinear returns (output, extra_bias) tuple. - # We only need the output tensor; extra_bias is not used here. - if isinstance(result, tuple): - return result[0] - return result - return hidden_states - def forward( self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index 8a35be78b..5c86064a9 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -55,6 +55,8 @@ logger = init_logger(__name__) class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): """MoE method without quantization.""" + # --8<-- [end:unquantized_fused_moe] + def __init__(self, moe: FusedMoEConfig): super().__init__(moe) self.unquantized_backend = select_unquantized_moe_backend( @@ -90,8 +92,9 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - return self.forward_cuda(layer, x, topk_weights, topk_ids) + return self.forward_cuda(layer, x, topk_weights, topk_ids, shared_experts_input) @property def is_monolithic(self) -> bool: @@ -293,12 +296,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: return self.forward( layer=layer, x=x, topk_weights=topk_weights, topk_ids=topk_ids, + shared_experts_input=shared_experts_input, ) def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig: @@ -316,6 +321,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert self.kernel is not None @@ -329,6 +335,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): apply_router_weight_on_input=layer.apply_router_weight_on_input, global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, + shared_experts_input=shared_experts_input, ) def forward_monolithic_cuda( diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 642088a45..5b7af3193 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -764,6 +764,7 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: return fused_marlin_moe( x, diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 2fd567d7f..983c076bd 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -501,6 +501,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 604373c0a..023cf3f67 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -349,6 +349,7 @@ class CompressedTensorsW4A4Mxfp4MoEMethod(CompressedTensorsMoEMethod): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert self.moe_mk is not None return self.moe_mk( @@ -361,7 +362,7 @@ class CompressedTensorsW4A4Mxfp4MoEMethod(CompressedTensorsMoEMethod): global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, - shared_experts_input=layer._get_shared_experts_input(x), + shared_experts_input=shared_experts_input, ) @@ -645,6 +646,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert not self.is_monolithic assert layer.activation == "silu", "Only SiLU activation is supported." @@ -673,7 +675,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, - shared_experts_input=layer._get_shared_experts_input(x), + shared_experts_input=shared_experts_input, ) @@ -1064,6 +1066,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert not self.is_monolithic assert self.moe_mk is not None @@ -1079,7 +1082,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): # https://github.com/vllm-project/vllm/commit/84166fee9770e6fba71a96978b3e7d149392fb28 # noqa: E501 expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, - shared_experts_input=layer._get_shared_experts_input(x), + shared_experts_input=shared_experts_input, ) @property @@ -1203,6 +1206,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts @@ -1713,6 +1717,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert self.kernel_backend == "Marlin" return fused_marlin_moe( @@ -1961,6 +1966,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts @@ -2575,6 +2581,7 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: if layer.enable_eplb: raise NotImplementedError( diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 176bfe040..d971f3b5b 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -140,6 +140,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index b8040e894..279f97dd6 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1010,6 +1010,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert self.moe_mk is not None assert not self.is_monolithic @@ -1023,7 +1024,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, - shared_experts_input=layer._get_shared_experts_input(x), + shared_experts_input=shared_experts_input, ) diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index ce84d2521..f7d995598 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -635,6 +635,7 @@ class GGUFMoEMethod(FusedMoEMethodBase): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert layer.activation == "silu", "Only SiLU activation is supported." if layer.apply_router_weight_on_input: diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index d18c7207d..4c175fddb 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -900,6 +900,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: return fused_marlin_moe( x, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 8b151133b..570317ad3 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -958,6 +958,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert not self.is_monolithic @@ -980,7 +981,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, - shared_experts_input=layer._get_shared_experts_input(x), + shared_experts_input=shared_experts_input, ) @@ -1524,6 +1525,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert not self.is_monolithic @@ -1551,7 +1553,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): global_num_experts=layer.global_num_experts, expert_map=layer.expert_map, apply_router_weight_on_input=layer.apply_router_weight_on_input, - shared_experts_input=layer._get_shared_experts_input(x), + shared_experts_input=shared_experts_input, ) diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index bca2516d4..4365d1693 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -367,6 +367,7 @@ class MoeWNA16Method(FusedMoEMethodBase): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index d1c9cb6bb..13199124b 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -900,6 +900,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert not self.is_monolithic if layer.enable_eplb: diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 190890130..7faa4fcc9 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -419,6 +419,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: if self.rocm_aiter_moe_enabled: from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( @@ -607,6 +608,7 @@ class QuarkW4A8Fp8MoEMethod(QuarkMoEMethod): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( rocm_aiter_fused_experts, @@ -977,6 +979,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): x: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, + shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: if not self.emulate: if ( diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 2b7d9ff29..635402f3d 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -816,10 +816,14 @@ class Worker(WorkerBase): for module in moe_modules: module.moe_config.num_experts = num_local_experts * new_ep_size module.global_num_experts = module.moe_config.num_experts + tp_size = get_tp_group().world_size + is_sequence_parallel = parallel_config.use_sequence_parallel_moe + sp_size = tp_size if is_sequence_parallel else 1 module.moe_parallel_config = FusedMoEParallelConfig.make( - tp_size_=get_tp_group().world_size, + tp_size_=tp_size, pcp_size_=get_pcp_group().world_size, dp_size_=get_dp_group().world_size, + sp_size_=sp_size, vllm_parallel_config=parallel_config, ) module.moe_config.moe_parallel_config = module.moe_parallel_config -- GitLab From 4a1550d22d7058e129d0e1257e726b3bf4a77025 Mon Sep 17 00:00:00 2001 From: Micah Williamson Date: Tue, 10 Feb 2026 19:08:11 -0600 Subject: [PATCH 0072/1166] [ROCm][CI] Fix test_sequence_parallel.py location in AMD CI pipeline (#34280) Signed-off-by: Micah Williamson --- .buildkite/test-amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 19fc79f61..730613e1f 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1334,7 +1334,7 @@ steps: - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - - pytest -v -s distributed/test_sequence_parallel.py + - pytest -v -s compile/correctness_e2e/test_sequence_parallel.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s v1/worker/test_worker_memory_snapshot.py -- GitLab From ba0511fd80b95d05ffab867cce54f3590e57a7fc Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 10 Feb 2026 19:29:49 -0700 Subject: [PATCH 0073/1166] [Misc] Add run one batch script that supports profiling (#32968) Signed-off-by: Lucas Wilkinson --- examples/offline_inference/run_one_batch.py | 112 ++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 examples/offline_inference/run_one_batch.py diff --git a/examples/offline_inference/run_one_batch.py b/examples/offline_inference/run_one_batch.py new file mode 100644 index 000000000..d7692c563 --- /dev/null +++ b/examples/offline_inference/run_one_batch.py @@ -0,0 +1,112 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + +from vllm import LLM, EngineArgs +from vllm.config import ProfilerConfig +from vllm.utils.argparse_utils import FlexibleArgumentParser + +DEFAULT_MAX_TOKENS = 16 + + +def create_parser() -> FlexibleArgumentParser: + parser = FlexibleArgumentParser() + EngineArgs.add_cli_args(parser) + parser.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct") + + batch_group = parser.add_argument_group("Batch parameters") + batch_group.add_argument("--batch-size", type=int, default=1) + batch_group.add_argument("--prompt-size", type=int, default=128) + batch_group.add_argument("--prompt-prefix", type=str, default="Hello, my name is") + + profile_group = parser.add_argument_group("Profiling parameters") + profile_group.add_argument( + "--profile", + choices=["none", "prefill", "decode", "both"], + default="none", + ) + profile_group.add_argument( + "--profile-dir", + type=str, + default="", + help="Required when --profile is not 'none'.", + ) + + return parser + + +def _build_prompt(prefix: str, prompt_size: int) -> str: + if prompt_size <= 0: + return "" + if not prefix: + prefix = " " + if len(prefix) >= prompt_size: + return prefix[:prompt_size] + repeat_count = (prompt_size + len(prefix) - 1) // len(prefix) + return (prefix * repeat_count)[:prompt_size] + + +def _build_profiler_config( + profile: str, profile_dir: str, max_tokens: int +) -> ProfilerConfig | None: + if profile == "none": + return None + if not profile_dir: + raise ValueError("--profile-dir must be set when profiling is enabled.") + if profile == "prefill": + delay_iterations = 0 + max_iterations = 1 + elif profile == "decode": + delay_iterations = 1 + max_iterations = max(1, max_tokens) + else: + delay_iterations = 0 + max_iterations = 0 + + return ProfilerConfig( + profiler="torch", + torch_profiler_dir=profile_dir, + delay_iterations=delay_iterations, + max_iterations=max_iterations, + ) + + +def main(args: dict) -> None: + max_tokens = DEFAULT_MAX_TOKENS + batch_size = args.pop("batch_size") + prompt_size = args.pop("prompt_size") + prompt_prefix = args.pop("prompt_prefix") + profile = args.pop("profile") + profile_dir = args.pop("profile_dir") + + profiler_config = _build_profiler_config(profile, profile_dir, max_tokens) + if profiler_config is not None: + args["profiler_config"] = profiler_config + + llm = LLM(**args) + + sampling_params = llm.get_default_sampling_params() + sampling_params.max_tokens = max_tokens + sampling_params.min_tokens = max_tokens + sampling_params.ignore_eos = True + + prompt = _build_prompt(prompt_prefix, prompt_size) + prompts = [prompt] * batch_size + + if profile != "none": + llm.start_profile() + outputs = llm.generate(prompts, sampling_params) + if profile != "none": + llm.stop_profile() + + print("-" * 50) + for output in outputs: + generated_text = output.outputs[0].text + print(f"Prompt: {output.prompt!r}\nGenerated text: {generated_text!r}") + print("-" * 50) + + +if __name__ == "__main__": + parser = create_parser() + main(vars(parser.parse_args())) -- GitLab From 3bcd494ef4bd50c8fa34990d80743728e464c2e0 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 11 Feb 2026 11:10:12 +0800 Subject: [PATCH 0075/1166] [Redo] Add `--trust-remote-code` to dataset bench args (#34251) Signed-off-by: DarkLight1337 --- vllm/benchmarks/datasets.py | 5 +++++ vllm/benchmarks/serve.py | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index f06f41a47..86e080b55 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -1310,6 +1310,11 @@ class _ValidateDatasetArgs(argparse.Action): def add_dataset_parser(parser: FlexibleArgumentParser): + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="Trust remote code from huggingface", + ) parser.add_argument("--seed", type=int, default=0) parser.add_argument( "--num-prompts", diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 534392883..06e67f912 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -1313,11 +1313,6 @@ def add_cli_args(parser: argparse.ArgumentParser): "bursty requests. A higher burstiness value (burstiness > 1) " "results in a more uniform arrival of requests.", ) - parser.add_argument( - "--trust-remote-code", - action="store_true", - help="Trust remote code from huggingface", - ) parser.add_argument( "--disable-tqdm", action="store_true", -- GitLab From e30cedd44be332e1ddc7ec43b8a33bce532e7614 Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Tue, 10 Feb 2026 22:15:40 -0500 Subject: [PATCH 0076/1166] [torch.compile] Stop doing unnecessary FakeTensorProp in PiecewiseCompileInterpreter (#34093) Signed-off-by: Richard Zou --- tests/compile/fullgraph/test_simple.py | 41 ++++++++++++++++++++++++-- vllm/compilation/backends.py | 4 ++- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/tests/compile/fullgraph/test_simple.py b/tests/compile/fullgraph/test_simple.py index 36cc1510e..ed9c7a351 100644 --- a/tests/compile/fullgraph/test_simple.py +++ b/tests/compile/fullgraph/test_simple.py @@ -27,10 +27,29 @@ from ...utils import create_new_process_for_each_test from ..silly_attention import get_global_counter, reset_global_counter +# Custom op that returns an unbacked symint during graph capture +@torch.library.custom_op("mylib::foo", mutates_args=()) +def foo(x: torch.Tensor) -> int: + return 3 + + +@foo.register_fake +def _(x): + return torch.library.get_ctx().new_dynamic_size() + + @support_torch_compile class SillyModel(nn.Module): - def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None: + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + intermediate_unbacked=False, + **kwargs, + ) -> None: super().__init__() + self.intermediate_unbacked = intermediate_unbacked def forward(self, x: torch.Tensor) -> torch.Tensor: """ @@ -44,6 +63,13 @@ class SillyModel(nn.Module): torch.ops.silly.attention(x, x, x, out) x = out x = x - 2 + + if self.intermediate_unbacked: + # Test for unbacked symints: the following is a fancy way to multiply by 1 + u0 = foo(x) + ones = x.new_ones(x.shape[0], u0).sum(-1) / 3 + x = x * ones + x = x - 1 out = torch.empty_like(x) torch.ops.silly.attention(x, x, x, out) @@ -52,6 +78,7 @@ class SillyModel(nn.Module): return x +@torch._dynamo.config.patch(capture_dynamic_output_shape_ops=True) def _run_simple_model( splitting_ops, use_inductor_graph_partition, @@ -60,6 +87,8 @@ def _run_simple_model( expected_num_piecewise_capturable_graphs_seen, expected_num_backend_compilations, expected_num_cudagraph_captured, + *, + intermediate_unbacked=False, ): vllm_config = VllmConfig( compilation_config=CompilationConfig( @@ -72,7 +101,11 @@ def _run_simple_model( ) ) with set_current_vllm_config(vllm_config): - model = SillyModel(vllm_config=vllm_config, prefix="") + model = SillyModel( + vllm_config=vllm_config, + prefix="", + intermediate_unbacked=intermediate_unbacked, + ) inputs = torch.randn(100).cuda() @@ -125,9 +158,10 @@ def _run_simple_model( @pytest.mark.parametrize("backend", ["inductor", "eager"]) +@pytest.mark.parametrize("intermediate_unbacked", [True, False]) @torch.inference_mode() @create_new_process_for_each_test("spawn") -def test_simple_piecewise_compile(backend): +def test_simple_piecewise_compile(backend, intermediate_unbacked): _run_simple_model( splitting_ops=["silly::attention"], use_inductor_graph_partition=False, @@ -140,6 +174,7 @@ def test_simple_piecewise_compile(backend): expected_num_backend_compilations=3, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen expected_num_cudagraph_captured=6, + intermediate_unbacked=intermediate_unbacked, ) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index e5cdb2d33..315bac73f 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -570,7 +570,9 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): # type: ignore[misc] ) -> Any: assert isinstance(target, str) - output = super().call_module(target, args, kwargs) + gm = getattr(self.module, target) + outputs = gm.graph.output_node().args[0] + output = fx.map_arg(outputs, lambda node: node.meta["example_value"]) if target in self.compile_submod_names: index = self.compile_submod_names.index(target) -- GitLab From 066c6da6a04906a89739fb7e6874ceb6cf714364 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Tue, 10 Feb 2026 22:15:43 -0500 Subject: [PATCH 0077/1166] [WideEP] Fix nvfp4 DeepEP High Throughput All2All backend (#33738) Signed-off-by: Tyler Michael Smith Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> --- .../layers/quantization/utils/flashinfer_fp4_moe.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index 4783ca5e0..cbdcd348c 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -82,8 +82,12 @@ def _supports_routing_method( def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool: - """Supports EP.""" - return True + """ + TRTLLM is a monolithic kernel that requires dispatch_router_logits() for + the naive dispatch/combine path. DeepEP HT only implements dispatch() for + the modular kernel path, so TRTLLM is incompatible with DeepEP HT. + """ + return not moe_parallel_config.use_deepep_ht_kernels def is_supported_config_trtllm( -- GitLab From b5dcb372e4ba04043a012475cea7cc901412f25a Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 11 Feb 2026 11:29:29 +0800 Subject: [PATCH 0078/1166] [Misc] Clean up validation logic in input processor (#34144) Signed-off-by: DarkLight1337 --- .../engine/test_process_multi_modal_uuids.py | 1 - vllm/multimodal/encoder_budget.py | 1 + vllm/v1/engine/input_processor.py | 156 ++++++++---------- 3 files changed, 72 insertions(+), 86 deletions(-) diff --git a/tests/v1/engine/test_process_multi_modal_uuids.py b/tests/v1/engine/test_process_multi_modal_uuids.py index 4f3dbdf29..4170de173 100644 --- a/tests/v1/engine/test_process_multi_modal_uuids.py +++ b/tests/v1/engine/test_process_multi_modal_uuids.py @@ -20,7 +20,6 @@ def _build_input_processor( ) -> InputProcessor: model_config = ModelConfig( model="Qwen/Qwen2.5-VL-3B-Instruct", - skip_tokenizer_init=True, max_model_len=128, mm_processor_cache_gb=mm_cache_gb, ) diff --git a/vllm/multimodal/encoder_budget.py b/vllm/multimodal/encoder_budget.py index 821c9e9b5..c51bb255d 100644 --- a/vllm/multimodal/encoder_budget.py +++ b/vllm/multimodal/encoder_budget.py @@ -62,6 +62,7 @@ class MultiModalBudget: processor = mm_registry.create_processor(model_config, cache=cache) self.cache = cache + self.processor = processor mm_config = model_config.get_multimodal_config() enable_mm_embeds = mm_config is not None and mm_config.enable_mm_embeds diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 47180ee59..0e52e2d20 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -72,13 +72,15 @@ class InputProcessor: self.mm_registry = mm_registry self.mm_processor_cache = mm_registry.processor_cache_from_config(vllm_config) - self.mm_encoder_cache_size: int | None = None - if ( - mm_registry.supports_multimodal_inputs(model_config) - and not model_config.skip_tokenizer_init - ): + self.supports_mm_inputs = mm_registry.supports_multimodal_inputs(model_config) + self.mm_encoder_cache_size = 0 + self.skip_prompt_length_check = False + if self.supports_mm_inputs: mm_budget = MultiModalBudget(vllm_config, mm_registry) self.mm_encoder_cache_size = mm_budget.encoder_cache_size + self.skip_prompt_length_check = ( + mm_budget.processor.info.skip_prompt_length_check + ) mm_budget.reset_cache() # Not used anymore self.input_preprocessor = InputPreprocessor( @@ -670,76 +672,25 @@ class InputProcessor: resumable=resumable, ) - def _validate_model_inputs( - self, encoder_inputs: SingletonInputs | None, decoder_inputs: SingletonInputs - ): - if encoder_inputs is not None: - self._validate_model_input(encoder_inputs, prompt_type="encoder") - - self._validate_model_input(decoder_inputs, prompt_type="decoder") - - def _validate_model_input( + def _validate_prompt_len( self, - prompt_inputs: SingletonInputs, - *, + prompt_len: int, prompt_type: Literal["encoder", "decoder"], ): - model_config = self.model_config - - prompt_ids = ( - None - if prompt_inputs["type"] == "embeds" - else prompt_inputs["prompt_token_ids"] - ) - prompt_embeds = ( - prompt_inputs["prompt_embeds"] - if prompt_inputs["type"] == "embeds" - else None - ) - prompt_len = length_from_prompt_token_ids_or_embeds(prompt_ids, prompt_embeds) - if not prompt_ids: - if prompt_type == "encoder" and model_config.is_multimodal_model: - pass # Mllama may have empty encoder inputs for text-only data - elif prompt_inputs["type"] == "embeds": - pass # Prompt embeds should not have prompt_ids. - else: - raise ValueError(f"The {prompt_type} prompt cannot be empty") - - tokenizer = self.tokenizer - if tokenizer is not None: - max_input_id = max(prompt_ids or (), default=0) - - # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while - # self.model_config.get_vocab_size() is the model’s vocab size. - # For Qwen3 models, the language model has extra tokens that do - # not exist in the tokenizer, and vice versa for multimodal - # placeholder tokens in some multimodal models. - # See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501 - # and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501 + if self.skip_prompt_length_check: + return - # Here we take the max of the two to determine if a token id is - # truly out-of-vocabulary. - if max_input_id > max( - tokenizer.max_token_id, self.model_config.get_vocab_size() - 1 - ): - raise ValueError(f"Token id {max_input_id} is out of vocabulary") + if prompt_len == 0 and prompt_type == "decoder": + raise ValueError(f"The {prompt_type} prompt cannot be empty") - max_prompt_len = self.model_config.max_model_len + model_config = self.model_config + max_prompt_len = ( + model_config.max_model_len + if prompt_type == "decoder" + else self.mm_encoder_cache_size + ) if prompt_len > max_prompt_len: - if model_config.is_multimodal_model: - mm_registry = self.input_preprocessor.mm_registry - model_cls = mm_registry._get_model_cls(model_config) - factories = model_cls._processor_factory - ctx = mm_registry._create_processing_ctx( - model_config, - tokenizer=tokenizer, - ) - mm_info = factories.info(ctx) - - if mm_info.skip_prompt_length_check: - return - - if model_config.is_multimodal_model: + if self.supports_mm_inputs: suggestion = ( "Make sure that `max_model_len` is no smaller than the " "number of text tokens plus multimodal tokens. For image " @@ -757,17 +708,7 @@ class InputProcessor: f"longer than the maximum model length of {max_prompt_len}. " f"{suggestion}" ) - - # TODO: Find out how many placeholder tokens are there so we can - # check that chunked prefill does not truncate them - # max_batch_len = self.scheduler_config.max_num_batched_tokens - - if ( - prompt_len == max_prompt_len - and prompt_type == "decoder" - and not model_config.is_multimodal_model - and self.model_config.runner_type != "pooling" - ): + elif prompt_len == max_prompt_len and model_config.runner_type == "generate": suggestion = ( "Make sure that `max_model_len` is no smaller than the " "number of text tokens (prompt + requested output tokens)." @@ -778,11 +719,29 @@ class InputProcessor: f"model length of {max_prompt_len}. {suggestion}" ) - if ( - prompt_type == "decoder" - and prompt_inputs["type"] == "multimodal" - and self.mm_encoder_cache_size is not None - ): + def _validate_model_input( + self, + prompt_inputs: SingletonInputs, + prompt_type: Literal["encoder", "decoder"], + ) -> None: + model_config = self.model_config + tokenizer = self.tokenizer + + prompt_ids = ( + None + if prompt_inputs["type"] == "embeds" + else prompt_inputs["prompt_token_ids"] + ) + prompt_embeds = ( + prompt_inputs["prompt_embeds"] + if prompt_inputs["type"] == "embeds" + else None + ) + + prompt_len = length_from_prompt_token_ids_or_embeds(prompt_ids, prompt_embeds) + self._validate_prompt_len(prompt_len, prompt_type) + + if prompt_inputs["type"] == "multimodal": decoder_mm_positions = prompt_inputs["mm_placeholders"] for modality, mm_positions in decoder_mm_positions.items(): for mm_position in mm_positions: @@ -797,6 +756,33 @@ class InputProcessor: f"by setting --limit-mm-per-prompt at startup." ) + if prompt_ids and tokenizer is not None: + max_input_id = max(prompt_ids, default=0) + + # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while + # self.model_config.get_vocab_size() is the model’s vocab size. + # For Qwen3 models, the language model has extra tokens that do + # not exist in the tokenizer, and vice versa for multimodal + # placeholder tokens in some multimodal models. + # See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501 + # and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501 + + # Here we take the max of the two to determine if a token id is + # truly out-of-vocabulary. + model_vocab_size = model_config.get_vocab_size() + if max_input_id > max(tokenizer.max_token_id, model_vocab_size - 1): + raise ValueError(f"Token id {max_input_id} is out of vocabulary") + + def _validate_model_inputs( + self, + encoder_inputs: SingletonInputs | None, + decoder_inputs: SingletonInputs, + ): + if encoder_inputs is not None: + self._validate_model_input(encoder_inputs, prompt_type="encoder") + + self._validate_model_input(decoder_inputs, prompt_type="decoder") + def stat_mm_cache(self) -> MultiModalCacheStats | None: return self.input_preprocessor.stat_mm_cache() -- GitLab From 5ee5c86eeb00a4d159e2e2cb4c8c85dcc0733e15 Mon Sep 17 00:00:00 2001 From: Kebe Date: Wed, 11 Feb 2026 12:31:36 +0900 Subject: [PATCH 0079/1166] [Bugfix][DeepSeek-V3.2] fix fp8 kvcache type cast (#33884) Signed-off-by: Kebe --- csrc/cache_kernels.cu | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 969c28c75..10d540a1d 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -1234,8 +1234,13 @@ void cp_gather_and_upconvert_fp8_kv_cache( "src_cache and seq_lens must be on the same device"); TORCH_CHECK(src_cache.device() == workspace_starts.device(), "src_cache and workspace_starts must be on the same device"); - - TORCH_CHECK(src_cache.dtype() == torch::kUInt8, "src_cache must be uint8"); + auto dtype = src_cache.scalar_type(); + TORCH_CHECK( + dtype == at::ScalarType::Byte || // uint8 + dtype == at::ScalarType::Float8_e4m3fn || // fp8 e4m3 + dtype == at::ScalarType::Float8_e5m2, // fp8 e5m2 + "src_cache must be uint8, float8_e4m3fn, or float8_e5m2, but got ", + src_cache.dtype()); TORCH_CHECK(dst.dtype() == torch::kBFloat16, "dst must be bfloat16"); TORCH_CHECK(head_dim == 576, "head_dim must be 576 for MLA"); @@ -1244,14 +1249,21 @@ void cp_gather_and_upconvert_fp8_kv_cache( int64_t cache_entry_stride = src_cache.stride(1); int64_t dst_entry_stride = dst.stride(0); + const uint8_t* src_ptr = nullptr; + if (dtype == at::ScalarType::Byte) { + src_ptr = src_cache.data_ptr(); + } else { + // float8_e4m3fn or float8_e5m2 + src_ptr = reinterpret_cast(src_cache.data_ptr()); + } + // Decide on the number of splits based on the batch size int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16; dim3 grid(batch_size, num_splits); dim3 block(576); vllm::cp_gather_and_upconvert_fp8_kv_cache<<>>( - src_cache.data_ptr(), - reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()), + src_ptr, reinterpret_cast<__nv_bfloat16*>(dst.data_ptr()), block_table.data_ptr(), seq_lens.data_ptr(), workspace_starts.data_ptr(), block_size, head_dim, block_table_stride, cache_block_stride, cache_entry_stride, -- GitLab From 1485396abb7c575d0196c2f52f4cdff7f9280a19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=94=D0=B7=D0=B5=D1=80=D0=B6=D0=B8=CC=81=D0=BD=D1=81?= =?UTF-8?q?=D0=BA=D0=B8=D0=B9?= <256908701+AstroVoyager7@users.noreply.github.com> Date: Wed, 11 Feb 2026 11:31:51 +0800 Subject: [PATCH 0080/1166] [Kernel] Apply 256bit LDG/STG To Activation Kernels (#33022) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Dzerzhinsky <256908701+AstroVoyager7@users.noreply.github.com> Signed-off-by: Дзержи́нский <256908701+AstroVoyager7@users.noreply.github.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- csrc/activation_kernels.cu | 524 ++++++++++++++++++++++++++++--------- 1 file changed, 401 insertions(+), 123 deletions(-) diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 8268065ef..f1d4c137c 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -9,6 +9,111 @@ namespace vllm { +struct alignas(32) u32x8_t { + uint32_t u0, u1, u2, u3, u4, u5, u6, u7; +}; + +__device__ __forceinline__ void ld256(u32x8_t& val, const u32x8_t* ptr) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 + asm volatile("ld.global.nc.v8.u32 {%0,%1,%2,%3,%4,%5,%6,%7}, [%8];\n" + : "=r"(val.u0), "=r"(val.u1), "=r"(val.u2), "=r"(val.u3), + "=r"(val.u4), "=r"(val.u5), "=r"(val.u6), "=r"(val.u7) + : "l"(ptr)); +#else + const uint4* uint_ptr = reinterpret_cast(ptr); + uint4 top_half = __ldg(&uint_ptr[0]); + uint4 bottom_half = __ldg(&uint_ptr[1]); + val.u0 = top_half.x; + val.u1 = top_half.y; + val.u2 = top_half.z; + val.u3 = top_half.w; + val.u4 = bottom_half.x; + val.u5 = bottom_half.y; + val.u6 = bottom_half.z; + val.u7 = bottom_half.w; +#endif +} + +__device__ __forceinline__ void st256(u32x8_t& val, u32x8_t* ptr) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000 + asm volatile("st.global.v8.u32 [%0], {%1,%2,%3,%4,%5,%6,%7,%8};\n" + : + : "l"(ptr), "r"(val.u0), "r"(val.u1), "r"(val.u2), "r"(val.u3), + "r"(val.u4), "r"(val.u5), "r"(val.u6), "r"(val.u7) + : "memory"); +#else + uint4* uint_ptr = reinterpret_cast(ptr); + uint_ptr[0] = make_uint4(val.u0, val.u1, val.u2, val.u3); + uint_ptr[1] = make_uint4(val.u4, val.u5, val.u6, val.u7); +#endif +} + +template +struct VecTraits; + +template <> +struct VecTraits { + static constexpr int ARCH_MAX_VEC_SIZE = 32; + using vec_t = u32x8_t; +}; + +template <> +struct VecTraits { + static constexpr int ARCH_MAX_VEC_SIZE = 16; + using vec_t = int4; +}; + +template +struct PackedTraits; + +template <> +struct PackedTraits { + using packed_t = __nv_bfloat162; +}; + +template <> +struct PackedTraits { + using packed_t = __half2; +}; + +template <> +struct PackedTraits { + using packed_t = float2; +}; + +template +__device__ __forceinline__ float2 cast_to_float2(const packed_t& val) { + if constexpr (std::is_same_v) { + return __bfloat1622float2(val); + } else if constexpr (std::is_same_v) { + return __half22float2(val); + } else if constexpr (std::is_same_v) { + return float2(val); + } +} + +template +__device__ __forceinline__ packed_t cast_to_packed(const float2& val) { + if constexpr (std::is_same_v) { + return __float22bfloat162_rn(val); + } else if constexpr (std::is_same_v) { + return __float22half2_rn(val); + } else if constexpr (std::is_same_v) { + return float2(val); + } +} + +template +__device__ __forceinline__ packed_t packed_mul(const packed_t& x, + const packed_t& y) { + if constexpr (std::is_same_v || + std::is_same_v) { + return __hmul2(x, y); + } else if constexpr (std::is_same_v) { + return make_float2(x.x * y.x, x.y * y.y); + } +} + template __device__ __forceinline__ scalar_t compute(const scalar_t& x, @@ -16,52 +121,69 @@ __device__ __forceinline__ scalar_t compute(const scalar_t& x, return act_first ? ACT_FN(x) * y : x * ACT_FN(y); } +template +__device__ __forceinline__ packed_t packed_compute(const packed_t& x, + const packed_t& y) { + return act_first ? packed_mul(PACKED_ACT_FN(x), y) + : packed_mul(x, PACKED_ACT_FN(y)); +} + // Check if all pointers are 16-byte aligned for int4 vectorized access -__device__ __forceinline__ bool is_16byte_aligned(const void* ptr) { +__host__ __device__ __forceinline__ bool is_16byte_aligned(const void* ptr) { return (reinterpret_cast(ptr) & 15) == 0; } +// Check if all pointers are 16-byte aligned for longlong4_32a vectorized access +__host__ __device__ __forceinline__ bool is_32byte_aligned(const void* ptr) { + return (reinterpret_cast(ptr) & 31) == 0; +} + // Activation and gating kernel template. -template +template __global__ void act_and_mul_kernel( scalar_t* __restrict__ out, // [..., d] const scalar_t* __restrict__ input, // [..., 2, d] const int d) { - constexpr int VEC_SIZE = 16 / sizeof(scalar_t); - const int64_t token_idx = blockIdx.x; - const scalar_t* x_ptr = input + token_idx * 2 * d; + const scalar_t* x_ptr = input + blockIdx.x * 2 * d; const scalar_t* y_ptr = x_ptr + d; - scalar_t* out_ptr = out + token_idx * d; + scalar_t* out_ptr = out + blockIdx.x * d; - // Check alignment for 128-bit vectorized access. - // All three pointers must be 16-byte aligned for safe int4 operations. - const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) && - is_16byte_aligned(out_ptr); + if constexpr (use_vec) { + // Fast path: 128-bit/256-bit vectorized loop + using vec_t = typename VecTraits::vec_t; + constexpr int ARCH_MAX_VEC_SIZE = VecTraits::ARCH_MAX_VEC_SIZE; + constexpr int VEC_SIZE = ARCH_MAX_VEC_SIZE / sizeof(packed_t); - if (aligned && d >= VEC_SIZE) { - // Fast path: 128-bit vectorized loop - const int4* x_vec = reinterpret_cast(x_ptr); - const int4* y_vec = reinterpret_cast(y_ptr); - int4* out_vec = reinterpret_cast(out_ptr); - const int num_vecs = d / VEC_SIZE; - const int vec_end = num_vecs * VEC_SIZE; + const vec_t* x_vec = reinterpret_cast(x_ptr); + const vec_t* y_vec = reinterpret_cast(y_ptr); + vec_t* out_vec = reinterpret_cast(out_ptr); + const int num_vecs = d / 2 / VEC_SIZE; for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) { - int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r; - auto* xp = reinterpret_cast(&x); - auto* yp = reinterpret_cast(&y); - auto* rp = reinterpret_cast(&r); + vec_t x, y; + if constexpr (use_256b) { + ld256(x, &x_vec[i]); + ld256(y, &y_vec[i]); + } else { + x = VLLM_LDG(&x_vec[i]); + y = VLLM_LDG(&y_vec[i]); + } + auto* xp = reinterpret_cast(&x); + auto* yp = reinterpret_cast(&y); #pragma unroll for (int j = 0; j < VEC_SIZE; j++) { - rp[j] = compute(xp[j], yp[j]); + xp[j] = + packed_compute(xp[j], yp[j]); + } + if constexpr (use_256b) { + st256(x, &out_vec[i]); + } else { + out_vec[i] = x; } - out_vec[i] = r; - } - // Scalar cleanup for remaining elements - for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) { - out_ptr[i] = compute(VLLM_LDG(&x_ptr[i]), - VLLM_LDG(&y_ptr[i])); } } else { // Scalar fallback for unaligned data or small d @@ -79,6 +201,15 @@ __device__ __forceinline__ T silu_kernel(const T& x) { return (T)(((float)x) / (1.0f + expf((float)-x))); } +template +__device__ __forceinline__ packed_t packed_silu_kernel(const packed_t& val) { + // x * sigmoid(x) + float2 fval = cast_to_float2(val); + fval.x = fval.x / (1.0f + expf(-fval.x)); + fval.y = fval.y / (1.0f + expf(-fval.y)); + return cast_to_packed(fval); +} + template __device__ __forceinline__ T gelu_kernel(const T& x) { // Equivalent to PyTorch GELU with 'none' approximation. @@ -89,6 +220,18 @@ __device__ __forceinline__ T gelu_kernel(const T& x) { return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA))); } +template +__device__ __forceinline__ packed_t packed_gelu_kernel(const packed_t& val) { + // Equivalent to PyTorch GELU with 'none' approximation. + // Refer to: + // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38 + constexpr float ALPHA = M_SQRT1_2; + float2 fval = cast_to_float2(val); + fval.x = fval.x * 0.5f * (1.0f + ::erf(fval.x * ALPHA)); + fval.y = fval.y * 0.5f * (1.0f + ::erf(fval.y * ALPHA)); + return cast_to_packed(fval); +} + template __device__ __forceinline__ T gelu_tanh_kernel(const T& x) { // Equivalent to PyTorch GELU with 'tanh' approximation. @@ -102,32 +245,83 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) { return (T)(0.5f * f * (1.0f + ::tanhf(inner))); } +template +__device__ __forceinline__ packed_t +packed_gelu_tanh_kernel(const packed_t& val) { + // Equivalent to PyTorch GELU with 'tanh' approximation. + // Refer to: + // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30 + float2 fval = cast_to_float2(val); + constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f; + constexpr float KAPPA = 0.044715; + + float x_cube = fval.x * fval.x * fval.x; + float inner = BETA * (fval.x + KAPPA * x_cube); + fval.x = 0.5f * fval.x * (1.0f + ::tanhf(inner)); + + x_cube = fval.y * fval.y * fval.y; + inner = BETA * (fval.y + KAPPA * x_cube); + fval.y = 0.5f * fval.y * (1.0f + ::tanhf(inner)); + return cast_to_packed(fval); +} + } // namespace vllm // Launch activation and gating kernel. // Use ACT_FIRST (bool) indicating whether to apply the activation function // first. -#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, ACT_FIRST) \ - int d = input.size(-1) / 2; \ - int64_t num_tokens = input.numel() / input.size(-1); \ - dim3 grid(num_tokens); \ - dim3 block(std::min(d, 1024)); \ - if (num_tokens == 0) { \ - return; \ - } \ - const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ - VLLM_DISPATCH_FLOATING_TYPES( \ - input.scalar_type(), "act_and_mul_kernel", [&] { \ - vllm::act_and_mul_kernel, ACT_FIRST> \ - <<>>(out.data_ptr(), \ - input.data_ptr(), d); \ - }); +#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL, PACKED_KERNEL, ACT_FIRST) \ + auto dtype = input.scalar_type(); \ + int d = input.size(-1) / 2; \ + int64_t num_tokens = input.numel() / input.size(-1); \ + if (num_tokens == 0) { \ + return; \ + } \ + dim3 grid(num_tokens); \ + int cc_major = at::cuda::getCurrentDeviceProperties()->major; \ + int support_vec = (cc_major >= 10 && num_tokens > 128) ? 32 : 16; \ + int vec_size = support_vec / at::elementSize(dtype); \ + const bool use_vec = (d % vec_size == 0); \ + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ + if (use_vec) { \ + dim3 block(std::min(d / vec_size, 1024)); \ + if (cc_major >= 10 && num_tokens > 128) { \ + VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] { \ + vllm::act_and_mul_kernel< \ + scalar_t, typename vllm::PackedTraits::packed_t, \ + KERNEL, \ + PACKED_KERNEL::packed_t>, \ + ACT_FIRST, true, true><<>>( \ + out.data_ptr(), input.data_ptr(), d); \ + }); \ + } else { \ + VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] { \ + vllm::act_and_mul_kernel< \ + scalar_t, typename vllm::PackedTraits::packed_t, \ + KERNEL, \ + PACKED_KERNEL::packed_t>, \ + ACT_FIRST, true, false><<>>( \ + out.data_ptr(), input.data_ptr(), d); \ + }); \ + } \ + } else { \ + dim3 block(std::min(d, 1024)); \ + VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel", [&] { \ + vllm::act_and_mul_kernel< \ + scalar_t, typename vllm::PackedTraits::packed_t, \ + KERNEL, \ + PACKED_KERNEL::packed_t>, \ + ACT_FIRST, false><<>>( \ + out.data_ptr(), input.data_ptr(), d); \ + }); \ + } void silu_and_mul(torch::Tensor& out, // [..., d] torch::Tensor& input) // [..., 2 * d] { - LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, true); + LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, vllm::packed_silu_kernel, + true); } void mul_and_silu(torch::Tensor& out, // [..., d] @@ -135,19 +329,22 @@ void mul_and_silu(torch::Tensor& out, // [..., d] { // The difference between mul_and_silu and silu_and_mul is that mul_and_silu // applies the silu to the latter half of the input. - LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, false); + LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, vllm::packed_silu_kernel, + false); } void gelu_and_mul(torch::Tensor& out, // [..., d] torch::Tensor& input) // [..., 2 * d] { - LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel, true); + LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel, vllm::packed_gelu_kernel, + true); } void gelu_tanh_and_mul(torch::Tensor& out, // [..., d] torch::Tensor& input) // [..., 2 * d] { - LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel, true); + LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel, + vllm::packed_gelu_tanh_kernel, true); } namespace vllm { @@ -158,42 +355,57 @@ __device__ __forceinline__ T fatrelu_kernel(const T& x, const float threshold) { return (T)(f > threshold ? f : 0.0f); } -template +template +__device__ __forceinline__ packed_t +packed_fatrelu_kernel(const packed_t& val, const float threshold) { + float2 fval = cast_to_float2(val); + fval.x = fval.x > threshold ? fval.x : 0.0f; + fval.y = fval.y > threshold ? fval.y : 0.0f; + return cast_to_packed(fval); +} + +template __global__ void act_and_mul_kernel_with_param( scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d, const float param) { - constexpr int VEC_SIZE = 16 / sizeof(scalar_t); - const int64_t token_idx = blockIdx.x; - const scalar_t* x_ptr = input + token_idx * 2 * d; + const scalar_t* x_ptr = input + blockIdx.x * 2 * d; const scalar_t* y_ptr = x_ptr + d; - scalar_t* out_ptr = out + token_idx * d; + scalar_t* out_ptr = out + blockIdx.x * d; - // Check alignment for 128-bit vectorized access - const bool aligned = is_16byte_aligned(x_ptr) && is_16byte_aligned(y_ptr) && - is_16byte_aligned(out_ptr); + if constexpr (use_vec) { + // Fast path: 128-bit/256-bit vectorized loop + using vec_t = typename VecTraits::vec_t; + constexpr int ARCH_MAX_VEC_SIZE = VecTraits::ARCH_MAX_VEC_SIZE; + constexpr int VEC_SIZE = ARCH_MAX_VEC_SIZE / sizeof(packed_t); - if (aligned && d >= VEC_SIZE) { - // Fast path: 128-bit vectorized loop - const int4* x_vec = reinterpret_cast(x_ptr); - const int4* y_vec = reinterpret_cast(y_ptr); - int4* out_vec = reinterpret_cast(out_ptr); - const int num_vecs = d / VEC_SIZE; - const int vec_end = num_vecs * VEC_SIZE; + const vec_t* x_vec = reinterpret_cast(x_ptr); + const vec_t* y_vec = reinterpret_cast(y_ptr); + vec_t* out_vec = reinterpret_cast(out_ptr); + const int num_vecs = d / 2 / VEC_SIZE; for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) { - int4 x = VLLM_LDG(&x_vec[i]), y = VLLM_LDG(&y_vec[i]), r; - auto* xp = reinterpret_cast(&x); - auto* yp = reinterpret_cast(&y); - auto* rp = reinterpret_cast(&r); + vec_t x, y; + if constexpr (use_256b) { + ld256(x, &x_vec[i]); + ld256(y, &y_vec[i]); + } else { + x = VLLM_LDG(&x_vec[i]); + y = VLLM_LDG(&y_vec[i]); + } + auto* xp = reinterpret_cast(&x); + auto* yp = reinterpret_cast(&y); #pragma unroll for (int j = 0; j < VEC_SIZE; j++) { - rp[j] = ACT_FN(xp[j], param) * yp[j]; + xp[j] = packed_mul(PACKED_ACT_FN(xp[j], param), yp[j]); + } + if constexpr (use_256b) { + st256(x, &out_vec[i]); + } else { + out_vec[i] = x; } - out_vec[i] = r; - } - // Scalar cleanup for remaining elements - for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) { - out_ptr[i] = ACT_FN(VLLM_LDG(&x_ptr[i]), param) * VLLM_LDG(&y_ptr[i]); } } else { // Scalar fallback for unaligned data or small d @@ -276,20 +488,58 @@ __global__ void swigluoai_and_mul_kernel( } // namespace vllm -#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM) \ - int d = input.size(-1) / 2; \ - int64_t num_tokens = input.numel() / input.size(-1); \ - dim3 grid(num_tokens); \ - dim3 block(std::min(d, 1024)); \ - const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ - VLLM_DISPATCH_FLOATING_TYPES( \ - input.scalar_type(), "act_and_mul_kernel_with_param", [&] { \ - vllm::act_and_mul_kernel_with_param> \ - <<>>(out.data_ptr(), \ - input.data_ptr(), d, \ - PARAM); \ - }); +#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PACKED_KERNEL, PARAM) \ + auto dtype = input.scalar_type(); \ + int d = input.size(-1) / 2; \ + int64_t num_tokens = input.numel() / input.size(-1); \ + if (num_tokens == 0) { \ + return; \ + } \ + dim3 grid(num_tokens); \ + int cc_major = at::cuda::getCurrentDeviceProperties()->major; \ + int support_vec = (cc_major >= 10 && num_tokens > 128) ? 32 : 16; \ + int vec_size = support_vec / at::elementSize(dtype); \ + const bool use_vec = (d % vec_size == 0); \ + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ + if (use_vec) { \ + dim3 block(std::min(d / vec_size, 1024)); \ + if (cc_major >= 10 && num_tokens > 128) { \ + VLLM_DISPATCH_FLOATING_TYPES( \ + dtype, "act_and_mul_kernel_with_param", [&] { \ + vllm::act_and_mul_kernel_with_param< \ + scalar_t, typename vllm::PackedTraits::packed_t, \ + KERNEL, \ + PACKED_KERNEL< \ + typename vllm::PackedTraits::packed_t>, \ + true, true><<>>( \ + out.data_ptr(), input.data_ptr(), d, \ + PARAM); \ + }); \ + } else { \ + VLLM_DISPATCH_FLOATING_TYPES( \ + dtype, "act_and_mul_kernel_with_param", [&] { \ + vllm::act_and_mul_kernel_with_param< \ + scalar_t, typename vllm::PackedTraits::packed_t, \ + KERNEL, \ + PACKED_KERNEL< \ + typename vllm::PackedTraits::packed_t>, \ + true, false><<>>( \ + out.data_ptr(), input.data_ptr(), d, \ + PARAM); \ + }); \ + } \ + } else { \ + dim3 block(std::min(d, 1024)); \ + VLLM_DISPATCH_FLOATING_TYPES(dtype, "act_and_mul_kernel_with_param", [&] { \ + vllm::act_and_mul_kernel_with_param< \ + scalar_t, typename vllm::PackedTraits::packed_t, \ + KERNEL, \ + PACKED_KERNEL::packed_t>, \ + false><<>>( \ + out.data_ptr(), input.data_ptr(), d, PARAM); \ + }); \ + } #define LAUNCH_SIGLUOAI_AND_MUL(KERNEL, ALPHA, LIMIT) \ int d = input.size(-1) / 2; \ @@ -309,7 +559,8 @@ __global__ void swigluoai_and_mul_kernel( void fatrelu_and_mul(torch::Tensor& out, // [..., d], torch::Tensor& input, // [..., 2 * d] double threshold) { - LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold); + LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM( + vllm::fatrelu_kernel, vllm::packed_fatrelu_kernel, threshold); } void swigluoai_and_mul(torch::Tensor& out, // [..., d] torch::Tensor& input, // [..., 2 * d] @@ -319,39 +570,41 @@ void swigluoai_and_mul(torch::Tensor& out, // [..., d] namespace vllm { // Element-wise activation kernel template. -template +template __global__ void activation_kernel( scalar_t* __restrict__ out, // [..., d] const scalar_t* __restrict__ input, // [..., d] const int d) { - constexpr int VEC_SIZE = 16 / sizeof(scalar_t); - const int64_t token_idx = blockIdx.x; - const scalar_t* in_ptr = input + token_idx * d; - scalar_t* out_ptr = out + token_idx * d; - - // Check alignment for 128-bit vectorized access - const bool aligned = is_16byte_aligned(in_ptr) && is_16byte_aligned(out_ptr); - - if (aligned && d >= VEC_SIZE) { - // Fast path: 128-bit vectorized loop - const int4* in_vec = reinterpret_cast(in_ptr); - int4* out_vec = reinterpret_cast(out_ptr); + const scalar_t* in_ptr = input + blockIdx.x * d; + scalar_t* out_ptr = out + blockIdx.x * d; + + if constexpr (use_vec) { + // Fast path: 128-bit/256-bit vectorized loop + using vec_t = typename VecTraits::vec_t; + constexpr int ARCH_MAX_VEC_SIZE = VecTraits::ARCH_MAX_VEC_SIZE; + constexpr int VEC_SIZE = ARCH_MAX_VEC_SIZE / sizeof(scalar_t); + const vec_t* in_vec = reinterpret_cast(in_ptr); + vec_t* out_vec = reinterpret_cast(out_ptr); const int num_vecs = d / VEC_SIZE; - const int vec_end = num_vecs * VEC_SIZE; for (int i = threadIdx.x; i < num_vecs; i += blockDim.x) { - int4 v = VLLM_LDG(&in_vec[i]), r; + vec_t v; + if constexpr (use_256b) { + ld256(v, &in_vec[i]); + } else { + v = VLLM_LDG(&in_vec[i]); + } auto* vp = reinterpret_cast(&v); - auto* rp = reinterpret_cast(&r); #pragma unroll for (int j = 0; j < VEC_SIZE; j++) { - rp[j] = ACT_FN(vp[j]); + vp[j] = ACT_FN(vp[j]); + } + if constexpr (use_256b) { + st256(v, &out_vec[i]); + } else { + out_vec[i] = v; } - out_vec[i] = r; - } - // Scalar cleanup for remaining elements - for (int i = vec_end + threadIdx.x; i < d; i += blockDim.x) { - out_ptr[i] = ACT_FN(VLLM_LDG(&in_ptr[i])); } } else { // Scalar fallback for unaligned data or small d @@ -365,18 +618,43 @@ __global__ void activation_kernel( } // namespace vllm // Launch element-wise activation kernel. -#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \ - int d = input.size(-1); \ - int64_t num_tokens = input.numel() / d; \ - dim3 grid(num_tokens); \ - dim3 block(std::min(d, 1024)); \ - const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ - VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] { \ - vllm::activation_kernel> \ - <<>>(out.data_ptr(), \ - input.data_ptr(), d); \ - }); +#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \ + auto dtype = input.scalar_type(); \ + int d = input.size(-1); \ + int64_t num_tokens = input.numel() / input.size(-1); \ + if (num_tokens == 0) { \ + return; \ + } \ + dim3 grid(num_tokens); \ + int cc_major = at::cuda::getCurrentDeviceProperties()->major; \ + int support_vec = (cc_major >= 10 && num_tokens > 128) ? 32 : 16; \ + int vec_size = support_vec / at::elementSize(dtype); \ + const bool use_vec = (d % vec_size == 0); \ + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ + if (use_vec) { \ + dim3 block(std::min(d / vec_size, 1024)); \ + if (cc_major >= 10 && num_tokens > 128) { \ + VLLM_DISPATCH_FLOATING_TYPES(dtype, "activation_kernel", [&] { \ + vllm::activation_kernel, true, true> \ + <<>>(out.data_ptr(), \ + input.data_ptr(), d); \ + }); \ + } else { \ + VLLM_DISPATCH_FLOATING_TYPES(dtype, "activation_kernel", [&] { \ + vllm::activation_kernel, true, false> \ + <<>>(out.data_ptr(), \ + input.data_ptr(), d); \ + }); \ + } \ + } else { \ + dim3 block(std::min(d, 1024)); \ + VLLM_DISPATCH_FLOATING_TYPES(dtype, "activation_kernel", [&] { \ + vllm::activation_kernel, false> \ + <<>>(out.data_ptr(), \ + input.data_ptr(), d); \ + }); \ + } namespace vllm { -- GitLab From b482f71e9f25ce848c1a53e71e332953d97b0aac Mon Sep 17 00:00:00 2001 From: zofia <110436990+zufangzhu@users.noreply.github.com> Date: Wed, 11 Feb 2026 11:33:59 +0800 Subject: [PATCH 0081/1166] [XPU][7/N] enable xpu fp8 moe (#34202) Signed-off-by: Zhu, Zufang --- requirements/xpu.txt | 2 +- .../layers/fused_moe/__init__.py | 2 + .../layers/fused_moe/oracle/fp8.py | 10 +++++ .../layers/fused_moe/xpu_fused_moe.py | 43 +++++++++++++++++-- 4 files changed, 52 insertions(+), 5 deletions(-) diff --git a/requirements/xpu.txt b/requirements/xpu.txt index f15f0dcd1..050737164 100644 --- a/requirements/xpu.txt +++ b/requirements/xpu.txt @@ -15,4 +15,4 @@ torch==2.10.0+xpu torchaudio torchvision -vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.1/vllm_xpu_kernels-0.1.1-cp312-cp312-linux_x86_64.whl \ No newline at end of file +vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.2/vllm_xpu_kernels-0.1.2-cp312-cp312-linux_x86_64.whl diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index edf7544b9..dc17af87e 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -102,6 +102,7 @@ if HAS_TRITON: ) from vllm.model_executor.layers.fused_moe.xpu_fused_moe import ( XPUExperts, + XPUExpertsFp8, ) __all__ += [ @@ -121,6 +122,7 @@ if HAS_TRITON: "BatchedDeepGemmExperts", "TritonOrDeepGemmExperts", "XPUExperts", + "XPUExpertsFp8", ] else: # Some model classes directly use the custom ops. Add placeholders diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py index b94e4637e..3dd32f5af 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py +++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py @@ -52,6 +52,7 @@ class Fp8MoeBackend(Enum): AITER = "AITER" VLLM_CUTLASS = "VLLM_CUTLASS" BATCHED_VLLM_CUTLASS = "BATCHED_VLLM_CUTLASS" + XPU = "XPU" def backend_to_kernel_cls( @@ -123,6 +124,13 @@ def backend_to_kernel_cls( return CutlassBatchedExpertsFp8 + elif backend == Fp8MoeBackend.XPU: + from vllm.model_executor.layers.fused_moe.xpu_fused_moe import ( + XPUExpertsFp8, + ) + + return XPUExpertsFp8 + else: raise ValueError(f"Unknown FP8 MoE backend: {backend.value}") @@ -154,6 +162,7 @@ def select_fp8_moe_backend( Fp8MoeBackend.TRITON, Fp8MoeBackend.BATCHED_TRITON, Fp8MoeBackend.MARLIN, + Fp8MoeBackend.XPU, ] # NOTE(rob): We need to peak into the P/F selection to determine @@ -393,6 +402,7 @@ def convert_to_fp8_moe_kernel_format( Fp8MoeBackend.BATCHED_TRITON, Fp8MoeBackend.VLLM_CUTLASS, Fp8MoeBackend.BATCHED_VLLM_CUTLASS, + Fp8MoeBackend.XPU, ]: raise ValueError(f"Unsupported FP8 MoE backend: {fp8_backend.value}") diff --git a/vllm/model_executor/layers/fused_moe/xpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/xpu_fused_moe.py index cfb88f6af..a20679ea6 100644 --- a/vllm/model_executor/layers/fused_moe/xpu_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/xpu_fused_moe.py @@ -4,13 +4,16 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, FusedMoEParallelConfig, + FusedMoEQuantConfig, ) from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceNoOP, ) from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, + kFp8DynamicTensorSym, kFp8StaticTensorSym, ) from vllm.platforms import current_platform @@ -20,6 +23,21 @@ if current_platform.is_xpu(): class XPUExperts(mk.FusedMoEPermuteExpertsUnpermute): + def __init__( + self, + moe_config: FusedMoEConfig, + quant_config: FusedMoEQuantConfig, + max_num_tokens: int | None = None, + num_dispatchers: int | None = None, + ): + super().__init__( + moe_config, + quant_config, + max_num_tokens, + num_dispatchers, + ) + self.is_fp8 = False + @property def expects_unquantized_inputs(self) -> bool: return True @@ -49,10 +67,10 @@ class XPUExperts(mk.FusedMoEPermuteExpertsUnpermute): weight_key: QuantKey | None, activation_key: QuantKey | None, ) -> bool: - # TODO: dispatch based on device. SUPPORTED_W_A = [ (None, None), (kFp8StaticTensorSym, None), + (kFp8StaticTensorSym, kFp8DynamicTensorSym), ] return (weight_key, activation_key) in SUPPORTED_W_A @@ -103,10 +121,10 @@ class XPUExperts(mk.FusedMoEPermuteExpertsUnpermute): xpu_fused_moe( hidden_states=hidden_states, w13=w1, - w13_scales=a1q_scale, + w13_scales=self.w1_scale, w13_bias=self.w1_bias, w2=w2, - w2_scales=a2_scale, + w2_scales=self.w2_scale, w2_bias=self.w2_bias, topk_weights=topk_weights, topk_ids=topk_ids, @@ -116,5 +134,22 @@ class XPUExperts(mk.FusedMoEPermuteExpertsUnpermute): ep_rank=self.moe_config.ep_rank, ep_size=self.moe_config.ep_size, output=output, + is_fp8=self.is_fp8, + ) + + +class XPUExpertsFp8(XPUExperts): + def __init__( + self, + moe_config: FusedMoEConfig, + quant_config: FusedMoEQuantConfig, + max_num_tokens: int | None = None, + num_dispatchers: int | None = None, + ): + super().__init__( + moe_config, + quant_config, + max_num_tokens, + num_dispatchers, ) - return + self.is_fp8 = True -- GitLab From c9a1923bb470f79a33963ad80cc8ad12bab2ad52 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 11 Feb 2026 11:47:39 +0800 Subject: [PATCH 0082/1166] [Plugin] Simplify IO Processor Plugin interface (#34236) Signed-off-by: DarkLight1337 --- docs/design/io_processor_plugins.md | 40 ++++----- .../prithvi_io_processor/prithvi_processor.py | 49 +++-------- .../prithvi_io_processor/types.py | 4 - .../test_io_processor_plugins.py | 8 +- vllm/entrypoints/llm.py | 74 ++++++++--------- vllm/entrypoints/pooling/pooling/protocol.py | 3 - vllm/entrypoints/pooling/pooling/serving.py | 52 ++++++++---- vllm/plugins/io_processors/interface.py | 82 ++++++++++++++----- vllm/utils/collection_utils.py | 6 -- 9 files changed, 167 insertions(+), 151 deletions(-) diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md index 3e029259e..c6945e443 100644 --- a/docs/design/io_processor_plugins.md +++ b/docs/design/io_processor_plugins.md @@ -14,8 +14,26 @@ IOProcessorOutput = TypeVar("IOProcessorOutput") class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): def __init__(self, vllm_config: VllmConfig): + super().__init__() + self.vllm_config = vllm_config + @abstractmethod + def parse_data(self, data: object) -> IOProcessorInput: + raise NotImplementedError + + def merge_sampling_params( + self, + params: SamplingParams | None = None, + ) -> SamplingParams: + return params or SamplingParams() + + def merge_pooling_params( + self, + params: PoolingParams | None = None, + ) -> PoolingParams: + return params or PoolingParams() + @abstractmethod def pre_process( self, @@ -55,29 +73,13 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): [(i, item) async for i, item in model_output], key=lambda output: output[0] ) collected_output = [output[1] for output in sorted_output] - return self.post_process(collected_output, request_id, **kwargs) - - @abstractmethod - def parse_request(self, request: Any) -> IOProcessorInput: - raise NotImplementedError - - def validate_or_generate_params( - self, params: SamplingParams | PoolingParams | None = None - ) -> SamplingParams | PoolingParams: - return params or PoolingParams() - - @abstractmethod - def output_to_response( - self, plugin_output: IOProcessorOutput - ) -> IOProcessorResponse: - raise NotImplementedError + return self.post_process(collected_output, request_id=request_id, **kwargs) ``` -The `parse_request` method is used for validating the user prompt and converting it into the input expected by the `pre_process`/`pre_process_async` methods. +The `parse_data` method is used for validating the user data and converting it into the input expected by the `pre_process*` methods. +The `merge_sampling_params` and `merge_pooling_params` methods merge input `SamplingParams` or `PoolingParams` (if any) with the default one. The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference. The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output. -The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters. -The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/pooling/pooling/serving.py). An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/pooling/plugin/prithvi_geospatial_mae_online.py](../../examples/pooling/plugin/prithvi_geospatial_mae_online.py)) and offline ([examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py](../../examples/pooling/plugin/prithvi_geospatial_mae_io_processor.py)) inference examples. diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py index 329b09c68..7915da94f 100644 --- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py +++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py @@ -18,18 +18,10 @@ from einops import rearrange from terratorch.datamodules import Sen1Floods11NonGeoDataModule from vllm.config import VllmConfig -from vllm.entrypoints.pooling.pooling.protocol import ( - IOProcessorRequest, - IOProcessorResponse, -) from vllm.inputs.data import PromptType from vllm.logger import init_logger from vllm.outputs import PoolingRequestOutput -from vllm.plugins.io_processors.interface import ( - IOProcessor, - IOProcessorInput, - IOProcessorOutput, -) +from vllm.plugins.io_processors.interface import IOProcessor from .types import DataModuleConfig, ImagePrompt, ImageRequestOutput @@ -227,7 +219,7 @@ def load_image( return imgs, temporal_coords, location_coords, metas -class PrithviMultimodalDataProcessor(IOProcessor): +class PrithviMultimodalDataProcessor(IOProcessor[ImagePrompt, ImageRequestOutput]): indices = [0, 1, 2, 3, 4, 5] def __init__(self, vllm_config: VllmConfig): @@ -251,34 +243,15 @@ class PrithviMultimodalDataProcessor(IOProcessor): self.requests_cache: dict[str, dict[str, Any]] = {} self.indices = DEFAULT_INPUT_INDICES - def parse_request(self, request: Any) -> IOProcessorInput: - if type(request) is dict: - image_prompt = ImagePrompt(**request) - return image_prompt - if isinstance(request, IOProcessorRequest): - if not hasattr(request, "data"): - raise ValueError("missing 'data' field in OpenAIBaseModel Request") - - request_data = request.data - - if type(request_data) is dict: - return ImagePrompt(**request_data) - else: - raise ValueError("Unable to parse the request data") - - raise ValueError("Unable to parse request") - - def output_to_response( - self, plugin_output: IOProcessorOutput - ) -> IOProcessorResponse: - return IOProcessorResponse( - request_id=plugin_output.request_id, - data=plugin_output, - ) + def parse_data(self, data: object) -> ImagePrompt: + if isinstance(data, dict): + return ImagePrompt(**data) + + raise ValueError("Prompt data should be an `ImagePrompt`") def pre_process( self, - prompt: IOProcessorInput, + prompt: ImagePrompt, request_id: str | None = None, **kwargs, ) -> PromptType | Sequence[PromptType]: @@ -364,7 +337,7 @@ class PrithviMultimodalDataProcessor(IOProcessor): model_output: Sequence[PoolingRequestOutput], request_id: str | None = None, **kwargs, - ) -> IOProcessorOutput: + ) -> ImageRequestOutput: pred_imgs_list = [] if request_id and (request_id in self.requests_cache): @@ -409,5 +382,7 @@ class PrithviMultimodalDataProcessor(IOProcessor): ) return ImageRequestOutput( - type=out_format, format="tiff", data=out_data, request_id=request_id + type=out_format, + format="tiff", + data=out_data, ) diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py index d1d787321..3a1a9c3be 100644 --- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py +++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/types.py @@ -38,9 +38,6 @@ class ImagePrompt(BaseModel): """ -MultiModalPromptType = ImagePrompt - - class ImageRequestOutput(BaseModel): """ The output data of an image request to vLLM. @@ -54,4 +51,3 @@ class ImageRequestOutput(BaseModel): type: Literal["path", "b64_json"] format: str data: str - request_id: str | None = None diff --git a/tests/plugins_tests/test_io_processor_plugins.py b/tests/plugins_tests/test_io_processor_plugins.py index 2088ee36e..6e820f1a4 100644 --- a/tests/plugins_tests/test_io_processor_plugins.py +++ b/tests/plugins_tests/test_io_processor_plugins.py @@ -75,9 +75,7 @@ async def test_prithvi_mae_plugin_online( # verify the output is formatted as expected for this plugin plugin_data = parsed_response.data - assert all( - plugin_data.get(attr) for attr in ["type", "format", "data", "request_id"] - ) + assert all(plugin_data.get(attr) for attr in ["type", "format", "data"]) # We just check that the output is a valid base64 string. # Raises an exception and fails the test if the string is corrupted. @@ -110,9 +108,7 @@ def test_prithvi_mae_plugin_offline(vllm_runner, model_name: str): output = pooler_output[0].outputs # verify the output is formatted as expected for this plugin - assert all( - hasattr(output, attr) for attr in ["type", "format", "data", "request_id"] - ) + assert all(hasattr(output, attr) for attr in ["type", "format", "data"]) # We just check that the output is a valid base64 string. # Raises an exception and fails the test if the string is corrupted. diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index b9147b99c..2b4ed8695 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -85,7 +85,6 @@ from vllm.tasks import PoolingTask from vllm.tokenizers import TokenizerLike from vllm.tokenizers.mistral import MistralTokenizer from vllm.usage.usage_lib import UsageContext -from vllm.utils.collection_utils import as_iter, is_list_of from vllm.utils.counter import Counter from vllm.v1.engine.llm_engine import LLMEngine from vllm.v1.sample.logits_processor import LogitsProcessor @@ -95,6 +94,7 @@ if TYPE_CHECKING: logger = init_logger(__name__) +_P = TypeVar("_P", bound=SamplingParams | PoolingParams | None) _R = TypeVar("_R", default=Any) @@ -1056,9 +1056,7 @@ class LLM: dict(truncate_prompt_tokens=truncate_prompt_tokens), ) - io_processor_prompt = False - if isinstance(prompts, dict) and "data" in prompts: - io_processor_prompt = True + if use_io_processor := (isinstance(prompts, dict) and "data" in prompts): if self.io_processor is None: raise ValueError( "No IOProcessor plugin installed. Please refer " @@ -1068,40 +1066,42 @@ class LLM: ) # Validate the request data is valid for the loaded plugin - validated_prompt = self.io_processor.parse_request(prompts) + validated_prompt = self.io_processor.parse_data(prompts) # obtain the actual model prompts from the pre-processor prompts = self.io_processor.pre_process(prompt=validated_prompt) + prompts_seq = prompt_to_seq(prompts) - if io_processor_prompt: - assert self.io_processor is not None - if is_list_of(pooling_params, PoolingParams): - validated_pooling_params: list[PoolingParams] = [] - for param in as_iter(pooling_params): - validated_pooling_params.append( - self.io_processor.validate_or_generate_params(param) - ) - pooling_params = validated_pooling_params - else: - assert not isinstance(pooling_params, Sequence) - pooling_params = self.io_processor.validate_or_generate_params( - pooling_params + params_seq: Sequence[PoolingParams] = [ + self.io_processor.merge_pooling_params(param) + for param in self._params_to_seq( + pooling_params, + len(prompts_seq), ) - - if pooling_params is None: - # Use default pooling params. - pooling_params = PoolingParams() - - for param in as_iter(pooling_params): - if param.task is None: - param.task = pooling_task - elif param.task != pooling_task: - msg = f"You cannot overwrite {param.task=!r} with {pooling_task=!r}!" - raise ValueError(msg) + ] + for p in params_seq: + if p.task is None: + p.task = "plugin" + else: + if pooling_params is None: + # Use default pooling params. + pooling_params = PoolingParams() + + prompts_seq = prompt_to_seq(prompts) + params_seq = self._params_to_seq(pooling_params, len(prompts_seq)) + + for param in params_seq: + if param.task is None: + param.task = pooling_task + elif param.task != pooling_task: + msg = ( + f"You cannot overwrite {param.task=!r} with {pooling_task=!r}!" + ) + raise ValueError(msg) outputs = self._run_completion( - prompts=prompts, - params=pooling_params, + prompts=prompts_seq, + params=params_seq, use_tqdm=use_tqdm, lora_request=lora_request, tokenization_kwargs=tokenization_kwargs, @@ -1111,12 +1111,10 @@ class LLM: outputs, PoolingRequestOutput ) - if io_processor_prompt: + if use_io_processor: # get the post-processed model outputs assert self.io_processor is not None - processed_outputs = self.io_processor.post_process( - model_output=model_outputs - ) + processed_outputs = self.io_processor.post_process(model_outputs) return [ PoolingRequestOutput[Any]( @@ -1662,11 +1660,9 @@ class LLM: def _params_to_seq( self, - params: SamplingParams - | PoolingParams - | Sequence[SamplingParams | PoolingParams], + params: _P | Sequence[_P], num_requests: int, - ) -> Sequence[SamplingParams | PoolingParams]: + ) -> Sequence[_P]: if isinstance(params, Sequence): if len(params) != num_requests: raise ValueError( diff --git a/vllm/entrypoints/pooling/pooling/protocol.py b/vllm/entrypoints/pooling/pooling/protocol.py index ab2d82d8e..6a5a743cd 100644 --- a/vllm/entrypoints/pooling/pooling/protocol.py +++ b/vllm/entrypoints/pooling/pooling/protocol.py @@ -100,9 +100,6 @@ class IOProcessorRequest(PoolingBasicRequestMixin, EncodingRequestMixin, Generic data: T task: PoolingTask = "plugin" - def to_pooling_params(self): - return PoolingParams(task=self.task) - class IOProcessorResponse(OpenAIBaseModel, Generic[T]): request_id: str | None = None diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py index 3ad5786db..5c5d649f6 100644 --- a/vllm/entrypoints/pooling/pooling/serving.py +++ b/vllm/entrypoints/pooling/pooling/serving.py @@ -85,7 +85,6 @@ class OpenAIServingPooling(OpenAIServing): request_id = f"pool-{self._base_request_id(raw_request)}" created_time = int(time.time()) - is_io_processor_request = isinstance(request, IOProcessorRequest) try: lora_request = self._maybe_get_adapters(request) @@ -95,7 +94,7 @@ class OpenAIServingPooling(OpenAIServing): ) engine_prompts: Sequence[PromptType | TokPrompt] - if is_io_processor_request: + if use_io_processor := isinstance(request, IOProcessorRequest): if self.io_processor is None: raise ValueError( "No IOProcessor plugin installed. Please refer " @@ -104,7 +103,7 @@ class OpenAIServingPooling(OpenAIServing): "offline inference example for more details." ) - validated_prompt = self.io_processor.parse_request(request) + validated_prompt = self.io_processor.parse_data(request.data) raw_prompts = await self.io_processor.pre_process_async( prompt=validated_prompt, request_id=request_id @@ -141,13 +140,18 @@ class OpenAIServingPooling(OpenAIServing): # Schedule the request and get the result generator. generators: list[AsyncGenerator[PoolingRequestOutput, None]] = [] try: - if is_io_processor_request: - assert self.io_processor is not None and isinstance( - request, IOProcessorRequest - ) - pooling_params = self.io_processor.validate_or_generate_params() + if use_io_processor: + assert self.io_processor is not None + + pooling_params = self.io_processor.merge_pooling_params() + if pooling_params.task is None: + pooling_params.task = "plugin" + + tokenization_kwargs: dict[str, Any] = {} else: - pooling_params = request.to_pooling_params() + pooling_params = request.to_pooling_params() # type: ignore + tok_params = request.build_tok_params(self.model_config) # type: ignore + tokenization_kwargs = tok_params.get_encode_kwargs() for i, engine_prompt in enumerate(engine_prompts): request_id_item = f"{request_id}-{i}" @@ -165,12 +169,6 @@ class OpenAIServingPooling(OpenAIServing): else await self._get_trace_headers(raw_request.headers) ) - if is_io_processor_request: - tokenization_kwargs: dict[str, Any] = {} - else: - tok_params = request.build_tok_params(self.model_config) # type: ignore - tokenization_kwargs = tok_params.get_encode_kwargs() - generator = self.engine_client.encode( engine_prompt, pooling_params, @@ -187,13 +185,31 @@ class OpenAIServingPooling(OpenAIServing): result_generator = merge_async_iterators(*generators) - if is_io_processor_request: + if use_io_processor: assert self.io_processor is not None output = await self.io_processor.post_process_async( - model_output=result_generator, + result_generator, request_id=request_id, ) - return self.io_processor.output_to_response(output) + + if callable( + output_to_response := getattr( + self.io_processor, "output_to_response", None + ) + ): + logger.warning_once( + "`IOProcessor.output_to_response` is deprecated. To ensure " + "consistency between offline and online APIs, " + "`IOProcessorResponse` will become a transparent wrapper " + "around output data from v0.19 onwards.", + ) + + if hasattr(output, "request_id") and output.request_id is None: + output.request_id = request_id # type: ignore + + return output_to_response(output) # type: ignore + + return IOProcessorResponse(request_id=request_id, data=output) assert isinstance(request, (PoolingCompletionRequest, PoolingChatRequest)) num_prompts = len(engine_prompts) diff --git a/vllm/plugins/io_processors/interface.py b/vllm/plugins/io_processors/interface.py index d2dd8b1bd..a978b1e74 100644 --- a/vllm/plugins/io_processors/interface.py +++ b/vllm/plugins/io_processors/interface.py @@ -1,12 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - +import warnings from abc import ABC, abstractmethod from collections.abc import AsyncGenerator, Sequence -from typing import Any, Generic, TypeVar +from typing import Generic, TypeVar from vllm.config import VllmConfig -from vllm.entrypoints.pooling.pooling.protocol import IOProcessorResponse from vllm.inputs.data import PromptType from vllm.outputs import PoolingRequestOutput from vllm.pooling_params import PoolingParams @@ -18,8 +17,68 @@ IOProcessorOutput = TypeVar("IOProcessorOutput") class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): def __init__(self, vllm_config: VllmConfig): + super().__init__() + self.vllm_config = vllm_config + def parse_data(self, data: object) -> IOProcessorInput: + if callable(parse_request := getattr(self, "parse_request", None)): + warnings.warn( + "`parse_request` has been renamed to `parse_data`. " + "Please update your IO Processor Plugin to use the new name. " + "The old name will be removed in v0.19.", + DeprecationWarning, + stacklevel=2, + ) + + return parse_request(data) # type: ignore + + raise NotImplementedError + + def merge_sampling_params( + self, + params: SamplingParams | None = None, + ) -> SamplingParams: + if callable( + validate_or_generate_params := getattr( + self, "validate_or_generate_params", None + ) + ): + warnings.warn( + "`validate_or_generate_params` has been split into " + "`merge_sampling_params` and `merge_pooling_params`." + "Please update your IO Processor Plugin to use the new methods. " + "The old name will be removed in v0.19.", + DeprecationWarning, + stacklevel=2, + ) + + return validate_or_generate_params(params) # type: ignore + + return params or SamplingParams() + + def merge_pooling_params( + self, + params: PoolingParams | None = None, + ) -> PoolingParams: + if callable( + validate_or_generate_params := getattr( + self, "validate_or_generate_params", None + ) + ): + warnings.warn( + "`validate_or_generate_params` has been split into " + "`merge_sampling_params` and `merge_pooling_params`." + "Please update your IO Processor Plugin to use the new methods. " + "The old name will be removed in v0.19.", + DeprecationWarning, + stacklevel=2, + ) + + return validate_or_generate_params(params) # type: ignore + + return params or PoolingParams(task="plugin") + @abstractmethod def pre_process( self, @@ -59,19 +118,4 @@ class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): [(i, item) async for i, item in model_output], key=lambda output: output[0] ) collected_output = [output[1] for output in sorted_output] - return self.post_process(collected_output, request_id, **kwargs) - - @abstractmethod - def parse_request(self, request: Any) -> IOProcessorInput: - raise NotImplementedError - - def validate_or_generate_params( - self, params: SamplingParams | PoolingParams | None = None - ) -> SamplingParams | PoolingParams: - return params or PoolingParams() - - @abstractmethod - def output_to_response( - self, plugin_output: IOProcessorOutput - ) -> IOProcessorResponse: - raise NotImplementedError + return self.post_process(collected_output, request_id=request_id, **kwargs) diff --git a/vllm/utils/collection_utils.py b/vllm/utils/collection_utils.py index aefaf84ee..e0bd2045f 100644 --- a/vllm/utils/collection_utils.py +++ b/vllm/utils/collection_utils.py @@ -51,12 +51,6 @@ def as_list(maybe_list: Iterable[T]) -> list[T]: return maybe_list if isinstance(maybe_list, list) else list(maybe_list) -def as_iter(obj: T | Iterable[T]) -> Iterable[T]: - if isinstance(obj, str) or not isinstance(obj, Iterable): - return [obj] # type: ignore[list-item] - return obj - - def is_list_of( value: object, typ: type[T] | tuple[type[T], ...], -- GitLab From 7a048ee65f0b8da2c2493ef76cbee89cf612baa6 Mon Sep 17 00:00:00 2001 From: Matthias Gehre Date: Wed, 11 Feb 2026 04:58:56 +0100 Subject: [PATCH 0083/1166] [Bugfix] Fix benchmark_moe.py inplace assertion with torch >= 2.9 (#34149) Signed-off-by: Matthias Gehre --- benchmarks/kernels/benchmark_moe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index c35cdb121..c5e3dabe5 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -226,9 +226,10 @@ def benchmark_config( x, input_gating, topk, renormalize=not use_deep_gemm ) + inplace = not disable_inplace() if use_deep_gemm: return deep_gemm_experts( - x, w1, w2, topk_weights, topk_ids, inplace=True + x, w1, w2, topk_weights, topk_ids, inplace=inplace ) return fused_experts( x, @@ -236,7 +237,7 @@ def benchmark_config( w2, topk_weights, topk_ids, - inplace=True, + inplace=inplace, quant_config=quant_config, ) -- GitLab From 1b3540e6c6d3833118d448c3246434de1a60e558 Mon Sep 17 00:00:00 2001 From: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> Date: Tue, 10 Feb 2026 19:59:14 -0800 Subject: [PATCH 0084/1166] Threshold fix wvSplitk for occasional CI fails (#34013) Signed-off-by: Hashem Hashemi --- tests/kernels/quantization/test_rocm_skinny_gemms.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/kernels/quantization/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py index 566cb0239..7606c2a91 100644 --- a/tests/kernels/quantization/test_rocm_skinny_gemms.py +++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py @@ -270,6 +270,9 @@ def test_rocm_wvsplitk_fp8_kernel( out = ops.wvSplitKQ(B, A, dtype, scale_a, scale_b, get_cu_count(), BIAS) if xnorm: - assert torch.allclose(out, ref_out, atol=1e-3, rtol=1e-8) + torch.testing.assert_close(out, ref_out, atol=1e-3, rtol=1e-8) + elif k >= 32 * 1024: + # wider pytrch thresh for large-K & no xnorm + torch.testing.assert_close(out, ref_out, atol=0.07, rtol=5e-2) else: - assert torch.allclose(out, ref_out, 0.01) + torch.testing.assert_close(out, ref_out, atol=0.01, rtol=0.01) -- GitLab From 9b17c57460bb5f6595f27b43e43caba144a8ec3c Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Wed, 11 Feb 2026 00:00:00 -0500 Subject: [PATCH 0085/1166] [ModelBash][DSR1 NVFp4] Removed Bf16 Bias Cast (#34298) Signed-off-by: Robert Shaw Co-authored-by: Robert Shaw --- .../layers/quantization/utils/flashinfer_fp4_moe.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index cbdcd348c..bbe206800 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -25,6 +25,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.platforms import current_platform if TYPE_CHECKING: + from vllm.model_executor.layers.fused_moe.layer import FusedMoE from vllm.model_executor.layers.fused_moe.oracle.nvfp4 import ( NvFp4MoeBackend, ) @@ -316,11 +317,7 @@ def flashinfer_trtllm_fp4_moe( if use_llama4_routing: routing_method_type = flashinfer.RoutingMethodType.Llama4 - # Prepare routing bias - routing_bias = e_score_correction_bias - if routing_bias is not None: - routing_bias = routing_bias.to(torch.bfloat16) - + # Cast to Fp32 (required by kernel). router_logits = ( router_logits.to(torch.float32) if routing_method_type == RoutingMethodType.DeepSeekV3 @@ -330,7 +327,7 @@ def flashinfer_trtllm_fp4_moe( # Call TRT-LLM FP4 block-scale MoE kernel out = flashinfer.fused_moe.trtllm_fp4_block_scale_moe( routing_logits=router_logits, - routing_bias=routing_bias, + routing_bias=e_score_correction_bias, hidden_states=hidden_states_fp4, hidden_states_scale=hidden_states_scale_linear_fp4.view( torch.float8_e4m3fn @@ -447,7 +444,7 @@ def flashinfer_trtllm_fp4_routed_moe( def prepare_nvfp4_moe_layer_for_fi_or_cutlass( backend: "NvFp4MoeBackend", - layer: torch.nn.Module, + layer: "FusedMoE", w13: torch.Tensor, w13_scale: torch.Tensor, w13_scale_2: torch.Tensor, -- GitLab From d7982daff5334b9465b29fa943a1954c064ab226 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Wed, 11 Feb 2026 00:15:52 -0500 Subject: [PATCH 0086/1166] [Bugfix] Fix fused MoE IMA (sans chunking) by using int64 for strides (#34279) Co-authored-by: Claude Opus 4.6 --- .../layers/fused_moe/fused_moe.py | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 63aae43c3..6ca3213fb 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -95,19 +95,19 @@ def fused_moe_kernel_gptq_awq( # moving by 1 element in a particular dimension. E.g. `stride_am` is # how much to increase `a_ptr` by to get the element one row down # (A has M rows). - stride_am, - stride_ak, - stride_be, - stride_bk, - stride_bn, - stride_cm, - stride_cn, - stride_bse, - stride_bsk, - stride_bsn, - stride_bze, - stride_bzk, - stride_bzn, + stride_am: tl.int64, + stride_ak: tl.int64, + stride_be: tl.int64, + stride_bk: tl.int64, + stride_bn: tl.int64, + stride_cm: tl.int64, + stride_cn: tl.int64, + stride_bse: tl.int64, + stride_bsk: tl.int64, + stride_bsn: tl.int64, + stride_bze: tl.int64, + stride_bzk: tl.int64, + stride_bzn: tl.int64, block_k_diviable: tl.constexpr, group_size: tl.constexpr, # Meta-parameters @@ -329,20 +329,20 @@ def fused_moe_kernel( # moving by 1 element in a particular dimension. E.g. `stride_am` is # how much to increase `a_ptr` by to get the element one row down # (A has M rows). - stride_am, - stride_ak, - stride_be, - stride_bk, - stride_bn, - stride_cm, - stride_cn, - stride_asm, - stride_ask, - stride_bse, - stride_bsk, - stride_bsn, - stride_bbe, # bias expert stride - stride_bbn, # bias N stride + stride_am: tl.int64, + stride_ak: tl.int64, + stride_be: tl.int64, + stride_bk: tl.int64, + stride_bn: tl.int64, + stride_cm: tl.int64, + stride_cn: tl.int64, + stride_asm: tl.int64, + stride_ask: tl.int64, + stride_bse: tl.int64, + stride_bsk: tl.int64, + stride_bsn: tl.int64, + stride_bbe: tl.int64, # bias expert stride + stride_bbn: tl.int64, # bias N stride # Block size for block-wise quantization group_n: tl.constexpr, group_k: tl.constexpr, -- GitLab From 0b20469c627e94060d1015170b186d19de1db583 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 10 Feb 2026 21:37:14 -0800 Subject: [PATCH 0087/1166] [Bugfix] Fix weight naming in Qwen3.5 (#34313) Signed-off-by: Roger Wang --- vllm/model_executor/models/qwen3_5.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen3_5.py b/vllm/model_executor/models/qwen3_5.py index 808db2d6f..c317c1e1a 100644 --- a/vllm/model_executor/models/qwen3_5.py +++ b/vllm/model_executor/models/qwen3_5.py @@ -206,7 +206,7 @@ class Qwen3_5GatedDeltaNet(Qwen3NextGatedDeltaNet): output_size=self.num_v_heads, bias=False, quant_config=quant_config, - prefix=f"{prefix}.in_proj_ba", + prefix=f"{prefix}.in_proj_b", ) self.in_proj_a = ColumnParallelLinear( input_size=self.hidden_size, -- GitLab From d1b837f0ae6a0152d820194a181e809ffaef6864 Mon Sep 17 00:00:00 2001 From: R3hankhan Date: Wed, 11 Feb 2026 12:11:42 +0530 Subject: [PATCH 0088/1166] [CPU] Enable FP16 (Half dtype) support for s390x (#34116) Signed-off-by: Rehan Khan --- csrc/cpu/cpu_attn_impl.hpp | 2 +- csrc/cpu/cpu_types_vxe.hpp | 247 ++++++++++++++++++++++++++++++++++++- csrc/cpu/mla_decode.cpp | 4 +- 3 files changed, 244 insertions(+), 9 deletions(-) diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp index 89cf2dc3a..fbe0e8778 100644 --- a/csrc/cpu/cpu_attn_impl.hpp +++ b/csrc/cpu/cpu_attn_impl.hpp @@ -821,7 +821,7 @@ struct VecTypeTrait { using vec_t = vec_op::BF16Vec16; }; -#if !defined(__powerpc__) && !defined(__s390x__) +#if !defined(__powerpc__) template <> struct VecTypeTrait { using vec_t = vec_op::FP16Vec16; diff --git a/csrc/cpu/cpu_types_vxe.hpp b/csrc/cpu/cpu_types_vxe.hpp index 9efd8b7ec..700ba0306 100644 --- a/csrc/cpu/cpu_types_vxe.hpp +++ b/csrc/cpu/cpu_types_vxe.hpp @@ -16,10 +16,12 @@ namespace vec_op { #define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic #define vec_sl(a, b) ((a) << (b)) // Vector Shift Left -// FIXME: FP16 is not fully supported in Torch-CPU -#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) +// NOTE: FP16 (Half) is supported on s390x via custom bit-manipulation +// conversion. PyTorch itself lacks native s390x FP16 support. +#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) @@ -86,6 +88,39 @@ struct BF16Vec8 : public Vec { } }; +struct FP16Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + + __vector signed short reg; + + explicit FP16Vec8(const void* ptr) : reg(*(__vector signed short*)ptr) {} + explicit FP16Vec8(const FP32Vec8&); + + void save(void* ptr) const { + *reinterpret_cast<__vector signed short*>(ptr) = reg; + } +}; + +struct FP16Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + + ss16x8x2_t reg; + + explicit FP16Vec16(const void* ptr) { + // Load 256 bits (16 FP16 values) in two parts + reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)ptr); + reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr); + } + + explicit FP16Vec16(const FP32Vec16&); + + void save(void* ptr) const { + // Save 256 bits in two parts + vec_xst(reg.val[0], 0, (signed short*)ptr); + vec_xst(reg.val[1], 16, (signed short*)ptr); + } +}; + struct BF16Vec16 : public Vec { constexpr static int VEC_ELEM_NUM = 16; @@ -108,6 +143,92 @@ struct BF16Vec16 : public Vec { const static __vector signed short zero = vec_splats((signed short)0); +FORCE_INLINE __vector float fp16_to_fp32_bits(__vector unsigned int x) { + const __vector unsigned int mask_sign = {0x8000, 0x8000, 0x8000, 0x8000}; + const __vector unsigned int mask_exp = {0x7C00, 0x7C00, 0x7C00, 0x7C00}; + const __vector unsigned int mask_mant = {0x03FF, 0x03FF, 0x03FF, 0x03FF}; + const __vector unsigned int bias_adj = {112, 112, 112, 112}; + const __vector unsigned int exp_max_fp16 = {0x1F, 0x1F, 0x1F, + 0x1F}; // FP16 NaN/Inf exponent + const __vector unsigned int exp_max_fp32 = {0xFF, 0xFF, 0xFF, + 0xFF}; // FP32 NaN/Inf exponent + + __vector unsigned int s = (x & mask_sign) << 16; + __vector unsigned int e = (x & mask_exp) >> 10; + __vector unsigned int m = (x & mask_mant) << 13; + + // Check for NaN/Inf: exponent = 0x1F in FP16 + __vector __bool int is_nan_inf = vec_cmpeq(e, exp_max_fp16); + + // Normal: adjust bias; NaN/Inf: set to 0xFF + __vector unsigned int e_normal = e + bias_adj; + e = vec_sel(e_normal, exp_max_fp32, is_nan_inf); + + return (__vector float)(s | (e << 23) | m); +} + +FORCE_INLINE __vector unsigned int fp32_to_fp16_bits(__vector float f_in) { + __vector unsigned int in = (__vector unsigned int)f_in; + + const __vector unsigned int mask_sign_32 = {0x80000000, 0x80000000, + 0x80000000, 0x80000000}; + const __vector unsigned int mask_exp_32 = {0x7F800000, 0x7F800000, 0x7F800000, + 0x7F800000}; + const __vector unsigned int mask_mant_32 = {0x007FFFFF, 0x007FFFFF, + 0x007FFFFF, 0x007FFFFF}; + + // Use SIGNED integers for exponent math to handle underflow check + const __vector signed int bias_adj = {112, 112, 112, 112}; + const __vector signed int zero = {0, 0, 0, 0}; + const __vector signed int max_exp = {31, 31, 31, 31}; // Max FP16 exp + const __vector unsigned int exp_max_fp32 = {0xFF, 0xFF, 0xFF, 0xFF}; + const __vector unsigned int exp_max_fp16 = {0x1F, 0x1F, 0x1F, 0x1F}; + + __vector unsigned int s = (in & mask_sign_32) >> 16; + __vector unsigned int e_u = (in & mask_exp_32) >> 23; + + // Check for NaN/Inf: exponent = 0xFF in FP32 + __vector __bool int is_nan_inf = vec_cmpeq(e_u, exp_max_fp32); + + __vector signed int e_s = (__vector signed int)e_u; + e_s = vec_sub(e_s, bias_adj); + e_s = vec_max(e_s, zero); + e_s = vec_min(e_s, max_exp); + __vector unsigned int e_normal = (__vector unsigned int)e_s; + + __vector unsigned int e_final = vec_sel(e_normal, exp_max_fp16, is_nan_inf); + + const __vector unsigned int one_v = {1, 1, 1, 1}; + const __vector unsigned int mask_sticky = {0xFFF, 0xFFF, 0xFFF, 0xFFF}; + + __vector unsigned int round_bit = (in >> 12) & one_v; + __vector unsigned int sticky = in & mask_sticky; + __vector unsigned int m = (in & mask_mant_32) >> 13; + __vector unsigned int lsb = m & one_v; // LSB of mantissa for tie-breaking + + // Round up if: round_bit && (sticky || lsb) + __vector __bool int sticky_nonzero = + vec_cmpgt(sticky, (__vector unsigned int){0, 0, 0, 0}); + __vector __bool int lsb_set = vec_cmpeq(lsb, one_v); + __vector __bool int round_up = + vec_and(vec_cmpeq(round_bit, one_v), vec_or(sticky_nonzero, lsb_set)); + + m = vec_sel(m, m + one_v, round_up); + + const __vector unsigned int mant_mask = {0x3FF, 0x3FF, 0x3FF, 0x3FF}; + const __vector unsigned int max_normal_exp = {0x1E, 0x1E, 0x1E, 0x1E}; + __vector __bool int mant_overflows = vec_cmpgt(m, mant_mask); + __vector __bool int would_overflow_to_inf = + vec_and(mant_overflows, vec_cmpeq(e_final, max_normal_exp)); + __vector unsigned int e_inc = vec_min(e_final + one_v, exp_max_fp16); + e_final = vec_sel(e_final, e_inc, mant_overflows); + m = vec_and(m, mant_mask); + e_final = vec_sel(e_final, max_normal_exp, would_overflow_to_inf); + m = vec_sel(m, mant_mask, would_overflow_to_inf); + + return s | (e_final << 10) | m; +} + struct BF16Vec32 : public Vec { constexpr static int VEC_ELEM_NUM = 32; @@ -180,6 +301,18 @@ struct FP32Vec8 : public Vec { reg.val[1] = (__vector float)vec_mergel(v.reg, zero); } + explicit FP32Vec8(const FP16Vec8& v) { + // Cast to UNSIGNED short vector to prevent sign-extension during unpack + __vector unsigned short raw_u = (__vector unsigned short)v.reg; + + // Unpack 8x16-bit to two 4x32-bit vectors (Zero extended) + __vector unsigned int raw_hi = (__vector unsigned int)vec_unpackh(raw_u); + __vector unsigned int raw_lo = (__vector unsigned int)vec_unpackl(raw_u); + + reg.val[0] = fp16_to_fp32_bits(raw_hi); + reg.val[1] = fp16_to_fp32_bits(raw_lo); + } + float reduce_sum() const { AliasReg ar; ar.reg = reg; @@ -531,6 +664,22 @@ struct FP32Vec16 : public Vec { reg.val[3] = (__vector float)vec_mergel(v.reg.val[1], zero); } + explicit FP32Vec16(const FP16Vec16& v) { + __vector unsigned int raw_hi_0 = + (__vector unsigned int)vec_unpackh(v.reg.val[0]); + __vector unsigned int raw_lo_0 = + (__vector unsigned int)vec_unpackl(v.reg.val[0]); + reg.val[0] = fp16_to_fp32_bits(raw_hi_0); + reg.val[1] = fp16_to_fp32_bits(raw_lo_0); + + __vector unsigned int raw_hi_1 = + (__vector unsigned int)vec_unpackh(v.reg.val[1]); + __vector unsigned int raw_lo_1 = + (__vector unsigned int)vec_unpackl(v.reg.val[1]); + reg.val[2] = fp16_to_fp32_bits(raw_hi_1); + reg.val[3] = fp16_to_fp32_bits(raw_lo_1); + } + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} FP32Vec16 operator*(const FP32Vec16& b) const { @@ -628,8 +777,10 @@ struct VecType { using vec_type = BF16Vec8; }; -// On s390x, FP16 (Half) is not natively supported, use FP32 vectors instead -using FP16Vec16 = FP32Vec16; +template <> +struct VecType { + using vec_type = FP16Vec8; +}; template void storeFP32(float v, T* ptr) { @@ -650,6 +801,52 @@ inline void storeFP32(float v, c10::BFloat16* ptr) { *ptr = *(v_ptr + 1); } +template <> +inline void storeFP32<::c10::Half>(float v, ::c10::Half* ptr) { + // Use bit-manipulation for IEEE FP32 to FP16 conversion since vector + // intrinsics for FP32 to FP16 conversion does not use IEEE rounding and can + // produce incorrect results for some inputs. Process each of the 4 vectors + // separately. + uint32_t in; + std::memcpy(&in, &v, sizeof(in)); + + uint32_t s = (in & 0x80000000) >> 16; // Sign + uint32_t e = (in & 0x7F800000) >> 23; // Exponent + uint32_t round_bit = (in >> 12) & 1; + uint32_t sticky = (in & 0xFFF) != 0; // Any bits in [11..0] + uint32_t m = (in & 0x007FFFFF) >> 13; + uint32_t lsb = m & 1; // LSB of mantissa for tie-breaking + + // Check for NaN/Inf before rounding + bool is_nan_inf = (e == 0xFF); + + if (round_bit && (sticky || lsb)) { + m++; + // Handle mantissa overflow: if m overflows 10 bits, increment exponent + if (m > 0x3FF) { + m = 0; + e++; + } + } + + if (is_nan_inf) { + // NaN/Inf: preserve it + e = 0x1F; + } else { + // Normal: adjust bias (127 - 15), flush subnormals to zero + e = (e >= 112) ? (e - 112) : 0; + // If exponent overflows to Inf range, saturate to max normal FP16 value + if (e > 0x1E) { + e = 0x1E; // Max normal exponent + m = 0x3FF; // Max mantissa + } + } + + uint16_t fp16 = (uint16_t)(s | (e << 10) | m); + + *reinterpret_cast(ptr) = fp16; +} + #ifndef __VEC_CLASS_FP_NAN #define __VEC_CLASS_FP_NAN (1 << 6) #endif @@ -803,6 +1000,44 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) { reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask); } +inline FP16Vec8::FP16Vec8(const FP32Vec8& v) { + // Use bit-manipulation for IEEE FP32 to FP16 conversion since vector + // intrinsics for FP32 to FP16 conversion does not use IEEE rounding and can + // produce incorrect results for some inputs. Process each of the 4 vectors + // separately. + __vector unsigned int res_hi = fp32_to_fp16_bits(v.reg.val[0]); + __vector unsigned int res_lo = fp32_to_fp16_bits(v.reg.val[1]); + + const __vector unsigned char perm_pack = { + 2, 3, 6, 7, 10, 11, 14, 15, // Select lower 2 bytes from res_hi + 18, 19, 22, 23, 26, 27, 30, 31 // Select lower 2 bytes from res_lo + }; + + reg = vec_perm((__vector signed short)res_hi, (__vector signed short)res_lo, + perm_pack); +} + +inline FP16Vec16::FP16Vec16(const FP32Vec16& v) { + // Use bit-manipulation for IEEE FP32 to FP16 conversion since vector + // intrinsics for FP32 to FP16 conversion does not use IEEE rounding and can + // produce incorrect results for some inputs. Process each of the 4 vectors + // separately. + __vector unsigned int res_0 = fp32_to_fp16_bits(v.reg.val[0]); + __vector unsigned int res_1 = fp32_to_fp16_bits(v.reg.val[1]); + __vector unsigned int res_2 = fp32_to_fp16_bits(v.reg.val[2]); + __vector unsigned int res_3 = fp32_to_fp16_bits(v.reg.val[3]); + + const __vector unsigned char perm_pack = { + 2, 3, 6, 7, 10, 11, 14, 15, // Lower 2 bytes from first vector + 18, 19, 22, 23, 26, 27, 30, 31 // Lower 2 bytes from second vector + }; + + reg.val[0] = vec_perm((__vector signed short)res_0, + (__vector signed short)res_1, perm_pack); + reg.val[1] = vec_perm((__vector signed short)res_2, + (__vector signed short)res_3, perm_pack); +} + // 1D softmax over `n` elements in `input`, writes result to `output`. // Uses FP32Vec8 for main body, scalar tail handling. // Requirement: n > 0 diff --git a/csrc/cpu/mla_decode.cpp b/csrc/cpu/mla_decode.cpp index 564055ef5..582c480c3 100644 --- a/csrc/cpu/mla_decode.cpp +++ b/csrc/cpu/mla_decode.cpp @@ -18,8 +18,8 @@ struct KernelVecType { template <> struct KernelVecType { -#if defined(__powerpc64__) || defined(__s390x__) - // Power and s390x architecture-specific vector types +#if defined(__powerpc64__) + // Power specific vector types using qk_load_vec_type = vec_op::FP32Vec16; using qk_vec_type = vec_op::FP32Vec16; using v_load_vec_type = vec_op::FP32Vec16; -- GitLab From 21dfb842d76c61204d44f6f1dd1e99f55a9b2cf4 Mon Sep 17 00:00:00 2001 From: AllenDou Date: Wed, 11 Feb 2026 15:37:09 +0800 Subject: [PATCH 0089/1166] [model] support FunASR model (#33247) Signed-off-by: zixiao Co-authored-by: zixiao --- docs/models/supported_models.md | 1 + .../openai_transcription_client.py | 19 +- tests/models/registry.py | 4 + vllm/model_executor/models/funasr.py | 1057 +++++++++++++++++ vllm/model_executor/models/registry.py | 1 + .../transformers_utils/processors/__init__.py | 2 + .../processors/funasr_processor.py | 504 ++++++++ 7 files changed, 1585 insertions(+), 3 deletions(-) create mode 100644 vllm/model_executor/models/funasr.py create mode 100644 vllm/transformers_utils/processors/funasr_processor.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 7ff9531c5..7f20d2052 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -790,6 +790,7 @@ Speech2Text models trained specifically for Automatic Speech Recognition. | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | |--------------|--------|-------------------|----------------------|---------------------------| +| `FunASRForConditionalGeneration` | FunASR | `allendou/Fun-ASR-Nano-2512-vllm`, etc. | | | | `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | | | `GlmAsrForConditionalGeneration` | GLM-ASR | `zai-org/GLM-ASR-Nano-2512` | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ | diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py index 966bfd2a4..478a0a7ea 100644 --- a/examples/online_serving/openai_transcription_client.py +++ b/examples/online_serving/openai_transcription_client.py @@ -26,7 +26,9 @@ from openai import AsyncOpenAI, OpenAI from vllm.assets.audio import AudioAsset -def sync_openai(audio_path: str, client: OpenAI, model: str): +def sync_openai( + audio_path: str, client: OpenAI, model: str, *, repetition_penalty: float = 1.3 +): """ Perform synchronous transcription using OpenAI-compatible API. """ @@ -40,7 +42,7 @@ def sync_openai(audio_path: str, client: OpenAI, model: str): # Additional sampling params not provided by OpenAI API. extra_body=dict( seed=4419, - repetition_penalty=1.3, + repetition_penalty=repetition_penalty, ), ) print("transcription result [sync]:", transcription.text) @@ -129,7 +131,12 @@ def main(args): print(f"Using model: {model}") # Run the synchronous function - sync_openai(args.audio_path if args.audio_path else mary_had_lamb, client, model) + sync_openai( + audio_path=args.audio_path if args.audio_path else mary_had_lamb, + client=client, + model=model, + repetition_penalty=args.repetition_penalty, + ) # Run the asynchronous function if "openai" in model: @@ -161,5 +168,11 @@ if __name__ == "__main__": default=None, help="The path to the audio file to transcribe.", ) + parser.add_argument( + "--repetition_penalty", + type=float, + default=1.3, + help="repetition penalty", + ) args = parser.parse_args() main(args) diff --git a/tests/models/registry.py b/tests/models/registry.py index d2c67cf7e..abc621d8e 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -713,6 +713,10 @@ _MULTIMODAL_EXAMPLE_MODELS = { "baidu/ERNIE-4.5-VL-28B-A3B-PT", trust_remote_code=True, ), + "FunASRForConditionalGeneration": _HfExamplesInfo( + "allendou/Fun-ASR-Nano-2512-vllm", + is_available_online=False, + ), "FunAudioChatForConditionalGeneration": _HfExamplesInfo( "funaudiochat", is_available_online=False ), diff --git a/vllm/model_executor/models/funasr.py b/vllm/model_executor/models/funasr.py new file mode 100644 index 000000000..b4d4fb5b7 --- /dev/null +++ b/vllm/model_executor/models/funasr.py @@ -0,0 +1,1057 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import math +from collections.abc import Iterable, Mapping, Sequence +from typing import Annotated, Literal, cast + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn +from transformers import ( + BatchFeature, + Qwen3Config, +) + +from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig +from vllm.config.multimodal import BaseDummyOptions +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.inputs.data import PromptType +from vllm.logger import init_logger +from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY +from vllm.model_executor.layers.attention.mm_encoder_attention import ( + MMEncoderAttention, +) +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.whisper_utils import ( + ISO639_1_SUPPORTED_LANGS, +) +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import ( + MultiModalDataDict, + MultiModalFieldConfig, + MultiModalKwargsItems, +) +from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser +from vllm.multimodal.processing import ( + BaseDummyInputsBuilder, + BaseMultiModalProcessor, + BaseProcessingInfo, + PromptReplacement, + PromptUpdate, + PromptUpdateDetails, +) +from vllm.transformers_utils.processor import cached_processor_from_config +from vllm.transformers_utils.processors.funasr_processor import FunASRFeatureExtractor +from vllm.utils.jsontree import json_map_leaves +from vllm.utils.tensor_schema import TensorSchema, TensorShape + +from .interfaces import ( + MultiModalEmbeddings, + SupportsMultiModal, + SupportsTranscription, + _require_is_multimodal, +) +from .qwen3 import Qwen3Model +from .utils import ( + AutoWeightsLoader, + WeightsMapper, + _merge_multimodal_embeddings, + maybe_prefix, +) + +logger = init_logger(__name__) + + +def sequence_mask(lengths, maxlen=None, dtype=torch.float32, device=None): + if maxlen is None: + maxlen = lengths.max() + row_vector = torch.arange(0, maxlen, 1).to(lengths.device) + matrix = torch.unsqueeze(lengths, dim=-1) + mask = row_vector < matrix + mask = mask.detach() + + return mask.type(dtype).to(device) if device is not None else mask.type(dtype) + + +class LayerNorm(torch.nn.LayerNorm): + def __init__(self, nout, dim=-1): + super().__init__(nout, eps=1e-12) + self.dim = dim + + def forward(self, x: torch.Tensor): + if self.dim == -1: + return super().forward(x) + return super().forward(x.transpose(self.dim, -1)).transpose(self.dim, -1) + + +class EncoderLayerSANM(nn.Module): + def __init__( + self, + in_size: int, + size: int, + self_attn: nn.Module, + feed_forward: nn.Module, + normalize_before=True, + ): + super().__init__() + self.self_attn = self_attn + self.feed_forward = feed_forward + self.norm1 = LayerNorm(in_size) + self.norm2 = LayerNorm(size) + self.in_size = in_size + self.size = size + self.normalize_before = normalize_before + + def forward( + self, + hidden_states: torch.Tensor, + mask: torch.Tensor | None = None, + cache=None, + mask_shfit_chunk=None, + mask_att_chunk_encoder=None, + ): + residual = hidden_states + hidden_states = self.norm1(hidden_states) + + if self.in_size == self.size: + hidden_states = residual + self.self_attn( + hidden_states, + mask, + mask_shfit_chunk=mask_shfit_chunk, + mask_att_chunk_encoder=mask_att_chunk_encoder, + ) + else: + hidden_states = self.self_attn( + hidden_states, + mask, + mask_shfit_chunk=mask_shfit_chunk, + mask_att_chunk_encoder=mask_att_chunk_encoder, + ) + + residual = hidden_states + hidden_states = self.norm2(hidden_states) + hidden_states = residual + self.feed_forward(hidden_states) + + return hidden_states, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder + + +class MultiHeadedAttentionSANM(nn.Module): + def __init__( + self, + n_head: int, + in_feat: int, + n_feat: int, + kernel_size: int, + sanm_shift: int = 0, + ): + super().__init__() + assert n_feat % n_head == 0 + # We assume d_v always equals d_k + self.d_k = n_feat // n_head + self.h = n_head + self.out_proj = ReplicatedLinear( + input_size=n_feat, + output_size=n_feat, + bias=True, + ) + self.linear_q_k_v = ReplicatedLinear( + input_size=in_feat, + output_size=n_feat * 3, + bias=True, + ) + self.attn = None + + self.fsmn_block = nn.Conv1d( + n_feat, n_feat, kernel_size, stride=1, padding=0, groups=n_feat, bias=False + ) + # padding + left_padding = (kernel_size - 1) // 2 + if sanm_shift > 0: + left_padding = left_padding + sanm_shift + right_padding = kernel_size - 1 - left_padding + self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0) + + def forward_fsmn( + self, + inputs: torch.Tensor, + mask: torch.Tensor, + mask_shfit_chunk: torch.Tensor = None, + ): + b, t, d = inputs.size() + if mask is not None: + mask = torch.reshape(mask, (b, -1, 1)) + if mask_shfit_chunk is not None: + mask = mask * mask_shfit_chunk + inputs = inputs * mask + + x = inputs.transpose(1, 2) + x = self.pad_fn(x) + x = self.fsmn_block(x) + x = x.transpose(1, 2) + x += inputs + if mask is not None: + x = x * mask + return x + + def forward_qkv(self, x: torch.Tensor): + b, t, d = x.size() + + q_k_v, _ = self.linear_q_k_v(x) + q, k, v = torch.split(q_k_v, int(self.h * self.d_k), dim=-1) + q_h = torch.reshape(q, (b, t, self.h, self.d_k)).transpose(1, 2) + k_h = torch.reshape(k, (b, t, self.h, self.d_k)).transpose(1, 2) + v_h = torch.reshape(v, (b, t, self.h, self.d_k)).transpose(1, 2) + + return q_h, k_h, v_h, v + + def forward_attention( + self, + value: torch.Tensor, + scores: torch.Tensor, + mask: torch.Tensor, + mask_att_chunk_encoder: torch.Tensor = None, + ): + n_batch = value.size(0) + if mask is not None: + if mask_att_chunk_encoder is not None: + mask = mask * mask_att_chunk_encoder + + mask = mask.unsqueeze(1).eq(0) + + min_value = -float("inf") + scores = scores.masked_fill(mask, min_value) + attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) + else: + attn = torch.softmax(scores, dim=-1) + + p_attn = attn + x = torch.matmul(p_attn, value) + x = x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k) + + out, _ = self.out_proj(x) + return out + + def forward( + self, + hidden_states: torch.Tensor, + mask: torch.Tensor, + mask_shfit_chunk: torch.Tensor = None, + mask_att_chunk_encoder: torch.Tensor = None, + ): + q_h, k_h, v_h, v = self.forward_qkv(hidden_states) + fsmn_memory = self.forward_fsmn(v, mask, mask_shfit_chunk) + q_h = q_h * self.d_k ** (-0.5) + scores = torch.matmul(q_h, k_h.transpose(-2, -1)) + att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder) + return att_outs + fsmn_memory + + +class SinusoidalPositionEncoder(torch.nn.Module): + def __init__(self, d_model=80): + super().__init__() + + def encode( + self, + positions: torch.Tensor = None, + depth: int = None, + dtype: torch.dtype = torch.float32, + ): + batch_size = positions.size(0) + positions = positions.type(dtype) + device = positions.device + log_timescale_increment = torch.log( + torch.tensor([10000], dtype=dtype, device=device) + ) / (depth / 2 - 1) + inv_timescales = torch.exp( + torch.arange(depth / 2, device=device).type(dtype) + * (-log_timescale_increment) + ) + inv_timescales = torch.reshape(inv_timescales, [batch_size, -1]) + scaled_time = torch.reshape(positions, [1, -1, 1]) * torch.reshape( + inv_timescales, [1, 1, -1] + ) + encoding = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=2) + return encoding.type(dtype) + + def forward(self, hidden_states: torch.Tensor): + batch_size, timesteps, input_dim = hidden_states.size() + positions = torch.arange(1, timesteps + 1, device=hidden_states.device)[None, :] + position_encoding = self.encode(positions, input_dim, hidden_states.dtype).to( + hidden_states.device + ) + + return hidden_states + position_encoding + + +class SenseVoiceEncoderSmall(nn.Module): + def __init__( + self, + input_size: int, + output_size: int = 256, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + tp_blocks: int = 0, + attention_dropout_rate: float = 0.0, + normalize_before: bool = True, + kernel_size: int = 11, + sanm_shift: int = 0, + **kwargs, + ): + super().__init__() + self._output_size = output_size + self.embed = SinusoidalPositionEncoder() + + self.normalize_before = normalize_before + + positionwise_layer = PositionwiseFeedForward + positionwise_layer_args = ( + output_size, + linear_units, + ) + + encoder_selfattn_layer = MultiHeadedAttentionSANM + encoder_selfattn_layer_args0 = ( + attention_heads, + input_size, + output_size, + kernel_size, + sanm_shift, + ) + encoder_selfattn_layer_args = ( + attention_heads, + output_size, + output_size, + kernel_size, + sanm_shift, + ) + + self.encoders0 = nn.ModuleList( + [ + EncoderLayerSANM( + input_size, + output_size, + encoder_selfattn_layer(*encoder_selfattn_layer_args0), + positionwise_layer(*positionwise_layer_args), + ) + for i in range(1) + ] + ) + self.encoders = nn.ModuleList( + [ + EncoderLayerSANM( + output_size, + output_size, + encoder_selfattn_layer(*encoder_selfattn_layer_args), + positionwise_layer(*positionwise_layer_args), + ) + for i in range(num_blocks - 1) + ] + ) + + self.tp_encoders = nn.ModuleList( + [ + EncoderLayerSANM( + output_size, + output_size, + encoder_selfattn_layer(*encoder_selfattn_layer_args), + positionwise_layer(*positionwise_layer_args), + ) + for i in range(tp_blocks) + ] + ) + + self.after_norm = LayerNorm(output_size) + + self.tp_norm = LayerNorm(output_size) + + def output_size(self) -> int: + return self._output_size + + def forward( + self, + xs_pad: torch.Tensor, + ilens: torch.Tensor, + ): + maxlen = xs_pad.shape[1] + masks = sequence_mask( + ilens, maxlen=maxlen, dtype=ilens.dtype, device=ilens.device + )[:, None, :] + + xs_pad *= self.output_size() ** 0.5 + + xs_pad = self.embed(xs_pad) + + for layer_idx, encoder_layer in enumerate(self.encoders0): + encoder_outs = encoder_layer(xs_pad, masks) + xs_pad, masks = encoder_outs[0], encoder_outs[1] + + for layer_idx, encoder_layer in enumerate(self.encoders): + encoder_outs = encoder_layer(xs_pad, masks) + xs_pad, masks = encoder_outs[0], encoder_outs[1] + + xs_pad = self.after_norm(xs_pad) + + olens = masks.squeeze(1).sum(1).int() + + for layer_idx, encoder_layer in enumerate(self.tp_encoders): + encoder_outs = encoder_layer(xs_pad, masks) + xs_pad, masks = encoder_outs[0], encoder_outs[1] + + xs_pad = self.tp_norm(xs_pad) + return xs_pad, olens + + +class PositionwiseFeedForward(nn.Module): + def __init__(self, idim: int, hidden_units: int): + super().__init__() + self.w_1 = ColumnParallelLinear( + input_size=idim, + output_size=hidden_units, + bias=True, + ) + self.w_2 = RowParallelLinear( + input_size=hidden_units, + output_size=idim, + bias=True, + ) + self.activation = _ACTIVATION_REGISTRY["relu"] + + def forward(self, hidden_states: torch.Tensor): + hidden_states, _ = self.w_1(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states, _ = self.w_2(hidden_states) + return hidden_states + + +class EncoderLayer(nn.Module): + def __init__( + self, + size: int, + self_attn: nn.Module, + feed_forward: nn.Module, + ): + super().__init__() + self.self_attn = self_attn + self.feed_forward = feed_forward + self.norm1 = LayerNorm(size) + self.norm2 = LayerNorm(size) + + def forward(self, hidden_states: torch.Tensor): + residual = hidden_states + hidden_states = self.norm1(hidden_states) + hidden_states = residual + self.self_attn(hidden_states, None, None) + residual = hidden_states + hidden_states = self.norm2(hidden_states) + hidden_states = residual + self.feed_forward(hidden_states) + + return hidden_states + + +class FunASRAudioAttention(nn.Module): + def __init__( + self, + num_heads: int, + embed_dim: int, + prefix: str = "", + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = self.embed_dim // self.num_heads + tp_size = get_tensor_model_parallel_world_size() + self.num_local_heads = self.num_heads // tp_size + + if (self.head_dim * self.num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: " + f"{self.embed_dim} and `num_heads`: {self.num_heads})." + ) + + self.scaling = self.head_dim**-0.5 + + self.qkv = QKVParallelLinear( + hidden_size=self.embed_dim, + head_size=self.head_dim, + total_num_heads=self.num_heads, + total_num_kv_heads=self.num_heads, + bias=True, + prefix=f"{prefix}.qkv", + ) + + self.out_proj = RowParallelLinear( + input_size=self.embed_dim, + output_size=self.embed_dim, + bias=True, + prefix=f"{prefix}.out_proj", + ) + + self.attn = MMEncoderAttention( + num_heads=self.num_local_heads, + head_size=self.head_dim, + scale=self.scaling, + ) + + def forward( + self, + hidden_states: torch.Tensor, + cu_seqlens: torch.Tensor, + max_seqlen: torch.Tensor | None, + ) -> torch.Tensor: + bs, seq_length, _ = hidden_states.size() + qkv, _ = self.qkv(hidden_states) + q, k, v = qkv.chunk(3, dim=-1) + q = q.view(bs, seq_length, -1, self.head_dim) + k = k.view(bs, seq_length, -1, self.head_dim) + v = v.view(bs, seq_length, -1, self.head_dim) + + attn_output = self.attn( + query=q, + key=k, + value=v, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + ) + + attn_output = attn_output.view(bs, seq_length, -1) + output, _ = self.out_proj(attn_output) + return output + + +class Transformer(nn.Module): + def __init__( + self, + downsample_rate=2, + encoder_dim=1280, + llm_dim=4096, + ffn_dim: int = 2048, + prefix: str = "", + **kwargs, + ): + super().__init__() + self.k = downsample_rate + self.encoder_dim = encoder_dim + self.llm_dim = llm_dim + self.linear1 = ColumnParallelLinear( + input_size=self.encoder_dim * self.k, + output_size=ffn_dim, + bias=True, + ) + self.relu = nn.ReLU() + self.linear2 = RowParallelLinear( + input_size=ffn_dim, + output_size=self.llm_dim, + bias=True, + ) + + self.blocks = None + if kwargs.get("n_layer", 2) > 0: + self.blocks = nn.ModuleList( + [ + EncoderLayer( + llm_dim, + FunASRAudioAttention( + kwargs.get("attention_heads", 8), + llm_dim, + prefix=f"{prefix}.self_attn", + ), + PositionwiseFeedForward( + llm_dim, + llm_dim // 4, + ), + ) + for _ in range(kwargs.get("n_layer", 2)) + ] + ) + + def forward(self, hidden_states: torch.Tensor, ilens: int = 0): + batch_size, seq_len, dim = hidden_states.size() + chunk_num = (seq_len - 1) // self.k + 1 + pad_num = chunk_num * self.k - seq_len + hidden_states = F.pad(hidden_states, (0, 0, 0, pad_num, 0, 0), value=0.0) + seq_len = hidden_states.size(1) + + hidden_states = hidden_states.contiguous() + hidden_states = hidden_states.view(batch_size, chunk_num, dim * self.k) + hidden_states, _ = self.linear1(hidden_states) + hidden_states = self.relu(hidden_states) + hidden_states, _ = self.linear2(hidden_states) + + olens = None + olens = (ilens - 1) // self.k + 1 + + if self.blocks is not None: + for layer, block in enumerate(self.blocks): + hidden_states = block(hidden_states) + return hidden_states, olens + + +class FunASRAudioInputs(TensorSchema): + """ + Dimensions: + - b: Batch size + - nmb: Number of mel bins + - t: Time frames (M) + """ + + input_features: Annotated[ + list[torch.Tensor] | None, + TensorShape("b", "nmb", "t"), + ] + speech_lengths: Annotated[ + list[torch.Tensor] | None, + TensorShape("b"), + ] + + +class FunASREncoder(nn.Module): + def __init__( + self, *, vllm_config: VllmConfig, prefix: str = "", init_in_fp32: bool = False + ): + super().__init__() + self.audio_encoder = SenseVoiceEncoderSmall( + input_size=560, **vllm_config.model_config.hf_config.audio_encoder_conf + ) + self.audio_adaptor = Transformer( + downsample_rate=1, + use_low_frame_rate=True, + ffn_dim=2048, + llm_dim=1024, + encoder_dim=512, + n_layer=2, + freeze=True, + prefix=maybe_prefix(prefix, "audio_encoder"), + ) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + """Load weights with mapping from HuggingFace format.""" + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("self_attn.qkv.", "self_attn.q_proj.", "q"), + ("self_attn.qkv.", "self_attn.k_proj.", "k"), + ("self_attn.qkv.", "self_attn.v_proj.", "v"), + ] + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() + + for name, loaded_weight in weights: + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + param = params_dict.get(name) + if param is not None: + weight_loader = getattr( + param, "weight_loader", default_weight_loader + ) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class FunASRModel(nn.Module): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.encoder = FunASREncoder( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "encoder") + ) + self.decoder = Qwen3Model( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "decoder") + ) + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor: + decoder_outputs = self.decoder( + input_ids=input_ids, + positions=positions, + inputs_embeds=inputs_embeds, + ) + return decoder_outputs + + def get_encoder_outputs( + self, + speech: torch.Tensor | list[torch.Tensor] | None, + speech_lengths: torch.Tensor | list[torch.Tensor] | None, + ) -> torch.Tensor | None: + self.feat_permute = False + + if self.feat_permute: + encoder_out, encoder_out_lens = self.encoder.audio_encoder( + speech.permute(0, 2, 1), speech_lengths + ) + else: + encoder_out, encoder_out_lens = self.encoder.audio_encoder( + speech, speech_lengths + ) + + encoder_out, encoder_out_lens = self.encoder.audio_adaptor( + encoder_out, encoder_out_lens + ) + return encoder_out + + +class FunASRProcessingInfo(BaseProcessingInfo): + def get_hf_config(self) -> Qwen3Config: + return self.ctx.get_hf_config(Qwen3Config) + + @property + def skip_prompt_length_check(self) -> bool: + return True # Because the encoder prompt is padded + + def get_supported_mm_limits(self) -> Mapping[str, int | None]: + return {"audio": 1} + + def get_feature_extractor(self, **kwargs: object) -> FunASRFeatureExtractor: + hf_processor = self.get_hf_processor(**kwargs) + feature_extractor = hf_processor.feature_extractor # type: ignore + assert isinstance(feature_extractor, FunASRFeatureExtractor) + return feature_extractor + + def get_target_channels(self) -> int: + return 1 + + def get_num_audio_tokens(self) -> int: + return self.get_hf_config().max_source_positions + + +class FunASRDummyInputsBuilder(BaseDummyInputsBuilder[FunASRProcessingInfo]): + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_audios = mm_counts.get("audio", 0) + + return "<|AUDIO|>" * num_audios + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + mm_options: Mapping[str, BaseDummyOptions] | None = None, + ) -> MultiModalDataDict: + feature_extractor = self.info.get_feature_extractor() + + sampling_rate = feature_extractor.sampling_rate + audio_len = feature_extractor.chunk_length * sampling_rate + num_audios = mm_counts.get("audio", 0) + + audio_overrides = mm_options.get("audio") if mm_options else None + + return { + "audio": self._get_dummy_audios( + length=audio_len, num_audios=num_audios, overrides=audio_overrides + ) + } + + +class FunASRMultiModalProcessor(BaseMultiModalProcessor[FunASRProcessingInfo]): + def _get_data_parser(self) -> MultiModalDataParser: + feature_extractor = self.info.get_feature_extractor() + return MultiModalDataParser( + target_sr=feature_extractor.sampling_rate, + target_channels=self.info.get_target_channels(), + ) + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> BatchFeature: + if mm_data: + feature_extractor = self.info.get_feature_extractor(**mm_kwargs) + mm_data = dict(audio=mm_data.pop("audios")) + mm_kwargs = dict( + **mm_kwargs, + sampling_rate=feature_extractor.sampling_rate, + ) + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + tok_kwargs=tok_kwargs, + ) + if "labels" in processed_outputs: + processed_outputs["input_ids"] = processed_outputs.pop("labels") + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + input_features=MultiModalFieldConfig.batched("audio"), + speech_lengths=MultiModalFieldConfig.batched("audio"), + fake_token_len=MultiModalFieldConfig.batched("audio"), + ) + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargsItems, + ) -> Sequence[PromptUpdate]: + processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + + # Use getattr with default to be compatible with transformers<4.48 + audio_token = getattr(processor, "audio_token", "<|AUDIO|>") + + audio_token_id = vocab[audio_token] + + out_mm_data = out_mm_kwargs.get_data() + + fake_token_len = out_mm_data.get("fake_token_len") + if fake_token_len is None: + audio_output_lengths = [] + else: + assert isinstance(fake_token_len, torch.Tensor) + + audio_output_lengths = fake_token_len.tolist() + + def get_replacement_qwen2_audio(item_idx: int): + if audio_output_lengths: + num_features = audio_output_lengths[item_idx] + else: + audio_embeds = out_mm_data["audio_embeds"][item_idx] + assert len(audio_embeds.shape) == 2, "audio_embeds must be a 2D tensor" + num_features = audio_embeds.shape[0] + + audio_tokens = [audio_token_id] * num_features + + return PromptUpdateDetails.select_token_id( + audio_tokens, + embed_token_id=audio_token_id, + ) + + return [ + PromptReplacement( + modality="audio", + target=audio_token, + replacement=get_replacement_qwen2_audio, + ) + ] + + +@MULTIMODAL_REGISTRY.register_processor( + FunASRMultiModalProcessor, + info=FunASRProcessingInfo, + dummy_inputs=FunASRDummyInputsBuilder, +) +class FunASRForConditionalGeneration( + nn.Module, SupportsTranscription, SupportsMultiModal +): + packed_modules_mapping = { + "self_attn.qkv_proj": [ + "self_attn.q_proj", + "self_attn.k_proj", + "self_attn.v_proj", + ], + "encoder_attn.kv_proj": ["encoder_attn.k_proj", "encoder_attn.v_proj"], + } + + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + "linear_q.": "q_proj.", + "linear_k.": "k_proj.", + "linear_v.": "v_proj.", + "linear_out.": "out_proj.", + } + ) + + supports_transcription_only = True + supports_segment_timestamp = True + supported_languages = ISO639_1_SUPPORTED_LANGS + + @classmethod + def validate_language(cls, language: str | None) -> str | None: + if language is None: + # TODO language should be optional and can be guessed. + # For now we default to en. See + # https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/generation_whisper.py#L1520 + logger.warning( + "Defaulting to language='en'. If you wish to transcribe " + "audio in a different language, pass the `language` field " + "in the TranscriptionRequest." + ) + language = "en" + return super().validate_language(language) + + @classmethod + def get_generation_prompt( + cls, + audio: np.ndarray, + model_config: ModelConfig, # not needed here + stt_config: SpeechToTextConfig, + language: str | None, + task_type: Literal["transcribe", "translate"], + request_prompt: str, + to_language: str | None, + ) -> PromptType: + if language is None: + raise ValueError( + "Language must be specified when creating the funasr prompt" + ) + + funasr_prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n语音转写:<|AUDIO|><|im_end|>\n<|im_start|>assistant\n" # noqa: E501 + prompt = { + "prompt": funasr_prompt, + "multi_modal_data": { + "audio": (audio, stt_config.sample_rate), + }, + } + return cast(PromptType, prompt) + + @classmethod + def get_speech_to_text_config( + cls, model_config: ModelConfig, task_type: str + ) -> SpeechToTextConfig: + processor = cached_processor_from_config(model_config) + + return SpeechToTextConfig( + max_audio_clip_s=processor.feature_extractor.chunk_length, + sample_rate=processor.feature_extractor.sampling_rate, + ) + + @classmethod + def get_num_audio_tokens( + cls, + audio_duration_s: float, + stt_config: SpeechToTextConfig, + model_config: ModelConfig, + ) -> int | None: + processor = cached_processor_from_config(model_config) + hop_length = processor.feature_extractor.hop_length + assert hop_length is not None + return math.ceil(audio_duration_s * stt_config.sample_rate / hop_length) + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.dtype = vllm_config.model_config.dtype + + self.model = FunASRModel( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model"), + ) + logit_scale = getattr(config, "logit_scale", 1.0) + + if config.tie_word_embeddings: + self.lm_head = self.model.decoder.embed_tokens + else: + self.lm_head = ParallelLMHead( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), + ) + self.logits_processor = LogitsProcessor(config.vocab_size, scale=logit_scale) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + inputs_embeds: torch.Tensor | None = None, + **kwargs, + ) -> torch.Tensor: + decoder_outputs = self.model( + input_ids=input_ids, + positions=positions, + inputs_embeds=inputs_embeds, + ) + return decoder_outputs + + def get_language_model(self) -> torch.nn.Module: + return self.model.decoder + + def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: + audio_input = self._parse_and_validate_audio_input(**kwargs) + + speech = audio_input["input_features"] + speech_lengths = audio_input["speech_lengths"] + enc_output = self.model.get_encoder_outputs( + speech=speech, speech_lengths=speech_lengths + ) + + return enc_output + + def embed_input_ids( + self, + input_ids: torch.Tensor, + multimodal_embeddings: MultiModalEmbeddings | None = None, + *, + is_multimodal: torch.Tensor | None = None, + handle_oov_mm_token: bool = False, + ) -> torch.Tensor: + inputs_embeds = self.model.decoder.embed_input_ids(input_ids) + + return _merge_multimodal_embeddings( + inputs_embeds=inputs_embeds, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=_require_is_multimodal(is_multimodal), + ) + + def _parse_and_validate_audio_input(self, **kwargs: object) -> FunASRAudioInputs: + input_features = kwargs.pop("input_features", None) + speech_lengths = kwargs.pop("speech_lengths", None) + + if input_features is not None: + input_features = json_map_leaves(lambda x: x.to(self.dtype), input_features) + + if speech_lengths is not None: + speech_lengths = json_map_leaves(lambda x: x.to(self.dtype), speech_lengths) + + return FunASRAudioInputs( + input_features=input_features, speech_lengths=speech_lengths + ) + + def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states) + return logits + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader( + self, + ) + + # add fake zeros bias for k_proj to state_dict + weights = _create_fake_bias_for_k_proj(weights) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + + +def _create_fake_bias_for_k_proj( + weights: Iterable[tuple[str, torch.Tensor]], +) -> Iterable[tuple[str, torch.Tensor]]: + """ + Create full zeros bias for k_proj weight in self-attn and x-attn layers. + So that the bias for k_proj in qkv_proj can be initialized with zeros. + """ + for name, weight in weights: + if name.endswith(".k_proj.weight"): + bias = torch.zeros(weight.size(0)) + bias_name = name.replace("weight", "bias") + yield from [(name, weight), (bias_name, bias)] + else: + yield name, weight diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 1871591c9..59fcd9117 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -325,6 +325,7 @@ _MULTIMODAL_MODELS = { "ernie45_vl", "Ernie4_5_VLMoeForConditionalGeneration", ), + "FunASRForConditionalGeneration": ("funasr", "FunASRForConditionalGeneration"), # noqa: E501 "FunAudioChatForConditionalGeneration": ( "funaudiochat", "FunAudioChatForConditionalGeneration", diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py index af25dbe4c..d726fd39a 100644 --- a/vllm/transformers_utils/processors/__init__.py +++ b/vllm/transformers_utils/processors/__init__.py @@ -10,6 +10,7 @@ reasons: from vllm.transformers_utils.processors.bagel import BagelProcessor from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor +from vllm.transformers_utils.processors.funasr_processor import FunASRProcessor from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor from vllm.transformers_utils.processors.hunyuan_vl_image import HunYuanVLImageProcessor from vllm.transformers_utils.processors.ovis import OvisProcessor @@ -18,6 +19,7 @@ from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor __all__ = [ "BagelProcessor", "DeepseekVLV2Processor", + "FunASRProcessor", "HunYuanVLProcessor", "HunYuanVLImageProcessor", "OvisProcessor", diff --git a/vllm/transformers_utils/processors/funasr_processor.py b/vllm/transformers_utils/processors/funasr_processor.py new file mode 100644 index 000000000..4807c87d3 --- /dev/null +++ b/vllm/transformers_utils/processors/funasr_processor.py @@ -0,0 +1,504 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import numpy as np +import torch +import torch.nn as nn +import torchaudio.compliance.kaldi as kaldi +from torch.nn.utils.rnn import pad_sequence +from transformers import ( + AutoFeatureExtractor, + AutoProcessor, + BatchFeature, +) +from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor +from transformers.processing_utils import ProcessorMixin +from transformers.utils import TensorType + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def apply_cmvn(inputs, cmvn): # noqa + """ + Apply CMVN with mvn data + """ + + device = inputs.device + # dtype = inputs.dtype + frame, dim = inputs.shape + + means = cmvn[0:1, :dim] + vars = cmvn[1:2, :dim] + inputs += means.to(device) + inputs *= vars.to(device) + + return inputs.type(torch.float32) + + +def apply_lfr(inputs, lfr_m, lfr_n): + # LFR_inputs = [] + T = inputs.shape[0] + T_lfr = int(np.ceil(T / lfr_n)) + left_padding = inputs[0].repeat((lfr_m - 1) // 2, 1) + inputs = torch.vstack((left_padding, inputs)) + T = T + (lfr_m - 1) // 2 + feat_dim = inputs.shape[-1] + strides = (lfr_n * feat_dim, 1) + sizes = (T_lfr, lfr_m * feat_dim) + last_idx = (T - lfr_m) // lfr_n + 1 + num_padding = lfr_m - (T - last_idx * lfr_n) + if num_padding > 0: + num_padding = ( + (2 * lfr_m - 2 * T + (T_lfr - 1 + last_idx) * lfr_n) + / 2 + * (T_lfr - last_idx) + ) + inputs = torch.vstack([inputs] + [inputs[-1:]] * int(num_padding)) + LFR_outputs = inputs.as_strided(sizes, strides) + return LFR_outputs.clone().type(torch.float32) + + +def load_cmvn(cmvn_file): + with open(cmvn_file, encoding="utf-8") as f: + lines = f.readlines() + means_list = [] + vars_list = [] + for i in range(len(lines)): + line_item = lines[i].split() + if line_item[0] == "": + line_item = lines[i + 1].split() + if line_item[0] == "": + add_shift_line = line_item[3 : (len(line_item) - 1)] + means_list = list(add_shift_line) + continue + elif line_item[0] == "": + line_item = lines[i + 1].split() + if line_item[0] == "": + rescale_line = line_item[3 : (len(line_item) - 1)] + vars_list = list(rescale_line) + continue + means = np.array(means_list).astype(np.float32) + vars = np.array(vars_list).astype(np.float32) + cmvn = np.array([means, vars]) + cmvn = torch.as_tensor(cmvn, dtype=torch.float32) + return cmvn + + +class WavFrontend(nn.Module): + """Conventional frontend structure for ASR.""" + + def __init__( + self, + cmvn_file: str = "null", + fs: int = 16000, + window: str = "hamming", + n_mels: int = 80, + frame_length: int = 25, + frame_shift: int = 10, + filter_length_min: int = -1, + filter_length_max: int = -1, + lfr_m: int = 1, + lfr_n: int = 1, + dither: float = 1.0, + snip_edges: bool = True, + upsacle_samples: bool = True, + **kwargs, + ): + super().__init__() + self.fs = fs + self.window = window + self.n_mels = n_mels + self.frame_length = frame_length + self.frame_shift = frame_shift + self.filter_length_min = filter_length_min + self.filter_length_max = filter_length_max + self.lfr_m = lfr_m + self.lfr_n = lfr_n + self.cmvn_file = cmvn_file + self.dither = dither + self.snip_edges = snip_edges + self.upsacle_samples = upsacle_samples + self.cmvn = None if self.cmvn_file is None else load_cmvn(self.cmvn_file) + + def output_size(self) -> int: + return self.n_mels * self.lfr_m + + def forward( + self, + input: torch.Tensor, + input_lengths, + **kwargs, + ) -> tuple[torch.Tensor, torch.Tensor]: + batch_size = input.size(0) + feats = [] + feats_lens = [] + for i in range(batch_size): + waveform_length = input_lengths[i] + waveform = input[i][:waveform_length] + if self.upsacle_samples: + waveform = waveform * (1 << 15) + waveform = waveform.unsqueeze(0) + mat = kaldi.fbank( + waveform, + num_mel_bins=self.n_mels, + frame_length=min(self.frame_length, waveform_length / self.fs * 1000), + frame_shift=self.frame_shift, + dither=self.dither, + energy_floor=0.0, + window_type=self.window, + sample_frequency=self.fs, + snip_edges=self.snip_edges, + ) + + if self.lfr_m != 1 or self.lfr_n != 1: + mat = apply_lfr(mat, self.lfr_m, self.lfr_n) + if self.cmvn is not None: + mat = apply_cmvn(mat, self.cmvn) + feat_length = mat.size(0) + feats.append(mat) + feats_lens.append(feat_length) + + feats_lens = torch.as_tensor(feats_lens) + if batch_size == 1: + feats_pad = feats[0][None, :, :] + else: + feats_pad = pad_sequence(feats, batch_first=True, padding_value=0.0) + return feats_pad, feats_lens + + def forward_fbank( + self, input: torch.Tensor, input_lengths: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor]: + batch_size = input.size(0) + feats = [] + feats_lens = [] + for i in range(batch_size): + waveform_length = input_lengths[i] + waveform = input[i][:waveform_length] + waveform = waveform * (1 << 15) + waveform = waveform.unsqueeze(0) + mat = kaldi.fbank( + waveform, + num_mel_bins=self.n_mels, + frame_length=self.frame_length, + frame_shift=self.frame_shift, + dither=self.dither, + energy_floor=0.0, + window_type=self.window, + sample_frequency=self.fs, + ) + + feat_length = mat.size(0) + feats.append(mat) + feats_lens.append(feat_length) + + feats_lens = torch.as_tensor(feats_lens) + feats_pad = pad_sequence(feats, batch_first=True, padding_value=0.0) + return feats_pad, feats_lens + + def forward_lfr_cmvn( + self, input: torch.Tensor, input_lengths: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor]: + batch_size = input.size(0) + feats = [] + feats_lens = [] + for i in range(batch_size): + mat = input[i, : input_lengths[i], :] + if self.lfr_m != 1 or self.lfr_n != 1: + mat = apply_lfr(mat, self.lfr_m, self.lfr_n) + if self.cmvn is not None: + mat = apply_cmvn(mat, self.cmvn) + feat_length = mat.size(0) + feats.append(mat) + feats_lens.append(feat_length) + + feats_lens = torch.as_tensor(feats_lens) + feats_pad = pad_sequence(feats, batch_first=True, padding_value=0.0) + return feats_pad, feats_lens + + +class FunASRFeatureExtractor(SequenceFeatureExtractor): + r""" + Constructs a FunASR feature extractor. + + This feature extractor inherits from [`~feature_extraction_sequence_ + utils.SequenceFeatureExtractor`] which contains most of the main + methods. Users should refer to this superclass for more information + regarding those methods. + + This class extracts mel-filter bank features from raw speech using a custom + numpy implementation of the `Short Time Fourier Transform` which should + match pytorch's `torch.stft` equivalent. + + Args: + feature_size (`int`, *optional*, defaults to 80): + The feature dimension of the extracted features. + sampling_rate (`int`, *optional*, defaults to 16000): + The sampling rate at which the audio files should be digitalized + expressed in hertz (Hz). + hop_length (`int`, *optional*, defaults to 160): + Length of the overlapping windows for the STFT used to obtain the + Mel Frequency coefficients. + chunk_length (`int`, *optional*, defaults to 30): + The maximum number of chunks of `sampling_rate` samples used to + trim and pad longer or shorter audio sequences. + n_fft (`int`, *optional*, defaults to 400): + Size of the Fourier transform. + padding_value (`float`, *optional*, defaults to 0.0): + Padding value used to pad the audio. Should correspond to silences. + dither (`float`, *optional*, defaults to 0.0): + Adds dithering. In other words, adds a small Gaussian noise to each frame. + E.g. use 0.0001 to add dithering with a normal distribution centered + around 0.0 with standard deviation 0.0001 (assuming [-1,+1] range + of raw_speech). The value 0.0 means no dithering. + Dithering has similar effect as `spectrogram(mel_floor=...)`. It reduces + the high log_mel_fbank values for signals with hard-zero sections, + when VAD cutoff is present in the signal. + """ + + model_input_names = ["input_features"] + + def __init__( + self, + feature_size=80, + sampling_rate=16000, + hop_length=160, + chunk_length=30, + n_fft=400, + padding_value=0.0, + dither=0.0, + return_attention_mask=False, + **kwargs, + ): + super().__init__( + feature_size=feature_size, + sampling_rate=sampling_rate, + padding_value=padding_value, + return_attention_mask=return_attention_mask, + **kwargs, + ) + self.frontend_conf = kwargs.get("frontend_conf", {}) + self.n_fft = n_fft + self.hop_length = hop_length + self.chunk_length = chunk_length + self.n_samples = chunk_length * sampling_rate + self.nb_max_frames = self.n_samples // hop_length + self.sampling_rate = sampling_rate + self.dither = dither + + def extract_fbank( + self, data, data_len=None, data_type: str = "sound", frontend=None, **kwargs + ): + if isinstance(data, np.ndarray): + data = torch.from_numpy(data) + if len(data.shape) < 2: + data = data[None, :] # data: [batch, N] + data_len = [data.shape[1]] if data_len is None else data_len + elif isinstance(data, torch.Tensor): + if len(data.shape) < 2: + data = data[None, :] # data: [batch, N] + data_len = [data.shape[1]] if data_len is None else data_len + elif isinstance(data, (list, tuple)): + data_list, data_len = [], [] + for data_i in data: + if isinstance(data_i, np.ndarray): + data_i = torch.from_numpy(data_i) + data_list.append(data_i) + data_len.append(data_i.shape[0]) + data = pad_sequence(data_list, batch_first=True) + + data, data_len = frontend(data, data_len, **kwargs) + + if isinstance(data_len, (list, tuple)): + data_len = torch.tensor([data_len]) + return data.to(torch.float32), data_len.to(torch.int32) + + def __call__( + self, + raw_speech: np.ndarray | list[float] | list[np.ndarray] | list[list[float]], + truncation: bool = True, + pad_to_multiple_of: int | None = None, + return_tensors: str | TensorType | None = None, + return_attention_mask: bool | None = None, + padding: str | None = "max_length", + max_length: int | None = None, + sampling_rate: int | None = None, + do_normalize: bool | None = None, + device: str | None = "cpu", + return_token_timestamps: bool | None = None, + **kwargs, + ) -> BatchFeature: + is_batched = isinstance(raw_speech, (list, tuple)) and ( + isinstance(raw_speech[0], (np.ndarray, tuple, list)) + ) + + if is_batched: + raw_speech = [ + np.asarray([speech], dtype=np.float32).T for speech in raw_speech + ] + elif not is_batched and not isinstance(raw_speech, np.ndarray): + raw_speech = np.asarray(raw_speech, dtype=np.float32) + elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype( + np.float64 + ): + raw_speech = raw_speech.astype(np.float32) + + if not is_batched: + raw_speech = [np.asarray([raw_speech]).T] + + batched_speech = BatchFeature({"input_features": raw_speech}) + + padded_inputs = self.pad( + batched_speech, + padding=padding, + max_length=max_length if max_length else self.n_samples, + truncation=truncation, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask or do_normalize, + ) + + input_features = padded_inputs.get("input_features").transpose(2, 0, 1) + + self.frontend = WavFrontend(**self.frontend_conf) + input_features, speech_lengths = self.extract_fbank( + input_features[0], + data_type=kwargs.get("data_type", "sound"), + frontend=self.frontend, + is_final=True, + ) + olens = 1 + (speech_lengths - 3 + 2 * 1) // 2 + olens = 1 + (olens - 3 + 2 * 1) // 2 + fake_token_len = (olens - 1) // 2 + 1 + if isinstance(input_features[0], list): + padded_inputs["input_features"] = [ + np.asarray(feature, dtype=np.float32) for feature in input_features + ] + + else: + padded_inputs["input_features"] = input_features + + if return_tensors is not None: + padded_inputs = padded_inputs.convert_to_tensors(return_tensors) + + padded_inputs["speech_lengths"] = speech_lengths + padded_inputs["fake_token_len"] = fake_token_len + + return padded_inputs + + +class FunASRProcessor(ProcessorMixin): + r""" + Constructs a FunASR processor which wraps a FunASR feature extractor and + a FunASR tokenizer into a single processor. + + [`FunASRProcessor`] offers all the functionalities of + [`FunASRFeatureExtractor`] and [`Qwen2Tokenizer`]. See the + [`~FunASRProcessor.__call__`] and [`~FunASRProcessor.decode`] for more + information. + + Args: + feature_extractor (`FunASRFeatureExtractor`): An instance of + [`FunASRFeatureExtractor`]. + The feature extractor is a required input. + tokenizer (`Qwen2Tokenizer`): + An instance of [`Qwen2Tokenizer`]. The tokenizer is a required + input. + """ + + feature_extractor_class = "FunASRFeatureExtractor" + tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") + + def __init__( + self, + feature_extractor, + tokenizer, + audio_token="<|AUDIO|>", + ): + super().__init__(feature_extractor, tokenizer) + self.current_processor = self.feature_extractor + self._in_target_context_manager = False + self.audio_token = ( + tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token + ) + self.audio_token_id = tokenizer.convert_tokens_to_ids(self.audio_token) + + def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True): + return self.tokenizer.get_decoder_prompt_ids( + task=task, language=language, no_timestamps=no_timestamps + ) + + def __call__(self, *args, **kwargs): + """ + Forwards the `audio` argument to FunASRFeatureExtractor's + [`~FunASRFeatureExtractor.__call__`] and the `text` argument to + [`~Qwen2Tokenizer.__call__`]. Please refer to the docstring of the + above two methods for more information. + """ + if self._in_target_context_manager: + return self.current_processor(*args, **kwargs) + + audio = kwargs.pop("audio", None) + sampling_rate = kwargs.pop("sampling_rate", None) + text = kwargs.pop("text", None) + if len(args) > 0: + audio = args[0] + args = args[1:] + + if text is None: + raise ValueError("You need to specify `text` input to process.") + elif isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError( + "Invalid input text. Please provide a string, or a list of strings" + ) + + if audio is not None: + # ensure we have as much audios as audio tokens + num_audio_tokens = sum(sample.count(self.audio_token) for sample in text) + num_audios = 1 if type(audio) is np.ndarray else len(audio) + if num_audio_tokens != num_audios: + raise ValueError( + f"Found {num_audio_tokens} {self.audio_token} token{'s' if num_audio_tokens > 1 else ''} in provided text but received {num_audios} audio{'s' if num_audios > 1 else ''}" # noqa: E501 + ) + inputs = self.feature_extractor( + audio, *args, sampling_rate=sampling_rate, **kwargs + ) + + expanded_text = [] + for sample in text: + replace_str = [] + while self.audio_token in sample: + num_audio_tokens = inputs["fake_token_len"].item() + + expanded_audio_token = self.audio_token * num_audio_tokens + + replace_str.append(expanded_audio_token) + sample = sample.replace(self.audio_token, "", 1) + + while "" in sample: + sample = sample.replace("", replace_str.pop(0), 1) + expanded_text.append(sample) + text = expanded_text + + if text is not None: + encodings = self.tokenizer(text, **kwargs) + + if text is None: + return inputs + + elif audio is None: + return encodings + else: + inputs["labels"] = encodings["input_ids"] + + return inputs + + def get_prompt_ids(self, text: str, return_tensors="np"): + return self.tokenizer.get_prompt_ids(text, return_tensors=return_tensors) + + +AutoFeatureExtractor.register("FunASRFeatureExtractor", FunASRFeatureExtractor) +AutoProcessor.register("FunASRProcessor", FunASRProcessor) -- GitLab From cb9574eb8528fca1ecd13ef4cb81cd30a643dbb9 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Wed, 11 Feb 2026 16:27:15 +0800 Subject: [PATCH 0090/1166] [XPU][9/N] clean up existing ipex code/doc (#34111) Signed-off-by: Kunshang Ji --- docker/Dockerfile.cpu | 1 - .../installation/gpu.xpu.inc.md | 9 +++--- tests/quantization/test_cpu_wna16.py | 2 +- tests/quantization/test_ipex_quant.py | 32 ------------------- vllm/{_ipex_ops.py => _xpu_ops.py} | 6 ++-- .../layers/quantization/mxfp4.py | 2 +- .../layers/sparse_attn_indexer.py | 2 +- vllm/platforms/cpu.py | 1 - vllm/v1/attention/backends/fa_utils.py | 9 +++--- vllm/v1/attention/ops/paged_attn.py | 2 +- 10 files changed, 16 insertions(+), 50 deletions(-) delete mode 100644 tests/quantization/test_ipex_quant.py rename vllm/{_ipex_ops.py => _xpu_ops.py} (96%) diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 98f99d089..063d3e6e4 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -134,7 +134,6 @@ WORKDIR /vllm-workspace # Copy test requirements COPY requirements/test.in requirements/cpu-test.in -# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version RUN \ sed -i '/mamba_ssm/d' requirements/cpu-test.in && \ remove_packages_not_supported_on_aarch64() { \ diff --git a/docs/getting_started/installation/gpu.xpu.inc.md b/docs/getting_started/installation/gpu.xpu.inc.md index 7e9c6a2b9..d8b84ace2 100644 --- a/docs/getting_started/installation/gpu.xpu.inc.md +++ b/docs/getting_started/installation/gpu.xpu.inc.md @@ -6,10 +6,11 @@ vLLM initially supports basic model inference and serving on Intel GPU platform. # --8<-- [start:requirements] - Supported Hardware: Intel Data Center GPU, Intel ARC GPU -- OneAPI requirements: oneAPI 2025.1 +- OneAPI requirements: oneAPI 2025.3 +- Dependency: [vllm-xpu-kernels](https://github.com/vllm-project/vllm-xpu-kernels): a package provide all necessary vllm custom kernel when running vLLM on Intel GPU platform, - Python: 3.12 !!! warning - The provided IPEX whl is Python3.12 specific so this version is a MUST. + The provided vllm-xpu-kernels whl is Python3.12 specific so this version is a MUST. # --8<-- [end:requirements] # --8<-- [start:set-up-using-python] @@ -24,7 +25,7 @@ Currently, there are no pre-built XPU wheels. # --8<-- [end:pre-built-wheels] # --8<-- [start:build-wheel-from-source] -- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.1 or later. +- First, install required [driver](https://dgpu-docs.intel.com/driver/installation.html#installing-gpu-drivers) and [Intel OneAPI](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) 2025.3 or later. - Second, install Python packages for vLLM XPU backend building: ```bash @@ -37,7 +38,7 @@ pip install -v -r requirements/xpu.txt - Then, build and install vLLM XPU backend: ```bash -VLLM_TARGET_DEVICE=xpu python setup.py install +VLLM_TARGET_DEVICE=xpu pip install --no-build-isolation -e . -v ``` # --8<-- [end:build-wheel-from-source] diff --git a/tests/quantization/test_cpu_wna16.py b/tests/quantization/test_cpu_wna16.py index 56b9c39b0..6c8a8f3d5 100644 --- a/tests/quantization/test_cpu_wna16.py +++ b/tests/quantization/test_cpu_wna16.py @@ -17,7 +17,7 @@ DTYPE = ["bfloat16"] @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", DTYPE) -def test_ipex_quant(vllm_runner, model, dtype): +def test_cpu_quant(vllm_runner, model, dtype): with vllm_runner(model, dtype=dtype) as llm: output = llm.generate_greedy(["The capital of France is"], max_tokens=32) assert output diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py deleted file mode 100644 index 4f3c52df6..000000000 --- a/tests/quantization/test_ipex_quant.py +++ /dev/null @@ -1,32 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Test model set-up and inference for quantized HF models supported -on the CPU/GPU backend using IPEX (including AWQ/GPTQ). - -Validating the configuration and printing results for manual checking. - -Run `pytest tests/quantization/test_ipex_quant.py`. -""" - -import pytest - -from vllm.platforms import current_platform - -MODELS = [ - "AMead10/Llama-3.2-1B-Instruct-AWQ", - "shuyuej/Llama-3.2-1B-Instruct-GPTQ", # with g_idx -] -DTYPE = ["bfloat16"] - - -@pytest.mark.skipif( - not current_platform.is_cpu() and not current_platform.is_xpu(), - reason="only supports Intel CPU/XPU backend.", -) -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", DTYPE) -def test_ipex_quant(vllm_runner, model, dtype): - with vllm_runner(model, dtype=dtype, enforce_eager=True) as llm: - output = llm.generate_greedy(["The capital of France is"], max_tokens=4) - assert output - print(output) diff --git a/vllm/_ipex_ops.py b/vllm/_xpu_ops.py similarity index 96% rename from vllm/_ipex_ops.py rename to vllm/_xpu_ops.py index 22133eaef..e40b18f81 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_xpu_ops.py @@ -53,7 +53,7 @@ if hasattr(torch.ops._xpu_C, "int4_gemm_w4a16"): return torch.empty((M, N), dtype=input.dtype, device=input.device) -class ipex_ops: +class xpu_ops: @staticmethod def flash_attn_varlen_func( q: torch.Tensor, @@ -73,7 +73,7 @@ class ipex_ops: cu_seqlens_k: torch.Tensor | None = None, # passed in qwen vl dropout_p: float = 0.0, - # The following parameters are not used in ipex kernel currently, + # The following parameters are not used in xpu kernel currently, # we keep API compatible to CUDA's. scheduler_metadata=None, fa_version: int = 2, @@ -153,6 +153,6 @@ class ipex_ops: sm_margin=0, # Can be tuned if some SMs are used for communication ) -> None: logger.warning_once( - "get_scheduler_metadata is not implemented for ipex_ops, returning None." + "get_scheduler_metadata is not implemented for xpu_ops, returning None." ) return None diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 13199124b..75501076a 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -160,7 +160,7 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend: logger.info_once("Using Triton backend") return Mxfp4Backend.TRITON elif current_platform.is_xpu(): - logger.info_once("Using ipex marlin backend on XPU") + logger.info_once("Using xpu backend on XPU") return Mxfp4Backend.MARLIN elif current_platform.is_rocm() and has_triton_kernels(): logger.info_once("Using Triton backend") diff --git a/vllm/model_executor/layers/sparse_attn_indexer.py b/vllm/model_executor/layers/sparse_attn_indexer.py index bd063de74..538860ca6 100644 --- a/vllm/model_executor/layers/sparse_attn_indexer.py +++ b/vllm/model_executor/layers/sparse_attn_indexer.py @@ -20,7 +20,7 @@ from vllm.v1.worker.workspace import current_workspace_manager if current_platform.is_cuda_alike(): from vllm import _custom_ops as ops elif current_platform.is_xpu(): - from vllm._ipex_ops import ipex_ops as ops + from vllm._xpu_ops import xpu_ops as ops logger = init_logger(__name__) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 60180b272..3edc83b15 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -345,7 +345,6 @@ class CpuPlatform(Platform): ld_preload_str += pytorch_libgomp_so os.environ["LD_PRELOAD"] = ld_preload_str - # To hint IPEX uses shared memory based AllReduce os.environ["LOCAL_WORLD_SIZE"] = str( vllm_config.parallel_config.tensor_parallel_size ) diff --git a/vllm/v1/attention/backends/fa_utils.py b/vllm/v1/attention/backends/fa_utils.py index ccf52aff2..3150ad9a5 100644 --- a/vllm/v1/attention/backends/fa_utils.py +++ b/vllm/v1/attention/backends/fa_utils.py @@ -23,12 +23,11 @@ if current_platform.is_cuda(): elif current_platform.is_xpu(): from vllm import _custom_ops as ops + from vllm._xpu_ops import xpu_ops reshape_and_cache_flash = ops.reshape_and_cache_flash - from vllm._ipex_ops import ipex_ops - - flash_attn_varlen_func = ipex_ops.flash_attn_varlen_func # type: ignore[assignment] - get_scheduler_metadata = ipex_ops.get_scheduler_metadata # type: ignore[assignment] + flash_attn_varlen_func = xpu_ops.flash_attn_varlen_func # type: ignore[assignment] + get_scheduler_metadata = xpu_ops.get_scheduler_metadata # type: ignore[assignment] elif current_platform.is_rocm(): try: from flash_attn import flash_attn_varlen_func # type: ignore[no-redef] @@ -153,7 +152,7 @@ def is_flash_attn_varlen_func_available() -> bool: Platform-specific sources: - CUDA: vllm.vllm_flash_attn.flash_attn_varlen_func - - XPU: ipex_ops.flash_attn_varlen_func + - XPU: xpu_ops.flash_attn_varlen_func - ROCm: upstream flash_attn.flash_attn_varlen_func (if available) Note: This is separate from the AITER flash attention backend (rocm_aiter_fa.py) diff --git a/vllm/v1/attention/ops/paged_attn.py b/vllm/v1/attention/ops/paged_attn.py index 73995fc93..896e929b5 100644 --- a/vllm/v1/attention/ops/paged_attn.py +++ b/vllm/v1/attention/ops/paged_attn.py @@ -9,7 +9,7 @@ from vllm.platforms import current_platform if current_platform.is_cuda_alike(): from vllm import _custom_ops as ops elif current_platform.is_xpu(): - from vllm._ipex_ops import ipex_ops as ops # type: ignore[no-redef] + from vllm._xpu_ops import xpu_ops as ops # type: ignore[no-redef] class PagedAttention: -- GitLab From 675a22ed66c4a34be7d2a60cac77078578f49892 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 11 Feb 2026 16:29:51 +0800 Subject: [PATCH 0091/1166] [Chore] Move `BaseRenderer` to `base.py` (#34308) Signed-off-by: DarkLight1337 --- vllm/renderers/__init__.py | 2 +- vllm/renderers/{protocol.py => base.py} | 0 vllm/renderers/deepseek_v32.py | 2 +- vllm/renderers/grok2.py | 2 +- vllm/renderers/hf.py | 2 +- vllm/renderers/mistral.py | 2 +- vllm/renderers/registry.py | 2 +- vllm/renderers/terratorch.py | 2 +- 8 files changed, 7 insertions(+), 7 deletions(-) rename vllm/renderers/{protocol.py => base.py} (100%) diff --git a/vllm/renderers/__init__.py b/vllm/renderers/__init__.py index 58d9ed70a..db186e1f0 100644 --- a/vllm/renderers/__init__.py +++ b/vllm/renderers/__init__.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from .base import BaseRenderer from .params import ChatParams, TokenizeParams, merge_kwargs -from .protocol import BaseRenderer from .registry import RendererRegistry, renderer_from_config __all__ = [ diff --git a/vllm/renderers/protocol.py b/vllm/renderers/base.py similarity index 100% rename from vllm/renderers/protocol.py rename to vllm/renderers/base.py diff --git a/vllm/renderers/deepseek_v32.py b/vllm/renderers/deepseek_v32.py index d10a596b2..e4cc3f0fb 100644 --- a/vllm/renderers/deepseek_v32.py +++ b/vllm/renderers/deepseek_v32.py @@ -14,10 +14,10 @@ from vllm.tokenizers import cached_get_tokenizer from vllm.tokenizers.deepseek_v32 import DeepseekV32Tokenizer from ..tokenizers.hf import HfTokenizer +from .base import BaseRenderer from .inputs import DictPrompt from .inputs.preprocess import parse_dec_only_prompt from .params import ChatParams -from .protocol import BaseRenderer logger = init_logger(__name__) diff --git a/vllm/renderers/grok2.py b/vllm/renderers/grok2.py index c5c3afe86..141c72aa7 100644 --- a/vllm/renderers/grok2.py +++ b/vllm/renderers/grok2.py @@ -13,10 +13,10 @@ from vllm.logger import init_logger from vllm.tokenizers import cached_get_tokenizer from vllm.tokenizers.grok2 import Grok2Tokenizer +from .base import BaseRenderer from .inputs import DictPrompt from .inputs.preprocess import parse_dec_only_prompt from .params import ChatParams -from .protocol import BaseRenderer logger = init_logger(__name__) diff --git a/vllm/renderers/hf.py b/vllm/renderers/hf.py index 5425bd888..83b17e961 100644 --- a/vllm/renderers/hf.py +++ b/vllm/renderers/hf.py @@ -32,10 +32,10 @@ from vllm.transformers_utils.chat_templates import get_chat_template_fallback_pa from vllm.transformers_utils.processor import cached_get_processor from vllm.utils.func_utils import supports_kw +from .base import BaseRenderer from .inputs import DictPrompt from .inputs.preprocess import parse_dec_only_prompt from .params import ChatParams -from .protocol import BaseRenderer if TYPE_CHECKING: from vllm.multimodal.inputs import MultiModalDataDict, MultiModalUUIDDict diff --git a/vllm/renderers/mistral.py b/vllm/renderers/mistral.py index 0d15b37e0..3d3141bdc 100644 --- a/vllm/renderers/mistral.py +++ b/vllm/renderers/mistral.py @@ -15,10 +15,10 @@ from vllm.tokenizers import cached_get_tokenizer from vllm.tokenizers.mistral import MistralTokenizer from vllm.utils.async_utils import make_async +from .base import BaseRenderer from .inputs import DictPrompt from .inputs.preprocess import parse_dec_only_prompt from .params import ChatParams -from .protocol import BaseRenderer logger = init_logger(__name__) diff --git a/vllm/renderers/registry.py b/vllm/renderers/registry.py index dde17a6f9..3abc7c9fe 100644 --- a/vllm/renderers/registry.py +++ b/vllm/renderers/registry.py @@ -7,7 +7,7 @@ from vllm.logger import init_logger from vllm.tokenizers.registry import tokenizer_args_from_config from vllm.utils.import_utils import resolve_obj_by_qualname -from .protocol import BaseRenderer +from .base import BaseRenderer if TYPE_CHECKING: from vllm.config import ModelConfig diff --git a/vllm/renderers/terratorch.py b/vllm/renderers/terratorch.py index 58c1459d2..2d00ebccb 100644 --- a/vllm/renderers/terratorch.py +++ b/vllm/renderers/terratorch.py @@ -12,10 +12,10 @@ from vllm.entrypoints.chat_utils import ( from vllm.logger import init_logger from vllm.tokenizers import TokenizerLike +from .base import BaseRenderer from .inputs import DictPrompt from .inputs.preprocess import parse_dec_only_prompt from .params import ChatParams -from .protocol import BaseRenderer logger = init_logger(__name__) -- GitLab From addac0e65343e4c24a109975d54c4673bbfb029c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luka=20Govedi=C4=8D?= Date: Wed, 11 Feb 2026 03:30:00 -0500 Subject: [PATCH 0092/1166] [torch.compile] Enable AR+rms fusion by default available for `-O2` (#34299) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luka Govedič --- vllm/config/compilation.py | 3 +-- vllm/config/vllm.py | 21 +++++++++++++++------ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index fb7a1466b..f1909ace6 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -115,7 +115,7 @@ class PassConfig: """Fuse the custom SiluMul + quant ops.""" fuse_attn_quant: bool = Field(default=None) """Fuse the custom attention + quant ops.""" - eliminate_noops: bool = Field(default=None) + eliminate_noops: bool = Field(default=True) """Eliminate no-op ops.""" enable_sp: bool = Field(default=None) """Enable sequence parallelism.""" @@ -194,7 +194,6 @@ class PassConfig: "fuse_norm_quant", "fuse_act_quant", "fuse_attn_quant", - "eliminate_noops", "enable_sp", "fuse_gemm_comms", "fuse_allreduce_rms", diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index c1ef8e6aa..eccaa6ce6 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -102,6 +102,19 @@ def enable_act_fusion(cfg: "VllmConfig") -> bool: ) or cfg.compilation_config.is_custom_op_enabled("quant_fp8") +def enable_allreduce_rms_fusion(cfg: "VllmConfig") -> bool: + """Enable if TP > 1 and Hopper+ and flashinfer installed.""" + from vllm.platforms import current_platform + from vllm.utils.flashinfer import has_flashinfer + + return ( + cfg.parallel_config.tensor_parallel_size > 1 + and current_platform.is_cuda() + and current_platform.has_device_capability(90) + and has_flashinfer() + ) + + def enable_norm_pad_fusion(cfg: "VllmConfig") -> bool: """Enable if using AITER RMSNorm and AITER Triton GEMMs and hidden size is 2880 i.e. gpt-oss; otherwise Inductor handles fusion.""" @@ -118,7 +131,6 @@ def enable_norm_pad_fusion(cfg: "VllmConfig") -> bool: OPTIMIZATION_LEVEL_00 = { "compilation_config": { "pass_config": { - "eliminate_noops": False, "fuse_norm_quant": False, "fuse_act_quant": False, "fuse_allreduce_rms": False, @@ -137,7 +149,6 @@ OPTIMIZATION_LEVEL_00 = { OPTIMIZATION_LEVEL_01 = { "compilation_config": { "pass_config": { - "eliminate_noops": True, "fuse_norm_quant": enable_norm_fusion, "fuse_act_quant": enable_act_fusion, "fuse_allreduce_rms": False, @@ -156,10 +167,9 @@ OPTIMIZATION_LEVEL_01 = { OPTIMIZATION_LEVEL_02 = { "compilation_config": { "pass_config": { - "eliminate_noops": True, "fuse_norm_quant": enable_norm_fusion, "fuse_act_quant": enable_act_fusion, - "fuse_allreduce_rms": False, + "fuse_allreduce_rms": enable_allreduce_rms_fusion, "fuse_attn_quant": IS_QUANTIZED, "enable_sp": IS_DENSE, "fuse_gemm_comms": IS_DENSE, @@ -175,10 +185,9 @@ OPTIMIZATION_LEVEL_02 = { OPTIMIZATION_LEVEL_03 = { "compilation_config": { "pass_config": { - "eliminate_noops": True, "fuse_norm_quant": enable_norm_fusion, "fuse_act_quant": enable_act_fusion, - "fuse_allreduce_rms": False, + "fuse_allreduce_rms": enable_allreduce_rms_fusion, "fuse_attn_quant": IS_QUANTIZED, "enable_sp": IS_DENSE, "fuse_gemm_comms": IS_DENSE, -- GitLab From 79504027ef93a742846856e81fc25de369dc5e22 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 11 Feb 2026 00:30:09 -0800 Subject: [PATCH 0093/1166] [Misc] Bump `fastsafetensors` version for latest fixes (#34273) Signed-off-by: Nick Hill --- requirements/nightly_torch_test.txt | 2 +- requirements/test.in | 2 +- requirements/test.txt | 3 +-- setup.py | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index 8dcbe2a71..a45634d0c 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -43,5 +43,5 @@ tritonclient>=2.51.0 numba == 0.61.2 # Required for N-gram speculative decoding numpy runai-model-streamer[s3,gcs]==0.15.3 -fastsafetensors>=0.1.10 +fastsafetensors>=0.2.2 pydantic>=2.12 # 2.11 leads to error on python 3.13 diff --git a/requirements/test.in b/requirements/test.in index e8abcc04e..8a97c0e88 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -53,7 +53,7 @@ arctic-inference == 0.1.1 # Required for suffix decoding test numba == 0.61.2 # Required for N-gram speculative decoding numpy runai-model-streamer[s3,gcs]==0.15.3 -fastsafetensors>=0.1.10 +fastsafetensors>=0.2.2 # 0.2.2 contains important fixes for multi-GPU mem usage pydantic>=2.12 # 2.11 leads to error on python 3.13 decord==0.6.0 terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test diff --git a/requirements/test.txt b/requirements/test.txt index 9090fe3c2..fbe3228d2 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -224,7 +224,7 @@ fastparquet==2024.11.0 # via genai-perf fastrlock==0.8.2 # via cupy-cuda12x -fastsafetensors==0.1.10 +fastsafetensors==0.2.2 # via -r requirements/test.in filelock==3.16.1 # via @@ -1174,7 +1174,6 @@ torch==2.10.0+cu129 # bitsandbytes # efficientnet-pytorch # encodec - # fastsafetensors # kornia # lightly # lightning diff --git a/setup.py b/setup.py index 14325cdfc..8dea355da 100644 --- a/setup.py +++ b/setup.py @@ -1035,7 +1035,7 @@ setup( extras_require={ "bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy"], "tensorizer": ["tensorizer==2.10.1"], - "fastsafetensors": ["fastsafetensors >= 0.1.10"], + "fastsafetensors": ["fastsafetensors >= 0.2.2"], "runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"], "audio": [ "librosa", -- GitLab From 786806dd4431959ac7b370838ab3a9aa5ea93ef3 Mon Sep 17 00:00:00 2001 From: Tianqi Ren Date: Wed, 11 Feb 2026 17:03:41 +0800 Subject: [PATCH 0094/1166] [Doc] Update Marlin support matrix for Turing (#34319) Signed-off-by: Tianqi Ren --- docs/features/quantization/README.md | 3 ++- docs/features/quantization/fp8.md | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md index 77213bb35..58c4e0bb5 100644 --- a/docs/features/quantization/README.md +++ b/docs/features/quantization/README.md @@ -48,7 +48,7 @@ th:not(:first-child) { |-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-----------| | AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | | GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | -| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | +| Marlin (GPTQ/AWQ/FP8/FP4) | ❌ | ✅︎* | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | | INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ✅︎ | | FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | | bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | @@ -59,6 +59,7 @@ th:not(:first-child) { - ✅︎ indicates that the quantization method is supported on the specified hardware. - ❌ indicates that the quantization method is not supported on the specified hardware. - All Intel Gaudi quantization support has been migrated to [vLLM-Gaudi](https://github.com/vllm-project/vllm-gaudi). +- *Turing does not support Marlin MXFP4. !!! note For information on quantization support on Google TPU, please refer to the [TPU-Inference Recommended Models and Features](https://docs.vllm.ai/projects/tpu/en/latest/recommended_models_features/) documentation. diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md index f17ef89a5..76fc04710 100644 --- a/docs/features/quantization/fp8.md +++ b/docs/features/quantization/fp8.md @@ -2,7 +2,7 @@ vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x. Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8. -Ampere GPUs are supported for W8A16 (weight-only FP8) utilizing Marlin kernels. +Turing/Ampere GPUs are supported for W8A16 (weight-only FP8) utilizing Marlin kernels. Quantization of models with FP8 allows for a 2x reduction in model memory requirements and up to a 1.6x improvement in throughput with minimal impact on accuracy. Please visit the HF collection of [quantized FP8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127). @@ -13,8 +13,8 @@ The FP8 types typically supported in hardware have two distinct representations, - **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- `inf`, and `nan`. The tradeoff for the increased dynamic range is lower precision of the stored values. !!! note - FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper). - FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin. + FP8 computation is supported on NVIDIA GPUs with compute capability >= 8.9 (Ada Lovelace, Hopper). + FP8 models will run on compute capability >= 7.5 (Turing) as weight-only W8A16, utilizing FP8 Marlin. ## Installation -- GitLab From e09546cf05f12c041083c289c24ecb48896f9620 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Wed, 11 Feb 2026 02:03:24 -0800 Subject: [PATCH 0095/1166] [Frontend] Exploit tokenizers "new stream" in FastIncrementalDetokenizer (#34217) Signed-off-by: Nick Hill --- vllm/v1/engine/detokenizer.py | 48 +++++++++++++---------------------- 1 file changed, 17 insertions(+), 31 deletions(-) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 18e4c98f8..da950c2a0 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -19,9 +19,9 @@ from vllm.v1.engine import EngineCoreRequest logger = init_logger(__name__) -# Only tokenizers >= 0.21.1 supports DecodeStream used for -# FastIncrementalDetokenizer. -USE_FAST_DETOKENIZER = version.parse(tokenizers.__version__) >= version.parse("0.21.1") +# Only tokenizers >= 0.22.0 supports DecodeStream with native prefill +# (ids parameter) used for FastIncrementalDetokenizer. +USE_FAST_DETOKENIZER = version.parse(tokenizers.__version__) >= version.parse("0.22.0") # Error string from https://github.com/huggingface/tokenizers/blob/909fdde2a4ffedd9295206f705eb612be2a91b12/tokenizers/src/tokenizer/mod.rs#L1042 INVALID_PREFIX_ERR_MSG = "Invalid prefix encountered" @@ -154,11 +154,10 @@ class BaseIncrementalDetokenizer(IncrementalDetokenizer, ABC): # We return the full output text if the sequence is finished. buffer_length = 0 if finished else self.stop_buffer_length if not delta: - return ( - self.output_text[:-buffer_length] - if buffer_length - else (self.output_text) - ) + if not buffer_length: + return self.output_text + return self.output_text[:-buffer_length] + length = len(self.output_text) - buffer_length last_offset = self._last_output_text_offset if last_offset < length: @@ -176,24 +175,14 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer): self.request_id = request.request_id self.skip_special_tokens = sampling_params.skip_special_tokens - self.stream = DecodeStream(skip_special_tokens=self.skip_special_tokens) self.tokenizer: Tokenizer = tokenizer._tokenizer - # Find a safe place to start. - prompt_token_ids = request.prompt_token_ids or [] - prompt_suffix = prompt_token_ids - prompt_len = len(prompt_suffix) - if prompt_len > 4: - for i in range(4, min(prompt_len + 1, 24)): - suffix = prompt_token_ids[-i:] - if "�" not in self.tokenizer.decode(suffix): - prompt_suffix = suffix - break - - # Prime the stream. - for tid in prompt_suffix: - self._protected_step(tid) + # Use native prefill to prime the decode stream with prompt tokens. + self.stream = DecodeStream( + ids=request.prompt_token_ids, + skip_special_tokens=self.skip_special_tokens, + ) self.spaces_between_special_tokens = ( sampling_params.skip_special_tokens @@ -203,9 +192,8 @@ class FastIncrementalDetokenizer(BaseIncrementalDetokenizer): if not self.spaces_between_special_tokens: # Store dict of added token ids so that we can suppress # the spaces between them. - if ( - added_token_ids := getattr(self.tokenizer, "added_token_ids", None) - ) is None: + added_token_ids = getattr(self.tokenizer, "added_token_ids", None) + if added_token_ids is None: self.tokenizer.added_token_ids = added_token_ids = { tid: tok.content for tid, tok in self.tokenizer.get_added_tokens_decoder().items() @@ -290,11 +278,9 @@ class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer): @property def output_token_ids(self) -> list[int]: - return ( - self.token_ids - if not self.prompt_len - else (self.token_ids[self.prompt_len :]) - ) + if self.prompt_len: + return self.token_ids[self.prompt_len :] + return self.token_ids def num_output_tokens(self) -> int: return len(self.token_ids) - self.prompt_len -- GitLab From 5045d5c9831a3a4a423a409ccea521d299a43a9a Mon Sep 17 00:00:00 2001 From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Date: Wed, 11 Feb 2026 02:25:04 -0800 Subject: [PATCH 0096/1166] Patch protobuf for CVE-2026-0994 (#34253) Signed-off-by: Seiji Eicher Co-authored-by: Kevin H. Luu --- requirements/build.txt | 2 +- requirements/common.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/build.txt b/requirements/build.txt index 994635309..6c6c9fc8a 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -9,5 +9,5 @@ wheel jinja2>=3.1.6 regex build -protobuf +protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* grpcio-tools==1.78.0 # Required for grpc entrypoints diff --git a/requirements/common.txt b/requirements/common.txt index f8402410b..297447cf2 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -9,7 +9,7 @@ blake3 py-cpuinfo transformers >= 4.56.0, < 5 tokenizers >= 0.21.1 # Required for fast incremental detokenization. -protobuf # Required by LlamaTokenizer, gRPC. +protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. aiohttp >= 3.13.3 openai >= 1.99.1 # For Responses API with reasoning content -- GitLab From 40b8f553588371bfd71d30117845cd305a785265 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 11 Feb 2026 11:56:02 +0100 Subject: [PATCH 0097/1166] [Docs] Reduce time spent generating API docs (#34255) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- mkdocs.yaml | 4 ++-- vllm/config/model.py | 1 + vllm/engine/async_llm_engine.py | 1 + vllm/engine/llm_engine.py | 1 + vllm/inputs/data.py | 1 + vllm/model_executor/layers/fused_moe/cpu_fused_moe.py | 2 ++ vllm/model_executor/layers/fused_moe/cutlass_moe.py | 6 ++++++ vllm/model_executor/layers/fused_moe/deep_gemm_moe.py | 2 ++ vllm/model_executor/layers/fused_moe/fused_marlin_moe.py | 4 ++++ vllm/model_executor/layers/fused_moe/fused_moe.py | 3 +++ .../layers/fused_moe/gpt_oss_triton_kernels_moe.py | 3 +++ .../layers/fused_moe/pplx_prepare_finalize.py | 2 ++ vllm/model_executor/layers/fused_moe/prepare_finalize.py | 2 ++ .../model_executor/layers/fused_moe/rocm_aiter_fused_moe.py | 1 + vllm/model_executor/layers/fused_moe/trtllm_moe.py | 2 ++ .../compressed_tensors/compressed_tensors_moe.py | 2 ++ vllm/model_executor/layers/quantization/mxfp4.py | 2 ++ vllm/model_executor/models/blip2.py | 1 + vllm/model_executor/models/llava.py | 1 + vllm/model_executor/models/llava_next.py | 2 ++ vllm/multimodal/processing/processor.py | 2 ++ vllm/platforms/interface.py | 2 ++ vllm/plugins/__init__.py | 1 + vllm/plugins/io_processors/interface.py | 2 ++ vllm/v1/engine/async_llm.py | 2 ++ 25 files changed, 50 insertions(+), 2 deletions(-) diff --git a/mkdocs.yaml b/mkdocs.yaml index d5d6852f3..ecc0ab692 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -63,8 +63,9 @@ plugins: - git-revision-date-localized: # exclude autogenerated files exclude: - - argparse/* + - api/* - examples/* + - generated/* - minify: minify_html: true minify_js: true @@ -92,7 +93,6 @@ plugins: - "!.*_pb2_grpc" # Exclude auto-generated gRPC stubs summary: modules: true - show_if_no_docstring: true show_signature_annotations: true separate_signature: true show_overloads: true diff --git a/vllm/config/model.py b/vllm/config/model.py index 749af0d5d..5fd7d2d73 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -1557,6 +1557,7 @@ class ModelConfig: @property def attn_type(self) -> AttnTypeStr: + """Determine the attention type based on model configuration.""" if self.pooler_config is not None: seq_pooling_type = self._model_info.default_seq_pooling_type if seq_pooling_type == "CLS": diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index ede027759..fc1cea023 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -4,3 +4,4 @@ from vllm.v1.engine.async_llm import AsyncLLM AsyncLLMEngine = AsyncLLM # type: ignore +"""The `AsyncLLMEngine` class is an alias of [vllm.v1.engine.async_llm.AsyncLLM][].""" diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index a0fe38eb3..419139c4b 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -4,3 +4,4 @@ from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine LLMEngine = V1LLMEngine # type: ignore +"""The `LLMEngine` class is an alias of [vllm.v1.engine.llm_engine.LLMEngine][].""" diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 7848c2c03..157ab337e 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -298,6 +298,7 @@ which can be passed to SingletonInputs: TypeAlias = DecoderOnlyInputs | MultiModalEncDecInputs +"""The inputs for a single encoder/decoder prompt.""" @dataclass diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py index ee4798d84..e929074d5 100644 --- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py @@ -206,6 +206,8 @@ class SGLFusedMOE: class CPUFusedMOE: + """CPU-based fused MoE implementation.""" + def __init__(self, layer: torch.nn.Module) -> None: use_grouped_gemm, isa = self.check_grouped_gemm(layer) self.isa = isa diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index ac5a86067..77d439d32 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -376,6 +376,8 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): class CutlassExpertsFp8(CutlassExpertsFp8Base): + """CUTLASS FP8 fused MoE expert implementation.""" + @staticmethod def activation_format() -> mk.FusedMoEActivationFormat: return mk.FusedMoEActivationFormat.Standard @@ -423,6 +425,8 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base): class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base): + """Batched CUTLASS FP8 fused MoE expert implementation.""" + @staticmethod def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool: # BATCHED activation format works with EP because @@ -651,6 +655,8 @@ def run_cutlass_moe_fp4( class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute): + """CUTLASS FP4 fused MoE expert implementation.""" + @property def expects_unquantized_inputs(self) -> bool: return True diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index 00d55bfb7..59dde3ca9 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -113,6 +113,8 @@ def _valid_deep_gemm( class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): + """DeepGemm-based fused MoE expert implementation.""" + def __init__(self, moe_config: FusedMoEConfig, quant_config: FusedMoEQuantConfig): super().__init__(moe_config=moe_config, quant_config=quant_config) assert quant_config.block_shape == get_mk_alignment_for_contiguous_layout() diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 5d382cfc9..3d3a21f81 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -637,6 +637,8 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute): class MarlinExperts(MarlinExpertsBase): + """Marlin-based fused MoE expert implementation.""" + def supports_expert_map(self) -> bool: return True @@ -738,6 +740,8 @@ class MarlinExperts(MarlinExpertsBase): class BatchedMarlinExperts(MarlinExpertsBase): + """Batched Marlin-based fused MoE expert implementation.""" + def __init__( self, moe_config: FusedMoEConfig, diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 6ca3213fb..352288e17 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1527,6 +1527,7 @@ def fused_experts( expert_map: torch.Tensor | None = None, quant_config: FusedMoEQuantConfig | None = None, ) -> torch.Tensor: + """Run fused MoE expert computation using Triton kernels.""" if quant_config is None: quant_config = FUSED_MOE_UNQUANTIZED_CONFIG @@ -1879,6 +1880,8 @@ def fused_experts_impl( class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): + """Triton-based fused MoE expert implementation.""" + def __init__( self, moe_config: FusedMoEConfig, diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index eafdf97a9..5aaf2a8c3 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -221,6 +221,7 @@ def triton_kernel_fused_experts( intermediate_cache: torch.Tensor | None = None, a1q_scale: torch.Tensor | None = None, ) -> torch.Tensor: + """Triton implementation of fused expert computation using OAI kernels.""" if quant_config is None: quant_config = FUSED_MOE_UNQUANTIZED_CONFIG @@ -444,6 +445,8 @@ class BaseOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): class OAITritonExperts(BaseOAITritonExperts): + """OAI Triton-based fused MoE expert implementation.""" + @staticmethod def activation_format() -> mk.FusedMoEActivationFormat: return mk.FusedMoEActivationFormat.Standard diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index 78b941498..289ac0d14 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -63,6 +63,8 @@ def pplx_hidden_dim_scale_bytes( class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): + """PPLX-based prepare and finalize for expert parallelism.""" + def __init__( self, a2a: pplx.AllToAll, diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py index d10476702..7b8dd3b77 100644 --- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py @@ -131,6 +131,8 @@ class MoEPrepareAndFinalizeNaiveEP(mk.FusedMoEPrepareAndFinalize): class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): + """MoE prepare and finalize without expert parallelism.""" + @property def activation_format(self) -> mk.FusedMoEActivationFormat: return mk.FusedMoEActivationFormat.Standard diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index 33150da6f..535abc420 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -192,6 +192,7 @@ def rocm_aiter_fused_experts( num_local_tokens: torch.Tensor | None = None, output_dtype: torch.dtype | None = None, ) -> torch.Tensor: + """ROCm AITER fused MoE expert computation.""" if quant_config is None: quant_config = FUSED_MOE_UNQUANTIZED_CONFIG diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py index aa7185040..074b8154a 100644 --- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py +++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py @@ -18,6 +18,8 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute): + """TensorRT-LLM-based fused MoE expert implementation.""" + def __init__( self, moe_config: FusedMoEConfig, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 023cf3f67..690ff0454 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -680,6 +680,8 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): + """W8A8 FP8 MoE quantization using compressed tensors.""" + def __init__( self, weight_quant: QuantizationArgs, diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 75501076a..5cd6d5d79 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -235,6 +235,8 @@ class Mxfp4Config(QuantizationConfig): class Mxfp4MoEMethod(FusedMoEMethodBase): + """MXFP4 MoE quantization method.""" + def __init__(self, moe: FusedMoEConfig): super().__init__(moe) self.weight_dtype = "mxfp4" diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 0441996f6..f812eb849 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -73,6 +73,7 @@ class Blip2ImageEmbeddingInputs(TensorSchema): Blip2ImageInputs: TypeAlias = Blip2ImagePixelInputs | Blip2ImageEmbeddingInputs +"""Alias for supported BLIP-2 image input types.""" class Blip2QFormerMultiHeadAttention(nn.Module): diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 2f9aaa3f3..c35728183 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -121,6 +121,7 @@ class LlavaImageEmbeddingInputs(TensorSchema): LlavaImageInputs: TypeAlias = ( LlavaImagePixelInputs | PixtralHFImagePixelInputs | LlavaImageEmbeddingInputs ) +"""Alias for supported LLaVA image input types.""" class LlavaMultiModalProjector(nn.Module): diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 9f83c7910..4ea58ce71 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -78,6 +78,7 @@ class LlavaNextImageEmbeddingInputs(TensorSchema): LlavaNextImageInputs: TypeAlias = ( LlavaNextImagePixelInputs | LlavaNextImageEmbeddingInputs ) +"""Alias for supported LLaVA-NeXT image input types.""" class LlavaNextLikeConfig(LlavaLikeConfig, Protocol): @@ -106,6 +107,7 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo): image_width: int, image_height: int, ) -> int: + """Get the number of image tokens for the given image dimensions.""" hf_config = self.get_hf_config() vision_encoder_info = self.get_vision_encoder_info() diff --git a/vllm/multimodal/processing/processor.py b/vllm/multimodal/processing/processor.py index 5f98cce3d..e1a164d4e 100644 --- a/vllm/multimodal/processing/processor.py +++ b/vllm/multimodal/processing/processor.py @@ -1110,6 +1110,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): self, mm_items: MultiModalDataItems, ) -> tuple[Mapping[str, object], Mapping[str, object]]: + """Extract processor and passthrough data from multi-modal items.""" processor_data = dict[str, object]() passthrough_data = dict[str, object]() @@ -1616,6 +1617,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): token_ids: list[int], mm_prompt_updates: MultiModalPromptUpdates, ) -> tuple[list[int], Mapping[str, list[PlaceholderFeaturesInfo]]]: + """Apply multi-modal prompt updates to token IDs.""" tokenizer = self.info.get_tokenizer() new_token_ids, match_result = self._apply_token_matches( diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 45dde6e47..27f5ea517 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -35,6 +35,8 @@ def in_wsl() -> bool: class PlatformEnum(enum.Enum): + """Enumeration of supported hardware platforms.""" + CUDA = enum.auto() ROCM = enum.auto() TPU = enum.auto() diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 4c59d5364..89fadad7a 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -26,6 +26,7 @@ plugins_loaded = False def load_plugins_by_group(group: str) -> dict[str, Callable[[], Any]]: + """Load plugins registered under the given entry point group.""" from importlib.metadata import entry_points allowed_plugins = envs.VLLM_PLUGINS diff --git a/vllm/plugins/io_processors/interface.py b/vllm/plugins/io_processors/interface.py index a978b1e74..fa71b4ca0 100644 --- a/vllm/plugins/io_processors/interface.py +++ b/vllm/plugins/io_processors/interface.py @@ -16,6 +16,8 @@ IOProcessorOutput = TypeVar("IOProcessorOutput") class IOProcessor(ABC, Generic[IOProcessorInput, IOProcessorOutput]): + """Abstract interface for pre/post-processing of engine I/O.""" + def __init__(self, vllm_config: VllmConfig): super().__init__() diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index bb4fffb69..072d2a164 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -69,6 +69,8 @@ class InputStreamError(Exception): class AsyncLLM(EngineClient): + """An asynchronous wrapper for the vLLM engine.""" + def __init__( self, vllm_config: VllmConfig, -- GitLab From 05339a7b207e2f32b56c29398c18d577c74cef3b Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Wed, 11 Feb 2026 19:07:23 +0800 Subject: [PATCH 0098/1166] [Bugfix][CPU] Fix llama4 inference on CPU (#34321) Signed-off-by: jiang1.li --- .gitignore | 3 ++ csrc/cpu/cpu_fused_moe.cpp | 13 +++++-- csrc/cpu/torch_bindings.cpp | 5 +-- vllm/_custom_ops.py | 2 ++ .../layers/fused_moe/cpu_fused_moe.py | 36 ++++++++++++++----- vllm/v1/worker/cpu_worker.py | 19 +++++++--- 6 files changed, 60 insertions(+), 18 deletions(-) diff --git a/.gitignore b/.gitignore index 375b1b7eb..8e864d090 100644 --- a/.gitignore +++ b/.gitignore @@ -238,3 +238,6 @@ ep_kernels_workspace/ vllm/grpc/vllm_engine_pb2.py vllm/grpc/vllm_engine_pb2_grpc.py vllm/grpc/vllm_engine_pb2.pyi + +# Ignore generated cpu headers +csrc/cpu/cpu_attn_dispatch_generated.h diff --git a/csrc/cpu/cpu_fused_moe.cpp b/csrc/cpu/cpu_fused_moe.cpp index 090e2d4cd..1a8264539 100644 --- a/csrc/cpu/cpu_fused_moe.cpp +++ b/csrc/cpu/cpu_fused_moe.cpp @@ -147,7 +147,7 @@ void fused_moe_impl(scalar_t* __restrict__ output, scalar_t* __restrict__ input, const int32_t token_num, const int32_t expert_num, const int32_t topk_num, const int32_t input_size_13, const int32_t output_size_13, const int32_t input_size_2, - const int32_t output_size_2) { + const int32_t output_size_2, const bool skip_weighted) { using scalar_vec_t = typename cpu_utils::VecTypeTrait::vec_t; constexpr int32_t gemm_n_tile_size = gemm_t::NSize; constexpr int32_t gemm_m_tile_size = gemm_t::MaxMSize; @@ -582,6 +582,11 @@ void fused_moe_impl(scalar_t* __restrict__ output, scalar_t* __restrict__ input, scalar_t* __restrict__ curr_output_buffer = output + token_id * output_size_2; + if (skip_weighted) { + // Only for topk_num == 1 + *curr_weight = 1.0f; + } + if (topk_num > 1) { { int32_t w2_output_idx = curr_expand_token_id_index_buffer[0]; @@ -699,7 +704,7 @@ void cpu_fused_moe( const std::optional& w2_bias, // [expert_num, output_size_2] const torch::Tensor& topk_weights, // [token_num, k], float32 const torch::Tensor& topk_id, // [token_num, k], int32 - const std::string& act, const std::string& isa) { + const bool skip_weighted, const std::string& act, const std::string& isa) { const int32_t token_num = input.size(0); const int32_t input_size_13 = input.size(1); const int64_t input_stride = input.stride(0); @@ -711,6 +716,8 @@ void cpu_fused_moe( const int32_t topk_num = topk_id.size(1); const FusedMOEAct act_type = get_act_type(act); cpu_utils::ISA isa_type = cpu_utils::get_isa(isa); + TORCH_CHECK(!skip_weighted || topk_num == 1, + "skip_weighted is only supported for topk=1 on CPU"); VLLM_DISPATCH_FLOATING_TYPES(w13.scalar_type(), "cpu_fused_moe", [&]() { CPU_ISA_DISPATCH_IMPL(isa_type, [&]() { @@ -721,7 +728,7 @@ void cpu_fused_moe( w2_bias.has_value() ? w2_bias->data_ptr() : nullptr, topk_weights.data_ptr(), topk_id.data_ptr(), act_type, token_num, expert_num, topk_num, input_size_13, output_size_13, - input_size_2, output_size_2); + input_size_2, output_size_2, skip_weighted); }); }); } diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index b54447b7d..11e1305c6 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -119,8 +119,8 @@ void cpu_fused_moe(torch::Tensor& output, const torch::Tensor& input, const std::optional& w13_bias, const std::optional& w2_bias, const torch::Tensor& topk_weights, - const torch::Tensor& topk_id, const std::string& act, - const std::string& isa); + const torch::Tensor& topk_id, const bool skip_weighted, + const std::string& act, const std::string& isa); TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops @@ -320,6 +320,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def( "cpu_fused_moe(Tensor(a0!) output, Tensor input, Tensor w13, Tensor w2, " "Tensor? w13_bias, Tensor? w2_bias, Tensor topk_weights, Tensor topk_id, " + "bool skip_weighted, " "str act, str isa) -> ()"); ops.impl("cpu_fused_moe", torch::kCPU, &cpu_fused_moe); #endif diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index ea44beda5..d04edf8e2 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -3078,6 +3078,7 @@ def cpu_fused_moe( topk_ids: torch.Tensor, act: str, isa: str, + skip_weighted: bool = False, ) -> torch.Tensor: output = torch.empty_like(input) torch.ops._C.cpu_fused_moe( @@ -3089,6 +3090,7 @@ def cpu_fused_moe( w2_bias, topk_weights, topk_ids, + skip_weighted, act, isa, ) diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py index e929074d5..127538822 100644 --- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py @@ -238,7 +238,6 @@ class CPUFusedMOE: activation: str = "silu", ) -> torch.Tensor: assert activation in _CPU_MOE_ACT_FN, f"{activation} is not supported." - assert not apply_router_weight_on_input topk_weights, topk_ids = select_experts( hidden_states=x, @@ -261,6 +260,7 @@ class CPUFusedMOE: topk_ids, activation, global_num_experts, + apply_router_weight_on_input, ) def check_grouped_gemm( @@ -355,7 +355,14 @@ class CPUFusedMOE: topk_ids: torch.Tensor, activation: str, global_num_experts: int = -1, + skip_weighted: bool = False, ) -> torch.Tensor: + if skip_weighted: + assert topk_ids.size(1) == 1, ( + "apply_router_weight_on_input is only implemented for topk=1" + ) + input.mul_(topk_weights.to(input.dtype)) + output = cpu_fused_moe( input, layer.w13_weight, @@ -366,6 +373,7 @@ class CPUFusedMOE: topk_ids, activation, self.isa, + skip_weighted, ) return output @@ -377,7 +385,14 @@ class CPUFusedMOE: topk_ids: torch.Tensor, activation: str, global_num_experts: int = -1, + skip_weighted: bool = False, ) -> torch.Tensor: + if skip_weighted: + assert topk_ids.size(1) == 1, ( + "apply_router_weight_on_input is only implemented for topk=1" + ) + input.mul_(topk_weights.to(input.dtype)) + output = torch.empty_like(input) layer_id = id(layer) torch.ops.vllm.cpu_fused_moe_torch( @@ -388,6 +403,7 @@ class CPUFusedMOE: topk_ids, activation, global_num_experts, + skip_weighted, ) return output @@ -401,6 +417,7 @@ def cpu_fused_moe_torch( topk_ids: torch.Tensor, activation: str, global_num_experts: int = -1, + skip_weighted: bool = False, ) -> None: layer = _CPU_MOE_LAYER_CACHE[layer_id]() @@ -434,13 +451,16 @@ def cpu_fused_moe_torch( new_x = torch.empty_like(outs) new_x[idxs] = outs - final_out = ( - new_x.view(*topk_ids.shape, -1) - .type(topk_weights.dtype) - .mul_(topk_weights.unsqueeze(dim=-1)) - .sum(dim=1) - .type(new_x.dtype) - ) + if skip_weighted: + final_out = new_x + else: + final_out = ( + new_x.view(*topk_ids.shape, -1) + .type(topk_weights.dtype) + .mul_(topk_weights.unsqueeze(dim=-1)) + .sum(dim=1) + .type(new_x.dtype) + ) output.copy_(final_out) diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index 8ccd45bb0..2fbcc9c44 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -160,12 +160,21 @@ class CPUWorker(Worker): x for x in logical_cpu_list if x.numa_node == selected_numa_node ] else: - assert len(logical_cpu_list) >= self.parallel_config.world_size - logical_cpu_list = sorted(logical_cpu_list, key=lambda x: x.numa_node) - sim_cpu_num_per_node = ( - len(logical_cpu_list) // self.parallel_config.world_size + # This is a bit tricky because the internal DP size + # is always 1 for non-MoE models + world_size_across_dp = ( + self.parallel_config.world_size + * self.parallel_config._api_process_count ) - start_idx = self.local_rank * sim_cpu_num_per_node + assert len(logical_cpu_list) >= world_size_across_dp + logical_cpu_list = sorted(logical_cpu_list, key=lambda x: x.numa_node) + sim_cpu_num_per_node = len(logical_cpu_list) // world_size_across_dp + assert self.parallel_config.data_parallel_rank_local is not None + start_idx = ( + self.local_rank + + self.parallel_config.world_size + * self.parallel_config.data_parallel_rank_local + ) * sim_cpu_num_per_node logical_cpu_list = logical_cpu_list[ start_idx : (start_idx + sim_cpu_num_per_node) ] -- GitLab From 1e9204bff31f021dce8290d894c7aaf26bb4642e Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 11 Feb 2026 13:13:23 +0100 Subject: [PATCH 0099/1166] Make Qwen3VL compatible with Transformers v5 (#34262) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Roger Wang Co-authored-by: Roger Wang --- vllm/model_executor/models/qwen3_vl.py | 26 ++++++++-------- vllm/model_executor/models/qwen3_vl_moe.py | 36 ++++++++-------------- 2 files changed, 25 insertions(+), 37 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 34ff881aa..908f6342d 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -1112,17 +1112,6 @@ class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]) } ) class Qwen3LLMModel(Qwen3Model): - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__(vllm_config=vllm_config, prefix=prefix) - vision_config = vllm_config.model_config.hf_config.vision_config - if not get_pp_group().is_first_rank and hasattr( - vision_config, "deepstack_visual_indexes" - ): - assert self.start_layer >= len(vision_config.deepstack_visual_indexes), ( - "start_layer should be greater than or equal to " - "len(deepstack_visual_indexes)" - ) - def forward( self, input_ids: torch.Tensor | None, @@ -1178,7 +1167,7 @@ class Qwen3LLMModel(Qwen3Model): class Qwen3LLMForCausalLM(Qwen3ForCausalLM): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super(Qwen3ForCausalLM, self).__init__() - config = vllm_config.model_config.hf_config.text_config + config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config self.config = config @@ -1298,7 +1287,18 @@ class Qwen3VLForConditionalGeneration( with self._mark_language_model(vllm_config): self.language_model = Qwen3LLMForCausalLM( - vllm_config=vllm_config, prefix=maybe_prefix(prefix, "language_model") + vllm_config=vllm_config.with_hf_config(config.text_config), + prefix=maybe_prefix(prefix, "language_model"), + ) + + if not get_pp_group().is_first_rank and hasattr( + config.vision_config, "deepstack_visual_indexes" + ): + assert self.language_model.start_layer >= len( + config.vision_config.deepstack_visual_indexes + ), ( + "start_layer should be greater than or equal to " + "len(deepstack_visual_indexes)" ) self.make_empty_intermediate_tensors = ( diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py index 8ac2dc945..80815616b 100644 --- a/vllm/model_executor/models/qwen3_vl_moe.py +++ b/vllm/model_executor/models/qwen3_vl_moe.py @@ -48,7 +48,6 @@ from vllm.sequence import IntermediateTensors from .interfaces import MixtureOfExperts from .qwen3_moe import ( - Qwen3MoeDecoderLayer, Qwen3MoeForCausalLM, Qwen3MoeModel, Qwen3MoeSparseMoeBlock, @@ -83,27 +82,6 @@ class Qwen3VLMoeProcessingInfo(Qwen3VLProcessingInfo): } ) class Qwen3MoeLLMModel(Qwen3MoeModel): - def __init__( - self, - *, - vllm_config: VllmConfig, - prefix: str = "", - decoder_layer_type: type[torch.nn.Module] = Qwen3MoeDecoderLayer, - ): - super().__init__( - vllm_config=vllm_config, - prefix=prefix, - decoder_layer_type=decoder_layer_type, - ) - vision_config = vllm_config.model_config.hf_config.vision_config - if not get_pp_group().is_first_rank and hasattr( - vision_config, "deepstack_visual_indexes" - ): - assert self.start_layer >= len(vision_config.deepstack_visual_indexes), ( - "start_layer should be greater than or equal to " - "len(deepstack_visual_indexes)" - ) - def forward( self, input_ids: torch.Tensor | None, @@ -352,7 +330,7 @@ class Qwen3MoeLLMModel(Qwen3MoeModel): class Qwen3MoeLLMForCausalLM(Qwen3MoeForCausalLM): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super(Qwen3MoeForCausalLM, self).__init__() - self.config = vllm_config.model_config.hf_config.text_config + self.config = vllm_config.model_config.hf_config self.quant_config = vllm_config.quant_config self.model = Qwen3MoeLLMModel( vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") @@ -473,10 +451,20 @@ class Qwen3VLMoeForConditionalGeneration( with self._mark_language_model(vllm_config): self.language_model = Qwen3MoeLLMForCausalLM( - vllm_config=vllm_config, + vllm_config=vllm_config.with_hf_config(config.text_config), prefix=maybe_prefix(prefix, "language_model"), ) + if not get_pp_group().is_first_rank and hasattr( + config.vision_config, "deepstack_visual_indexes" + ): + assert self.language_model.start_layer >= len( + config.vision_config.deepstack_visual_indexes + ), ( + "start_layer should be greater than or equal to " + "len(deepstack_visual_indexes)" + ) + # Whether to include the gate_up_proj mapping is determined by # the language model. self.packed_modules_mapping = ( -- GitLab From 0f5e55e7a8de564407ee54ad8ab5ab1d2cb3bb5a Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 11 Feb 2026 13:30:37 +0100 Subject: [PATCH 0100/1166] Make JAIS compatible with Transformers v5 (#34264) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/jais.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index 5685acd75..2e122e3db 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -244,7 +244,6 @@ class JAISModel(nn.Module): quant_config = vllm_config.quant_config self.config = config - assert not config.add_cross_attention assert not config.scale_attn_by_inverse_layer_idx assert not config.reorder_and_upcast_attn self.embed_dim = config.hidden_size -- GitLab From 275e0d2a993b271cfaec9da87711868719d50d8c Mon Sep 17 00:00:00 2001 From: Linda <57756729+Linda-Stadter@users.noreply.github.com> Date: Wed, 11 Feb 2026 13:38:11 +0100 Subject: [PATCH 0101/1166] [NVIDIA][test] Tests for flashinfer TRTLLM BF16 MoE (#33715) Signed-off-by: Linda-Stadter <57756729+Linda-Stadter@users.noreply.github.com> Co-authored-by: Pavani Majety --- .../Llama-4-Scout-BF16-fi-cutlass.yaml | 2 + .../Mixtral-8x7B-BF16-fi-cutlass.yaml | 1 + tests/kernels/moe/test_flashinfer.py | 41 ++++++ tests/kernels/moe/test_moe.py | 100 +++++++++++++ .../moe/test_unquantized_backend_selection.py | 132 ++++++++++++++++++ tests/quantization/test_blackwell_moe.py | 8 ++ .../layers/fused_moe/oracle/unquantized.py | 13 +- 7 files changed, 296 insertions(+), 1 deletion(-) create mode 100644 tests/kernels/moe/test_unquantized_backend_selection.py diff --git a/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-BF16-fi-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-BF16-fi-cutlass.yaml index fe099f9f1..5416d9232 100644 --- a/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-BF16-fi-cutlass.yaml +++ b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-BF16-fi-cutlass.yaml @@ -5,3 +5,5 @@ num_fewshot: 5 server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --enable-expert-parallel" env: VLLM_USE_FLASHINFER_MOE_FP16: "1" + VLLM_FLASHINFER_MOE_BACKEND: "throughput" + diff --git a/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-BF16-fi-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-BF16-fi-cutlass.yaml index 5f4a76b0a..cc8df6292 100644 --- a/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-BF16-fi-cutlass.yaml +++ b/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-BF16-fi-cutlass.yaml @@ -5,3 +5,4 @@ num_fewshot: 5 server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2 --enable-expert-parallel" env: VLLM_USE_FLASHINFER_MOE_FP16: "1" + VLLM_FLASHINFER_MOE_BACKEND: "throughput" diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py index e62cf7941..ddcd221ef 100644 --- a/tests/kernels/moe/test_flashinfer.py +++ b/tests/kernels/moe/test_flashinfer.py @@ -318,3 +318,44 @@ def test_flashinfer_cutlass_moe_fp8_no_graph( torch.testing.assert_close( output, flashinfer_cutlass_output, atol=5.5e-2, rtol=1e-2 ) + + +@pytest.mark.parametrize( + "num_experts,intermediate,hidden", + [ + (8, 2048, 1536), + (64, 4096, 4096), + ], +) +def test_convert_moe_weights_to_flashinfer_trtllm_block_layout( + num_experts, intermediate, hidden +): + from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( + convert_moe_weights_to_flashinfer_trtllm_block_layout, + ) + + w13 = torch.randn( + (num_experts, 2 * intermediate, hidden), dtype=torch.bfloat16, device="cuda" + ) + w2 = torch.randn( + (num_experts, hidden, intermediate), dtype=torch.bfloat16, device="cuda" + ) + + cache: dict[torch.Size, torch.Tensor] = {} + w13_converted, w2_converted = convert_moe_weights_to_flashinfer_trtllm_block_layout( + cache, w13, w2 + ) + + assert w13_converted.ndim == 4, ( + f"Expected 4D tensor, got shape {w13_converted.shape}" + ) + assert w2_converted.ndim == 4, f"Expected 4D tensor, got shape {w2_converted.shape}" + + assert w13_converted.numel() == w13.numel(), "W13 element count should be preserved" + assert w2_converted.numel() == w2.numel(), "W2 element count should be preserved" + + assert w13_converted.dtype == torch.bfloat16 + assert w2_converted.dtype == torch.bfloat16 + + assert w13_converted.shape[0] == num_experts + assert w2_converted.shape[0] == num_experts diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 53fb43e3c..6a622ac8e 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -1558,3 +1558,103 @@ def test_batched_fused_marlin_moe( marlin_output = br.run(a, kwargs) torch.testing.assert_close(marlin_output, ref_marlin_output, atol=1e-3, rtol=0) + + +@pytest.mark.parametrize("m,n,k", [(32, 1024, 1024)]) +@pytest.mark.parametrize("e,topk", [(8, 2)]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) +@pytest.mark.skipif( + not current_platform.is_device_capability_family(100), + reason="TRTLLM backend test only runs on Blackwell GPUs (SM10x).", +) +def test_unquantized_bf16_flashinfer_trtllm_backend( + m: int, + n: int, + k: int, + e: int, + topk: int, + dtype: torch.dtype, + monkeypatch, + workspace_init, +): + """ + Test BF16 unquantized MoE with FlashInfer TRTLLM backend. + """ + set_random_seed(7) + + monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP16", "1") + + from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, + FusedMoEParallelConfig, + RoutingMethodType, + ) + from vllm.model_executor.layers.fused_moe.oracle.unquantized import ( + UnquantizedMoeBackend, + ) + from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( + UnquantizedFusedMoEMethod, + ) + + # Setup test data + a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 + w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 + w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10 + router_logits = torch.randn((m, e), device="cuda", dtype=dtype) + + moe_config = FusedMoEConfig( + num_experts=e, + experts_per_token=topk, + hidden_dim=k, + intermediate_size_per_partition=n, + num_local_experts=e, + activation="silu", + device="cuda", + moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), + in_dtype=dtype, + is_act_and_mul=True, + routing_method=RoutingMethodType.Renormalize, + max_num_tokens=m, + ) + + with set_current_vllm_config(vllm_config): + quant_method = UnquantizedFusedMoEMethod(moe_config) + + # Verify TRTLLM backend was selected + assert ( + quant_method.unquantized_backend == UnquantizedMoeBackend.FLASHINFER_TRTLLM + ), f"Expected FLASHINFER_TRTLLM backend, got {quant_method.unquantized_backend}" + + # Verify it's using monolithic path + assert quant_method.is_monolithic, ( + "FLASHINFER_TRTLLM backend should use monolithic forward" + ) + layer = torch.nn.Module() + layer.w13_weight = Parameter(w1.clone(), requires_grad=False) + layer.w2_weight = Parameter(w2.clone(), requires_grad=False) + layer.global_num_experts = e + layer.local_num_experts = e + layer.top_k = topk + layer.num_expert_group = 1 + layer.topk_group = 1 + layer.intermediate_size_per_partition = n + layer.ep_rank = 0 + layer.activation = "silu" + layer.e_score_correction_bias = None + layer.routing_method_type = RoutingMethodType.Renormalize + + quant_method.process_weights_after_loading(layer) + + trtllm_output = quant_method.forward_monolithic_cuda( + layer=layer, + x=a, + router_logits=router_logits, + ) + + # Compute torch baseline + w1_original = w1.clone() + w2_original = w2.clone() + baseline_output = torch_moe(a, w1_original, w2_original, router_logits, topk) + + close = torch.isclose(trtllm_output, baseline_output, atol=1e-1, rtol=0.85) + assert close.float().mean() > 0.925 diff --git a/tests/kernels/moe/test_unquantized_backend_selection.py b/tests/kernels/moe/test_unquantized_backend_selection.py new file mode 100644 index 000000000..fcb79ee8f --- /dev/null +++ b/tests/kernels/moe/test_unquantized_backend_selection.py @@ -0,0 +1,132 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from unittest.mock import patch + +import pytest + +from tests.kernels.moe.utils import make_dummy_moe_config +from vllm.model_executor.layers.fused_moe.oracle.unquantized import ( + UnquantizedMoeBackend, + select_unquantized_moe_backend, +) + + +@pytest.mark.parametrize( + "platform_method,expected_backend", + [ + ("is_cuda", UnquantizedMoeBackend.TRITON), # Default CUDA without FlashInfer + ("is_rocm", UnquantizedMoeBackend.TRITON), + ("is_cpu", UnquantizedMoeBackend.CPU), + ("is_xpu", UnquantizedMoeBackend.XPU), + ("is_tpu", UnquantizedMoeBackend.TPU), + ("is_out_of_tree", UnquantizedMoeBackend.OOT), + ], +) +@patch( + "vllm.model_executor.layers.fused_moe.oracle.unquantized.has_flashinfer", + return_value=False, +) +def test_select_default_backend_by_platform( + mock_has_flashinfer, + monkeypatch, + platform_method, + expected_backend, +): + """Test backend selection for different platforms.""" + with patch( + "vllm.model_executor.layers.fused_moe.oracle.unquantized.current_platform" + ) as mock_platform: + # Set all platform checks to False + mock_platform.is_cuda.return_value = False + mock_platform.is_rocm.return_value = False + mock_platform.is_cpu.return_value = False + mock_platform.is_xpu.return_value = False + mock_platform.is_tpu.return_value = False + mock_platform.is_out_of_tree.return_value = False + + # Set only the specified platform to True + getattr(mock_platform, platform_method).return_value = True + + moe_config = make_dummy_moe_config() + selected_backend = select_unquantized_moe_backend( + moe_config=moe_config, + use_ep=False, + use_dp=False, + ) + + assert selected_backend == expected_backend + + +@patch( + "vllm.model_executor.layers.fused_moe.oracle.unquantized.has_flashinfer", + return_value=True, +) +@patch( + "vllm.model_executor.layers.fused_moe.oracle.unquantized.is_supported_config_trtllm_bf16", + return_value=(True, None), +) +def test_select_cuda_flashinfer_trtllm_backend( + mock_has_flashinfer, mock_is_supported_trtllm, monkeypatch +): + """Test CUDA backend selection when FlashInfer TRTLLM is available and enabled.""" + with patch( + "vllm.model_executor.layers.fused_moe.oracle.unquantized.current_platform" + ) as mock_platform: + # Set as CUDA platform + mock_platform.is_cuda.return_value = True + mock_platform.is_rocm.return_value = False + mock_platform.is_cpu.return_value = False + mock_platform.is_xpu.return_value = False + mock_platform.is_tpu.return_value = False + mock_platform.is_out_of_tree.return_value = False + + monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP16", "1") + + moe_config = make_dummy_moe_config() + + selected_backend = select_unquantized_moe_backend( + moe_config=moe_config, + use_ep=True, + use_dp=False, + ) + + assert selected_backend == UnquantizedMoeBackend.FLASHINFER_TRTLLM + + +@patch( + "vllm.model_executor.layers.fused_moe.oracle.unquantized.has_flashinfer", + return_value=True, +) +@patch( + "vllm.model_executor.layers.fused_moe.oracle.unquantized.is_supported_config_trtllm_bf16", + return_value=(False, None), +) +def test_select_cuda_flashinfer_cutlass_backend( + mock_has_flashinfer, mock_is_supported_trtllm, monkeypatch +): + """Test CUDA backend selection when FlashInfer TRTLLM is not available + and FlashInfer CUTLASS is available.""" + with patch( + "vllm.model_executor.layers.fused_moe.oracle.unquantized.current_platform" + ) as mock_platform: + # Set as CUDA platform with Hopper capability + mock_platform.is_cuda.return_value = True + mock_platform.is_rocm.return_value = False + mock_platform.is_cpu.return_value = False + mock_platform.is_xpu.return_value = False + mock_platform.is_tpu.return_value = False + mock_platform.is_out_of_tree.return_value = False + mock_platform.has_device_capability.return_value = True # SM90+ + + # Enable FlashInfer via env var + monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP16", "1") + + moe_config = make_dummy_moe_config() + + selected_backend = select_unquantized_moe_backend( + moe_config=moe_config, + use_ep=True, # CUTLASS requires EP + use_dp=False, # CUTLASS doesn't support DP + ) + + assert selected_backend == UnquantizedMoeBackend.FLASHINFER_CUTLASS diff --git a/tests/quantization/test_blackwell_moe.py b/tests/quantization/test_blackwell_moe.py index a43d2abfd..07da2b454 100644 --- a/tests/quantization/test_blackwell_moe.py +++ b/tests/quantization/test_blackwell_moe.py @@ -178,3 +178,11 @@ def test_gptoss_eager(monkeypatch: pytest.MonkeyPatch): hf_overrides=HF_OVERRIDE_TEXT, extra_args=["--enforce-eager"], ) + + +## Qwen3 Next ## + + +def test_qwen3_next_bf16_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP16", "1") + can_initialize("Qwen/Qwen3-Next-80B-A3B-Instruct", hf_overrides=HF_OVERRIDE_TEXT) diff --git a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py index c4a19ecb6..61aaa6927 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py +++ b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py @@ -78,7 +78,10 @@ def select_unquantized_moe_backend( activation_format=activation_format, ) flashinfer_trtllm_moe_enabled = ( - has_flashinfer() and envs.VLLM_USE_FLASHINFER_MOE_FP16 and trtllm_supported + has_flashinfer() + and envs.VLLM_USE_FLASHINFER_MOE_FP16 + and trtllm_supported + and envs.VLLM_FLASHINFER_MOE_BACKEND == "latency" ) # FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS flashinfer_cutlass_moe_enabled = ( @@ -98,11 +101,19 @@ def select_unquantized_moe_backend( backend = UnquantizedMoeBackend.FLASHINFER_TRTLLM elif flashinfer_cutlass_moe_enabled: backend = UnquantizedMoeBackend.FLASHINFER_CUTLASS + if trtllm_supported: + logger.info_once( + "FlashInfer TRTLLM MoE is available but not enabled, " + "consider setting VLLM_FLASHINFER_MOE_BACKEND=latency " + "to enable it for better performance.", + scope="local", + ) else: if not envs.VLLM_USE_FLASHINFER_MOE_FP16 and trtllm_supported: logger.info_once( "FlashInfer TRTLLM MoE is available but not enabled, " "consider setting VLLM_USE_FLASHINFER_MOE_FP16=1 " + "and VLLM_FLASHINFER_MOE_BACKEND=latency " "to enable it for better performance.", scope="local", ) -- GitLab From 1b8756562e1cc50bade1335e52aa36547d62e477 Mon Sep 17 00:00:00 2001 From: Adam Binford Date: Wed, 11 Feb 2026 08:14:28 -0500 Subject: [PATCH 0102/1166] Responses harmony system message structured (#34268) Signed-off-by: Adam Binford --- .../openai/responses/test_harmony.py | 33 ++++++++++++++++--- vllm/entrypoints/openai/responses/serving.py | 16 +++++++-- 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/tests/entrypoints/openai/responses/test_harmony.py b/tests/entrypoints/openai/responses/test_harmony.py index b6842f3db..641171e3c 100644 --- a/tests/entrypoints/openai/responses/test_harmony.py +++ b/tests/entrypoints/openai/responses/test_harmony.py @@ -1302,16 +1302,17 @@ async def test_system_prompt_override(client: OpenAI, model_name: str): # Message structure may vary, skip this specific check pass + custom_system_prompt_2 = ( + "You are a helpful assistant that always responds in exactly 5 words." + ) + # Test 3: Test with different custom system prompt response_2 = await client.responses.create( model=model_name, input=[ { "role": "system", - "content": ( - "You are a helpful assistant that always " - "responds in exactly 5 words." - ), + "content": custom_system_prompt_2, }, {"role": "user", "content": "What is the weather like?"}, ], @@ -1328,3 +1329,27 @@ async def test_system_prompt_override(client: OpenAI, model_name: str): assert 3 <= word_count <= 8, ( f"Expected around 5 words, got {word_count} words: {response_2.output_text}" ) + + # Test 4: Test with structured content + response_3 = await client.responses.create( + model=model_name, + input=[ + { + "role": "system", + "content": [{"type": "input_text", "text": custom_system_prompt_2}], + }, + {"role": "user", "content": "What is the weather like?"}, + ], + temperature=0.0, + ) + + assert response_3 is not None + assert response_3.status == "completed" + assert response_3.output_text is not None + + # Count words in response (approximately, allowing for punctuation) + word_count = len(response_3.output_text.split()) + # Allow some flexibility (4-7 words) since the model might not be perfectly precise + assert 3 <= word_count <= 8, ( + f"Expected around 5 words, got {word_count} words: {response_3.output_text}" + ) diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index 9f54a8081..2af7f578e 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -980,7 +980,9 @@ class OpenAIServingResponses(OpenAIServing): output_items.extend(last_items) return output_items - def _extract_system_message_from_request(self, request) -> str | None: + def _extract_system_message_from_request( + self, request: ResponsesRequest + ) -> str | None: system_msg = None if not isinstance(request.input, str): for response_msg in request.input: @@ -988,7 +990,17 @@ class OpenAIServingResponses(OpenAIServing): isinstance(response_msg, dict) and response_msg.get("role") == "system" ): - system_msg = response_msg.get("content") + content = response_msg.get("content") + if isinstance(content, str): + system_msg = content + elif isinstance(content, list): + for param in content: + if ( + isinstance(param, dict) + and param.get("type") == "input_text" + ): + system_msg = param.get("text") + break break return system_msg -- GitLab From c7914d30f90bc47f1c959d3330666885a0034f7d Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 11 Feb 2026 08:07:56 -0700 Subject: [PATCH 0103/1166] Reapply [Attention][FA3] Update FA3 to include new swizzle optimization (#34043) Signed-off-by: Lucas Wilkinson --- cmake/external_projects/vllm_flash_attn.cmake | 2 +- tests/v1/cudagraph/test_cudagraph_dispatch.py | 22 ++++++----- vllm/forward_context.py | 18 ++------- vllm/v1/attention/backends/flash_attn.py | 13 ++++++- .../attention/backends/mla/flashattn_mla.py | 12 +++++- vllm/v1/cudagraph_dispatcher.py | 37 +++++++++++-------- 6 files changed, 60 insertions(+), 44 deletions(-) diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index b51934a3a..41c4e308d 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 188be16520ceefdc625fdf71365585d2ee348fe2 + GIT_TAG 5824e6e2008271063c3229ab3e7032bd74abbbc6 GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py index 2b0f8a95d..debf9aeaa 100644 --- a/tests/v1/cudagraph/test_cudagraph_dispatch.py +++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import replace from unittest.mock import MagicMock, patch import pytest @@ -132,36 +133,39 @@ class TestCudagraphDispatcher: # Test dispatch logic # 1. non-uniform batch, size in cudagraph size list - desc_full_exact = BatchDescriptor( - num_tokens=8, - uniform=False, - ) + # FULL mode uses exact keys with num_reqs set + desc_full_with_reqs = BatchDescriptor(num_tokens=8, num_reqs=8, uniform=False) + # PIECEWISE mode uses relaxed keys with num_reqs=None + desc_piecewise = BatchDescriptor(num_tokens=8, num_reqs=None, uniform=False) rt_mode, key = dispatcher.dispatch( num_tokens=8, uniform_decode=False, has_lora=False ) if cudagraph_mode_str == "FULL": assert rt_mode == CUDAGraphMode.FULL - assert key == desc_full_exact + assert key == desc_full_with_reqs elif cudagraph_mode_str in ["FULL_AND_PIECEWISE", "PIECEWISE"]: assert rt_mode == CUDAGraphMode.PIECEWISE - assert key == desc_full_exact + assert key == desc_piecewise else: assert rt_mode == CUDAGraphMode.NONE # 2. uniform decode batch, size in cudagraph size list desc_uniform_exact = BatchDescriptor(num_tokens=8, num_reqs=8, uniform=True) + desc_non_uniform = BatchDescriptor(num_tokens=8, num_reqs=8, uniform=False) rt_mode, key = dispatcher.dispatch( num_tokens=8, uniform_decode=True, has_lora=False ) if cudagraph_mode_str == "FULL": + # Pure FULL mode uses non-uniform keys for all batches assert rt_mode == CUDAGraphMode.FULL - assert key == desc_uniform_exact.relax_for_mixed_batch_cudagraphs() + assert key == desc_non_uniform elif cudagraph_mode_str in ["FULL_DECODE_ONLY", "FULL_AND_PIECEWISE"]: + # These modes have separate uniform decode keys assert rt_mode == CUDAGraphMode.FULL assert key == desc_uniform_exact elif cudagraph_mode_str == "PIECEWISE": assert rt_mode == CUDAGraphMode.PIECEWISE - assert key == desc_uniform_exact.relax_for_mixed_batch_cudagraphs() + assert key == replace(desc_uniform_exact, num_reqs=None, uniform=False) else: assert rt_mode == CUDAGraphMode.NONE @@ -180,7 +184,7 @@ class TestCudagraphDispatcher: if "PIECEWISE" in cudagraph_mode_str: # string contains check assert rt_mode == CUDAGraphMode.PIECEWISE - assert key == desc_full_exact.relax_for_mixed_batch_cudagraphs() + assert key == replace(desc_full_exact, num_reqs=None, uniform=False) else: assert rt_mode == CUDAGraphMode.NONE diff --git a/vllm/forward_context.py b/vllm/forward_context.py index d357c8929..a0753b19e 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -5,7 +5,7 @@ import time from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass, field -from typing import Any, NamedTuple +from typing import Any import torch @@ -26,7 +26,8 @@ batchsize_logging_interval: float = envs.VLLM_LOG_BATCHSIZE_INTERVAL batchsize_forward_time: defaultdict = defaultdict(list) -class BatchDescriptor(NamedTuple): +@dataclass(frozen=True) +class BatchDescriptor: """ Batch descriptor for cudagraph dispatching. We should keep the num of items as minimal as possible to properly and uniquely describe the padded @@ -56,19 +57,6 @@ class BatchDescriptor(NamedTuple): to be properly captured. """ - def relax_for_mixed_batch_cudagraphs(self) -> "BatchDescriptor": - """ - Return a relaxed version of current batch descriptor that is still compatible - with PIECEWISE cudagraphs (or mixed prefill-decode FA cudagraphs). - """ - return BatchDescriptor( - self.num_tokens, - num_reqs=None, - uniform=False, - has_lora=self.has_lora, - num_active_loras=self.num_active_loras, - ) - def _compute_sp_num_tokens( num_tokens_across_dp_cpu: torch.Tensor, sequence_parallel_size: int diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index e786ab3bc..ecd1b274c 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -40,7 +40,7 @@ from vllm.model_executor.layers.batch_invariant import ( vllm_is_batch_invariant, ) from vllm.platforms.interface import DeviceCapability -from vllm.utils.math_utils import cdiv +from vllm.utils.math_utils import cdiv, round_up from vllm.v1.attention.backend import ( AttentionCGSupport, AttentionMetadataBuilder, @@ -310,8 +310,17 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad self.max_cudagraph_size = self.compilation_config.max_cudagraph_capture_size if self.use_full_cuda_graph and self.aot_schedule: + # FA3 scheduler_metadata size: 1 + round_up(batch_size, 4) * 4 + # The +1 is for the tile_count_semaphore (synchronization). + # The 4 slots per batch element (num_prepare_batch_vectors) are: + # prepare_varlen + dynamic_split + sort_batches + head_swizzle + # See: https://github.com/vllm-project/flash-attention/blob/5824e6e/hopper/flash_api.cpp#L664-L671 # noqa: E501 + max_batch_size = max( + vllm_config.scheduler_config.max_num_seqs, + self.max_cudagraph_size or 0, + ) self.scheduler_metadata = torch.zeros( - vllm_config.scheduler_config.max_num_seqs + 1, + 1 + round_up(max_batch_size, 4) * 4, dtype=torch.int32, device=self.device, ) diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index e160d3255..33f896035 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -21,6 +21,7 @@ from vllm.model_executor.layers.batch_invariant import ( vllm_is_batch_invariant, ) from vllm.platforms.interface import DeviceCapability +from vllm.utils.math_utils import round_up from vllm.v1.attention.backend import ( AttentionCGSupport, AttentionLayer, @@ -129,8 +130,17 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata] self.max_cudagraph_size = self.compilation_config.max_cudagraph_capture_size if self.use_full_cuda_graph and self.fa_aot_schedule: + # FA3 scheduler_metadata size: 1 + round_up(batch_size, 4) * 4 + # The +1 is for the tile_count_semaphore (synchronization). + # The 4 slots per batch element (num_prepare_batch_vectors) are: + # prepare_varlen + dynamic_split + sort_batches + head_swizzle + # See: https://github.com/vllm-project/flash-attention/blob/5824e6e/hopper/flash_api.cpp#L664-L671 # noqa: E501 + max_batch_size = max( + vllm_config.scheduler_config.max_num_seqs, + self.max_cudagraph_size or 0, + ) self.scheduler_metadata = torch.zeros( - vllm_config.scheduler_config.max_num_seqs + 1, + 1 + round_up(max_batch_size, 4) * 4, dtype=torch.int32, device=self.device, ) diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index 6f3e029c7..6817c571b 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import replace from itertools import product from vllm.config import CUDAGraphMode, VllmConfig @@ -180,12 +181,14 @@ class CudagraphDispatcher: for bs, num_active_loras in product( self.compilation_config.cudagraph_capture_sizes, lora_cases ): - self.add_cudagraph_key( - cudagraph_mode.mixed_mode(), - self._create_padded_batch_descriptor( - bs, False, num_active_loras > 0, num_active_loras - ).relax_for_mixed_batch_cudagraphs(), + batch_desc = self._create_padded_batch_descriptor( + bs, False, num_active_loras > 0, num_active_loras ) + # Only relax for PIECEWISE mode. FULL mode needs exact num_reqs + # because FA3's scheduler_metadata computation depends on it. + if cudagraph_mode.mixed_mode() == CUDAGraphMode.PIECEWISE: + batch_desc = replace(batch_desc, num_reqs=None, uniform=False) + self.add_cudagraph_key(cudagraph_mode.mixed_mode(), batch_desc) # if decode cudagraph mode is FULL, and we don't already have mixed # mode full cudagraphs then add them here. @@ -264,21 +267,23 @@ class CudagraphDispatcher: batch_desc = self._create_padded_batch_descriptor( num_tokens, uniform_decode, has_lora, effective_num_active_loras ) - relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs() - - if not disable_full: - # check if key exists for full cudagraph - if batch_desc in self.cudagraph_keys[CUDAGraphMode.FULL]: - return CUDAGraphMode.FULL, batch_desc - # otherwise, check if the relaxed key exists - if relaxed_batch_desc in self.cudagraph_keys[CUDAGraphMode.FULL]: - return CUDAGraphMode.FULL, relaxed_batch_desc + # check if key exists for full cudagraph + # For pure FULL mode, keys are registered with uniform=False. + batch_desc_to_check = batch_desc + if self.cudagraph_mode == CUDAGraphMode.FULL: + batch_desc_to_check = replace(batch_desc, uniform=False) + if ( + not disable_full + and batch_desc_to_check in self.cudagraph_keys[CUDAGraphMode.FULL] + ): + return CUDAGraphMode.FULL, batch_desc_to_check # also check if the relaxed key exists for more "general" # piecewise cudagraph - if relaxed_batch_desc in self.cudagraph_keys[CUDAGraphMode.PIECEWISE]: - return CUDAGraphMode.PIECEWISE, relaxed_batch_desc + batch_desc_to_check = replace(batch_desc, num_reqs=None, uniform=False) + if batch_desc_to_check in self.cudagraph_keys[CUDAGraphMode.PIECEWISE]: + return CUDAGraphMode.PIECEWISE, batch_desc_to_check # finally, just return no cudagraphs and a trivial batch descriptor return CUDAGraphMode.NONE, BatchDescriptor(num_tokens) -- GitLab From 67a42b5a44fe196250142f1e8ddee44d7061500f Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 11 Feb 2026 17:09:40 +0100 Subject: [PATCH 0104/1166] Don't try and run GLM-ASR with remote code (#34352) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/models/registry.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index abc621d8e..21188bf39 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -725,7 +725,6 @@ _MULTIMODAL_EXAMPLE_MODELS = { "Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it"), "GlmAsrForConditionalGeneration": _HfExamplesInfo( "zai-org/GLM-ASR-Nano-2512", - trust_remote_code=True, min_transformers_version="5.0.0", ), "GraniteVision": _HfExamplesInfo("ibm-granite/granite-vision-3.3-2b"), -- GitLab From fd618871b41c0cf9259379cde9cca230a56c4096 Mon Sep 17 00:00:00 2001 From: Rohan Potdar <66227218+Rohan138@users.noreply.github.com> Date: Wed, 11 Feb 2026 10:12:05 -0600 Subject: [PATCH 0105/1166] [Bugfix]: Fix ROCm fusion attn test; use AttentionBackend utils to create kv cache (#33948) Signed-off-by: Rohan138 --- tests/compile/passes/test_fusion_attn.py | 79 ++++++++---------------- 1 file changed, 27 insertions(+), 52 deletions(-) diff --git a/tests/compile/passes/test_fusion_attn.py b/tests/compile/passes/test_fusion_attn.py index 2b29cf605..ffa01563e 100644 --- a/tests/compile/passes/test_fusion_attn.py +++ b/tests/compile/passes/test_fusion_attn.py @@ -92,6 +92,8 @@ class AttentionQuantPatternModel(torch.nn.Module): def build_attn_metadata(self, batch_size: int) -> AttentionMetadata: """Initialize attention metadata.""" + # TODO (Rohan138) reuse utils from vllm/v1/worker/gpu/attn_utils.py + # Create common attn metadata batch_spec = BatchSpec(seq_lens=[1] * batch_size, query_lens=[1] * batch_size) common_attn_metadata = create_common_attn_metadata( @@ -100,58 +102,31 @@ class AttentionQuantPatternModel(torch.nn.Module): max_blocks = (max(batch_spec.seq_lens) + self.block_size - 1) // self.block_size num_blocks = batch_size * max_blocks - backend = self.attn.backend - - # TODO(luka) use get_kv_cache_stride_order - # Create dummy KV cache for the selected backend - if backend == AttentionBackendEnum.ROCM_ATTN: - # k/v as 1st dimention - # HND: [num_blocks, num_kv_heads, block_size, head_size] - kv_cache = torch.zeros( - 2, - num_blocks, - self.num_kv_heads, - self.block_size, - self.head_size, - dtype=self.kv_cache_dtype, - device=self.device, - ) - elif backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN: - # k/v as 1st dimention - # NHD: [num_blocks, block_size, num_kv_heads, head_size] - kv_cache = torch.zeros( - 2, - num_blocks, - self.block_size, - self.num_kv_heads, - self.head_size, - dtype=self.kv_cache_dtype, - device=self.device, - ) - elif backend == AttentionBackendEnum.TRITON_ATTN: - # k/v as 2nd dimention - # NHD: [num_blocks, block_size, num_kv_heads, head_size] - kv_cache = torch.zeros( - num_blocks, - 2, - self.num_kv_heads, - self.block_size, - self.head_size, - dtype=self.kv_cache_dtype, - device=self.device, - ) - elif backend == AttentionBackendEnum.FLASHINFER: - kv_cache = torch.zeros( - num_blocks, - 2, - self.num_kv_heads, - self.block_size, - self.head_size, - dtype=self.kv_cache_dtype, - device=self.device, - ).permute(0, 1, 3, 2, 4) - else: - raise ValueError(f"Unsupported backend: {backend}") + + # Fetch the attention backend and kv cache shape and stride order + attn_backend = self.attn.attn_backend + kv_cache_shape = attn_backend.get_kv_cache_shape( + num_blocks, self.block_size, self.num_kv_heads, self.head_size + ) + try: + kv_cache_stride_order = attn_backend.get_kv_cache_stride_order() + except (AttributeError, NotImplementedError): + kv_cache_stride_order = tuple(range(len(kv_cache_shape))) + + kv_cache_shape = tuple(kv_cache_shape[i] for i in kv_cache_stride_order) + inv_order = [ + kv_cache_stride_order.index(i) for i in range(len(kv_cache_stride_order)) + ] + + # Create dummy KV cache + raw_tensor = torch.zeros( + 2 * num_blocks * self.block_size * self.num_kv_heads * self.head_size, + dtype=self.kv_cache_dtype, + device=self.device, + ) + raw_tensor = raw_tensor.view(kv_cache_shape) + kv_cache = raw_tensor.permute(*inv_order) + self.attn.kv_cache = [kv_cache] # Build attn metadata -- GitLab From 64f570ab56cab7e8977c611b78f9a44a9a9f033c Mon Sep 17 00:00:00 2001 From: kliuae <17350011+kliuae@users.noreply.github.com> Date: Thu, 12 Feb 2026 00:26:44 +0800 Subject: [PATCH 0106/1166] [ROCm] [aiter] Split KV cache update for AiterFlashAttention (#33681) Signed-off-by: kliuae --- vllm/v1/attention/backends/rocm_aiter_fa.py | 108 ++++++++++++-------- 1 file changed, 68 insertions(+), 40 deletions(-) diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 28b5a7f41..4be650f93 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -11,6 +11,7 @@ from vllm._aiter_ops import rocm_aiter_ops from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention +from vllm.model_executor.layers.attention.attention import get_attention_context from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv from vllm.utils.platform_utils import get_cu_count @@ -687,6 +688,8 @@ class AiterFlashAttentionBackend(AttentionBackend): def get_supported_head_sizes(cls) -> list[int]: return [64, 128, 256] + forward_includes_kv_cache_update: bool = False + @staticmethod def get_name() -> str: return "FLASH_ATTN" @@ -982,49 +985,10 @@ class AiterFlashAttentionImpl(AttentionImpl): # performance to make sure it does not introduce any overhead. num_actual_tokens = attn_metadata.num_actual_tokens key_cache, value_cache = kv_cache.unbind(0) - # key and value may be None in the case of cross attention. They are - # calculated once based on the output from the encoder and then cached - # in KV cache. + if self.kv_cache_dtype.startswith("fp8"): key_cache = key_cache.view(current_platform.fp8_dtype()) value_cache = value_cache.view(current_platform.fp8_dtype()) - if ( - self.kv_sharing_target_layer_name is None - and key is not None - and value is not None - ): - # Reshape the input keys and values and store them in the cache. - # Skip this if sharing KV cache with an earlier attention layer. - # NOTE(woosuk): Here, key and value are padded while slot_mapping - # is not padded. However, we don't need to do - # key[:num_actual_tokens] and value[:num_actual_tokens] because - # the reshape_and_cache_flash op uses the slot_mapping's shape - # to determine the number of actual tokens. - if rocm_aiter_ops.is_shuffle_kv_cache_enabled(): - # We may calculate per token quant scale in - # reshape_and_cache_shuffle_triton which might differ from - # vllm's style when shuffle layout is used. - reshape_and_cache_shuffle_triton( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping, - self.kv_cache_dtype, - attn_metadata.k_scale, - attn_metadata.v_scale, - ) - else: - torch.ops._C_cache_ops.reshape_and_cache_flash( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) # decode:extend:prefill query = query[:num_actual_tokens] @@ -1215,3 +1179,67 @@ class AiterFlashAttentionImpl(AttentionImpl): ) return output + + def do_kv_cache_update( + self, + layer: Attention, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + slot_mapping: torch.Tensor, + ): + attn_metadata, _, _ = get_attention_context(layer.layer_name) + if attn_metadata is None: + # Profiling run. + return + + key_cache, value_cache = kv_cache.unbind(0) + + # key and value may be None in the case of cross attention. They are + # calculated once based on the output from the encoder and then cached + # in KV cache. + if self.kv_cache_dtype.startswith("fp8"): + key_cache = key_cache.view(current_platform.fp8_dtype()) + value_cache = value_cache.view(current_platform.fp8_dtype()) + if ( + self.kv_sharing_target_layer_name is None + and key is not None + and value is not None + ): + # Reshape the input keys and values and store them in the cache. + # Skip this if sharing KV cache with an earlier attention layer. + # NOTE(woosuk): Here, key and value are padded while slot_mapping + # is not padded. However, we don't need to do + # key[:num_actual_tokens] and value[:num_actual_tokens] because + # the reshape_and_cache_flash op uses the slot_mapping's shape + # to determine the number of actual tokens. + if rocm_aiter_ops.is_shuffle_kv_cache_enabled(): + # We may calculate per token quant scale in + # reshape_and_cache_shuffle_triton which might differ from + # vllm's style when shuffle layout is used. + k_scale = attn_metadata.k_scale + v_scale = attn_metadata.v_scale + assert k_scale is not None and v_scale is not None, ( + "k_scale and v_scale are required for shuffled update" + ) + reshape_and_cache_shuffle_triton( + key, + value, + key_cache, + value_cache, + slot_mapping, + self.kv_cache_dtype, + k_scale, + v_scale, + ) + else: + torch.ops._C_cache_ops.reshape_and_cache_flash( + key, + value, + key_cache, + value_cache, + slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) -- GitLab From 48134a2c227541dd47b1651bfd96a70a714b0f6e Mon Sep 17 00:00:00 2001 From: SorenDreano <71752785+SorenDreano@users.noreply.github.com> Date: Wed, 11 Feb 2026 18:02:27 +0100 Subject: [PATCH 0107/1166] [Docs] Fix typo ("defult") and double spacing (#34348) Signed-off-by: SorenDreano <71752785+SorenDreano@users.noreply.github.com> Co-authored-by: Soren Dreano Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/config/vllm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index eccaa6ce6..e9f6b37ab 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -278,7 +278,7 @@ class VllmConfig: optimization_level: OptimizationLevel = OptimizationLevel.O2 """The optimization level. These levels trade startup time cost for performance, with -O0 having the best startup time and -O3 having the best - performance. -02 is used by defult. See OptimizationLevel for full + performance. -O2 is used by default. See OptimizationLevel for full description.""" weight_transfer_config: WeightTransferConfig | None = None -- GitLab From fa7e0bfacfb44ec77a4bda77ba499d320b14ae7c Mon Sep 17 00:00:00 2001 From: junuxyz <216036880+junuxyz@users.noreply.github.com> Date: Thu, 12 Feb 2026 02:03:48 +0900 Subject: [PATCH 0108/1166] =?UTF-8?q?[CI][BugFix]=20Fix=20silent=20failure?= =?UTF-8?q?=20in=20shellcheck=20hook=20and=20baseline=20exist=E2=80=A6=20(?= =?UTF-8?q?#32458)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: junuxyz <216036880+junuxyz@users.noreply.github.com> --- tools/pre_commit/shellcheck.baseline | 89 ++++++++++++++++++++++++++++ tools/pre_commit/shellcheck.sh | 39 +++++++++++- 2 files changed, 126 insertions(+), 2 deletions(-) create mode 100644 tools/pre_commit/shellcheck.baseline diff --git a/tools/pre_commit/shellcheck.baseline b/tools/pre_commit/shellcheck.baseline new file mode 100644 index 000000000..7433bb331 --- /dev/null +++ b/tools/pre_commit/shellcheck.baseline @@ -0,0 +1,89 @@ +benchmarks/auto_tune/auto_tune.sh:SC2034 +benchmarks/auto_tune/auto_tune.sh:SC2086 +benchmarks/auto_tune/batch_auto_tune.sh:SC2086 +benchmarks/run_structured_output_benchmark.sh:SC2028 +benchmarks/run_structured_output_benchmark.sh:SC2034 +benchmarks/run_structured_output_benchmark.sh:SC2086 +.buildkite/image_build/image_build_cpu_arm64.sh:SC2086 +.buildkite/image_build/image_build_cpu.sh:SC2086 +.buildkite/image_build/image_build_hpu.sh:SC2086 +.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh:SC2086 +.buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh:SC2034 +.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh:SC2027 +.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh:SC2086 +.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh:SC2126 +.buildkite/scripts/annotate-rocm-release.sh:SC2086 +.buildkite/scripts/cache-rocm-base-wheels.sh:SC2012 +.buildkite/scripts/cherry-pick-from-milestone.sh:SC2064 +.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh:SC2086 +.buildkite/scripts/hardware_ci/run-cpu-test.sh:SC2086 +.buildkite/scripts/hardware_ci/run-hpu-test.sh:SC2086 +.buildkite/scripts/hardware_ci/run-npu-test.sh:SC1090 +.buildkite/scripts/hardware_ci/run-npu-test.sh:SC2006 +.buildkite/scripts/hardware_ci/run-npu-test.sh:SC2086 +.buildkite/scripts/hardware_ci/run-npu-test.sh:SC2181 +.buildkite/scripts/hardware_ci/run-xpu-test.sh:SC2086 +.buildkite/scripts/push-nightly-builds.sh:SC2086 +.buildkite/scripts/run-multi-node-test.sh:SC2086 +.buildkite/scripts/run-multi-node-test.sh:SC2089 +.buildkite/scripts/run-multi-node-test.sh:SC2090 +.buildkite/scripts/run-prime-rl-test.sh:SC2086 +.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh:SC2086 +.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh:SC2086 +.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh:SC2086 +.buildkite/scripts/tpu/docker_run_bm.sh:SC1090 +.buildkite/scripts/tpu/docker_run_bm.sh:SC2086 +.buildkite/scripts/tpu/run_bm.sh:SC2034 +.buildkite/scripts/tpu/run_bm.sh:SC2086 +.buildkite/scripts/upload-nightly-wheels.sh:SC2086 +.buildkite/scripts/upload-nightly-wheels.sh:SC2115 +.buildkite/scripts/upload-nightly-wheels.sh:SC2236 +.buildkite/scripts/upload-release-wheels-pypi.sh:SC2086 +.buildkite/scripts/upload-rocm-wheels.sh:SC2012 +examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh:SC2086 +examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh:SC2086 +examples/online_serving/disaggregated_prefill.sh:SC2086 +examples/online_serving/disaggregated_serving/kv_events.sh:SC2086 +examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh:SC2046 +examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh:SC2086 +examples/online_serving/disaggregated_serving/mooncake_connector/run_mooncake_connector.sh:SC2317 +examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh:SC2046 +examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh:SC2086 +examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh:SC2317 +examples/online_serving/elastic_ep/bench.sh:SC2086 +examples/online_serving/elastic_ep/serve_deepseek_v2.sh:SC2086 +examples/online_serving/multi-node-serving.sh:SC2006 +examples/online_serving/multi-node-serving.sh:SC2086 +examples/online_serving/multi-node-serving.sh:SC2181 +examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh:SC2046 +examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh:SC2126 +examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh:SC2181 +examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh:SC2206 +examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh:SC2086 +examples/pooling/embed/openai_embedding_long_text/service.sh:SC2086 +tests/standalone_tests/python_only_compile.sh:SC2086 +tests/v1/ec_connector/integration/run_epd_correctness_test.sh:SC2086 +tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh:SC2086 +tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh:SC2005 +tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh:SC2086 +tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh:SC2124 +tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh:SC2126 +tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh:SC2206 +tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh:SC2086 +tests/v1/kv_connector/nixl_integration/run_edge_case_test.sh:SC2153 +tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh:SC2086 +tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh:SC2089 +tests/v1/kv_connector/nixl_integration/run_tpu_disagg_accuracy_test.sh:SC2090 +tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh:SC2086 +tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh:SC2089 +tests/v1/kv_connector/nixl_integration/run_tpu_edge_case_test.sh:SC2090 +tools/ep_kernels/elastic_ep/install_eep_libraries.sh:SC2086 +tools/ep_kernels/install_python_libraries.sh:SC2086 +tools/ep_kernels/install_python_libraries.sh:SC2196 +tools/flashinfer-build.sh:SC2086 +tools/flashinfer-build.sh:SC2269 +tools/install_deepgemm.sh:SC2035 +tools/install_deepgemm.sh:SC2295 +tools/pre_commit/shellcheck.sh:SC2016 +tools/vllm-rocm/generate-rocm-wheels-root-index.sh:SC2295 +tools/vllm-tpu/build.sh:SC2145 diff --git a/tools/pre_commit/shellcheck.sh b/tools/pre_commit/shellcheck.sh index 59ce40038..4adee5d57 100755 --- a/tools/pre_commit/shellcheck.sh +++ b/tools/pre_commit/shellcheck.sh @@ -1,7 +1,8 @@ #!/bin/bash -set -e +set -euo pipefail scversion="stable" +baseline="tools/pre_commit/shellcheck.baseline" if [ -d "shellcheck-${scversion}" ]; then export PATH="$PATH:$(pwd)/shellcheck-${scversion}" @@ -19,4 +20,38 @@ if ! [ -x "$(command -v shellcheck)" ]; then fi # TODO - fix warnings in .buildkite/scripts/hardware_ci/run-amd-test.sh -find . -name "*.sh" ".git" -prune -not -path "./.buildkite/scripts/hardware_ci/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck -s bash "{}"' +# collects warnings as "file:SCcode" pairs for baseline comparison. +collect() { + find . -path ./.git -prune -o -name "*.sh" \ + -not -path "./.buildkite/scripts/hardware_ci/run-amd-test.sh" -print0 | \ + xargs -0 sh -c 'for f in "$@"; do git check-ignore -q "$f" || shellcheck -s bash -f gcc "$f" || true; done' -- | \ + sed -nE 's|^\./||; s|^([^:]+):[0-9]+:[0-9]+:.*\[(SC[0-9]+)\]$|\1:\2|p' | \ + sort -u +} + +if [[ "${1:-}" == "--generate-baseline" ]]; then + collect > "$baseline" + echo "Wrote baseline to $baseline" + exit 0 +fi + +if [[ ! -f "$baseline" ]]; then + echo "Baseline not found: $baseline (run: $0 --generate-baseline)" + exit 1 +fi + +current="$(mktemp)" +trap 'rm -f "$current"' EXIT +collect > "$current" + +# finds new warnings not in baseline +new_errors="$(comm -23 "$current" <(sort -u "$baseline") || true)" +if [ -n "$new_errors" ]; then + echo "$new_errors" | cut -d: -f1 | sort -u | while IFS= read -r file; do + if [[ -f "$file" ]]; then + codes=$(echo "$new_errors" | awk -F: -v f="$file" '$1==f {print $2}' | paste -sd ',' -) + shellcheck -s bash --include="$codes" "$file" 2>&1 || true + fi + done + exit 1 +fi -- GitLab From ffb3d553cc9258049bf4d48214c9f4106cc67cfb Mon Sep 17 00:00:00 2001 From: Xinyu Chen Date: Thu, 12 Feb 2026 01:12:13 +0800 Subject: [PATCH 0109/1166] [Model Runner V2] Init cuda graph pool when necessary (#33217) Signed-off-by: Xinyu Chen --- vllm/v1/worker/gpu/cudagraph_utils.py | 4 +++- vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py index bf55b99af..d5a22d6a0 100644 --- a/vllm/v1/worker/gpu/cudagraph_utils.py +++ b/vllm/v1/worker/gpu/cudagraph_utils.py @@ -45,7 +45,9 @@ class CudaGraphManager: ) self.graphs: dict[int, torch.cuda.CUDAGraph] = {} - self.pool = torch.cuda.graph_pool_handle() + self.pool = None + if self.cudagraph_mode != CUDAGraphMode.NONE: + self.pool = torch.cuda.graph_pool_handle() self.hidden_states: torch.Tensor | None = None def needs_capture(self) -> bool: diff --git a/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py b/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py index 48e7cb110..1ea7ffcb5 100644 --- a/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py +++ b/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py @@ -44,7 +44,9 @@ class EagleCudaGraphManager: ) self.graphs: dict[int, torch.cuda.CUDAGraph] = {} - self.pool = torch.cuda.graph_pool_handle() + self.pool = None + if self.cudagraph_mode != CUDAGraphMode.NONE: + self.pool = torch.cuda.graph_pool_handle() def get_cudagraph_size(self, num_tokens: int) -> int | None: return self.cudagraph_sizes.get(num_tokens) -- GitLab From 0ab06100f469fe29b8a71cf0311b6b9da99db23e Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 12 Feb 2026 01:37:40 +0800 Subject: [PATCH 0110/1166] [Multimodal] Expose `mm_processor_kwargs` for `DummyInputsBuilder` (#34330) Signed-off-by: Isotr0py --- vllm/model_executor/models/aria.py | 1 + vllm/model_executor/models/audioflamingo3.py | 5 ++++- vllm/model_executor/models/aya_vision.py | 1 + vllm/model_executor/models/bagel.py | 1 + vllm/model_executor/models/bee.py | 1 + vllm/model_executor/models/blip2.py | 1 + vllm/model_executor/models/chameleon.py | 1 + vllm/model_executor/models/clip.py | 1 + vllm/model_executor/models/cohere2_vision.py | 1 + vllm/model_executor/models/deepseek_ocr.py | 1 + vllm/model_executor/models/deepseek_ocr2.py | 1 + vllm/model_executor/models/deepseek_vl2.py | 1 + vllm/model_executor/models/dots_ocr.py | 3 +++ vllm/model_executor/models/ernie45_vl.py | 1 + vllm/model_executor/models/funasr.py | 5 ++++- vllm/model_executor/models/funaudiochat.py | 7 +++++-- vllm/model_executor/models/fuyu.py | 1 + vllm/model_executor/models/gemma3_mm.py | 1 + vllm/model_executor/models/gemma3n_mm.py | 1 + vllm/model_executor/models/glm4_1v.py | 1 + vllm/model_executor/models/glm4v.py | 1 + vllm/model_executor/models/glmasr.py | 5 ++++- vllm/model_executor/models/granite_speech.py | 1 + vllm/model_executor/models/hunyuan_vision.py | 1 + vllm/model_executor/models/hyperclovax_vision.py | 1 + vllm/model_executor/models/idefics3.py | 14 +++----------- vllm/model_executor/models/interns1.py | 1 + vllm/model_executor/models/internvl.py | 2 ++ vllm/model_executor/models/isaac.py | 1 + vllm/model_executor/models/kanana_v.py | 1 + vllm/model_executor/models/keye.py | 1 + vllm/model_executor/models/kimi_k25.py | 1 + vllm/model_executor/models/kimi_vl.py | 1 + vllm/model_executor/models/lfm2_vl.py | 1 + vllm/model_executor/models/llava.py | 1 + vllm/model_executor/models/llava_next_video.py | 1 + vllm/model_executor/models/llava_onevision.py | 1 + vllm/model_executor/models/midashenglm.py | 1 + vllm/model_executor/models/minicpmo.py | 1 + vllm/model_executor/models/minicpmv.py | 1 + vllm/model_executor/models/mistral3.py | 1 + vllm/model_executor/models/mllama4.py | 1 + vllm/model_executor/models/molmo.py | 1 + vllm/model_executor/models/molmo2.py | 1 + vllm/model_executor/models/nano_nemotron_vl.py | 2 ++ vllm/model_executor/models/nemotron_parse.py | 1 + vllm/model_executor/models/nvlm_d.py | 1 + vllm/model_executor/models/ovis.py | 1 + vllm/model_executor/models/ovis2_5.py | 1 + vllm/model_executor/models/paddleocr_vl.py | 1 + vllm/model_executor/models/paligemma.py | 1 + vllm/model_executor/models/phi3v.py | 1 + vllm/model_executor/models/phi4mm.py | 1 + vllm/model_executor/models/pixtral.py | 2 ++ vllm/model_executor/models/qwen2_5_omni_thinker.py | 9 +++++++-- vllm/model_executor/models/qwen2_audio.py | 5 ++++- vllm/model_executor/models/qwen2_vl.py | 6 +++++- vllm/model_executor/models/qwen3_asr.py | 5 ++++- .../models/qwen3_omni_moe_thinker.py | 2 +- vllm/model_executor/models/qwen3_vl.py | 8 ++++++-- vllm/model_executor/models/qwen_vl.py | 1 + vllm/model_executor/models/rvl.py | 1 + vllm/model_executor/models/siglip.py | 1 + vllm/model_executor/models/skyworkr1v.py | 1 + vllm/model_executor/models/step3_vl.py | 1 + vllm/model_executor/models/terratorch.py | 1 + .../models/transformers/multimodal.py | 1 + vllm/model_executor/models/ultravox.py | 5 ++++- vllm/model_executor/models/voxtral.py | 2 ++ vllm/model_executor/models/whisper.py | 5 ++++- vllm/multimodal/processing/dummy_inputs.py | 12 +++++++++++- vllm/multimodal/registry.py | 2 ++ 72 files changed, 131 insertions(+), 27 deletions(-) diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index b8e742362..fc1720296 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -445,6 +445,7 @@ class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: vision_config = self.info.get_vision_config() diff --git a/vllm/model_executor/models/audioflamingo3.py b/vllm/model_executor/models/audioflamingo3.py index 599f3d29f..111b99461 100644 --- a/vllm/model_executor/models/audioflamingo3.py +++ b/vllm/model_executor/models/audioflamingo3.py @@ -253,8 +253,11 @@ class AudioFlamingo3DummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor( + **(mm_processor_kwargs or {}) + ) sampling_rate = feature_extractor.sampling_rate audio_len = MAX_AUDIO_LEN * sampling_rate num_audios = mm_counts.get("audio", 0) diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index af72f0bc4..ce3b990c3 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -192,6 +192,7 @@ class AyaVisionDummyInputsBuilder(BaseDummyInputsBuilder[AyaVisionProcessingInfo seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) image_size = self.info.get_image_size_with_most_features() diff --git a/vllm/model_executor/models/bagel.py b/vllm/model_executor/models/bagel.py index ac16538e9..657e8cefb 100644 --- a/vllm/model_executor/models/bagel.py +++ b/vllm/model_executor/models/bagel.py @@ -250,6 +250,7 @@ class BagelDummyInputsBuilder(BaseDummyInputsBuilder[BagelProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) hf_config = self.info.get_hf_config() diff --git a/vllm/model_executor/models/bee.py b/vllm/model_executor/models/bee.py index 4f0342df4..5c3a1a4f1 100644 --- a/vllm/model_executor/models/bee.py +++ b/vllm/model_executor/models/bee.py @@ -91,6 +91,7 @@ class BeeDummyInputsBuilder(LlavaDummyInputsBuilder[BeeProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index f812eb849..fe9db19ea 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -446,6 +446,7 @@ class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: hf_config = self.info.get_hf_config() vision_config = hf_config.vision_config diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index c4b885cc9..2c21d70ed 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -117,6 +117,7 @@ class ChameleonDummyInputsBuilder(BaseDummyInputsBuilder[ChameleonProcessingInfo seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: config = self.info.get_hf_config() diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 4ffeedf46..3f189eacc 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -171,6 +171,7 @@ class CLIPDummyInputsBuilder(BaseDummyInputsBuilder[CLIPProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index ebdb4bcb8..4aefd2ead 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -221,6 +221,7 @@ class Cohere2VisionDummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) image_size = self.info.get_image_size_with_most_features() diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py index 3425b1570..146b05002 100644 --- a/vllm/model_executor/models/deepseek_ocr.py +++ b/vllm/model_executor/models/deepseek_ocr.py @@ -256,6 +256,7 @@ class DeepseekOCRDummyInputsBuilder(BaseDummyInputsBuilder[DeepseekOCRProcessing seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/deepseek_ocr2.py b/vllm/model_executor/models/deepseek_ocr2.py index cead43685..6ababf9f2 100644 --- a/vllm/model_executor/models/deepseek_ocr2.py +++ b/vllm/model_executor/models/deepseek_ocr2.py @@ -138,6 +138,7 @@ class DeepseekOCR2DummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index cb98640ce..83ab54f60 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -215,6 +215,7 @@ class DeepseekVL2DummyInputsBuilder(BaseDummyInputsBuilder[DeepseekVL2Processing seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py index d2f39553d..0d2fefb73 100644 --- a/vllm/model_executor/models/dots_ocr.py +++ b/vllm/model_executor/models/dots_ocr.py @@ -107,10 +107,13 @@ class DotsOCRDummyInputsBuilder(Qwen2VLDummyInputsBuilder): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) + mm_processor_kwargs = mm_processor_kwargs or {} target_width, target_height = self.info.get_image_size_with_most_features( # noqa: E501 + mm_processor_kwargs.get("max_pixels", None) ) image_overrides = mm_options.get("image") if mm_options else None diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 0ada8a223..50d3954b6 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -1153,6 +1153,7 @@ class Ernie4_5_VLDummyInputsBuilder(BaseDummyInputsBuilder[Ernie4_5_VLProcessing seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) diff --git a/vllm/model_executor/models/funasr.py b/vllm/model_executor/models/funasr.py index b4d4fb5b7..3e4a6131c 100644 --- a/vllm/model_executor/models/funasr.py +++ b/vllm/model_executor/models/funasr.py @@ -745,8 +745,11 @@ class FunASRDummyInputsBuilder(BaseDummyInputsBuilder[FunASRProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor( + **(mm_processor_kwargs or {}) + ) sampling_rate = feature_extractor.sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate diff --git a/vllm/model_executor/models/funaudiochat.py b/vllm/model_executor/models/funaudiochat.py index b7b8659a4..a89a5c104 100644 --- a/vllm/model_executor/models/funaudiochat.py +++ b/vllm/model_executor/models/funaudiochat.py @@ -611,8 +611,11 @@ class FunAudioChatDummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor( + **(mm_processor_kwargs or {}) + ) sampling_rate = int(feature_extractor.sampling_rate) # Dummy inputs are used for profiling; construct the worst-case audio @@ -656,7 +659,7 @@ class FunAudioChatMultiModalProcessor( if not audios: return BatchFeature({"input_ids": input_ids}) - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor(**mm_kwargs) sr = int(feature_extractor.sampling_rate) min_samples = int(getattr(feature_extractor, "n_fft", 400) or 400) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 50708f4b9..c4f1118f7 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -143,6 +143,7 @@ class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 18437528e..1e803f89b 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -256,6 +256,7 @@ class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 8b5e7b8bb..8588e51f5 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -182,6 +182,7 @@ class Gemma3nDummyInputsBuilder(BaseDummyInputsBuilder[Gemma3nProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_audios = mm_counts.get("audio", 0) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 5333042cb..8440c3946 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1143,6 +1143,7 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 56504029d..4d86900e9 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -493,6 +493,7 @@ class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: hf_config = self.info.get_hf_config() vision_config = hf_config.vision_config diff --git a/vllm/model_executor/models/glmasr.py b/vllm/model_executor/models/glmasr.py index b9bdb3aa2..4e223b15f 100644 --- a/vllm/model_executor/models/glmasr.py +++ b/vllm/model_executor/models/glmasr.py @@ -727,8 +727,11 @@ class GlmAsrDummyInputsBuilder(BaseDummyInputsBuilder[GlmAsrProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor( + **(mm_processor_kwargs or {}) + ) sampling_rate = feature_extractor.sampling_rate num_audios = mm_counts.get("audio", 0) audio_overrides = mm_options.get("audio") if mm_options else None diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index 6956f92ee..9d37a0683 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -217,6 +217,7 @@ class GraniteSpeechDummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) audio_overrides = mm_options.get("audio") if mm_options else None diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index 729b6cb6c..edd00c5cd 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -702,6 +702,7 @@ class HunYuanVLDummyInputsBuilder(BaseDummyInputsBuilder[HunYuanVLProcessingInfo seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 1) diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index 6a1f58af2..ea10d764f 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -166,6 +166,7 @@ class HCXVisionDummyInputsBuilder(BaseDummyInputsBuilder[HCXVisionProcessingInfo seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index d51c50af0..e2cfd1d63 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -42,7 +42,7 @@ from vllm.multimodal.inputs import ( MultiModalFieldConfig, MultiModalKwargsItems, ) -from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems +from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems from vllm.multimodal.processing import ( BaseDummyInputsBuilder, BaseMultiModalProcessor, @@ -285,15 +285,6 @@ class Idefics3ProcessingInfo(BaseProcessingInfo): return num_patches * processor.image_seq_len - def get_image_size_with_most_features(self) -> ImageSize: - processor = self.get_hf_processor() - image_processor: Idefics3ImageProcessor = processor.image_processor - - return ImageSize( - width=image_processor.size["longest_edge"], - height=image_processor.size["longest_edge"], - ) - class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]): def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: @@ -309,9 +300,10 @@ class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]) seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) - hf_processor = self.info.get_hf_processor() + hf_processor = self.info.get_hf_processor(**(mm_processor_kwargs or {})) image_processor: Idefics3ImageProcessor = hf_processor.image_processor longest_edge = image_processor.max_image_size["longest_edge"] diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index de306341c..dd1332dfd 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -298,6 +298,7 @@ class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo]) seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() target_num_frames = self.info.get_num_frames_with_most_features( diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index bcce1c800..334ee3cbe 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -766,6 +766,7 @@ class BaseInternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) @@ -938,6 +939,7 @@ class InternVLDummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: dummy_image = super().get_dummy_mm_data( seq_len=seq_len, mm_counts=mm_counts, mm_options=mm_options diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index ed10e8200..8ed9ddda4 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -850,6 +850,7 @@ class IsaacDummyInputsBuilder(BaseDummyInputsBuilder[IsaacProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/kanana_v.py b/vllm/model_executor/models/kanana_v.py index 06ea26155..b679241b5 100644 --- a/vllm/model_executor/models/kanana_v.py +++ b/vllm/model_executor/models/kanana_v.py @@ -445,6 +445,7 @@ class KananaVDummyInputsBuilder(BaseDummyInputsBuilder[KananaVProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) return { diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index e57e5c6f3..960915af6 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -1159,6 +1159,7 @@ class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) diff --git a/vllm/model_executor/models/kimi_k25.py b/vllm/model_executor/models/kimi_k25.py index cb07cfe98..bc6fffa3b 100644 --- a/vllm/model_executor/models/kimi_k25.py +++ b/vllm/model_executor/models/kimi_k25.py @@ -238,6 +238,7 @@ class KimiK25DummyInputsBuilder(BaseDummyInputsBuilder[KimiK25ProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: # TODO: Support mm_options for vision_chunk to allow user configuration dummy_items = self.get_dummy_mm_items() diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index cb7719777..e280f8245 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -216,6 +216,7 @@ class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/lfm2_vl.py b/vllm/model_executor/models/lfm2_vl.py index 445ecdce7..7bded977a 100644 --- a/vllm/model_executor/models/lfm2_vl.py +++ b/vllm/model_executor/models/lfm2_vl.py @@ -319,6 +319,7 @@ class Lfm2VLDummyInputsBuilder(BaseDummyInputsBuilder[Lfm2VLProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index c35728183..ecd2c895b 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -232,6 +232,7 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 1aee7f9c5..6696a0009 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -166,6 +166,7 @@ class LlavaNextVideoDummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_videos = mm_counts.get("video", 0) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index d49c08eb3..39633eaf9 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -277,6 +277,7 @@ class LlavaOnevisionDummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py index 3f75e60fd..4bba0ad71 100644 --- a/vllm/model_executor/models/midashenglm.py +++ b/vllm/model_executor/models/midashenglm.py @@ -566,6 +566,7 @@ class MiDashengLMDummyInputsBuilder(BaseDummyInputsBuilder[MiDashengLMProcessing seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index 39b79e4b1..33df0f785 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -302,6 +302,7 @@ class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder[MiniCPMOProcessingIn seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) audio_len = ( diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index ebe2eca32..6a1686100 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -708,6 +708,7 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 08f5d45e2..33d94e9ff 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -237,6 +237,7 @@ class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 58f63597a..3752a7704 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -704,6 +704,7 @@ class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 1ee177656..6edec9719 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1278,6 +1278,7 @@ class MolmoDummyInputsBuilder(BaseDummyInputsBuilder[MolmoProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/molmo2.py b/vllm/model_executor/models/molmo2.py index 30f639c8b..e0f74ce46 100644 --- a/vllm/model_executor/models/molmo2.py +++ b/vllm/model_executor/models/molmo2.py @@ -2079,6 +2079,7 @@ class Molmo2DummyInputsBuilder(BaseDummyInputsBuilder[Molmo2ProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 1c36b681f..fb683487f 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -1385,6 +1385,7 @@ class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) processor = self.info.get_hf_processor() @@ -1457,6 +1458,7 @@ class NanoNemotronVLDummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: dummy_image = super().get_dummy_mm_data( seq_len=seq_len, mm_counts=mm_counts, mm_options=mm_options diff --git a/vllm/model_executor/models/nemotron_parse.py b/vllm/model_executor/models/nemotron_parse.py index f9acae3e0..b94b606a1 100644 --- a/vllm/model_executor/models/nemotron_parse.py +++ b/vllm/model_executor/models/nemotron_parse.py @@ -642,6 +642,7 @@ class NemotronParseDummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 73dd8dfd0..840918953 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -93,6 +93,7 @@ class NVLMDummyInputsBuilder(BaseInternVLDummyInputsBuilder[NVLMProcessingInfo]) seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index 3a058bb94..7e02d87ec 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -303,6 +303,7 @@ class OvisDummyInputsBuilder(BaseDummyInputsBuilder[OvisProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index a787a0bf8..69c0600d8 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -302,6 +302,7 @@ class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index b3873c160..8d287e342 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -204,6 +204,7 @@ class PaddleOCRVLDummyInputsBuilder(BaseDummyInputsBuilder[PaddleOCRVLProcessing seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 4ab0067f3..e551f9fc9 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -128,6 +128,7 @@ class PaliGemmaDummyInputsBuilder(BaseDummyInputsBuilder[PaliGemmaProcessingInfo seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: hf_config = self.info.get_hf_config() vision_config = hf_config.vision_config diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 3dde6dfd7..8f33cc859 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -380,6 +380,7 @@ class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 97a29b353..d11483a6b 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -826,6 +826,7 @@ class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 3a5dee3c2..7d12cffcd 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -261,6 +261,7 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) @@ -282,6 +283,7 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> ProcessorInputs: tokenizer = self.info.get_tokenizer() diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 3b50ae74d..974de8068 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -358,12 +358,14 @@ class Qwen2_5OmniThinkerDummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) - feature_extractor = self.info.get_feature_extractor() + mm_processor_kwargs = mm_processor_kwargs or {} + feature_extractor = self.info.get_feature_extractor(**mm_processor_kwargs) target_audio_length = ( min( @@ -372,7 +374,10 @@ class Qwen2_5OmniThinkerDummyInputsBuilder( ) * feature_extractor.sampling_rate ) - target_width, target_height = self.info.get_image_size_with_most_features() + + target_width, target_height = self.info.get_image_size_with_most_features( + max_pixels=mm_processor_kwargs.get("max_pixels", None), + ) target_num_frames = self.info.get_num_frames_with_most_features( seq_len, mm_counts ) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 2115d5140..51a24b0ae 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -195,8 +195,11 @@ class Qwen2AudioDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2AudioProcessingIn seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor( + **(mm_processor_kwargs or {}) + ) sampling_rate = feature_extractor.sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index d911fb1dd..fa9bf6cfe 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1016,11 +1016,15 @@ class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) - target_width, target_height = self.info.get_image_size_with_most_features() + mm_processor_kwargs = mm_processor_kwargs or {} + target_width, target_height = self.info.get_image_size_with_most_features( + max_pixels=mm_processor_kwargs.get("max_pixels", None) + ) target_num_frames = self.info.get_num_frames_with_most_features( seq_len, mm_counts ) diff --git a/vllm/model_executor/models/qwen3_asr.py b/vllm/model_executor/models/qwen3_asr.py index 9dac8d75b..5f56088cb 100644 --- a/vllm/model_executor/models/qwen3_asr.py +++ b/vllm/model_executor/models/qwen3_asr.py @@ -147,10 +147,13 @@ class Qwen3ASRDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3ASRProcessingInfo]) seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor( + **(mm_processor_kwargs or {}) + ) target_audio_length = ( min( diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index b06503031..50fbb8be1 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -1169,7 +1169,7 @@ class Qwen3OmniMoeThinkerMultiModalProcessor( return x # NOTE: WhisperFeatureExtractor cannot handle empty list of audios - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor(**mm_kwargs) hop_length = feature_extractor.hop_length if audios: # NOTE: Qwen3-Omni processor accept "audio" diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 908f6342d..7d9785141 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -796,14 +796,18 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) image_overrides = mm_options.get("image") if mm_options else None video_overrides = mm_options.get("video") if mm_options else None + mm_processor_kwargs = mm_processor_kwargs or {} target_image_width, target_image_height = ( - self.info.get_image_size_with_most_features() + self.info.get_image_size_with_most_features( + max_pixels=mm_processor_kwargs.get("max_pixels", None), + ) ) # treat videos as special images @@ -828,7 +832,7 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]): target_num_frames = min(target_num_frames, num_frames_override) target_num_frames = max(target_num_frames, 2) - video_processor = self.info.get_video_processor() + video_processor = self.info.get_video_processor(**(mm_processor_kwargs or {})) video_max_pixels = video_processor.size["longest_edge"] # video_max_pixels contains the temporal compression factor, # so we divide by 2 to get the maximum number of image pixels. diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index ed61bb140..66b669a9c 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -618,6 +618,7 @@ class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: hf_config = self.info.get_hf_config() vision_config = hf_config.visual diff --git a/vllm/model_executor/models/rvl.py b/vllm/model_executor/models/rvl.py index 92352febe..f6ddaa8fa 100644 --- a/vllm/model_executor/models/rvl.py +++ b/vllm/model_executor/models/rvl.py @@ -41,6 +41,7 @@ class RVLDummyInputsBuilder(LlavaDummyInputsBuilder[RVLProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 9f1bbd596..92ecc7579 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -155,6 +155,7 @@ class SiglipDummyInputsBuilder(BaseDummyInputsBuilder[SiglipProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index 29a0389b9..4fadad14d 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -533,6 +533,7 @@ class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[SkyworkR1VProcessingIn seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 11081b040..8050f6b85 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -565,6 +565,7 @@ class Step3VLDummyInputsBuilder(BaseDummyInputsBuilder[Step3VLProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py index b817383ab..804eccbc4 100644 --- a/vllm/model_executor/models/terratorch.py +++ b/vllm/model_executor/models/terratorch.py @@ -154,6 +154,7 @@ class TerratorchInputBuilder(BaseDummyInputsBuilder[TerratorchProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: # Dummy data is generated based on the 'input' section # defined in the HF configuration file diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py index 890b486b8..64dc5bf8b 100644 --- a/vllm/model_executor/models/transformers/multimodal.py +++ b/vllm/model_executor/models/transformers/multimodal.py @@ -98,6 +98,7 @@ class MultiModalDummyInputsBuilder(BaseDummyInputsBuilder[MultiModalProcessingIn seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, "BaseDummyOptions"] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 944dc5e12..d7a9bd4fd 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -161,8 +161,11 @@ class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]) seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor( + **(mm_processor_kwargs or {}) + ) sampling_rate = feature_extractor.sampling_rate audio_len = ( diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 581664aec..715d6aa25 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -220,6 +220,7 @@ class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) @@ -238,6 +239,7 @@ class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> ProcessorInputs: tokenizer = self.info.get_tokenizer() diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 7462d9f6e..26c7b62e8 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -685,8 +685,11 @@ class WhisperDummyInputsBuilder(BaseDummyInputsBuilder[WhisperProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor( + **(mm_processor_kwargs or {}) + ) sampling_rate = feature_extractor.sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate diff --git a/vllm/multimodal/processing/dummy_inputs.py b/vllm/multimodal/processing/dummy_inputs.py index a93fd2c24..0b02861e3 100644 --- a/vllm/multimodal/processing/dummy_inputs.py +++ b/vllm/multimodal/processing/dummy_inputs.py @@ -63,6 +63,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: """ Build the multimodal input which, after processing, results in @@ -83,6 +84,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> ProcessorInputs: """ Build the input which, after processing, results in @@ -92,9 +94,16 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]): seq_len: Sequence length mm_counts: Count of items per modality mm_options: Configurable options per modality (optional) + mm_processor_kwargs: Additional keyword arguments + for hf_processor (optional) """ dummy_text = self.get_dummy_text(mm_counts) - dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts, mm_options) + dummy_mm_data = self.get_dummy_mm_data( + seq_len, + mm_counts, + mm_options, + mm_processor_kwargs=mm_processor_kwargs, + ) dummy_mm_items = self.info.parse_mm_data(dummy_mm_data, validate=False) tokenization_kwargs = {"truncation": False} @@ -102,6 +111,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]): return ProcessorInputs( prompt=dummy_text, mm_items=dummy_mm_items, + hf_processor_mm_kwargs=mm_processor_kwargs or {}, tokenization_kwargs=tokenization_kwargs, ) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 6c7e86a4f..340754d16 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -257,10 +257,12 @@ class MultiModalRegistry: if processor is None: processor = self.create_processor(model_config, cache=cache) + mm_config = model_config.get_multimodal_config() processor_inputs = processor.dummy_inputs.get_dummy_processor_inputs( seq_len=seq_len, mm_counts=mm_counts, mm_options=self._extract_mm_options(model_config), + mm_processor_kwargs=mm_config.mm_processor_kwargs, ) mm_inputs = processor.apply( prompt=processor_inputs.prompt, -- GitLab From be7f3d5d2016b326d12ff582a8c9f96a68217c7a Mon Sep 17 00:00:00 2001 From: Xinyu Dong Date: Thu, 12 Feb 2026 02:20:45 +0800 Subject: [PATCH 0111/1166] [Bugfix] fix default is_neox_style is True for deepseek (#34353) Signed-off-by: dongxinyu03 --- vllm/model_executor/models/deepseek_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index ab4f498b9..e62af24a8 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -836,7 +836,7 @@ class DeepseekV2MLAAttention(nn.Module): qk_rope_head_dim, max_position=max_position_embeddings, rope_parameters=config.rope_parameters, - is_neox_style=not getattr(config, "indexer_rope_interleave", True), + is_neox_style=not getattr(config, "indexer_rope_interleave", False), ) self.indexer = Indexer( vllm_config, -- GitLab From 11c7ace340610e0be376d531b677bcee1ae84ad4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eldar=20Kurti=C4=87?= <8884008+eldarkurtic@users.noreply.github.com> Date: Wed, 11 Feb 2026 19:24:22 +0100 Subject: [PATCH 0112/1166] [Bugfix] Enable attn quantization of Llama-4 by correctly permuting scales for rope (int8, fp8) (#34243) Signed-off-by: Your Name Co-authored-by: Your Name --- vllm/model_executor/models/llama4.py | 34 ++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 0cdb4989e..4050bf045 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -44,6 +44,9 @@ from vllm.model_executor.layers.linear import ( RowParallelLinear, ) from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization.compressed_tensors import ( + compressed_tensors as ct, +) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, @@ -829,11 +832,20 @@ class Llama4ForCausalLM(LlamaForCausalLM, MixtureOfExperts): loaded_weight: torch.Tensor, ) -> tuple[str, torch.Tensor]: # Helper function to permute the weight's channels - def permute(w: torch.Tensor, n_heads: int, is_weight_scale: bool): + def permute( + w: torch.Tensor, + n_heads: int, + is_nvfp4_weight_scale: bool, + is_ct_int8_or_fp8_weight_scale: bool, + ): # Calculate the expected shape of the weight. # Do not rely on w's shape, as it may be in another layout. attn_in = self.config.head_dim * n_heads - attn_out = self.config.hidden_size + attn_out = ( + self.config.hidden_size + if not is_ct_int8_or_fp8_weight_scale + else w.shape[-1] + ) # If the weight is FP4 packed as uint8, we need to divide attn_out # by 2. @@ -844,7 +856,7 @@ class Llama4ForCausalLM(LlamaForCausalLM, MixtureOfExperts): # block size, which is currently 16. elif ( w.dtype == torch.float8_e4m3fn - and is_weight_scale + and is_nvfp4_weight_scale and w.shape[1] * 16 == attn_out ): attn_out = attn_out // 16 @@ -862,19 +874,31 @@ class Llama4ForCausalLM(LlamaForCausalLM, MixtureOfExperts): is_nvfp4_weight_scale = ( modules[-1] == "weight_scale" and loaded_weight.dtype == torch.float8_e4m3fn ) - - if is_weight or is_nvfp4_weight_scale: + is_ct_int8_or_fp8_weight_scale = False + if modules[-1] == "weight_scale" and isinstance( + self.model.quant_config, ct.CompressedTensorsConfig + ): + from compressed_tensors import CompressionFormat + + is_ct_int8_or_fp8_weight_scale = self.model.quant_config.quant_format in [ + CompressionFormat.int_quantized.value, + CompressionFormat.float_quantized.value, + ] and loaded_weight.dtype in [torch.float16, torch.bfloat16, torch.float32] + + if is_weight or is_nvfp4_weight_scale or is_ct_int8_or_fp8_weight_scale: if "wk" in modules or "k_proj" in modules: loaded_weight = permute( loaded_weight, self.config.num_key_value_heads, is_nvfp4_weight_scale, + is_ct_int8_or_fp8_weight_scale, ) elif "wq" in modules or "q_proj" in modules: loaded_weight = permute( loaded_weight, self.config.num_attention_heads, is_nvfp4_weight_scale, + is_ct_int8_or_fp8_weight_scale, ) return name, loaded_weight -- GitLab From 500121136995ff0b261f4d2f68e4831896e32d63 Mon Sep 17 00:00:00 2001 From: TJian Date: Thu, 12 Feb 2026 02:50:44 +0800 Subject: [PATCH 0113/1166] [ROCm] [CI] fix test_unrecognized_env (#34350) Signed-off-by: tjtanaa --- tests/config/test_config_generation.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/config/test_config_generation.py b/tests/config/test_config_generation.py index 225ac0f22..c7edf2b97 100644 --- a/tests/config/test_config_generation.py +++ b/tests/config/test_config_generation.py @@ -80,12 +80,19 @@ def test_ray_runtime_env(monkeypatch: pytest.MonkeyPatch): ray.shutdown() -def test_unrecognized_env(): +def test_unrecognized_env(monkeypatch): import os + from vllm.envs import environment_variables + + # Remove any existing unrecognized VLLM env vars that might interfere + for env in list(os.environ): + if env.startswith("VLLM_") and env not in environment_variables: + monkeypatch.delenv(env, raising=False) + # Test that if fail_on_environ_validation is True, then an error # is raised when an unrecognized vLLM environment variable is set - os.environ["VLLM_UNRECOGNIZED_ENV_VAR"] = "some_value" + monkeypatch.setenv("VLLM_UNRECOGNIZED_ENV_VAR", "some_value") engine_args = EngineArgs( fail_on_environ_validation=True, ) @@ -97,7 +104,7 @@ def test_unrecognized_env(): engine_args.create_engine_config() # Test that when the unrecognized env var is removed, no error is raised - os.environ.pop("VLLM_UNRECOGNIZED_ENV_VAR", None) + monkeypatch.delenv("VLLM_UNRECOGNIZED_ENV_VAR") engine_args = EngineArgs( fail_on_environ_validation=True, ) -- GitLab From 83e26c834ef188ca84b2459199840e2d58c75c32 Mon Sep 17 00:00:00 2001 From: elvischenv <219235043+elvischenv@users.noreply.github.com> Date: Thu, 12 Feb 2026 04:29:29 +0800 Subject: [PATCH 0114/1166] [GPT-OSS] Remove unnecessary contiguous (#34337) Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com> --- vllm/model_executor/models/gpt_oss.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index 28c37c64b..503bcd3d0 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -140,7 +140,6 @@ class OAIAttention(nn.Module): qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - v = v.contiguous() attn_output = self.attn(q, k, v) output, _ = self.o_proj(attn_output) return output -- GitLab From 144d9b7cc8352c5868eb407dd970be94f02b572f Mon Sep 17 00:00:00 2001 From: Tomas Ruiz Date: Wed, 11 Feb 2026 21:57:57 +0100 Subject: [PATCH 0115/1166] [Benchmarks] Reduce ready checker log verbosity (#34349) Signed-off-by: Tomas Ruiz --- vllm/benchmarks/lib/ready_checker.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/benchmarks/lib/ready_checker.py b/vllm/benchmarks/lib/ready_checker.py index 0cfd053f5..eec4a42cb 100644 --- a/vllm/benchmarks/lib/ready_checker.py +++ b/vllm/benchmarks/lib/ready_checker.py @@ -66,7 +66,8 @@ async def wait_for_endpoint( pbar.close() return output else: - logger.warning("Endpoint is not ready. Error='%s'", output.error) + err_last_line = str(output.error).rstrip().rsplit("\n", 1)[-1] + logger.warning("Endpoint is not ready. Error='%s'", err_last_line) except aiohttp.ClientConnectorError: pass -- GitLab From 5458eb835d66323a11d4a252ad551d001ce00ac8 Mon Sep 17 00:00:00 2001 From: Junseo Park <53421022+pjs102793@users.noreply.github.com> Date: Thu, 12 Feb 2026 06:01:53 +0900 Subject: [PATCH 0116/1166] [Bugfix] send None sentinel on final commit so server properly sends transcription.done (#33963) Signed-off-by: pjs102793 Co-authored-by: Nick Hill --- tests/entrypoints/openai/test_realtime_validation.py | 2 +- vllm/entrypoints/openai/realtime/connection.py | 8 +------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/tests/entrypoints/openai/test_realtime_validation.py b/tests/entrypoints/openai/test_realtime_validation.py index 7f12bcaca..946843e0b 100644 --- a/tests/entrypoints/openai/test_realtime_validation.py +++ b/tests/entrypoints/openai/test_realtime_validation.py @@ -129,5 +129,5 @@ async def test_multi_chunk_streaming( " First words I spoke in the original phonograph." " A little piece of practical poetry. Mary had a little lamb," " it sleeps with quite a flow, and everywhere that Mary went," - " the lamb was sure to go" + " the lamb was sure to go." ) diff --git a/vllm/entrypoints/openai/realtime/connection.py b/vllm/entrypoints/openai/realtime/connection.py index 6b779c720..fe1b0f5f3 100644 --- a/vllm/entrypoints/openai/realtime/connection.py +++ b/vllm/entrypoints/openai/realtime/connection.py @@ -48,7 +48,6 @@ class RealtimeConnection: self.generation_task: asyncio.Task | None = None self._is_connected = False - self._is_input_finished = False self._is_model_validated = False self._max_audio_filesize_mb = envs.VLLM_MAX_AUDIO_CLIP_FILESIZE_MB @@ -145,7 +144,7 @@ class RealtimeConnection: commit_event = InputAudioBufferCommit(**event) # final signals that the audio is finished if commit_event.final: - self._is_input_finished = True + self.audio_queue.put_nowait(None) else: await self.start_generation() else: @@ -239,11 +238,6 @@ class RealtimeConnection: # finish because websocket connection was killed break - if self.audio_queue.empty() and self._is_input_finished: - # finish because client signals that audio input - # is finished - break - usage = UsageInfo( prompt_tokens=prompt_token_ids_len, completion_tokens=completion_tokens_len, -- GitLab From 527ca32197b327e55bc718c0ecfea27ff8995902 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Wed, 11 Feb 2026 22:02:05 +0100 Subject: [PATCH 0117/1166] [Bugfix] Fix more multimodal tests for transformers V5 (#34334) Signed-off-by: raushan --- tests/models/multimodal/processing/test_common.py | 1 + vllm/model_executor/models/glmasr.py | 6 +++--- vllm/model_executor/models/glmasr_utils.py | 4 ++-- vllm/model_executor/models/lfm2_vl.py | 4 +++- vllm/model_executor/models/qwen2_vl.py | 14 +++++++++----- 5 files changed, 18 insertions(+), 11 deletions(-) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index ae2ec1bc0..4c99c9bad 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -108,6 +108,7 @@ _ADD_SPECIAL_TOKENS_OVERRIDES = { "paligemma": False, "ultravox": False, "whisper": False, + "lfm2_vl": False, } _IGNORE_MM_KEYS = { diff --git a/vllm/model_executor/models/glmasr.py b/vllm/model_executor/models/glmasr.py index 4e223b15f..b7d67b1e4 100644 --- a/vllm/model_executor/models/glmasr.py +++ b/vllm/model_executor/models/glmasr.py @@ -810,9 +810,9 @@ class GlmAsrMultiModalProcessor(BaseMultiModalProcessor["GlmAsrProcessingInfo"]) # Postprocess: rename mask and add chunk counts # Handle different key names from different transformers versions - if "input_feature_mask" in outputs: - outputs["feature_attention_mask"] = outputs.pop("input_feature_mask") - elif "feature_attention_mask" not in outputs and "input_features" in outputs: + if "input_features_mask" in outputs: + outputs["feature_attention_mask"] = outputs.pop("input_features_mask") + elif "input_features_mask" not in outputs and "input_features" in outputs: # If no mask is provided, create one from input_features input_features = outputs["input_features"] if isinstance(input_features, torch.Tensor): diff --git a/vllm/model_executor/models/glmasr_utils.py b/vllm/model_executor/models/glmasr_utils.py index 80c903da7..ed0551540 100644 --- a/vllm/model_executor/models/glmasr_utils.py +++ b/vllm/model_executor/models/glmasr_utils.py @@ -18,8 +18,8 @@ def _calculate_conv_output_length( input_length: torch.Tensor, padding: int, kernel_size: int, stride: int ) -> torch.Tensor: """Calculate Conv1d output length using standard formula.""" - # Standard formula: floor((input + 2*padding - kernel_size) / stride) + 1 - return (input_length + 2 * padding - kernel_size) // stride + 1 + # in sync with `hf_processor._get_audio_token_length` + return (input_length + 2 * padding - (kernel_size - 1) - 1) // stride + 1 def _as_list_chunk_counts( diff --git a/vllm/model_executor/models/lfm2_vl.py b/vllm/model_executor/models/lfm2_vl.py index 7bded977a..b77b93196 100644 --- a/vllm/model_executor/models/lfm2_vl.py +++ b/vllm/model_executor/models/lfm2_vl.py @@ -347,7 +347,9 @@ class Lfm2VLMultiModalProcessor(BaseMultiModalProcessor[Lfm2VLProcessingInfo]): ) -> BatchFeature: # Text-only input not supported in composite processor if not (images := mm_data.get("images", [])): - prompt_ids = self.info.get_tokenizer().encode(prompt) + prompt_ids = self.info.get_tokenizer().encode( + prompt, add_special_tokens=False + ) prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index fa9bf6cfe..62df900ad 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1467,15 +1467,15 @@ class Tarsier2ImageProcessor(Qwen2VLImageProcessor): class Tarsier2Processor(Qwen2VLProcessor): def __init__( self, - vision_config: dict, + image_processor: Tarsier2ImageProcessor, tokenizer: TokenizerLike, + video_processor: Qwen2VLVideoProcessor, **kwargs, ): - self.image_processor = Tarsier2ImageProcessor(**vision_config) super().__init__( - image_processor=self.image_processor, + image_processor=image_processor, tokenizer=tokenizer, - video_processor=Qwen2VLVideoProcessor(**vision_config), + video_processor=video_processor, chat_template=None, **kwargs, ) @@ -1489,8 +1489,12 @@ class Tarsier2ProcessingInfo(Qwen2VLProcessingInfo): return correct_config def get_hf_processor(self, **kwargs: object) -> Tarsier2Processor: + vision_config = self.ctx.get_hf_image_processor_config() + image_processor = Tarsier2ImageProcessor(**vision_config) + video_processor = Qwen2VLVideoProcessor(**vision_config) return Tarsier2Processor( - vision_config=self.ctx.get_hf_image_processor_config(), + image_processor=image_processor, + video_processor=video_processor, tokenizer=self.get_tokenizer(), **kwargs, ) -- GitLab From 5aff2699bdcedd9ee91fe936fc21b26466203ae1 Mon Sep 17 00:00:00 2001 From: Wei Zhao <51183510+wzhao18@users.noreply.github.com> Date: Wed, 11 Feb 2026 17:17:16 -0500 Subject: [PATCH 0118/1166] Fix CI failure - Flashinfer Kernel tests (#34316) Signed-off-by: wzhao18 --- tests/kernels/moe/test_flashinfer.py | 1 + tests/kernels/moe/test_flashinfer_moe.py | 1 + tests/kernels/moe/test_pplx_cutlass_moe.py | 1 + 3 files changed, 3 insertions(+) diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py index ddcd221ef..c5d34ef0b 100644 --- a/tests/kernels/moe/test_flashinfer.py +++ b/tests/kernels/moe/test_flashinfer.py @@ -287,6 +287,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph( hidden_dim=k, intermediate_size_per_partition=n, num_local_experts=e, + num_logical_experts=e, activation=activation, device="cuda", moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), diff --git a/tests/kernels/moe/test_flashinfer_moe.py b/tests/kernels/moe/test_flashinfer_moe.py index 113649afe..c61bca313 100644 --- a/tests/kernels/moe/test_flashinfer_moe.py +++ b/tests/kernels/moe/test_flashinfer_moe.py @@ -97,6 +97,7 @@ def test_flashinfer_fp4_moe_no_graph( hidden_dim=k, intermediate_size_per_partition=n, num_local_experts=e, + num_logical_experts=e, activation=activation, device="cuda", moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py index 213d28cda..894e57fe2 100644 --- a/tests/kernels/moe/test_pplx_cutlass_moe.py +++ b/tests/kernels/moe/test_pplx_cutlass_moe.py @@ -147,6 +147,7 @@ def pplx_cutlass_moe( hidden_dim=hidden_dim, intermediate_size_per_partition=intermediate_dim, num_local_experts=num_local_experts, + num_logical_experts=num_experts, moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), activation="silu", in_dtype=torch.bfloat16, -- GitLab From 31d992d215a05ad2e4f17653ddff0f515f865914 Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Wed, 11 Feb 2026 17:33:14 -0500 Subject: [PATCH 0119/1166] [Bugfix] Fix some issues with MoERunner PR #32344 (#34371) Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/layer.py | 5 ++--- .../layers/fused_moe/runner/default_moe_runner.py | 4 ++++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 914dc6846..5a8f51de6 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -700,7 +700,7 @@ class FusedMoE(CustomOp): @property def gate(self) -> torch.nn.Module | None: - return self._gate + return self._gate if self.use_overlapped else None @property def tp_size(self): @@ -725,7 +725,7 @@ class FusedMoE(CustomOp): @property def is_internal_router(self) -> bool: # By default, router/gate is called before FusedMoE forward pass - return self._gate is not None + return self.gate is not None def _maybe_init_expert_routing_tables( self, @@ -1457,7 +1457,6 @@ class FusedMoE(CustomOp): hidden_states: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - self.ensure_moe_quant_config_init() return self.runner.forward( hidden_states, router_logits, diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index 12b795f30..b265cbb41 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -63,6 +63,8 @@ def _moe_forward( layer_name: str, ) -> torch.Tensor: layer = get_layer_from_name(layer_name) + # TODO(bnell): this can be removed after MK migration is complete. + layer.ensure_moe_quant_config_init() return layer.runner.forward_impl( layer, hidden_states, router_logits, shared_experts_input ) @@ -84,6 +86,8 @@ def _moe_forward_shared( layer_name: str, ) -> tuple[torch.Tensor, torch.Tensor]: layer = get_layer_from_name(layer_name) + # TODO(bnell): this can be removed after MK migration is complete. + layer.ensure_moe_quant_config_init() return layer.runner.forward_impl( layer, hidden_states, router_logits, shared_experts_input ) -- GitLab From fb7b30c7162d37d47160b46c5ddb1c82e8073e45 Mon Sep 17 00:00:00 2001 From: Micah Williamson Date: Wed, 11 Feb 2026 17:52:34 -0600 Subject: [PATCH 0120/1166] [ROCm][CI] Revert Test Groups From mi325_8 to mi325_1 Agent Pool In AMD CI (#34384) Signed-off-by: Micah Williamson --- .buildkite/test-amd.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 730613e1f..2f5c2fe4c 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -552,7 +552,7 @@ steps: - label: LoRA Test %N # 20min each timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] - agent_pool: mi325_8 + agent_pool: mi325_1 # grade: Blocking source_file_dependencies: - vllm/lora @@ -648,7 +648,7 @@ steps: - label: Kernels Attention Test %N # 23min timeout_in_minutes: 35 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_8 + agent_pool: mi325_1 # grade: Blocking source_file_dependencies: - csrc/attention/ @@ -663,7 +663,7 @@ steps: - label: Kernels Quantization Test %N # 64min timeout_in_minutes: 90 mirror_hardwares: [amdexperimental] - agent_pool: mi325_8 + agent_pool: mi325_1 # grade: Blocking source_file_dependencies: - csrc/quantization/ @@ -676,7 +676,7 @@ steps: - label: Kernels MoE Test %N # 40min timeout_in_minutes: 60 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_8 + agent_pool: mi325_1 # grade: Blocking source_file_dependencies: - csrc/quantization/cutlass_w8a8/moe/ @@ -839,7 +839,7 @@ steps: - label: Basic Models Tests (Extra Initialization) %N timeout_in_minutes: 45 mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_8 + agent_pool: mi325_1 # grade: Blocking torch_nightly: true source_file_dependencies: @@ -901,7 +901,7 @@ steps: - label: Language Models Tests (Extra Standard) %N timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] - agent_pool: mi325_8 + agent_pool: mi325_1 # grade: Blocking torch_nightly: true source_file_dependencies: @@ -922,7 +922,7 @@ steps: - label: Language Models Tests (Hybrid) %N timeout_in_minutes: 75 mirror_hardwares: [amdexperimental] - agent_pool: mi325_8 + agent_pool: mi325_1 # grade: Blocking torch_nightly: true source_file_dependencies: -- GitLab From 83b47f67b1dfad505606070ae4d9f83e50ad4ebd Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Wed, 11 Feb 2026 16:54:17 -0800 Subject: [PATCH 0121/1166] [ci] Integrate AMD tests into CI (#33626) Signed-off-by: Kevin H. Luu Signed-off-by: khluu Co-authored-by: TJian --- .buildkite/hardware_tests/amd.yaml | 3 ++- .buildkite/test_areas/basic_correctness.yaml | 5 +++++ .buildkite/test_areas/entrypoints.yaml | 5 +++++ .buildkite/test_areas/models_basic.yaml | 8 ++++++-- .buildkite/test_areas/models_language.yaml | 7 ------- .buildkite/test_areas/samplers.yaml | 7 +++++++ 6 files changed, 25 insertions(+), 10 deletions(-) diff --git a/.buildkite/hardware_tests/amd.yaml b/.buildkite/hardware_tests/amd.yaml index ea10624f9..0fd8d3485 100644 --- a/.buildkite/hardware_tests/amd.yaml +++ b/.buildkite/hardware_tests/amd.yaml @@ -1,6 +1,7 @@ -group: Hardware +group: Hardware - AMD Build steps: - label: "AMD: :docker: build image" + key: image-build-amd depends_on: [] device: amd_cpu no_plugin: true diff --git a/.buildkite/test_areas/basic_correctness.yaml b/.buildkite/test_areas/basic_correctness.yaml index 759d2b535..5259a66a3 100644 --- a/.buildkite/test_areas/basic_correctness.yaml +++ b/.buildkite/test_areas/basic_correctness.yaml @@ -14,3 +14,8 @@ steps: - pytest -v -s basic_correctness/test_cumem.py - pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_cpu_offload.py + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml index 0c72e3d9b..6aebb9aab 100644 --- a/.buildkite/test_areas/entrypoints.yaml +++ b/.buildkite/test_areas/entrypoints.yaml @@ -24,6 +24,11 @@ steps: - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd - label: Entrypoints Integration (API Server 1) timeout_in_minutes: 130 diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml index df0a98dc9..de0f3994d 100644 --- a/.buildkite/test_areas/models_basic.yaml +++ b/.buildkite/test_areas/models_basic.yaml @@ -4,7 +4,6 @@ depends_on: steps: - label: Basic Models Tests (Initialization) timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ @@ -16,7 +15,6 @@ steps: - label: Basic Models Tests (Extra Initialization) %N timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/model_executor/models/ @@ -38,6 +36,12 @@ steps: - tests/models/test_registry.py commands: - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd + - label: Basic Models Test (Other CPU) # 5min depends_on: diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml index 7a64604c3..8982dccc4 100644 --- a/.buildkite/test_areas/models_language.yaml +++ b/.buildkite/test_areas/models_language.yaml @@ -4,7 +4,6 @@ depends_on: steps: - label: Language Models Tests (Standard) timeout_in_minutes: 25 - mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ @@ -16,7 +15,6 @@ steps: - label: Language Models Tests (Extra Standard) %N timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/model_executor/models/ @@ -32,7 +30,6 @@ steps: - label: Language Models Tests (Hybrid) %N timeout_in_minutes: 75 - mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ @@ -48,7 +45,6 @@ steps: - label: Language Models Test (Extended Generation) # 80min timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: - vllm/ @@ -62,7 +58,6 @@ steps: - label: Language Models Test (PPL) timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: - vllm/ @@ -72,7 +67,6 @@ steps: - label: Language Models Test (Extended Pooling) # 36min timeout_in_minutes: 50 - mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: - vllm/ @@ -82,7 +76,6 @@ steps: - label: Language Models Test (MTEB) timeout_in_minutes: 110 - mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: - vllm/ diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml index ad377148f..7a71fa433 100644 --- a/.buildkite/test_areas/samplers.yaml +++ b/.buildkite/test_areas/samplers.yaml @@ -12,3 +12,10 @@ steps: commands: - pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers + mirror: + amd: + device: mi325_1 + depends_on: + - image-build-amd + commands: + - pytest -v -s -m 'not skip_v1' samplers -- GitLab From ff1f83b056aedcf3e2d978d267011b2b79c08aca Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 11 Feb 2026 20:29:32 -0500 Subject: [PATCH 0122/1166] [Refactor] Replace `activation: str` with `MoEActivation` enum (#33843) Signed-off-by: mgoin Signed-off-by: Michael Goin --- .../kernels/benchmark_cutlass_moe_fp8.py | 3 +- benchmarks/kernels/benchmark_moe.py | 4 +- .../moe/modular_kernel_tools/common.py | 3 +- tests/kernels/moe/test_cpu_fused_moe.py | 9 +- tests/kernels/moe/test_cutlass_moe.py | 3 +- tests/kernels/moe/test_deepep_deepgemm_moe.py | 3 +- tests/kernels/moe/test_deepep_moe.py | 3 +- tests/kernels/moe/test_flashinfer.py | 20 +-- tests/kernels/moe/test_flashinfer_moe.py | 11 +- .../moe/test_modular_oai_triton_moe.py | 3 +- tests/kernels/moe/test_moe.py | 24 +++- tests/kernels/moe/test_pplx_cutlass_moe.py | 3 +- .../kernels/moe/test_triton_moe_no_act_mul.py | 28 ++-- tests/kernels/moe/utils.py | 3 +- tests/kernels/utils.py | 7 +- .../layers/fused_moe/__init__.py | 8 +- .../layers/fused_moe/activation.py | 136 ++++++++++++++++++ .../layers/fused_moe/batched_deep_gemm_moe.py | 9 +- .../model_executor/layers/fused_moe/config.py | 3 +- .../layers/fused_moe/cpu_fused_moe.py | 22 +-- .../layers/fused_moe/cutlass_moe.py | 51 ++++--- .../layers/fused_moe/deep_gemm_moe.py | 13 +- .../layers/fused_moe/fallback.py | 7 +- .../fused_moe/flashinfer_cutedsl_moe.py | 9 +- .../fused_moe/flashinfer_cutlass_moe.py | 13 +- .../layers/fused_moe/flashinfer_trtllm_moe.py | 5 +- .../layers/fused_moe/fused_batched_moe.py | 25 ++-- .../layers/fused_moe/fused_marlin_moe.py | 39 ++--- .../layers/fused_moe/fused_moe.py | 34 +++-- .../fused_moe/gpt_oss_triton_kernels_moe.py | 18 ++- vllm/model_executor/layers/fused_moe/layer.py | 5 +- .../layers/fused_moe/modular_kernel.py | 28 ++-- .../layers/fused_moe/rocm_aiter_fused_moe.py | 21 +-- .../layers/fused_moe/triton_cutlass_moe.py | 3 +- .../layers/fused_moe/triton_deep_gemm_moe.py | 3 +- .../layers/fused_moe/trtllm_moe.py | 7 +- vllm/model_executor/layers/fused_moe/utils.py | 60 -------- .../layers/fused_moe/xpu_fused_moe.py | 15 +- .../compressed_tensors_moe.py | 29 ++-- .../model_executor/layers/quantization/fp8.py | 3 +- .../layers/quantization/gguf.py | 16 +-- .../layers/quantization/modelopt.py | 8 +- .../layers/quantization/moe_wna16.py | 5 +- .../layers/quantization/mxfp4.py | 6 +- .../layers/quantization/quark/quark_moe.py | 3 +- .../quantization/utils/flashinfer_fp4_moe.py | 13 +- .../layers/quantization/utils/mxfp4_utils.py | 5 +- vllm/model_executor/models/nemotron_h.py | 7 +- 48 files changed, 474 insertions(+), 282 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/activation.py diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py index f1234d821..b33282523 100644 --- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py +++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py @@ -11,6 +11,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from tests.kernels.moe.utils import make_dummy_moe_config from vllm import _custom_ops as ops +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk @@ -161,7 +162,7 @@ def bench_run( w2_fp8q_cutlass, topk_weights, topk_ids, - activation="silu", + activation=MoEActivation.SILU, global_num_experts=num_experts, ) torch.cuda.synchronize() diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index c5e3dabe5..5ee1cf199 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -16,6 +16,7 @@ import torch from ray.experimental.tqdm_ray import tqdm from vllm.model_executor.layers.fused_moe import fused_topk +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, @@ -211,7 +212,8 @@ def benchmark_config( hidden_dim=hidden_size, intermediate_size_per_partition=shard_intermediate_size, num_local_experts=num_experts, - activation="silu", + num_logical_experts=num_experts, + activation=MoEActivation.SILU, moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), in_dtype=init_dtype, routing_method=RoutingMethodType.TopK, diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py index 6dfcd5ebe..87cf0453b 100644 --- a/tests/kernels/moe/modular_kernel_tools/common.py +++ b/tests/kernels/moe/modular_kernel_tools/common.py @@ -22,6 +22,7 @@ from vllm.distributed import ( ) from vllm.forward_context import set_forward_context from vllm.model_executor.layers.fused_moe import fused_topk +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.all2all_utils import ( maybe_make_prepare_finalize, ) @@ -599,7 +600,7 @@ def make_modular_kernel( moe_parallel_config=moe_parallel_config, in_dtype=config.dtype, max_num_tokens=next_power_of_2(config.M), - activation="silu", + activation=MoEActivation.SILU, device=vllm_config.device_config.device, routing_method=RoutingMethodType.DeepSeekV3, ) diff --git a/tests/kernels/moe/test_cpu_fused_moe.py b/tests/kernels/moe/test_cpu_fused_moe.py index 681f42091..839eceeeb 100644 --- a/tests/kernels/moe/test_cpu_fused_moe.py +++ b/tests/kernels/moe/test_cpu_fused_moe.py @@ -6,6 +6,7 @@ import torch from tests.kernels.allclose_default import get_default_atol, get_default_rtol from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.cpu_fused_moe import _CPU_MOE_ACT_FN from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed @@ -19,7 +20,7 @@ EXPERT_NUM = [ HIDDEN_DIM = [128, 2880] INTERMEDIATE_DIM = [128, 2880] BATCH_SIZE = [1, 64, 256] -ACT = ["silu", "swigluoai"] +ACT = [MoEActivation.SILU, MoEActivation.SWIGLUOAI] USE_BIAS = [True, False] ISA = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"] DTYPE = [torch.bfloat16] @@ -33,7 +34,7 @@ def ref_fused_moe( w2_bias: torch.Tensor | None, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, ) -> torch.Tensor: len_experts = w13.size(0) @@ -103,7 +104,7 @@ def test_cpu_fused_moe( intermediate_size: int, use_bias: bool, dtype: torch.dtype, - act: str, + act: MoEActivation, isa: str, ): set_random_seed(0) @@ -153,7 +154,7 @@ def test_cpu_fused_moe( w2_bias, topk_weight, topk_ids, - act, + act.value, isa, ) diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index d232d00fc..ec23008df 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -12,6 +12,7 @@ from tests.kernels.moe.utils import make_dummy_moe_config from vllm import _custom_ops as ops from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.model_executor.layers.fused_moe import fused_experts, fused_topk +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FUSED_MOE_UNQUANTIZED_CONFIG, FusedMoEQuantConfig, @@ -531,7 +532,7 @@ def test_run_cutlass_moe_fp8( c_strides1 = torch.full((e,), 2 * n, device="cuda", dtype=torch.int64) c_strides2 = torch.full((e,), k, device="cuda", dtype=torch.int64) - activation = "silu" + activation = MoEActivation.SILU a1q, a1q_scale = moe_kernel_quantize_input( mt.a, mt.a_scale, torch.float8_e4m3fn, per_act_token ) diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index 11f535715..2b8240482 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -16,6 +16,7 @@ from typing_extensions import ParamSpec from vllm.config import VllmConfig, set_current_vllm_config from vllm.forward_context import set_forward_context +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, fp8_w8a8_moe_quant_config, @@ -324,7 +325,7 @@ def deepep_deepgemm_moe_impl( w2=w2, topk_weights=test_tensors.topk_weights, topk_ids=test_tensors.topk, - activation="silu", + activation=MoEActivation.SILU, global_num_experts=num_experts, expert_map=build_expert_map(), apply_router_weight_on_input=False, diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py index 8d3ca1650..01f340730 100644 --- a/tests/kernels/moe/test_deepep_moe.py +++ b/tests/kernels/moe/test_deepep_moe.py @@ -15,6 +15,7 @@ from vllm import _custom_ops as ops from vllm.config import VllmConfig, set_current_vllm_config from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import TritonExperts +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, ) @@ -260,7 +261,7 @@ def deep_ep_moe_impl( w2=w2, topk_weights=topk_weights_chunk, topk_ids=topk_chunk, - activation="silu", + activation=MoEActivation.SILU, global_num_experts=num_experts, expert_map=build_expert_map(), apply_router_weight_on_input=False, diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py index c5d34ef0b..9c31d9325 100644 --- a/tests/kernels/moe/test_flashinfer.py +++ b/tests/kernels/moe/test_flashinfer.py @@ -7,6 +7,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, @@ -93,9 +94,14 @@ class TestData: @staticmethod def make_moe_tensors_8bit( - m: int, k: int, n: int, e: int, is_trtllm: bool, activation: str = "silu" + m: int, + k: int, + n: int, + e: int, + is_trtllm: bool, + activation: MoEActivation = MoEActivation.SILU, ) -> "TestData": - is_gated = activation != "relu2_no_mul" + is_gated = activation.is_gated hidden_states = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10 w13 = torch.randn( @@ -194,7 +200,7 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph( topk_weights=topk_weights, topk_ids=topk_ids, inplace=False, - activation="silu", + activation=MoEActivation.SILU, global_num_experts=e, expert_map=None, apply_router_weight_on_input=True, @@ -219,21 +225,19 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph( @pytest.mark.parametrize("m,n,k", MNK_FACTORS) @pytest.mark.parametrize("e", NUM_EXPERTS) @pytest.mark.parametrize("topk", TOP_KS) -@pytest.mark.parametrize("activation", ["silu", "relu2_no_mul"]) +@pytest.mark.parametrize("activation", [MoEActivation.SILU, MoEActivation.RELU2_NO_MUL]) def test_flashinfer_cutlass_moe_fp8_no_graph( m: int, n: int, k: int, e: int, topk: int, - activation: str, + activation: MoEActivation, monkeypatch, workspace_init, ): set_random_seed(7) monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") - assert activation in ["silu", "relu2_no_mul"] - is_act_and_mul = activation == "silu_and_mul" with set_current_vllm_config(vllm_config): td = TestData.make_moe_tensors_8bit( m, k, n, e, is_trtllm=False, activation=activation @@ -292,7 +296,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph( device="cuda", moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), in_dtype=torch.bfloat16, - is_act_and_mul=is_act_and_mul, + is_act_and_mul=activation.is_gated, routing_method=RoutingMethodType.TopK, ) diff --git a/tests/kernels/moe/test_flashinfer_moe.py b/tests/kernels/moe/test_flashinfer_moe.py index c61bca313..1f1349cff 100644 --- a/tests/kernels/moe/test_flashinfer_moe.py +++ b/tests/kernels/moe/test_flashinfer_moe.py @@ -13,6 +13,7 @@ from tests.kernels.utils import torch_moe from vllm import _custom_ops as ops from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.model_executor.layers.fused_moe import fused_topk +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, @@ -54,7 +55,7 @@ MNK_FACTORS = [ @pytest.mark.parametrize("e", [40, 64, 256]) @pytest.mark.parametrize("topk", [1, 6, 8]) @pytest.mark.parametrize("dtype", [torch.bfloat16]) -@pytest.mark.parametrize("activation", ["silu_and_mul", "relu2"]) +@pytest.mark.parametrize("activation", [MoEActivation.SILU, MoEActivation.RELU2_NO_MUL]) @torch.inference_mode() def test_flashinfer_fp4_moe_no_graph( m: int, @@ -63,7 +64,7 @@ def test_flashinfer_fp4_moe_no_graph( e: int, topk: int, dtype: torch.dtype, - activation: str, + activation: MoEActivation, workspace_init, ): set_random_seed(7) @@ -73,7 +74,7 @@ def test_flashinfer_fp4_moe_no_graph( a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 quant_blocksize = 16 - is_gated_act = activation == "silu_and_mul" + is_gated_act = activation.is_gated w1_q, w2_q, quant_config = make_test_quant_config( e, @@ -112,15 +113,13 @@ def test_flashinfer_fp4_moe_no_graph( inplace=False, ) - fi_activation = {"silu_and_mul": "silu", "relu2": "relu2_no_mul"}[activation] - flashinfer_output = flashinfer_experts( hidden_states=a, w1=w1_q, w2=w2_q, topk_weights=topk_weights, topk_ids=topk_ids, - activation=fi_activation, + activation=activation, ) # Reference check: diff --git a/tests/kernels/moe/test_modular_oai_triton_moe.py b/tests/kernels/moe/test_modular_oai_triton_moe.py index bebf18ef0..cf9ff1863 100644 --- a/tests/kernels/moe/test_modular_oai_triton_moe.py +++ b/tests/kernels/moe/test_modular_oai_triton_moe.py @@ -7,6 +7,7 @@ Test modular OAI Triton MoE import pytest import torch +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.utils.import_utils import has_triton_kernels if not has_triton_kernels(): @@ -192,7 +193,7 @@ def oai_triton_moe_impl( w2=w2, topk_weights=topk_weights, topk_ids=topk_ids, - activation="swigluoai", + activation=MoEActivation.SWIGLUOAI, global_num_experts=num_experts, expert_map=None, apply_router_weight_on_input=False, diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 6a622ac8e..eddc395cc 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -29,6 +29,7 @@ from vllm.config import VllmConfig, set_current_vllm_config from vllm.distributed.parallel_state import init_distributed_environment from vllm.forward_context import get_forward_context, set_forward_context from vllm.model_executor.layers.fused_moe import ( + MoEActivation, fused_topk, ) from vllm.model_executor.layers.fused_moe.config import ( @@ -1155,7 +1156,10 @@ def test_fused_marlin_moe_with_bias(m): @pytest.mark.parametrize("m", [1, 64, 256]) @pytest.mark.parametrize("n,k", [(1024, 1024), (2048, 2048)]) @pytest.mark.parametrize("e,topk", [(8, 2), (64, 4)]) -def test_fused_marlin_moe_non_gated(m: int, n: int, k: int, e: int, topk: int): +@pytest.mark.parametrize("activation", [MoEActivation.RELU2_NO_MUL]) +def test_fused_marlin_moe_non_gated( + m: int, n: int, k: int, e: int, topk: int, activation: MoEActivation +): """Test Marlin MoE with non-gated activation (relu2_no_mul). Non-gated activations like relu2 don't have the gate-up projection pattern, @@ -1198,7 +1202,7 @@ def test_fused_marlin_moe_non_gated(m: int, n: int, k: int, e: int, topk: int): w2_data.w_ref, score, topk, - activation="relu2", + activation=activation, ) marlin_output = fused_marlin_moe( @@ -1223,7 +1227,7 @@ def test_fused_marlin_moe_non_gated(m: int, n: int, k: int, e: int, topk: int): w2_zeros=w2_data.zeros, quant_type_id=quant_type.id, is_k_full=is_k_full, - activation="relu2_no_mul", + activation=activation, ) torch.testing.assert_close(marlin_output, torch_output, atol=1e-1, rtol=0) @@ -1330,9 +1334,18 @@ def test_moe_sum(m: int, topk: int, k: int, dtype: torch.dtype): @pytest.mark.parametrize("topk", [2]) @pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16]) @pytest.mark.parametrize("with_bias", [False, True]) -@pytest.mark.parametrize("activation", ["silu"]) +@pytest.mark.parametrize("activation", [MoEActivation.SILU]) @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only test") -def test_cpu_fused_moe_basic(m, n, k, e, topk, dtype, with_bias, activation): +def test_cpu_fused_moe_basic( + m: int, + n: int, + k: int, + e: int, + topk: int, + dtype: torch.dtype, + with_bias: bool, + activation: MoEActivation, +): from vllm.model_executor.layers.fused_moe.cpu_fused_moe import CPUFusedMOE device = "cpu" @@ -1608,6 +1621,7 @@ def test_unquantized_bf16_flashinfer_trtllm_backend( hidden_dim=k, intermediate_size_per_partition=n, num_local_experts=e, + num_logical_experts=e, activation="silu", device="cuda", moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py index 894e57fe2..d8a660074 100644 --- a/tests/kernels/moe/test_pplx_cutlass_moe.py +++ b/tests/kernels/moe/test_pplx_cutlass_moe.py @@ -9,6 +9,7 @@ from tests.kernels.utils import torch_experts from vllm import _custom_ops as ops from vllm.config import VllmConfig, set_current_vllm_config from vllm.model_executor.layers.fused_moe import fused_topk +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, @@ -149,7 +150,7 @@ def pplx_cutlass_moe( num_local_experts=num_local_experts, num_logical_experts=num_experts, moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), - activation="silu", + activation=MoEActivation.SILU, in_dtype=torch.bfloat16, device="cuda", routing_method=RoutingMethodType.Llama4, diff --git a/tests/kernels/moe/test_triton_moe_no_act_mul.py b/tests/kernels/moe/test_triton_moe_no_act_mul.py index ab15f898b..1dfac3cf0 100644 --- a/tests/kernels/moe/test_triton_moe_no_act_mul.py +++ b/tests/kernels/moe/test_triton_moe_no_act_mul.py @@ -11,15 +11,11 @@ import pytest import torch from tests.kernels.moe.utils import make_dummy_moe_config +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FUSED_MOE_UNQUANTIZED_CONFIG, ) from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts -from vllm.model_executor.layers.fused_moe.utils import ( - GELU_NO_MUL, - RELU2_NO_MUL, - SILU_NO_MUL, -) from vllm.platforms import current_platform # Test parameters @@ -28,7 +24,11 @@ N_SIZES = [128, 256] K_SIZES = [64, 128] TOPK_VALUES = [1, 2] NUM_EXPERTS = 8 -NO_MUL_ACTIVATIONS = [SILU_NO_MUL, GELU_NO_MUL, RELU2_NO_MUL] +NO_MUL_ACTIVATIONS = [ + MoEActivation.SILU_NO_MUL, + MoEActivation.GELU_NO_MUL, + MoEActivation.RELU2_NO_MUL, +] def make_test_tensors( @@ -73,7 +73,7 @@ def test_triton_experts_no_mul_activation( n: int, k: int, topk: int, - activation: str, + activation: MoEActivation, ): hidden_states, w1, w2, topk_weights, topk_ids = make_test_tensors( m, n, k, NUM_EXPERTS, topk @@ -161,11 +161,11 @@ def test_workspace_shapes_no_mul_vs_gated(): ) ws1_no_mul, _, out_no_mul = experts.workspace_shapes( - M, N, K, topk, 8, 8, None, SILU_NO_MUL + M, N, K, topk, 8, 8, None, MoEActivation.SILU_NO_MUL ) ws1_gated, _, out_gated = experts.workspace_shapes( - M, N, K, topk, 8, 8, None, "silu" + M, N, K, topk, 8, 8, None, MoEActivation.SILU ) # For no_mul: activation_out_dim = N @@ -202,10 +202,10 @@ def test_adjust_n_for_activation(): N = 256 # Gated activations should return N // 2 - assert experts.adjust_N_for_activation(N, "silu") == N // 2 - assert experts.adjust_N_for_activation(N, "gelu") == N // 2 + assert experts.adjust_N_for_activation(N, MoEActivation.SILU) == N // 2 + assert experts.adjust_N_for_activation(N, MoEActivation.GELU) == N // 2 # Non-gated activations should return N - assert experts.adjust_N_for_activation(N, SILU_NO_MUL) == N - assert experts.adjust_N_for_activation(N, GELU_NO_MUL) == N - assert experts.adjust_N_for_activation(N, RELU2_NO_MUL) == N + assert experts.adjust_N_for_activation(N, MoEActivation.SILU_NO_MUL) == N + assert experts.adjust_N_for_activation(N, MoEActivation.GELU_NO_MUL) == N + assert experts.adjust_N_for_activation(N, MoEActivation.RELU2_NO_MUL) == N diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py index 984fabc47..6cf01ac47 100644 --- a/tests/kernels/moe/utils.py +++ b/tests/kernels/moe/utils.py @@ -12,6 +12,7 @@ from vllm.model_executor.layers.fused_moe import ( fused_experts, fused_topk, ) +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, @@ -54,7 +55,7 @@ def make_dummy_moe_config( num_local_experts=num_experts, num_logical_experts=num_experts, moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(), - activation="silu", + activation=MoEActivation.SILU, in_dtype=in_dtype, device="cuda", routing_method=RoutingMethodType.TopK, diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 9c6cc4dab..c1a111e1f 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -15,6 +15,7 @@ from torch._prims_common import TensorLikeType from tests.kernels.quant_utils import native_w8a8_block_matmul from vllm.model_executor.custom_op import op_registry from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input from vllm.utils.torch_utils import make_tensor_with_pad from vllm.v1.attention.backend import AttentionType @@ -840,7 +841,7 @@ def torch_experts( per_act_token_quant=False, block_shape: list[int] | None = None, apply_router_weights_on_input: bool = False, - activation: str = "silu_and_mul", + activation: MoEActivation = MoEActivation.SILU, ) -> torch.Tensor: assert ( global_num_experts == -1 @@ -883,7 +884,7 @@ def torch_experts( f32 = torch.float32 - act = op_registry[activation] + act = op_registry[activation.custom_op_name] for i in range(num_experts): mask = topk_ids == i @@ -973,7 +974,7 @@ def torch_moe( b_bias2: torch.Tensor | None = None, global_num_experts: int = -1, expert_map: torch.Tensor | None = None, - activation: str = "silu_and_mul", + activation: MoEActivation = MoEActivation.SILU, ) -> torch.Tensor: score = torch.softmax(score, dim=-1, dtype=torch.float32) topk_weight, topk_ids = torch.topk(score, topk) diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index dc17af87e..c6cb31b62 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -4,6 +4,11 @@ from contextlib import contextmanager from typing import Any +from vllm.model_executor.layers.fused_moe.activation import ( + MoEActivation, + activation_without_mul, + apply_moe_activation, +) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, RoutingMethodType, @@ -27,7 +32,6 @@ from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import ( UnquantizedFusedMoEMethod, ) -from vllm.model_executor.layers.fused_moe.utils import activation_without_mul from vllm.model_executor.layers.fused_moe.zero_expert_fused_moe import ( ZeroExpertFusedMoE, ) @@ -54,6 +58,7 @@ __all__ = [ "FusedMoERouter", "FusedMoEConfig", "FusedMoEMethodBase", + "MoEActivation", "UnquantizedFusedMoEMethod", "FusedMoeWeightScaleSupported", "FusedMoEPermuteExpertsUnpermute", @@ -63,6 +68,7 @@ __all__ = [ "SharedFusedMoE", "ZeroExpertFusedMoE", "activation_without_mul", + "apply_moe_activation", "override_config", "get_config", ] diff --git a/vllm/model_executor/layers/fused_moe/activation.py b/vllm/model_executor/layers/fused_moe/activation.py new file mode 100644 index 000000000..3112b3054 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/activation.py @@ -0,0 +1,136 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""MoE activation function enum and utilities.""" + +from enum import Enum + +import torch +import torch.nn.functional as F + + +class MoEActivation(Enum): + """Activation functions for MoE layers.""" + + # Gated activations (gate * activation(up)) expect input of shape [..., 2*d] + # and produce output of shape [..., d] + SILU = "silu" + GELU = "gelu" + RELU2 = "relu2" + SWIGLUOAI = "swigluoai" + SWIGLUSTEP = "swiglustep" + + # Non-gated activations (no mul with gate) expect input of shape [..., d] + # and produce output of shape [..., d]. + # NOTE: Non-gated activations require the "_no_mul" suffix to be present. + SILU_NO_MUL = "silu_no_mul" + GELU_NO_MUL = "gelu_no_mul" + RELU2_NO_MUL = "relu2_no_mul" + + @property + def is_gated(self) -> bool: + """Returns True if activation expects gate*activation(up) pattern. + + Gated activations expect input tensor with 2x the output size, + where the first half is the gate and second half is the up projection. + """ + return not self.value.endswith("_no_mul") + + @property + def custom_op_name(self) -> str: + """Maps to the CustomOp name of activations + in vllm/model_executor/layers/activation.py.""" + return _CUSTOM_OP_NAMES[self] + + def without_mul(self) -> "MoEActivation": + """Get the non-gated variant of this activation. + + For activations that have a _no_mul variant, returns that variant. + For activations without a _no_mul variant (or already _no_mul), + returns self. + """ + return _WITHOUT_MUL.get(self, self) + + @classmethod + def from_str(cls, s: str) -> "MoEActivation": + """Parse from string for backward compatibility.""" + for member in cls: + if member.value == s: + return member + valid = [m.value for m in cls] + raise ValueError(f"Unknown MoE activation: {s!r}. Valid activations: {valid}") + + +# Module-level lookup tables used by MoEActivation functions. +_CUSTOM_OP_NAMES: dict[MoEActivation, str] = { + MoEActivation.SILU: "silu_and_mul", + MoEActivation.GELU: "gelu_and_mul", + MoEActivation.SWIGLUOAI: "swigluoai_and_mul", + MoEActivation.SWIGLUSTEP: "swiglustep_and_mul", + MoEActivation.RELU2: "relu2", + MoEActivation.SILU_NO_MUL: "silu_and_mul", + MoEActivation.GELU_NO_MUL: "gelu_and_mul", + MoEActivation.RELU2_NO_MUL: "relu2", +} + +_WITHOUT_MUL: dict[MoEActivation, MoEActivation] = { + MoEActivation.SILU: MoEActivation.SILU_NO_MUL, + MoEActivation.GELU: MoEActivation.GELU_NO_MUL, + MoEActivation.RELU2: MoEActivation.RELU2_NO_MUL, +} + + +def activation_without_mul(activation: str) -> str: + """Get the non-gated variant of an activation function. + + Args: + activation: The activation function name (e.g., "silu", "gelu") + + Returns: + The non-gated activation name (e.g., "silu_no_mul", "gelu_no_mul") + """ + return MoEActivation.from_str(activation).without_mul().value + + +def apply_moe_activation( + activation: MoEActivation, + output: torch.Tensor, + input: torch.Tensor, +) -> torch.Tensor: + """Apply MoE activation function.""" + assert input.dim() == 2, "Input must be 2D" + assert output.dim() == 2, "Output must be 2D" + if activation.is_gated: + assert output.size(-1) * 2 == input.size(-1), ( + f"{activation.value} expects 2x ratio: " + f"{output.size(-1) * 2} vs {input.size(-1)}" + ) + else: + assert output.size(-1) == input.size(-1), ( + f"{activation.value} expects equal sizes: " + f"{output.size(-1)} vs {input.size(-1)}" + ) + + # Activations with gated multiplication (gate × activation(up)) + if activation == MoEActivation.SILU: + torch.ops._C.silu_and_mul(output, input) + elif activation == MoEActivation.GELU: + torch.ops._C.gelu_and_mul(output, input) + elif activation == MoEActivation.SWIGLUOAI: + torch.ops._C.swigluoai_and_mul(output, input) + elif activation == MoEActivation.SWIGLUSTEP: + from vllm.model_executor.layers.activation import swiglustep_and_mul_triton + + swiglustep_and_mul_triton(output, input) + + # Activations without gated multiplication + elif activation == MoEActivation.SILU_NO_MUL: + output.copy_(F.silu(input)) + elif activation == MoEActivation.GELU_NO_MUL: + output.copy_(F.gelu(input)) + elif activation == MoEActivation.RELU2_NO_MUL: + F.relu(input, inplace=True) + torch.square(input, out=output) + else: + raise ValueError(f"Unsupported FusedMoe activation: {activation}") + + return output diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py index ac37cff93..405965c53 100644 --- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py @@ -7,6 +7,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.forward_context import get_forward_context, is_forward_context_available from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, @@ -303,8 +304,8 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): return (weight_key, activation_key) in SUPPORTED_W_A @staticmethod - def _supports_activation(activation: str) -> bool: - return activation in ["silu"] + def _supports_activation(activation: MoEActivation) -> bool: + return activation == MoEActivation.SILU @staticmethod def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool: @@ -338,7 +339,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, local_num_experts: int, expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: # FIXME (varun): We should be able to dispatch only from the leader # DP ranks in the case of TP > 1. At the moment, all the Ranks @@ -389,7 +390,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, a1q_scale: torch.Tensor | None, diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 6dce6875d..c999673e8 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -14,6 +14,7 @@ from vllm.distributed import ( get_tensor_model_parallel_rank, ) from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import ( OCP_MX_DTYPES, OCP_MX_Scheme, @@ -1132,7 +1133,7 @@ class FusedMoEConfig: intermediate_size_per_partition: int num_local_experts: int num_logical_experts: int - activation: str + activation: MoEActivation device: torch.device | str routing_method: RoutingMethodType moe_parallel_config: FusedMoEParallelConfig diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py index 127538822..7a78faafb 100644 --- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py @@ -9,6 +9,7 @@ from torch.nn import functional as F from vllm import _custom_ops as ops from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.quantization.utils.layer_utils import replace_parameter from vllm.utils.torch_utils import direct_register_custom_op @@ -36,9 +37,9 @@ def _swigluoai_forward_native( # Map activation names to their native forward functions. # Uses static methods or standalone functions to avoid instantiating CustomOp # classes, which would call get_current_vllm_config() before config is set. -_CPU_MOE_ACT_FN: dict[str, Callable[[torch.Tensor], torch.Tensor]] = { - "silu": SiluAndMul.forward_native, - "swigluoai": _swigluoai_forward_native, +_CPU_MOE_ACT_FN: dict[MoEActivation, Callable[[torch.Tensor], torch.Tensor]] = { + MoEActivation.SILU: SiluAndMul.forward_native, + MoEActivation.SWIGLUOAI: _swigluoai_forward_native, } @@ -168,9 +169,9 @@ class SGLFusedMOE: routed_scaling_factor: float = 1.0, e_score_correction_bias: torch.Tensor | None = None, apply_router_weight_on_input: bool = False, - activation: str = "silu", + activation: MoEActivation = MoEActivation.SILU, ) -> torch.Tensor: - assert activation == "silu", f"{activation} is not supported." + assert activation == MoEActivation.SILU, f"{activation} is not supported." assert not apply_router_weight_on_input topk_weights, topk_ids = select_experts( hidden_states=x, @@ -235,7 +236,7 @@ class CPUFusedMOE: routed_scaling_factor: float = 1.0, e_score_correction_bias: torch.Tensor | None = None, apply_router_weight_on_input: bool = False, - activation: str = "silu", + activation: MoEActivation = MoEActivation.SILU, ) -> torch.Tensor: assert activation in _CPU_MOE_ACT_FN, f"{activation} is not supported." @@ -353,7 +354,7 @@ class CPUFusedMOE: input: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int = -1, skip_weighted: bool = False, ) -> torch.Tensor: @@ -371,7 +372,7 @@ class CPUFusedMOE: getattr(layer, "w2_bias", None), topk_weights, topk_ids, - activation, + activation.value, self.isa, skip_weighted, ) @@ -383,7 +384,7 @@ class CPUFusedMOE: input: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int = -1, skip_weighted: bool = False, ) -> torch.Tensor: @@ -419,6 +420,7 @@ def cpu_fused_moe_torch( global_num_experts: int = -1, skip_weighted: bool = False, ) -> None: + act = MoEActivation.from_str(activation) layer = _CPU_MOE_LAYER_CACHE[layer_id]() # Ref code from https://github.com/sgl-project/sglang/blob/716e682721397df103f347d22da8bd46c6016dab/python/sglang/srt/layers/moe/fused_moe_native.py#L53 @@ -442,7 +444,7 @@ def cpu_fused_moe_torch( tokens_for_this_expert = sorted_tokens[start_idx:end_idx] gate_up = layer.gate_up_linear[i](tokens_for_this_expert) # type: ignore - gate_up = _CPU_MOE_ACT_FN[activation](gate_up) + gate_up = _CPU_MOE_ACT_FN[act](gate_up) expert_out = layer.down_linear[i](gate_up) # type: ignore outputs.append(expert_out) start_idx = end_idx diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 77d439d32..4f8948778 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -7,6 +7,10 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.activation import ( + MoEActivation, + apply_moe_activation, +) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, @@ -25,7 +29,6 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( ) from vllm.model_executor.layers.fused_moe.utils import ( _resize_cache, - apply_moe_activation, ) from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, @@ -51,7 +54,7 @@ def run_cutlass_moe_fp8( w1: torch.Tensor, w2: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, w1_scale: torch.Tensor | None, @@ -73,7 +76,7 @@ def run_cutlass_moe_fp8( ): a1q = hidden_states - assert not activation.endswith("_no_mul"), "Only gated activation is supported" + assert activation.is_gated, "Only gated activation is supported" assert w1_scale is not None assert w2_scale is not None assert w1.dtype == torch.float8_e4m3fn @@ -310,8 +313,12 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): return (weight_key, activation_key) in SUPPORTED_W_A @staticmethod - def _supports_activation(activation: str) -> bool: - return activation in ["silu", "gelu", "swigluoai"] + def _supports_activation(activation: MoEActivation) -> bool: + return activation in [ + MoEActivation.SILU, + MoEActivation.GELU, + MoEActivation.SWIGLUOAI, + ] def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: # Let PrepareAndFinalize::finalize() decide the impl. @@ -325,7 +332,7 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, a1q_scale: torch.Tensor | None, @@ -415,7 +422,7 @@ class CutlassExpertsFp8(CutlassExpertsFp8Base): global_num_experts: int, local_num_experts: int, expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: activation_out_dim = self.adjust_N_for_activation(N, activation) workspace1 = (M * topk, max(N, K)) @@ -456,7 +463,7 @@ class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base): global_num_experts: int, local_num_experts: int, expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: num_dp = self.num_dispatchers assert num_dp is not None @@ -489,7 +496,7 @@ def run_cutlass_moe_fp4( w2_alphas: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, workspace13: torch.Tensor, workspace2: torch.Tensor, m: int, @@ -612,7 +619,7 @@ def run_cutlass_moe_fp4( blockscale_offsets[:-1], ) del rep_a_fp4, rep_a_blockscale - if activation == "silu": + if activation == MoEActivation.SILU: # Fused SiLU+Mul+NVFP4 quantization # Note: c2 workspace is no longer needed since SiLU is fused with quantization. # c3 reuses workspace13 after c1 is consumed. @@ -682,8 +689,12 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute): return (weight_key, activation_key) == (kNvfp4Static, kNvfp4Dynamic) @staticmethod - def _supports_activation(activation: str) -> bool: - return activation in ["silu", "gelu", "swigluoai"] + def _supports_activation(activation: MoEActivation) -> bool: + return activation in [ + MoEActivation.SILU, + MoEActivation.GELU, + MoEActivation.SWIGLUOAI, + ] @staticmethod def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool: @@ -716,7 +727,7 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, local_num_experts: int, expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: workspace1 = (M * topk, max(2 * N, K)) workspace2 = (M * topk, N) @@ -731,7 +742,7 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, a1q_scale: torch.Tensor | None, # unused @@ -776,7 +787,7 @@ def run_cutlass_moe_w4a8_fp8( w1: torch.Tensor, w2: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, w1_scale: torch.Tensor | None, @@ -970,7 +981,7 @@ class CutlassExpertsW4A8Fp8(mk.FusedMoEPermuteExpertsUnpermute): ) @staticmethod - def _supports_activation(activation: str) -> bool: + def _supports_activation(activation: MoEActivation) -> bool: raise NotImplementedError( "CutlassExpertsW4A8Fp8 is not yet used by an Oracle. " "This method should not be called." @@ -1005,7 +1016,7 @@ class CutlassExpertsW4A8Fp8(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, local_num_experts: int, expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: activation_out_dim = self.adjust_N_for_activation(N, activation) workspace1 = (M * topk, max(N, K)) @@ -1021,7 +1032,7 @@ class CutlassExpertsW4A8Fp8(mk.FusedMoEPermuteExpertsUnpermute): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, a1q_scale: torch.Tensor | None, @@ -1094,7 +1105,7 @@ def cutlass_moe_w4a8_fp8( s_strides2: torch.Tensor, quant_config: FusedMoEQuantConfig, moe_config: FusedMoEConfig, - activation: str = "silu", + activation: MoEActivation = MoEActivation.SILU, expert_map: torch.Tensor | None = None, apply_router_weight_on_input: bool = False, global_num_experts: int = -1, @@ -1137,7 +1148,7 @@ def cutlass_moe_w4a8_fp8( dtype: torch.int64 - per_act_token (Optional[bool]): Whether the scale is per-token or per-tensor. - - activation (str): The activation function to use. + - activation (MoEActivation): The activation function to use. - expert_map (Optional[torch.Tensor]): In the case of Expert parallel, every Rank is responsible for a subset of experts. expert_map is a mapping from global expert-id to local expert-id. When expert_map[i] diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index 59dde3ca9..69ca7c91c 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -5,6 +5,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, @@ -145,8 +146,8 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): return (weight_key, activation_key) in SUPPORTED_W_A @staticmethod - def _supports_activation(activation: str) -> bool: - return activation in ["silu", "swiglustep"] + def _supports_activation(activation: MoEActivation) -> bool: + return activation in [MoEActivation.SILU, MoEActivation.SWIGLUSTEP] @staticmethod def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool: @@ -171,7 +172,7 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, local_num_experts: int, expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: assert self.block_shape is not None block_m = self.block_shape[0] @@ -187,7 +188,7 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): return (workspace1, workspace2, output) def _act_mul_quant( - self, input: torch.Tensor, output: torch.Tensor, activation: str + self, input: torch.Tensor, output: torch.Tensor, activation: MoEActivation ) -> tuple[torch.Tensor, torch.Tensor]: assert self.block_shape is not None block_k = self.block_shape[1] @@ -210,7 +211,7 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): return a2q, a2q_scale # 2. Hopper / non‑E8M0: prefer the fused SiLU+mul+quant kernel - if activation == "silu": + if activation == MoEActivation.SILU: use_ue8m0 = scale_fmt == DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0 return silu_mul_per_token_group_quant_fp8_colmajor( input=input, @@ -235,7 +236,7 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, a1q_scale: torch.Tensor | None, diff --git a/vllm/model_executor/layers/fused_moe/fallback.py b/vllm/model_executor/layers/fused_moe/fallback.py index 07e5b8005..4b6458e7f 100644 --- a/vllm/model_executor/layers/fused_moe/fallback.py +++ b/vllm/model_executor/layers/fused_moe/fallback.py @@ -6,6 +6,7 @@ from abc import ABC, abstractmethod import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import FusedMoEParallelConfig from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey @@ -76,7 +77,7 @@ class FallbackExperts(mk.FusedMoEPermuteExpertsUnpermute, ABC): ) and fallback_cls._supports_quant_scheme(weight_key, activation_key) @classmethod - def _supports_activation(cls, activation: str) -> bool: + def _supports_activation(cls, activation: MoEActivation) -> bool: experts_cls, fallback_cls = cls.get_clses() return experts_cls._supports_activation( activation @@ -138,7 +139,7 @@ class FallbackExperts(mk.FusedMoEPermuteExpertsUnpermute, ABC): global_num_experts: int, local_num_experts: int, expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: raise NotImplementedError @@ -159,7 +160,7 @@ class FallbackExperts(mk.FusedMoEPermuteExpertsUnpermute, ABC): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, a1q_scale: torch.Tensor | None, diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py index 2ad949577..d0cf7533d 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py @@ -6,6 +6,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import envs from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, @@ -72,8 +73,8 @@ class FlashInferCuteDSLExperts(mk.FusedMoEPermuteExpertsUnpermute): return (weight_key, activation_key) in SUPPORTED_W_A @staticmethod - def _supports_activation(activation: str) -> bool: - return activation in ["silu"] + def _supports_activation(activation: MoEActivation) -> bool: + return activation == MoEActivation.SILU @staticmethod def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool: @@ -101,7 +102,7 @@ class FlashInferCuteDSLExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, local_num_experts: int, expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: # We use global_num_experts due to how moe_align_block_size handles # expert_maps. @@ -135,7 +136,7 @@ class FlashInferCuteDSLExperts(mk.FusedMoEPermuteExpertsUnpermute): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, a1q_scale: torch.Tensor | None, diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py index 85df6cb66..4ec76ee98 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py @@ -5,6 +5,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEParallelConfig, FusedMoEQuantConfig, @@ -130,8 +131,8 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): ) @staticmethod - def _supports_activation(activation: str) -> bool: - return activation in ["silu", "relu2_no_mul"] + def _supports_activation(activation: MoEActivation) -> bool: + return activation in [MoEActivation.SILU, MoEActivation.RELU2_NO_MUL] @staticmethod def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool: @@ -164,7 +165,7 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, local_num_experts: int, expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: # We use global_num_experts due to how moe_align_block_size handles # expert_maps. @@ -201,7 +202,7 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, a1q_scale: torch.Tensor | None, @@ -214,8 +215,8 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute): from flashinfer.fused_moe.core import ActivationType activation_str_to_value_map = { - "silu": ActivationType.Swiglu, # This is the default - "relu2_no_mul": ActivationType.Relu2, + MoEActivation.SILU: ActivationType.Swiglu, # This is the default + MoEActivation.RELU2_NO_MUL: ActivationType.Relu2, } assert activation in activation_str_to_value_map, ( f"{activation=} missing from {activation_str_to_value_map.keys()=}" diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py index 9af18485e..a50ad6722 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py @@ -4,6 +4,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, @@ -50,9 +51,9 @@ def _supports_quant_scheme( return (weight_key, activation_key) in SUPPORTED_W_A -def _supports_activation(activation: str) -> bool: +def _supports_activation(activation: MoEActivation) -> bool: """Supports silu activation only.""" - return activation in ["silu"] + return activation == MoEActivation.SILU def _supports_routing_method( diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index 8822b8a8a..fbd47f8c4 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -5,6 +5,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, @@ -698,7 +699,7 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute): ) @staticmethod - def _supports_activation(activation: str) -> bool: + def _supports_activation(activation: MoEActivation) -> bool: raise NotImplementedError( "NaiveBatchedExperts is not yet used by an Oracle. " "This method should not be called." @@ -730,7 +731,7 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, local_num_experts: int, expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: assert self.num_dispatchers is not None assert self.max_num_tokens is not None @@ -757,7 +758,7 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, a1q_scale: torch.Tensor | None, @@ -942,14 +943,14 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): ) @staticmethod - def _supports_activation(activation: str) -> bool: + def _supports_activation(activation: MoEActivation) -> bool: return activation in [ - "silu", - "gelu", - "swigluoai", - "silu_no_mul", - "gelu_no_mul", - "relu2_no_mul", + MoEActivation.SILU, + MoEActivation.GELU, + MoEActivation.SWIGLUOAI, + MoEActivation.SILU_NO_MUL, + MoEActivation.GELU_NO_MUL, + MoEActivation.RELU2_NO_MUL, ] @staticmethod @@ -975,7 +976,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, local_num_experts: int, expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: assert self.num_dispatchers is not None assert self.max_num_tokens is not None @@ -996,7 +997,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, a1q_scale: torch.Tensor | None, diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 3d3a21f81..57fb3561d 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -8,6 +8,10 @@ import torch import vllm._custom_ops as ops import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.activation import ( + MoEActivation, + apply_moe_activation, +) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, @@ -23,7 +27,6 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( ) from vllm.model_executor.layers.fused_moe.utils import ( _resize_cache, - apply_moe_activation, disable_inplace, ) from vllm.model_executor.layers.quantization.utils.marlin_utils import ( @@ -59,9 +62,9 @@ def _fused_marlin_moe( sorted_token_ids: torch.Tensor, expert_ids: torch.Tensor, num_tokens_post_padded: torch.Tensor, - activation: str = "silu", + activation: MoEActivation = MoEActivation.SILU, activation_func: Callable[ - [str, torch.Tensor, torch.Tensor], None + [MoEActivation, torch.Tensor, torch.Tensor], None ] = apply_moe_activation, input_global_scale1: torch.Tensor | None = None, input_global_scale2: torch.Tensor | None = None, @@ -83,7 +86,7 @@ def _fused_marlin_moe( assert hidden_states.ndim == 2 M, K = hidden_states.size() N = marlin_moe_intermediate_size(w1, w2) - w13_num_shards = 1 if "no_mul" in activation else 2 + w13_num_shards = 2 if activation.is_gated else 1 if workspace is None: workspace = marlin_make_workspace_new(hidden_states.device, 4) @@ -215,9 +218,9 @@ def fused_marlin_moe( quant_type_id: int, apply_router_weight_on_input: bool = False, global_num_experts: int = -1, - activation: str = "silu", + activation: MoEActivation = MoEActivation.SILU, activation_func: Callable[ - [str, torch.Tensor, torch.Tensor], None + [MoEActivation, torch.Tensor, torch.Tensor], None ] = apply_moe_activation, moe_sum: Callable[[torch.Tensor, torch.Tensor], None] | None = None, expert_map: torch.Tensor | None = None, @@ -377,7 +380,7 @@ def batched_fused_marlin_moe( quant_type_id: int, apply_router_weight_on_input: bool = False, global_num_experts: int = -1, - activation: str | None = "silu", + activation: MoEActivation = MoEActivation.SILU, expert_map: torch.Tensor | None = None, global_scale1: torch.Tensor | None = None, global_scale2: torch.Tensor | None = None, @@ -579,14 +582,14 @@ class MarlinExpertsBase(mk.FusedMoEPermuteExpertsUnpermute): return weight_key in SUPPORTED_W @staticmethod - def _supports_activation(activation: str) -> bool: + def _supports_activation(activation: MoEActivation) -> bool: return activation in [ - "silu", - "gelu", - "swigluoai", - "silu_no_mul", - "gelu_no_mul", - "relu2_no_mul", + MoEActivation.SILU, + MoEActivation.GELU, + MoEActivation.SWIGLUOAI, + MoEActivation.SILU_NO_MUL, + MoEActivation.GELU_NO_MUL, + MoEActivation.RELU2_NO_MUL, ] @staticmethod @@ -661,7 +664,7 @@ class MarlinExperts(MarlinExpertsBase): global_num_experts: int, local_num_experts: int, expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: # Modular Kernel provisions output buffer from workspace1. However in # the fused_marlin_moe() function, the final torch.sum(), is defined @@ -692,7 +695,7 @@ class MarlinExperts(MarlinExpertsBase): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, a1q_scale: torch.Tensor | None, @@ -788,7 +791,7 @@ class BatchedMarlinExperts(MarlinExpertsBase): global_num_experts: int, local_num_experts: int, expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: assert self.num_dispatchers is not None assert self.max_num_tokens is not None @@ -808,7 +811,7 @@ class BatchedMarlinExperts(MarlinExpertsBase): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, a1q_scale: torch.Tensor | None, diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 352288e17..f988e91c2 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -17,6 +17,10 @@ from vllm.logger import init_logger from vllm.model_executor.layers.batch_invariant import ( vllm_is_batch_invariant, ) +from vllm.model_executor.layers.fused_moe.activation import ( + MoEActivation, + apply_moe_activation, +) from vllm.model_executor.layers.fused_moe.config import ( FUSED_MOE_UNQUANTIZED_CONFIG, FusedMoEConfig, @@ -32,7 +36,6 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( ) from vllm.model_executor.layers.fused_moe.utils import ( _resize_cache, - apply_moe_activation, disable_inplace, moe_kernel_quantize_input, ) @@ -1468,6 +1471,7 @@ def outplace_fused_experts_fake( topk_weights: torch.Tensor, topk_ids: torch.Tensor, activation: str = "silu", + apply_router_weight_on_input: bool = False, use_fp8_w8a8: bool = False, use_int8_w8a8: bool = False, use_int8_w8a16: bool = False, @@ -1521,7 +1525,7 @@ def fused_experts( topk_weights: torch.Tensor, topk_ids: torch.Tensor, inplace: bool = False, - activation: str = "silu", + activation: MoEActivation = MoEActivation.SILU, apply_router_weight_on_input: bool = False, global_num_experts: int = -1, expert_map: torch.Tensor | None = None, @@ -1539,7 +1543,7 @@ def fused_experts( w2=w2, topk_weights=topk_weights, topk_ids=topk_ids, - activation=activation, + activation=activation.value, apply_router_weight_on_input=apply_router_weight_on_input, use_fp8_w8a8=quant_config.use_fp8_w8a8, use_int8_w8a8=quant_config.use_int8_w8a8, @@ -1618,6 +1622,9 @@ def fused_experts_impl( w1_bias: torch.Tensor | None = None, w2_bias: torch.Tensor | None = None, ) -> torch.Tensor: + # Convert string activation to enum for internal use + activation_enum = MoEActivation.from_str(activation) + # Check constraints. if use_int4_w4a16: assert hidden_states.size(1) // 2 == w1.size(2), "Hidden size mismatch" @@ -1692,7 +1699,7 @@ def fused_experts_impl( # This needs separate memory since it's used concurrently with cache1 activation_out_dim = mk.FusedMoEPermuteExpertsUnpermute.adjust_N_for_activation( - N, activation + N, activation_enum ) intermediate_cache2 = torch.empty( (M * top_k_num, activation_out_dim), @@ -1832,7 +1839,7 @@ def fused_experts_impl( ) apply_moe_activation( - activation, intermediate_cache2, intermediate_cache1.view(-1, N) + activation_enum, intermediate_cache2, intermediate_cache1.view(-1, N) ) qintermediate_cache2, a2q_scale = moe_kernel_quantize_input( @@ -1932,8 +1939,13 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): return (weight_key, activation_key) in SUPPORTED_W_A @staticmethod - def _supports_activation(activation: str) -> bool: - return activation in ["silu", "gelu", "swigluoai", "swiglustep"] + def _supports_activation(activation: MoEActivation) -> bool: + return activation in [ + MoEActivation.SILU, + MoEActivation.GELU, + MoEActivation.SWIGLUOAI, + MoEActivation.SWIGLUSTEP, + ] @staticmethod def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool: @@ -1957,7 +1969,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, local_num_experts: int, expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: activation_out_dim = self.adjust_N_for_activation(N, activation) workspace1 = (M, topk, max(activation_out_dim, K)) @@ -1973,7 +1985,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, a1q_scale: torch.Tensor | None, @@ -2138,7 +2150,7 @@ class TritonWNA16Experts(TritonExperts): ) @staticmethod - def _supports_activation(activation: str) -> bool: + def _supports_activation(activation: MoEActivation) -> bool: raise NotImplementedError( "TritonWNA16Experts is not yet used by an Oracle. " "This method should not be called." @@ -2159,7 +2171,7 @@ class TritonWNA16Experts(TritonExperts): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, a1q_scale: torch.Tensor | None, diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index 5aaf2a8c3..70d11f44f 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -7,6 +7,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FUSED_MOE_UNQUANTIZED_CONFIG, FusedMoEParallelConfig, @@ -172,7 +173,7 @@ def triton_kernel_moe_forward( gating_output: torch.Tensor, topk: int, renormalize: bool, - activation: str = "silu", + activation: MoEActivation = MoEActivation.SWIGLUOAI, quant_config: FusedMoEQuantConfig | None = None, apply_router_weight_on_input: bool = False, global_num_experts: int = -1, @@ -211,7 +212,7 @@ def triton_kernel_fused_experts( gather_indx, # GatherIndx scatter_indx, # ScatterIndx topk: int, - activation: str = "silu", + activation: MoEActivation = MoEActivation.SWIGLUOAI, quant_config: FusedMoEQuantConfig | None = None, swiglu_alpha: float = 1.702, swiglu_limit: float = 7.0, @@ -222,6 +223,9 @@ def triton_kernel_fused_experts( a1q_scale: torch.Tensor | None = None, ) -> torch.Tensor: """Triton implementation of fused expert computation using OAI kernels.""" + assert activation == MoEActivation.SWIGLUOAI, ( + "Only SWIGLUOAI activation is supported" + ) if quant_config is None: quant_config = FUSED_MOE_UNQUANTIZED_CONFIG @@ -379,7 +383,7 @@ class BaseOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute): ) @staticmethod - def _supports_activation(activation: str) -> bool: + def _supports_activation(activation: MoEActivation) -> bool: raise NotImplementedError( "OAITritonExperts is not yet used by an Oracle. " "This method should not be called." @@ -463,7 +467,7 @@ class OAITritonExperts(BaseOAITritonExperts): global_num_experts: int, local_num_experts: int, expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: # workspace are allocated inside the kernel activation_out_dim = self.adjust_N_for_activation(N, activation) @@ -480,7 +484,7 @@ class OAITritonExperts(BaseOAITritonExperts): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, a1q_scale: torch.Tensor | None, @@ -547,7 +551,7 @@ class UnfusedOAITritonExperts(BaseOAITritonExperts): global_num_experts: int, local_num_experts: int, expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: # workspace are allocated inside the kernel activation_out_dim = self.adjust_N_for_activation(N, activation) @@ -567,7 +571,7 @@ class UnfusedOAITritonExperts(BaseOAITritonExperts): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, a1q_scale: torch.Tensor | None, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 5a8f51de6..a181b18c9 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -20,6 +20,7 @@ from vllm.distributed import ( from vllm.distributed.eplb.eplb_state import EplbLayerState, EplbState from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, @@ -500,7 +501,7 @@ class FusedMoE(CustomOp): # TODO(bnell): end attributes self.apply_router_weight_on_input = apply_router_weight_on_input - self.activation = activation + self.activation = MoEActivation.from_str(activation) self.router = create_fused_moe_router( top_k=top_k, @@ -554,7 +555,7 @@ class FusedMoE(CustomOp): has_bias=has_bias, is_act_and_mul=is_act_and_mul, is_lora_enabled=vllm_config.lora_config is not None, - activation=activation, + activation=self.activation, device=vllm_config.device_config.device, routing_method=self.routing_method_type, # TODO: in_dtype == out_dtype? diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index e2f77d6c8..7e6855778 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -12,6 +12,10 @@ import torch import vllm.envs as envs from vllm.forward_context import get_forward_context, is_forward_context_available from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.activation import ( + MoEActivation, + apply_moe_activation, +) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, @@ -19,7 +23,6 @@ from vllm.model_executor.layers.fused_moe.config import ( ) from vllm.model_executor.layers.fused_moe.utils import ( _resize_cache, - apply_moe_activation, count_expert_num_tokens, disable_inplace, ) @@ -536,7 +539,7 @@ class FusedMoEPermuteExpertsUnpermute(ABC): @staticmethod @abstractmethod - def _supports_activation(activation: str) -> bool: + def _supports_activation(activation: MoEActivation) -> bool: """ Whether the kernel supports a particular act function. """ @@ -658,7 +661,7 @@ class FusedMoEPermuteExpertsUnpermute(ABC): global_num_experts: int, local_num_experts: int, expert_tokens_meta: ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: """ Compute the shapes for the temporary and final outputs of the two gemms @@ -690,7 +693,7 @@ class FusedMoEPermuteExpertsUnpermute(ABC): raise NotImplementedError @staticmethod - def adjust_N_for_activation(N: int, activation: str) -> int: + def adjust_N_for_activation(N: int, activation: MoEActivation) -> int: """ Calculate the output dimension for the activation function. @@ -702,16 +705,15 @@ class FusedMoEPermuteExpertsUnpermute(ABC): Args: N: The intermediate size (width of w1/w3 weights). - activation: The activation function name. + activation: The activation function enum. Returns: The output dimension after activation. """ - is_no_mul = activation.endswith("_no_mul") - return N if is_no_mul else N // 2 + return N if not activation.is_gated else N // 2 def activation( - self, activation: str, output: torch.Tensor, input: torch.Tensor + self, activation: MoEActivation, output: torch.Tensor, input: torch.Tensor ) -> None: apply_moe_activation(activation, output, input) @@ -732,7 +734,7 @@ class FusedMoEPermuteExpertsUnpermute(ABC): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, a1q_scale: torch.Tensor | None, @@ -892,7 +894,7 @@ class FusedMoEModularKernel(torch.nn.Module): global_num_experts: int, local_num_experts: int, expert_tokens_meta: ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ Allocate temporary and output buffers for the fused experts op. @@ -1135,7 +1137,7 @@ class FusedMoEModularKernel(torch.nn.Module): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, local_num_experts: int, expert_map: torch.Tensor | None, @@ -1309,7 +1311,7 @@ class FusedMoEModularKernel(torch.nn.Module): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str = "silu", + activation: MoEActivation = MoEActivation.SILU, global_num_experts: int = -1, expert_map: torch.Tensor | None = None, apply_router_weight_on_input: bool = False, @@ -1326,7 +1328,7 @@ class FusedMoEModularKernel(torch.nn.Module): - topk_weights (torch.Tensor): The topk weights applied at the end of the layer. - topk_ids (torch.Tensor): A map of row to expert id. - - activation (str): The activation function to apply after the first + - activation (MoEActivation): The activation function to apply after the first MoE layer. - global_num_experts (int): The total number of experts in the global expert space. diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index 535abc420..def1ec9dc 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -7,6 +7,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm._aiter_ops import rocm_aiter_ops +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FUSED_MOE_UNQUANTIZED_CONFIG, FusedMoEParallelConfig, @@ -184,7 +185,7 @@ def rocm_aiter_fused_experts( w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str = "silu", + activation: MoEActivation = MoEActivation.SILU, apply_router_weight_on_input: bool = False, expert_map: torch.Tensor | None = None, quant_config: FusedMoEQuantConfig | None = None, @@ -196,9 +197,13 @@ def rocm_aiter_fused_experts( if quant_config is None: quant_config = FUSED_MOE_UNQUANTIZED_CONFIG - activation_method = ( - ActivationMethod.SILU if activation == "silu" else ActivationMethod.GELU - ) + if activation == MoEActivation.SILU: + activation_method = ActivationMethod.SILU + elif activation == MoEActivation.GELU: + activation_method = ActivationMethod.GELU + else: + raise ValueError(f"Unsupported activation: {activation}") + # All AITER Fused MoE kernels are expecting the following datatypes topk_weights = topk_weights.to(torch.float32) topk_ids = topk_ids.to(torch.int32) @@ -322,8 +327,8 @@ class AiterExperts(mk.FusedMoEPermuteExpertsUnpermute): return (weight_key, activation_key) in SUPPORTED_W_A @staticmethod - def _supports_activation(activation: str) -> bool: - return activation in ["silu", "gelu"] + def _supports_activation(activation: MoEActivation) -> bool: + return activation in [MoEActivation.SILU, MoEActivation.GELU] @staticmethod def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool: @@ -347,7 +352,7 @@ class AiterExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, local_num_experts: int, expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: # Workspaces are managed internally by AITER. workspace1 = (0,) @@ -363,7 +368,7 @@ class AiterExperts(mk.FusedMoEPermuteExpertsUnpermute): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, a1q_scale: torch.Tensor | None, diff --git a/vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py index f537f2f99..21a3d05f4 100644 --- a/vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py @@ -5,6 +5,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, @@ -45,7 +46,7 @@ class TritonOrCutlassExperts(FallbackExperts): global_num_experts: int, local_num_experts: int, expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: # Small batch fallback for sm100. if self.is_sm100 and M <= 8: diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py index 7e41269dc..a3f2f59c5 100644 --- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py @@ -4,6 +4,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, @@ -45,7 +46,7 @@ class TritonOrDeepGemmExperts(FallbackExperts): global_num_experts: int, local_num_experts: int, expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: # Note: the deep gemm workspaces are strictly larger than the triton # workspaces so we can be pessimistic here and allocate for DeepGemm diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py index 074b8154a..61e06fa60 100644 --- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py +++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py @@ -4,6 +4,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, @@ -64,7 +65,7 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute): ) @staticmethod - def _supports_activation(activation: str) -> bool: + def _supports_activation(activation: MoEActivation) -> bool: raise NotImplementedError( "TrtLlmGenExperts is not yet used by an Oracle. " "This method should not be called." @@ -95,7 +96,7 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, local_num_experts: int, expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: # The workspaces for this implementation are managed by flashinfer. workspace1 = (0,) @@ -111,7 +112,7 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, a1q_scale: torch.Tensor | None, diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py index 7d5ca876b..a1d4f46aa 100644 --- a/vllm/model_executor/layers/fused_moe/utils.py +++ b/vllm/model_executor/layers/fused_moe/utils.py @@ -4,7 +4,6 @@ import functools from math import prod import torch -import torch.nn.functional as F from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.fp8_utils import ( @@ -341,65 +340,6 @@ def _validate_scale_shape( assert a_scale.shape == expected, f"{a_scale.shape} == {expected}" -def activation_without_mul(activation: str) -> str: - return activation + "_no_mul" - - -RELU2_NO_MUL: str = activation_without_mul("relu2") -SILU_NO_MUL: str = activation_without_mul("silu") -GELU_NO_MUL: str = activation_without_mul("gelu") - - -def apply_moe_activation( - activation: str, - output: torch.Tensor, - input: torch.Tensor, -) -> torch.Tensor: - """ - Apply MoE activation function. - - For *_and_mul activations (silu, gelu, swigluoai): - - Expects output.size(-1) * 2 == input.size(-1) - - For *_no_mul activations (silu_no_mul, gelu_no_mul, relu2_no_mul): - - Expects output.size(-1) == input.size(-1) - """ - is_no_mul = activation.endswith("_no_mul") - if is_no_mul: - assert output.size(-1) == input.size(-1), ( - f"{activation} expects equal sizes: {output.size(-1)} vs {input.size(-1)}" - ) - else: - assert output.size(-1) * 2 == input.size(-1), ( - f"{activation} expects 2x ratio: {output.size(-1) * 2} vs {input.size(-1)}" - ) - - # Activations with gated multiplication (gate × activation(up)) - if activation == "silu": - torch.ops._C.silu_and_mul(output, input) - elif activation == "gelu": - torch.ops._C.gelu_and_mul(output, input) - elif activation == "swigluoai": - torch.ops._C.swigluoai_and_mul(output, input) - elif activation == "swiglustep": - from vllm.model_executor.layers.activation import swiglustep_and_mul_triton - - swiglustep_and_mul_triton(output, input) - - # Activations without gated multiplication - elif activation == SILU_NO_MUL: - output.copy_(F.silu(input)) - elif activation == GELU_NO_MUL: - output.copy_(F.gelu(input)) - elif activation == RELU2_NO_MUL: - F.relu(input, inplace=True) - torch.square(input, out=output) - else: - raise ValueError(f"Unsupported FusedMoe activation: {activation}") - - return output - - # Torch custom ops can't deal with outputs aliasing inputs so we need to # disable inplace for torch >= 2.9. # See https://github.com/vllm-project/vllm/issues/26378 diff --git a/vllm/model_executor/layers/fused_moe/xpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/xpu_fused_moe.py index a20679ea6..e6f8b8efa 100644 --- a/vllm/model_executor/layers/fused_moe/xpu_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/xpu_fused_moe.py @@ -3,6 +3,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, @@ -55,8 +56,12 @@ class XPUExperts(mk.FusedMoEPermuteExpertsUnpermute): return False @staticmethod - def _supports_activation(activation: str) -> bool: - return activation in ["silu", "gelu", "swigluoai"] + def _supports_activation(activation: MoEActivation) -> bool: + return activation in [ + MoEActivation.SILU, + MoEActivation.GELU, + MoEActivation.SWIGLUOAI, + ] @staticmethod def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool: @@ -92,7 +97,7 @@ class XPUExperts(mk.FusedMoEPermuteExpertsUnpermute): global_num_experts: int, local_num_experts: int, expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: str, + activation: MoEActivation, ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: workspace1 = (0,) workspace2 = (0,) @@ -107,7 +112,7 @@ class XPUExperts(mk.FusedMoEPermuteExpertsUnpermute): w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - activation: str, + activation: MoEActivation, global_num_experts: int, expert_map: torch.Tensor | None, a1q_scale: torch.Tensor | None, @@ -129,7 +134,7 @@ class XPUExperts(mk.FusedMoEPermuteExpertsUnpermute): topk_weights=topk_weights, topk_ids=topk_ids, n_experts_per_token=topk, - activation=activation, + activation=activation.value, num_experts=self.moe_config.num_local_experts, ep_rank=self.moe_config.ep_rank, ep_size=self.moe_config.ep_size, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 690ff0454..0fecc7bbc 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -24,6 +24,7 @@ from vllm.model_executor.layers.fused_moe import ( FusedMoeWeightScaleSupported, UnquantizedFusedMoEMethod, ) +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, @@ -622,7 +623,9 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert self.is_monolithic - assert layer.activation == "silu", "Only SiLU activation is supported." + assert layer.activation == MoEActivation.SILU, ( + f"Only SiLU activation is supported, not {layer.activation}." + ) assert ( self.nvfp4_backend == NvFp4MoeBackend.FLASHINFER_TRTLLM and not layer.enable_eplb @@ -649,7 +652,9 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert not self.is_monolithic - assert layer.activation == "silu", "Only SiLU activation is supported." + assert layer.activation == MoEActivation.SILU, ( + f"Only SiLU activation is supported, not {layer.activation}." + ) # EPLB path if self.nvfp4_backend == NvFp4MoeBackend.FLASHINFER_TRTLLM: @@ -1025,7 +1030,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert self.is_monolithic assert self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM - assert layer.activation == "silu" + assert layer.activation == MoEActivation.SILU, ( + f"Only SiLU activation is supported, not {layer.activation}." + ) if self.block_quant: import vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe # noqa: E501, F401 @@ -2271,19 +2278,21 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod): router_logits: torch.Tensor, ) -> torch.Tensor: assert not layer.enable_eplb, "EPLB not supported for W4A8-int MoE yet." - assert layer.activation in ("silu", "swigluoai", "swiglu"), ( - "Only SiLU/SwiGLUGU/SwiGLUUG are supported." - ) + assert layer.activation in ( + MoEActivation.SILU, + MoEActivation.SWIGLUOAI, + MoEActivation.SWIGLUSTEP, + ), "Only SiLU/SwiGLUGU/SwiGLUUG are supported." assert layer.expert_map is None, """expert_map/EP not implemented for CPU dyn-4bit MoE.""" - def _act_kind(s: str) -> int: + def _act_kind(s: MoEActivation) -> int: # 0 = SwiGLU_Gu (SiLU(g)*u), 1 = SwiGLU_Ug (SiLU(u)*g), 2 = SiLU - if s == "swiglu": + if s == MoEActivation.SWIGLUSTEP: return 0 - if s == "swigluoai": + if s == MoEActivation.SWIGLUOAI: return 1 - if s == "silu": + if s == MoEActivation.SILU: return 2 raise ValueError(f"Unknown activation '{s}'") diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 279f97dd6..cd589b315 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -23,6 +23,7 @@ from vllm.model_executor.layers.fused_moe import ( FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize, FusedMoeWeightScaleSupported, + MoEActivation, ) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, @@ -965,7 +966,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): # TODO(rob): convert this to MK. if layer.enable_eplb: raise NotImplementedError("EPLB not supported for `Fp8MoEMethod` yet.") - assert layer.activation == "silu", ( + assert layer.activation == MoEActivation.SILU, ( f"Expected 'silu' activation but got {layer.activation}" ) diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index f7d995598..88023349e 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -12,6 +12,10 @@ from torch.nn.parameter import Parameter, UninitializedParameter from vllm import _custom_ops as ops from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.activation import ( + MoEActivation, + apply_moe_activation, +) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, @@ -246,16 +250,13 @@ def _fused_moe_gguf( qweight_type2: int, activation: str, ) -> torch.Tensor: + activation_enum = MoEActivation.from_str(activation) + def act(x: torch.Tensor): d = x.shape[-1] // 2 output_shape = x.shape[:-1] + (d,) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) - if activation == "silu": - torch.ops._C.silu_and_mul(out, x) - elif activation == "gelu": - torch.ops._C.gelu_and_mul(out, x) - else: - raise ValueError(f"Unsupported activation: {activation}") + apply_moe_activation(activation_enum, out, x) return out # lazy import to avoid triggering triton import in CPU backend @@ -637,7 +638,6 @@ class GGUFMoEMethod(FusedMoEMethodBase): topk_ids: torch.Tensor, shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert layer.activation == "silu", "Only SiLU activation is supported." if layer.apply_router_weight_on_input: raise NotImplementedError( "Apply router weight on input is not supported for" @@ -652,7 +652,7 @@ class GGUFMoEMethod(FusedMoEMethodBase): topk_ids, layer.w13_qweight_type.weight_type, layer.w2_qweight_type.weight_type, - layer.activation, + layer.activation.value, ) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 570317ad3..e0322a46f 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -10,6 +10,7 @@ from torch.nn.parameter import Parameter import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, @@ -936,7 +937,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): ) # TODO(rob): this validation should happen at kernel selection # time in the oracle rather than here. - assert layer.activation == "silu", ( + assert layer.activation == MoEActivation.SILU, ( f"Expected 'silu' activation but got {layer.activation}" ) assert not layer.renormalize @@ -965,7 +966,10 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): # TODO(rob): this validation should happen at kernel selection # time in the oracle rather than here. if self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS: - assert layer.activation in ("silu", "relu2_no_mul"), ( + assert layer.activation in ( + MoEActivation.SILU, + MoEActivation.RELU2_NO_MUL, + ), ( "Expected activation to be in ('silu', 'relu2_no_mul')," f"but got {layer.activation}" ) diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index 4365d1693..f5c679840 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -6,6 +6,7 @@ from typing import Any import torch from vllm.distributed import get_tensor_model_parallel_rank, get_tp_group +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, int4_w4a16_moe_quant_config, @@ -371,7 +372,9 @@ class MoeWNA16Method(FusedMoEMethodBase): ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts - assert layer.activation == "silu", "Only SiLU activation is supported." + assert layer.activation == MoEActivation.SILU, ( + f"Only SiLU activation is supported, not {layer.activation}." + ) return fused_experts( x, diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 5cd6d5d79..5c6837e7a 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -13,6 +13,7 @@ from vllm.model_executor.layers.fused_moe import ( FusedMoE, FusedMoEConfig, FusedMoEMethodBase, + MoEActivation, ) from vllm.model_executor.layers.fused_moe import modular_kernel as mk from vllm.model_executor.layers.fused_moe.config import ( @@ -1141,8 +1142,9 @@ class XpuMxfp4MoEMethod(Mxfp4MoEMethod): x: torch.Tensor, router_logits: torch.Tensor, ) -> torch.Tensor: - assert layer.activation == "swigluoai", ( - "Only swiglu_oai activation is supported for XPU MXFP4 MoE" + assert layer.activation == MoEActivation.SWIGLUOAI, ( + "Only swiglu_oai activation is supported for " + f"XPU MXFP4 MoE, not {layer.activation}." ) from vllm_xpu_kernels.fused_moe_interface import xpu_fused_moe diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 7faa4fcc9..555b94c1c 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -15,6 +15,7 @@ from vllm.model_executor.layers.fused_moe import ( FusedMoEConfig, FusedMoEMethodBase, FusedMoeWeightScaleSupported, + MoEActivation, ) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, @@ -438,7 +439,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): expert_map=layer.expert_map, ) elif self.use_marlin: - assert layer.activation == "silu", ( + assert layer.activation == MoEActivation.SILU, ( f"{layer.activation} not supported for Marlin MoE." ) return fused_marlin_moe( diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index bbe206800..9d9fd31ad 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -9,6 +9,7 @@ import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEParallelConfig, @@ -64,9 +65,9 @@ def _supports_quant_scheme( return (weight_key, activation_key) in SUPPORTED_W_A -def _supports_activation(activation: str) -> bool: +def _supports_activation(activation: MoEActivation) -> bool: """Supports silu activation only.""" - return activation in ["silu"] + return activation in [MoEActivation.SILU] def _supports_routing_method( @@ -267,7 +268,7 @@ def flashinfer_trtllm_fp4_moe( x: torch.Tensor | tuple[torch.Tensor, torch.Tensor], router_logits: torch.Tensor, top_k: int, - activation: str, + activation: MoEActivation, global_num_experts: int, num_expert_group: int | None, topk_group: int | None, @@ -297,7 +298,7 @@ def flashinfer_trtllm_fp4_moe( from vllm.model_executor.models.llama4 import Llama4MoE # https://github.com/flashinfer-ai/flashinfer/blob/f0277fd1bff90e309e5c19cab36c5dae056d685d/flashinfer/fused_moe/core.py#L2404 - assert activation == "silu", ( + assert activation == MoEActivation.SILU, ( "Only SiLU activation is supported for FlashInfer TRTLLM FP4 MoE. " f"{activation} found instead." ) @@ -365,7 +366,7 @@ def flashinfer_trtllm_fp4_routed_moe( topk_ids: torch.Tensor, topk_weights: torch.Tensor, top_k: int, - activation: str, + activation: MoEActivation, global_num_experts: int, ) -> torch.Tensor: """ @@ -387,7 +388,7 @@ def flashinfer_trtllm_fp4_routed_moe( import flashinfer # https://github.com/flashinfer-ai/flashinfer/blob/f0277fd1bff90e309e5c19cab36c5dae056d685d/flashinfer/fused_moe/core.py#L2535 - assert activation == "silu", ( + assert activation == MoEActivation.SILU, ( "Only SiLU activation is supported for FlashInfer TRTLLM FP4 Routed MoE. " f"{activation} found instead." ) diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index e9ecf0547..9dbfc6eca 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -6,6 +6,7 @@ from typing import Any import torch from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.platforms import current_platform from vllm.triton_utils import triton from vllm.utils.import_utils import has_triton_kernels @@ -88,7 +89,7 @@ def _can_support_mxfp4( e_score_correction_bias: torch.Tensor | None = None, apply_router_weight_on_input: bool = False, scoring_func: str = "softmax", - activation: str = "swigluoai", + activation: MoEActivation = MoEActivation.SWIGLUOAI, expert_load_view: torch.Tensor | None = None, logical_to_physical_map: torch.Tensor | None = None, logical_replica_count: torch.Tensor | None = None, @@ -101,7 +102,7 @@ def _can_support_mxfp4( or e_score_correction_bias or apply_router_weight_on_input or scoring_func != "softmax" - or activation != "swigluoai" + or activation != MoEActivation.SWIGLUOAI or expert_load_view or logical_to_physical_map or logical_replica_count diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index a935071fc..06141013c 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -33,8 +33,11 @@ from vllm.distributed.communication_op import tensor_model_parallel_all_gather from vllm.distributed.parallel_state import get_pp_group from vllm.model_executor.layers.activation import ReLUSquaredActivation from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.fused_moe import FusedMoE, SharedFusedMoE -from vllm.model_executor.layers.fused_moe.utils import activation_without_mul +from vllm.model_executor.layers.fused_moe import ( + FusedMoE, + SharedFusedMoE, + activation_without_mul, +) from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, -- GitLab From ec12d39d44739bee408ec1473acc09e75daf1a5d Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 11 Feb 2026 22:08:19 -0500 Subject: [PATCH 0123/1166] [Bugfix] Fix MTP accuracy for GLM-5 (#34385) Signed-off-by: mgoin --- vllm/v1/spec_decode/eagle.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index b5532d652..a6e7995bc 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -1506,6 +1506,24 @@ class SpecDecodeBaseProposer: del self.model.lm_head self.model.lm_head = target_language_model.lm_head + # MTP models call compute_logits via shared_head.head (a + # ParallelLMHead inside each MTP layer), not self.model.lm_head. + # If the checkpoint omits a copy of the lm_head weights at the + # MTP layer path, shared_head.head stays uninitialised and + # produces NaN logits. Always share it explicitly. + inner = getattr(self.model, "model", None) + layers = getattr(inner, "layers", None) if inner else None + if layers is not None: + items = layers.values() if isinstance(layers, nn.ModuleDict) else layers + for layer in items: + sh = getattr(layer, "shared_head", None) + if sh is not None and hasattr(sh, "head"): + del sh.head + sh.head = target_language_model.lm_head + logger.info( + "Shared target model lm_head with MTP shared_head.head." + ) + @torch.inference_mode() def dummy_run( self, -- GitLab From e1d97c38f8689da0b11da0fac54cc277c237d5c4 Mon Sep 17 00:00:00 2001 From: Runkai Tao <129432511+RunkaiTao@users.noreply.github.com> Date: Wed, 11 Feb 2026 22:30:57 -0500 Subject: [PATCH 0124/1166] [Bug Fix] Fix `naive_block_assignment` always defaulting to False due to arg misalignment (#33848) Signed-off-by: Runkai Tao --- vllm/lora/layers/fused_moe.py | 2 +- vllm/lora/punica_wrapper/punica_base.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 4d4e053cf..e3d9894de 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -219,7 +219,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): self.max_loras, self.adapter_enabled, expert_map, - naive_block_assignment, + naive_block_assignment=naive_block_assignment, ) moe_state_dict["sorted_token_ids_lora"] = sorted_token_ids_lora diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py index fdcf6c0cb..facbd681a 100644 --- a/vllm/lora/punica_wrapper/punica_base.py +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -458,6 +458,7 @@ class PunicaWrapperBase(PunicaWrapperABC): adapter_enabled: torch.Tensor, expert_map: torch.Tensor | None = None, pad_sorted_ids: bool = False, + naive_block_assignment: bool = False, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ Aligns tokens and experts into block-sized chunks for LoRA-based -- GitLab From ced2a92f40ed56148a6f4496239b55a65f854081 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 12 Feb 2026 11:33:15 +0800 Subject: [PATCH 0125/1166] [Refactor] Move validation to params definitions (#34362) Signed-off-by: DarkLight1337 --- vllm/pooling_params.py | 17 +- vllm/sampling_params.py | 238 ++++++++++++++++++++++++++++ vllm/v1/engine/input_processor.py | 254 +++--------------------------- 3 files changed, 264 insertions(+), 245 deletions(-) diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 2251cceef..75d441d74 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -72,7 +72,7 @@ class PoolingParams( """Returns a deep copy of the PoolingParams instance.""" return deepcopy(self) - def verify(self, model_config: "ModelConfig") -> None: + def verify(self, model_config: ModelConfig) -> None: # plugin task uses io_processor.parse_request to verify inputs, # skipping PoolingParams verify if self.task == "plugin": @@ -87,12 +87,7 @@ class PoolingParams( self._set_default_parameters(model_config) self._verify_valid_parameters() - def _merge_default_parameters( - self, model_config: "ModelConfig | None" = None - ) -> None: - if model_config is None: - return - + def _merge_default_parameters(self, model_config: ModelConfig) -> None: pooler_config = model_config.pooler_config if pooler_config is None: return @@ -119,7 +114,9 @@ class PoolingParams( self._verify_step_pooling(pooler_config, valid_parameters) def _verify_step_pooling( - self, pooler_config: "PoolerConfig", valid_parameters: list[str] + self, + pooler_config: PoolerConfig, + valid_parameters: list[str], ): step_pooling_parameters = ["step_tag_id", "returned_token_ids"] if pooler_config.tok_pooling_type != "STEP": @@ -142,12 +139,12 @@ class PoolingParams( if getattr(self, k, None) is None: setattr(self, k, getattr(pooler_config, k)) - def _set_default_parameters(self, model_config: "ModelConfig | None"): + def _set_default_parameters(self, model_config: ModelConfig): if self.task in ["embed", "token_embed"]: if self.use_activation is None: self.use_activation = True - if self.dimensions is not None and model_config is not None: + if self.dimensions is not None: if not model_config.is_matryoshka: raise ValueError( f'Model "{model_config.served_model_name}" does not ' diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 1d097852e..dd354190f 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -3,6 +3,7 @@ """Sampling parameters for text generation.""" import copy +import json from dataclasses import field from enum import Enum, IntEnum from functools import cached_property @@ -11,6 +12,7 @@ from typing import Annotated, Any import msgspec from pydantic.dataclasses import dataclass +from vllm.config import ModelConfig, SpeculativeConfig, StructuredOutputsConfig from vllm.exceptions import VLLMValidationError from vllm.logger import init_logger from vllm.logits_process import LogitsProcessor @@ -453,6 +455,11 @@ class SamplingParams( parameter="prompt_logprobs", value=self.prompt_logprobs, ) + if self.logits_processors: + # TODO: Remove `logits_processors` attribute + raise ValueError( + "vLLM V1 does not support per request user-provided logits processors." + ) if self.truncate_prompt_tokens is not None and ( self.truncate_prompt_tokens == 0 or self.truncate_prompt_tokens < -1 ): @@ -589,6 +596,237 @@ class SamplingParams( ) return copy.deepcopy(self, memo=logit_processor_refs) + def verify( + self, + model_config: ModelConfig, + speculative_config: SpeculativeConfig | None, + structured_outputs_config: StructuredOutputsConfig | None, + tokenizer: TokenizerLike | None, + ) -> None: + self._validate_logprobs(model_config) + self._validate_logit_bias(model_config) + self._validate_allowed_token_ids(tokenizer) + self._validate_spec_decode(speculative_config) + self._validate_structured_outputs(structured_outputs_config, tokenizer) + + def _validate_logprobs(self, model_config: ModelConfig) -> None: + max_logprobs = model_config.max_logprobs + if max_logprobs == -1: + max_logprobs = model_config.get_vocab_size() + + # Validate sample logprobs. + if num_logprobs := self.logprobs: + if num_logprobs == -1: + num_logprobs = model_config.get_vocab_size() + if num_logprobs > max_logprobs: + raise VLLMValidationError( + f"Requested sample logprobs of {num_logprobs}, " + f"which is greater than max allowed: {max_logprobs}", + parameter="logprobs", + value=num_logprobs, + ) + + # Validate prompt logprobs. + if num_prompt_logprobs := self.prompt_logprobs: + if num_prompt_logprobs == -1: + num_prompt_logprobs = model_config.get_vocab_size() + if num_prompt_logprobs > max_logprobs: + raise VLLMValidationError( + f"Requested prompt logprobs of {num_prompt_logprobs}, " + f"which is greater than max allowed: {max_logprobs}", + parameter="prompt_logprobs", + value=num_prompt_logprobs, + ) + + def _validate_logit_bias(self, model_config: ModelConfig) -> None: + """Validate logit_bias token IDs are within vocabulary range.""" + if not self.logit_bias: + return + + vocab_size = model_config.get_vocab_size() + invalid_token_ids = [ + token_id + for token_id in self.logit_bias + if token_id < 0 or token_id >= vocab_size + ] + + if invalid_token_ids: + raise VLLMValidationError( + f"token_id(s) {invalid_token_ids} in logit_bias contain " + f"out-of-vocab token ids. Vocabulary size: {vocab_size}", + parameter="logit_bias", + value=invalid_token_ids, + ) + + def _validate_allowed_token_ids(self, tokenizer: TokenizerLike | None) -> None: + allowed_token_ids = self.allowed_token_ids + if allowed_token_ids is None: + return + + if len(allowed_token_ids) == 0: + raise VLLMValidationError( + "allowed_token_ids is not None and empty!", + parameter="allowed_token_ids", + value=allowed_token_ids, + ) + + if tokenizer is not None: + vocab_size = len(tokenizer) + invalid_token_ids = [ + token_id + for token_id in allowed_token_ids + if token_id < 0 or token_id >= vocab_size + ] + if invalid_token_ids: + raise VLLMValidationError( + "allowed_token_ids contains out-of-vocab token id!", + parameter="allowed_token_ids", + value=invalid_token_ids, + ) + + def _validate_spec_decode( + self, + speculative_config: SpeculativeConfig | None, + ) -> None: + if speculative_config is None: + return + + # Some sampling parameters are not yet compatible with spec decoding. + if self.min_tokens > 1 or self.min_p > _SAMPLING_EPS or self.logit_bias: + raise ValueError( + "The min_tokens, min_p, and logit_bias sampling parameters " + "are not yet supported with speculative decoding." + ) + + def _validate_structured_outputs( + self, + structured_outputs_config: StructuredOutputsConfig | None, + tokenizer: TokenizerLike | None, + ) -> None: + if structured_outputs_config is None or self.structured_outputs is None: + return + + if tokenizer is None: + raise ValueError( + "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'" # noqa: E501 + ) + + backend = structured_outputs_config.backend + if _backend := self.structured_outputs._backend: + # Request-level backend selection is not supported. + # The values may differ if `params` is reused and was set + # to a specific backend based on `auto` behavior in a previous + # request. We remember that it was set as a result of `auto` + # using the `_backend_was_auto` field set in the params. + if backend != _backend and not ( + backend == "auto" and self.structured_outputs._backend_was_auto + ): + raise ValueError( + "Request-level structured output backend selection is not " + f"supported. The request specified '{_backend}', but vLLM " + f"was initialised with '{backend}'. This error can be " + "resolved by removing '_backend' from the request." + ) + else: + self.structured_outputs._backend = backend + + # Request content validation + if ( + isinstance(self.structured_outputs.choice, list) + and not self.structured_outputs.choice + ): + # It is invalid for choice to be an empty list + raise ValueError( + f"Choice '{self.structured_outputs.choice}' cannot be an empty list" # noqa: E501 + ) + # Reject empty string grammar early to avoid engine-side crashes + if ( + isinstance(self.structured_outputs.grammar, str) + and self.structured_outputs.grammar.strip() == "" + ): + raise ValueError("structured_outputs.grammar cannot be an empty string") + + from vllm.tokenizers.mistral import MistralTokenizer + from vllm.v1.structured_output.backend_guidance import ( + has_guidance_unsupported_json_features, + validate_guidance_grammar, + ) + from vllm.v1.structured_output.backend_lm_format_enforcer import ( + validate_structured_output_request_lm_format_enforcer, + ) + from vllm.v1.structured_output.backend_outlines import ( + validate_structured_output_request_outlines, + ) + from vllm.v1.structured_output.backend_xgrammar import validate_xgrammar_grammar + + if backend.startswith("xgrammar"): + # xgrammar with no fallback + validate_xgrammar_grammar(self) + elif backend.startswith("guidance"): + # TODO: ideally we would have the LLTokenizer here as Lark syntax + # allows <|special_token|> and similar, see + # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens + # Without tokenizer these are disallowed in grammars. + if isinstance(tokenizer, MistralTokenizer): + raise ValueError( + "Mistral tokenizer is not supported for the 'guidance' " + "structured output backend. Please use ['xgrammar', 'outlines'] " + "backends or tokenizer_mode='hf' instead." + ) + validate_guidance_grammar(self, tokenizer=None) + elif backend == "outlines": + # outlines backend + validate_structured_output_request_outlines(self) + elif backend == "lm-format-enforcer": + # lm format enforcer backend + if isinstance(tokenizer, MistralTokenizer): + raise ValueError( + "Mistral tokenizer is not supported for the 'lm-format-enforcer' " + "structured output backend. Please use ['xgrammar', 'outlines'] " + "backends or tokenizer_mode='hf' instead." + ) + validate_structured_output_request_lm_format_enforcer(self) + else: + # NOTE: backend must be "auto" here, because we have + # checked supported_backends above. + # In this mode, we set opinionated defaults based on what we think + # will satisfy the most use cases without having to worry about + # this setting. We include fallback behavior here, but not with any + # other setting where a specific backend was specified. + try: + validate_xgrammar_grammar(self) + self.structured_outputs._backend = "xgrammar" + except ValueError: + # The request either failed validation + # or includes some jsonschema feature(s) that + # are not supported in xgrammar. + + # Check if schema has features unsupported by guidance + so_params = self.structured_outputs + skip_guidance = False + if so_params.json: + if isinstance(so_params.json, str): + schema = json.loads(so_params.json) + else: + schema = so_params.json + skip_guidance = has_guidance_unsupported_json_features(schema) + + if isinstance(tokenizer, MistralTokenizer) or skip_guidance: + # Fall back to outlines if the tokenizer is Mistral + # or if schema contains features unsupported by guidance + validate_structured_output_request_outlines(self) + self.structured_outputs._backend = "outlines" + else: + # Fall back to guidance by default. + validate_guidance_grammar(self, tokenizer=None) + self.structured_outputs._backend = "guidance" + # Remember that this backend was set automatically + self.structured_outputs._backend_was_auto = True + + # Run post-init validation. This is also important to ensure subsequent + # roundtrip serialization/deserialization won't fail. + self.structured_outputs.__post_init__() + def __repr__(self) -> str: return ( f"SamplingParams(n={self.n}, " diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 0e52e2d20..17f4c6dec 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -6,7 +6,6 @@ from collections.abc import Mapping from typing import Any, Literal, cast from vllm.config import VllmConfig -from vllm.exceptions import VLLMValidationError from vllm.inputs.data import ( ProcessorInputs, PromptType, @@ -30,25 +29,13 @@ from vllm.multimodal.utils import argsort_mm_positions from vllm.pooling_params import PoolingParams from vllm.renderers import BaseRenderer from vllm.renderers.inputs import DictPrompt, TokPrompt -from vllm.sampling_params import _SAMPLING_EPS, SamplingParams +from vllm.sampling_params import SamplingParams from vllm.tasks import POOLING_TASKS, SupportedTask from vllm.tokenizers import TokenizerLike -from vllm.tokenizers.mistral import MistralTokenizer from vllm.utils import length_from_prompt_token_ids_or_embeds, random_uuid from vllm.utils.torch_utils import set_default_torch_num_threads from vllm.v1.engine import EngineCoreRequest from vllm.v1.metrics.stats import MultiModalCacheStats -from vllm.v1.structured_output.backend_guidance import ( - has_guidance_unsupported_json_features, - validate_guidance_grammar, -) -from vllm.v1.structured_output.backend_lm_format_enforcer import ( - validate_structured_output_request_lm_format_enforcer, -) -from vllm.v1.structured_output.backend_outlines import ( - validate_structured_output_request_outlines, -) -from vllm.v1.structured_output.backend_xgrammar import validate_xgrammar_grammar logger = init_logger(__name__) @@ -64,6 +51,7 @@ class InputProcessor: self.cache_config = vllm_config.cache_config self.lora_config = vllm_config.lora_config self.scheduler_config = vllm_config.scheduler_config + self.speculative_config = vllm_config.speculative_config self.structured_outputs_config = vllm_config.structured_outputs_config self.observability_config = vllm_config.observability_config @@ -101,101 +89,6 @@ class InputProcessor: def renderer(self) -> BaseRenderer: return self.input_preprocessor.renderer - def _validate_logprobs( - self, - params: SamplingParams, - ) -> None: - max_logprobs = self.model_config.max_logprobs - if max_logprobs == -1: - max_logprobs = self.model_config.get_vocab_size() - - # Validate sample logprobs. - if params.logprobs: - num_logprobs = params.logprobs - if num_logprobs == -1: - num_logprobs = self.model_config.get_vocab_size() - if num_logprobs > max_logprobs: - raise VLLMValidationError( - f"Requested sample logprobs of {num_logprobs}, " - f"which is greater than max allowed: {max_logprobs}", - parameter="logprobs", - value=num_logprobs, - ) - - # Validate prompt logprobs. - if params.prompt_logprobs: - num_prompt_logprobs = params.prompt_logprobs - if num_prompt_logprobs == -1: - num_prompt_logprobs = self.model_config.get_vocab_size() - if num_prompt_logprobs > max_logprobs: - raise VLLMValidationError( - f"Requested prompt logprobs of {num_prompt_logprobs}, " - f"which is greater than max allowed: {max_logprobs}", - parameter="prompt_logprobs", - value=num_prompt_logprobs, - ) - - def _validate_sampling_params( - self, - params: SamplingParams, - ) -> None: - self._validate_structured_output(params) - self._validate_logit_bias(params) - - if params.allowed_token_ids is None: - return - if not params.allowed_token_ids: - raise ValueError("allowed_token_ids is not None and empty!") - if self.tokenizer is None: - # When skip_tokenizer_init=True, we can't validate token IDs - # Skip validation and let the model handle invalid tokens - return - vocab_size = len(self.tokenizer) - if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids): - raise ValueError("allowed_token_ids contains out-of-vocab token id!") - - def _validate_logit_bias( - self, - params: SamplingParams, - ) -> None: - """Validate logit_bias token IDs are within vocabulary range.""" - if not params.logit_bias: - return - - vocab_size = self.model_config.get_vocab_size() - invalid_token_ids = [] - - for token_id in params.logit_bias: - if token_id < 0 or token_id >= vocab_size: - invalid_token_ids.append(token_id) - - if invalid_token_ids: - raise VLLMValidationError( - f"token_id(s) {invalid_token_ids} in logit_bias contain " - f"out-of-vocab token ids. Vocabulary size: {vocab_size}", - parameter="logit_bias", - value=invalid_token_ids, - ) - - def _validate_supported_sampling_params( - self, - params: SamplingParams, - ) -> None: - # Logits processors not supported. - if params.logits_processors: - raise ValueError( - "vLLM V1 does not support per request user-provided logits processors." - ) - - # Some sampling parameters are not yet compatible with spec decoding. - if self.vllm_config.speculative_config is not None and ( - params.min_tokens > 1 or params.min_p > _SAMPLING_EPS or params.logit_bias - ): - raise ValueError( - "The min_tokens, min_p, and logit_bias sampling parameters " - "are not yet supported with speculative decoding." - ) - def _validate_params( self, params: SamplingParams | PoolingParams, @@ -203,11 +96,15 @@ class InputProcessor: # is passed to all `process_inputs` calls supported_tasks: tuple[SupportedTask, ...] | None, ): - """ - Validate supported SamplingParam. - Should raise ValueError if unsupported for API Server. - """ - if isinstance(params, PoolingParams): + """Raise `ValueError` if SamplingParams or PoolingParams is not valid.""" + if isinstance(params, SamplingParams): + params.verify( + self.model_config, + self.speculative_config, + self.structured_outputs_config, + self.tokenizer, + ) + elif isinstance(params, PoolingParams): if supported_tasks is None: raise RuntimeError("`supported_tasks` must be passed for pooling") @@ -233,12 +130,11 @@ class InputProcessor: ) params.verify(self.model_config) - - return - - self._validate_logprobs(params) - self._validate_sampling_params(params) - self._validate_supported_sampling_params(params) + else: + raise TypeError( + f"params must be either SamplingParams or PoolingParams, " + f"but got {type(params).__name__}" + ) def _parse_mm_items(self, mm_data: MultiModalDataDict) -> MultiModalDataItems: mm_processor = self.input_preprocessor._get_mm_processor() @@ -334,120 +230,6 @@ class InputProcessor: "[lora_path]` to use the LoRA tokenizer." ) - def _validate_structured_output(self, params: SamplingParams) -> None: - if not params.structured_outputs or not self.structured_outputs_config: - return - - if self.model_config.skip_tokenizer_init and params.structured_outputs: - raise ValueError( - "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'" # noqa: E501 - ) - - backend = self.structured_outputs_config.backend - if _backend := params.structured_outputs._backend: - # Request-level backend selection is not supported. - # The values may differ if `params` is reused and was set - # to a specific backend based on `auto` behavior in a previous - # request. We remember that it was set as a result of `auto` - # using the `_backend_was_auto` field set in the params. - if backend != _backend and not ( - backend == "auto" and params.structured_outputs._backend_was_auto - ): - raise ValueError( - "Request-level structured output backend selection is not " - f"supported. The request specified '{_backend}', but vLLM " - f"was initialised with '{backend}'. This error can be " - "resolved by removing '_backend' from the request." - ) - else: - params.structured_outputs._backend = backend - - # Request content validation - if ( - isinstance(params.structured_outputs.choice, list) - and not params.structured_outputs.choice - ): - # It is invalid for choice to be an empty list - raise ValueError( - f"Choice '{params.structured_outputs.choice}' cannot be an empty list" # noqa: E501 - ) - # Reject empty string grammar early to avoid engine-side crashes - if ( - isinstance(params.structured_outputs.grammar, str) - and params.structured_outputs.grammar.strip() == "" - ): - raise ValueError("structured_outputs.grammar cannot be an empty string") - - if backend.startswith("xgrammar"): - # xgrammar with no fallback - validate_xgrammar_grammar(params) - elif backend.startswith("guidance"): - # TODO: ideally we would have the LLTokenizer here as Lark syntax - # allows <|special_token|> and similar, see - # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens - # Without tokenizer these are disallowed in grammars. - if isinstance(self.tokenizer, MistralTokenizer): - raise ValueError( - "Mistral tokenizer is not supported for the 'guidance' " - "structured output backend. Please use ['xgrammar', 'outlines'] " - "backends or tokenizer_mode='hf' instead." - ) - validate_guidance_grammar(params, tokenizer=None) - elif backend == "outlines": - # outlines backend - validate_structured_output_request_outlines(params) - elif backend == "lm-format-enforcer": - # lm format enforcer backend - if isinstance(self.tokenizer, MistralTokenizer): - raise ValueError( - "Mistral tokenizer is not supported for the 'lm-format-enforcer' " - "structured output backend. Please use ['xgrammar', 'outlines'] " - "backends or tokenizer_mode='hf' instead." - ) - validate_structured_output_request_lm_format_enforcer(params) - else: - # NOTE: backend must be "auto" here, because we have - # checked supported_backends above. - # In this mode, we set opinionated defaults based on what we think - # will satisfy the most use cases without having to worry about - # this setting. We include fallback behavior here, but not with any - # other setting where a specific backend was specified. - try: - validate_xgrammar_grammar(params) - params.structured_outputs._backend = "xgrammar" - except ValueError: - # The request either failed validation - # or includes some jsonschema feature(s) that - # are not supported in xgrammar. - - # Check if schema has features unsupported by guidance - so_params = params.structured_outputs - skip_guidance = False - if so_params.json: - if isinstance(so_params.json, str): - import json - - schema = json.loads(so_params.json) - else: - schema = so_params.json - skip_guidance = has_guidance_unsupported_json_features(schema) - - if isinstance(self.tokenizer, MistralTokenizer) or skip_guidance: - # Fall back to outlines if the tokenizer is Mistral - # or if schema contains features unsupported by guidance - validate_structured_output_request_outlines(params) - params.structured_outputs._backend = "outlines" - else: - # Fall back to guidance by default. - validate_guidance_grammar(params, tokenizer=None) - params.structured_outputs._backend = "guidance" - # Remember that this backend was set automatically - params.structured_outputs._backend_was_auto = True - - # Run post-init validation. This is also important to ensure subsequent - # roundtrip serialization/deserialization won't fail. - params.structured_outputs.__post_init__() - def _extract_singleton_mm_data( self, prompt: SingletonPrompt ) -> MultiModalDataDict | None: @@ -618,8 +400,10 @@ class InputProcessor: prompt_token_ids, prompt_embeds ) sampling_params.max_tokens = self.model_config.max_model_len - seq_len + sampling_params.update_from_generation_config( - self.generation_config_fields, eos_token_id + self.generation_config_fields, + None if self.tokenizer is None else self.tokenizer.eos_token_id, ) if self.tokenizer is not None: sampling_params.update_from_tokenizer(self.tokenizer) -- GitLab From b96f7314b451c01d2c727a93636c023b07adf732 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 12 Feb 2026 11:38:11 +0800 Subject: [PATCH 0126/1166] [Refactor] Pass Renderer to Input Processor (#34329) Signed-off-by: DarkLight1337 --- .../openai/test_serving_responses.py | 5 ++- .../models/language/generation/test_hybrid.py | 6 ++-- .../pooling/test_auto_prefix_cache_support.py | 9 ++++-- tests/v1/e2e/test_pooling_chunked_prefill.py | 3 +- tests/v1/sample/test_logprobs.py | 3 +- vllm/config/pooler.py | 2 +- vllm/engine/protocol.py | 7 ++--- vllm/entrypoints/llm.py | 7 +++-- .../openai/chat_completion/serving.py | 6 ++-- vllm/entrypoints/openai/completion/serving.py | 3 +- vllm/entrypoints/openai/engine/serving.py | 31 ++++++++++--------- vllm/entrypoints/openai/models/serving.py | 14 ++++----- vllm/entrypoints/openai/responses/serving.py | 12 ++++--- vllm/entrypoints/pooling/embed/serving.py | 20 ++++-------- vllm/entrypoints/serve/tokenize/serving.py | 2 +- vllm/inputs/preprocess.py | 5 +-- vllm/v1/engine/async_llm.py | 27 ++++++++-------- vllm/v1/engine/input_processor.py | 16 +++++----- vllm/v1/engine/llm_engine.py | 31 +++++++++---------- vllm/v1/engine/output_processor.py | 4 ++- 20 files changed, 107 insertions(+), 106 deletions(-) diff --git a/tests/entrypoints/openai/test_serving_responses.py b/tests/entrypoints/openai/test_serving_responses.py index ba0c2c876..ff0da632e 100644 --- a/tests/entrypoints/openai/test_serving_responses.py +++ b/tests/entrypoints/openai/test_serving_responses.py @@ -125,6 +125,7 @@ class TestInitializeToolSessions: engine_client = MagicMock() model_config = MagicMock() + model_config.max_model_len = 100 model_config.hf_config.model_type = "test" model_config.get_diff_sampling_param.return_value = {} engine_client.model_config = model_config @@ -212,6 +213,7 @@ class TestValidateGeneratorInput: engine_client = MagicMock() model_config = MagicMock() + model_config.max_model_len = 100 model_config.hf_config.model_type = "test" model_config.get_diff_sampling_param.return_value = {} engine_client.model_config = model_config @@ -231,9 +233,6 @@ class TestValidateGeneratorInput: chat_template_content_format="auto", ) - # Set max_model_len for testing - instance.max_model_len = 100 - return instance def test_validate_generator_input(self, serving_responses_instance): diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index e853f65db..524cf5b92 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -507,7 +507,8 @@ def test_apc_single_prompt_block_align_alignment( vllm_runner_kwargs["enable_prefix_caching"] = True with vllm_runner(**vllm_runner_kwargs) as vllm_model: # Retrieve the default mamba state block size - mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size + vllm_config = vllm_model.llm.llm_engine.vllm_config + mamba_block_size = vllm_config.cache_config.mamba_block_size # In case the hybrid model does not have the # "mamba_block_size" assume a fixed constant @@ -660,7 +661,8 @@ def test_apc_multiple_prompts_block_align_alignment( vllm_runner_kwargs["enable_prefix_caching"] = True with vllm_runner(**vllm_runner_kwargs) as vllm_model: # Retrieve the default mamba state block size - mamba_block_size = vllm_model.llm.llm_engine.cache_config.mamba_block_size + vllm_config = vllm_model.llm.llm_engine.vllm_config + mamba_block_size = vllm_config.cache_config.mamba_block_size # In case the hybrid model does not have the # "mamba_block_size" assume a fixed constant diff --git a/tests/models/language/pooling/test_auto_prefix_cache_support.py b/tests/models/language/pooling/test_auto_prefix_cache_support.py index 3795f2a5d..e176936de 100644 --- a/tests/models/language/pooling/test_auto_prefix_cache_support.py +++ b/tests/models/language/pooling/test_auto_prefix_cache_support.py @@ -25,7 +25,8 @@ def test_classify_models( with vllm_runner( model, max_model_len=512, dtype=dtype, enable_prefix_caching=True ) as vllm_model: - cache_config = vllm_model.llm.llm_engine.cache_config + vllm_config = vllm_model.llm.llm_engine.vllm_config + cache_config = vllm_config.cache_config assert cache_config.enable_prefix_caching # First Run @@ -74,7 +75,8 @@ def test_embed_models( max_model_len=None, enable_prefix_caching=True, ) as vllm_model: - cache_config = vllm_model.llm.llm_engine.cache_config + vllm_config = vllm_model.llm.llm_engine.vllm_config + cache_config = vllm_config.cache_config assert cache_config.enable_prefix_caching # First Run @@ -106,5 +108,6 @@ def test_non_causal_models( hf_runner, vllm_runner, example_prompts, model: str, dtype: str ) -> None: with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model: - cache_config = vllm_model.llm.llm_engine.cache_config + vllm_config = vllm_model.llm.llm_engine.vllm_config + cache_config = vllm_config.cache_config assert not cache_config.enable_prefix_caching diff --git a/tests/v1/e2e/test_pooling_chunked_prefill.py b/tests/v1/e2e/test_pooling_chunked_prefill.py index a196e3599..976e4d173 100644 --- a/tests/v1/e2e/test_pooling_chunked_prefill.py +++ b/tests/v1/e2e/test_pooling_chunked_prefill.py @@ -161,7 +161,8 @@ def test_pooling_prefix_cache(vllm_runner, monkeypatch): assert chunks[0] <= prompt1_len assert chunks[0] < prompt2_len - cache_config = llm.get_llm().llm_engine.cache_config + vllm_config = llm.get_llm().llm_engine.vllm_config + cache_config = vllm_config.cache_config print(f"{cache_config=}") # Prefixes are cached in blocks assert (prompt2_len - chunks[0]) % cache_config.block_size == 0 diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index 3c7ed77a8..7466e3619 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -311,7 +311,8 @@ def test_get_logprobs_and_prompt_logprobs( temperature: "temperature" sampling parameter example_prompts: example prompt fixture """ - do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching + vllm_config = vllm_model.llm.llm_engine.vllm_config + do_apc = vllm_config.cache_config.enable_prefix_caching if do_apc and (temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT): # Skip some test-cases to save time. pytest.skip() diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py index 75cdc90fe..841260e27 100644 --- a/vllm/config/pooler.py +++ b/vllm/config/pooler.py @@ -54,7 +54,7 @@ class PoolerConfig: Reduce the dimensions of embeddings if model support matryoshka representation. Defaults to None. """ - enable_chunked_processing: bool | None = None + enable_chunked_processing: bool = False """ Whether to enable chunked processing for long inputs that exceed the model's maximum position embeddings. When enabled, long inputs will be split into diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index d942b7f5f..0f2e62c59 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -31,12 +31,9 @@ class EngineClient(ABC): vllm_config: VllmConfig model_config: ModelConfig - input_processor: InputProcessor + renderer: BaseRenderer io_processor: IOProcessor | None - - @property - @abstractmethod - def renderer(self) -> BaseRenderer: ... + input_processor: InputProcessor @property @abstractmethod diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 2b4ed8695..ab0b46821 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -356,8 +356,9 @@ class LLM: self.supported_tasks = supported_tasks self.model_config = self.llm_engine.model_config - self.input_processor = self.llm_engine.input_processor + self.renderer = self.llm_engine.renderer self.io_processor = self.llm_engine.io_processor + self.input_processor = self.llm_engine.input_processor # Cache for __repr__ to avoid repeated collective_rpc calls self._cached_repr: str | None = None @@ -816,7 +817,7 @@ class LLM: A list of `TokensPrompts` objects containing the tokenized prompt after chat template interpolation, and the raw multi-modal inputs. """ - renderer = self.llm_engine.renderer + renderer = self.renderer model_config = self.model_config parsed_prompts = [ @@ -858,7 +859,7 @@ class LLM: A list of `TokensPrompts` objects containing the tokenized prompt after chat template interpolation, and the raw multi-modal inputs. """ - renderer = self.llm_engine.renderer + renderer = self.renderer chat_params = ChatParams( chat_template=chat_template, diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py index adcd488a0..761ae9a50 100644 --- a/vllm/entrypoints/openai/chat_completion/serving.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py @@ -239,8 +239,7 @@ class OpenAIServingChat(OpenAIServing): raise self.engine_client.dead_error try: - renderer = self.engine_client.renderer - tokenizer = renderer.tokenizer + tokenizer = self.renderer.tokenizer tool_parser = self.tool_parser @@ -375,6 +374,7 @@ class OpenAIServingChat(OpenAIServing): data_parallel_rank = self._get_data_parallel_rank(raw_request) # Schedule the request and get the result generator. + max_model_len = self.model_config.max_model_len generators: list[AsyncGenerator[RequestOutput, None]] = [] try: for i, engine_prompt in enumerate(engine_prompts): @@ -387,7 +387,7 @@ class OpenAIServingChat(OpenAIServing): ) max_tokens = get_max_tokens( - self.max_model_len, + max_model_len, request.max_completion_tokens if request.max_completion_tokens is not None else request.max_tokens, diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py index beb3c2c53..0353625fe 100644 --- a/vllm/entrypoints/openai/completion/serving.py +++ b/vllm/entrypoints/openai/completion/serving.py @@ -157,13 +157,14 @@ class OpenAIServingCompletion(OpenAIServing): data_parallel_rank = self._get_data_parallel_rank(raw_request) # Schedule the request and get the result generator. + max_model_len = self.model_config.max_model_len generators: list[AsyncGenerator[RequestOutput, None]] = [] try: for i, engine_prompt in enumerate(engine_prompts): prompt_text = self._extract_prompt_text(engine_prompt) max_tokens = get_max_tokens( - self.max_model_len, + max_model_len, request.max_tokens, self._extract_prompt_len(engine_prompt), self.default_sampling_params, diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py index 5ee5b531e..d39decaa7 100644 --- a/vllm/entrypoints/openai/engine/serving.py +++ b/vllm/entrypoints/openai/engine/serving.py @@ -242,11 +242,10 @@ class OpenAIServing: self.log_error_stack = log_error_stack - self.input_processor = self.models.input_processor - self.io_processor = self.models.io_processor - self.renderer = self.models.renderer - self.model_config = self.models.model_config - self.max_model_len = self.model_config.max_model_len + self.model_config = engine_client.model_config + self.renderer = engine_client.renderer + self.io_processor = engine_client.io_processor + self.input_processor = engine_client.input_processor async def beam_search( self, @@ -537,7 +536,7 @@ class OpenAIServing: if ( truncate_prompt_tokens is not None - and truncate_prompt_tokens > self.max_model_len + and truncate_prompt_tokens > self.model_config.max_model_len ): return self.create_error_response( "truncate_prompt_tokens value is " @@ -844,6 +843,7 @@ class OpenAIServing: input_text: str, ) -> TokensPrompt: token_num = len(input_ids) + max_model_len = self.model_config.max_model_len # Note: EmbeddingRequest, ClassificationRequest, # and ScoreRequest doesn't have max_tokens @@ -862,7 +862,7 @@ class OpenAIServing: ): # Note: input length can be up to the entire model context length # since these requests don't generate tokens. - if token_num > self.max_model_len: + if token_num > max_model_len: operations: dict[type[AnyRequest], str] = { ScoreDataRequest: "score", ScoreTextRequest: "score", @@ -873,7 +873,7 @@ class OpenAIServing: operation = operations.get(type(request), "embedding generation") raise VLLMValidationError( f"This model's maximum context length is " - f"{self.max_model_len} tokens. However, you requested " + f"{max_model_len} tokens. However, you requested " f"{token_num} tokens in the input for {operation}. " f"Please reduce the length of the input.", parameter="input_tokens", @@ -898,22 +898,22 @@ class OpenAIServing: # Note: input length can be up to model context length - 1 for # completion-like requests. - if token_num >= self.max_model_len: + if token_num >= max_model_len: raise VLLMValidationError( f"This model's maximum context length is " - f"{self.max_model_len} tokens. However, your request has " + f"{max_model_len} tokens. However, your request has " f"{token_num} input tokens. Please reduce the length of " "the input messages.", parameter="input_tokens", value=token_num, ) - if max_tokens is not None and token_num + max_tokens > self.max_model_len: + if max_tokens is not None and token_num + max_tokens > max_model_len: raise VLLMValidationError( "'max_tokens' or 'max_completion_tokens' is too large: " f"{max_tokens}. This model's maximum context length is " - f"{self.max_model_len} tokens and your request has " - f"{token_num} input tokens ({max_tokens} > {self.max_model_len}" + f"{max_model_len} tokens and your request has " + f"{token_num} input tokens ({max_tokens} > {max_model_len}" f" - {token_num}).", parameter="max_tokens", value=max_tokens, @@ -1089,6 +1089,7 @@ class OpenAIServing: priority: int = 0, trace_headers: Mapping[str, str] | None = None, ): + max_model_len = self.model_config.max_model_len prompt_text = self._extract_prompt_text(engine_prompt) orig_priority = priority @@ -1148,7 +1149,7 @@ class OpenAIServing: token_ids = context.render_for_completion() engine_prompt = TokensPrompt(prompt_token_ids=token_ids) - sampling_params.max_tokens = self.max_model_len - len(token_ids) + sampling_params.max_tokens = max_model_len - len(token_ids) elif isinstance(context, ParsableContext): engine_prompts = await self._render_next_turn( context.request, @@ -1162,7 +1163,7 @@ class OpenAIServing: prompt_text = self._extract_prompt_text(engine_prompt) sampling_params.max_tokens = get_max_tokens( - self.max_model_len, + max_model_len, context.request.max_output_tokens, self._extract_prompt_len(engine_prompt), self.default_sampling_params, # type: ignore diff --git a/vllm/entrypoints/openai/models/serving.py b/vllm/entrypoints/openai/models/serving.py index ba32787ac..e99d8f7ac 100644 --- a/vllm/entrypoints/openai/models/serving.py +++ b/vllm/entrypoints/openai/models/serving.py @@ -59,11 +59,10 @@ class OpenAIServingModels: ) self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock) - self.input_processor = self.engine_client.input_processor - self.io_processor = self.engine_client.io_processor - self.renderer = self.engine_client.renderer self.model_config = self.engine_client.model_config - self.max_model_len = self.model_config.max_model_len + self.renderer = self.engine_client.renderer + self.io_processor = self.engine_client.io_processor + self.input_processor = self.engine_client.input_processor async def init_static_loras(self): """Loads all static LoRA modules. @@ -96,12 +95,13 @@ class OpenAIServingModels: return self.base_model_paths[0].name async def show_available_models(self) -> ModelList: - """Show available models. This includes the base model and all - adapters""" + """Show available models. This includes the base model and all adapters.""" + max_model_len = self.model_config.max_model_len + model_cards = [ ModelCard( id=base_model.name, - max_model_len=self.max_model_len, + max_model_len=max_model_len, root=base_model.model_path, permission=[ModelPermission()], ) diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index 2af7f578e..0d9ef135a 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -296,10 +296,12 @@ class OpenAIServingResponses(OpenAIServing): ) -> ErrorResponse | None: """Add validations to the input to the generator here.""" prompt_len = self._extract_prompt_len(engine_prompt) - if self.max_model_len <= prompt_len: + max_model_len = self.model_config.max_model_len + + if prompt_len >= max_model_len: error_message = ( f"The engine prompt length {prompt_len} " - f"exceeds the max_model_len {self.max_model_len}. " + f"exceeds the max_model_len {max_model_len}. " "Please reduce prompt." ) return self.create_error_response( @@ -414,6 +416,7 @@ class OpenAIServingResponses(OpenAIServing): raw_request.state.request_metadata = request_metadata # Schedule the request and get the result generator. + max_model_len = self.model_config.max_model_len generators: list[AsyncGenerator[ConversationContext, None]] = [] builtin_tool_list: list[str] = [] @@ -431,8 +434,7 @@ class OpenAIServingResponses(OpenAIServing): assert len(builtin_tool_list) == 0 available_tools = [] try: - renderer = self.engine_client.renderer - tokenizer = renderer.get_tokenizer() + tokenizer = self.renderer.get_tokenizer() for engine_prompt in engine_prompts: maybe_error = self._validate_generator_input(engine_prompt) @@ -440,7 +442,7 @@ class OpenAIServingResponses(OpenAIServing): return maybe_error default_max_tokens = get_max_tokens( - self.max_model_len, + max_model_len, request.max_output_tokens, self._extract_prompt_len(engine_prompt), self.default_sampling_params, diff --git a/vllm/entrypoints/pooling/embed/serving.py b/vllm/entrypoints/pooling/embed/serving.py index f06ed9ad7..cd7c4f772 100644 --- a/vllm/entrypoints/pooling/embed/serving.py +++ b/vllm/entrypoints/pooling/embed/serving.py @@ -69,16 +69,8 @@ class OpenAIServingEmbedding(OpenAIServing): self.trust_request_chat_template = trust_request_chat_template pooler_config = self.model_config.pooler_config - - # Avoid repeated attribute lookups - self.supports_chunked_processing = bool( - pooler_config and pooler_config.enable_chunked_processing - ) - self.max_embed_len = ( - pooler_config.max_embed_len - if pooler_config and pooler_config.max_embed_len - else None - ) + assert pooler_config is not None + self.pooler_config = pooler_config async def _preprocess( self, @@ -240,7 +232,7 @@ class OpenAIServingEmbedding(OpenAIServing): """Check if chunked processing should be used for this request.""" return ( isinstance(request, (EmbeddingCompletionRequest, EmbeddingChatRequest)) - and self.supports_chunked_processing + and self.pooler_config.enable_chunked_processing ) async def _process_chunked_request( @@ -310,14 +302,14 @@ class OpenAIServingEmbedding(OpenAIServing): max_pos_embeddings = self._get_max_position_embeddings() # Determine the effective max length for validation - if self.max_embed_len is not None: + if self.pooler_config.max_embed_len: # Use max_embed_len for validation instead of max_model_len length_type = "maximum embedding input length" - max_length_value = self.max_embed_len + max_length_value = self.pooler_config.max_embed_len else: # Fall back to max_model_len validation (original behavior) length_type = "maximum context length" - max_length_value = self.max_model_len + max_length_value = self.model_config.max_model_len validation_error_msg = ( "This model's {length_type} is {max_length_value} tokens. " diff --git a/vllm/entrypoints/serve/tokenize/serving.py b/vllm/entrypoints/serve/tokenize/serving.py index 64a2741ac..3d29ff809 100644 --- a/vllm/entrypoints/serve/tokenize/serving.py +++ b/vllm/entrypoints/serve/tokenize/serving.py @@ -117,7 +117,7 @@ class OpenAIServingTokenization(OpenAIServing): tokens=input_ids, token_strs=token_strs, count=len(input_ids), - max_model_len=self.max_model_len, + max_model_len=self.model_config.max_model_len, ) async def create_detokenize( diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 1d085cabb..b2cdccbed 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -16,7 +16,7 @@ from vllm.multimodal.inputs import ( MultiModalUUIDDict, ) from vllm.multimodal.processing import BaseMultiModalProcessor -from vllm.renderers import renderer_from_config +from vllm.renderers import BaseRenderer, renderer_from_config from vllm.renderers.inputs import ( DecoderDictPrompt, DecoderOnlyDictPrompt, @@ -56,6 +56,7 @@ class InputPreprocessor: self, model_config: ModelConfig, observability_config: ObservabilityConfig | None = None, + renderer: BaseRenderer | None = None, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, mm_processor_cache: BaseMultiModalProcessorCache | None = None, ) -> None: @@ -63,7 +64,7 @@ class InputPreprocessor: self.model_config = model_config self.observability_config = observability_config - self.renderer = renderer_from_config(model_config) + self.renderer = renderer or renderer_from_config(model_config) self.mm_registry = mm_registry self.mm_processor_cache = mm_processor_cache diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 072d2a164..2d608b11a 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -27,7 +27,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.outputs import STREAM_FINISHED, PoolingRequestOutput, RequestOutput from vllm.plugins.io_processors import get_io_processor from vllm.pooling_params import PoolingParams -from vllm.renderers import BaseRenderer, merge_kwargs +from vllm.renderers import merge_kwargs, renderer_from_config from vllm.renderers.inputs import DictPrompt, TokPrompt from vllm.renderers.inputs.preprocess import extract_prompt_components from vllm.sampling_params import RequestOutputKind, SamplingParams @@ -110,9 +110,10 @@ class AsyncLLM(EngineClient): # Ensure we can serialize custom transformer configs maybe_register_config_serialize_by_value() - self.model_config = vllm_config.model_config self.vllm_config = vllm_config + self.model_config = vllm_config.model_config self.observability_config = vllm_config.observability_config + tracing_endpoint = self.observability_config.otlp_traces_endpoint if tracing_endpoint is not None: init_tracer("vllm.llm_engine", tracing_endpoint) @@ -131,20 +132,22 @@ class AsyncLLM(EngineClient): "enabling logging without default stat loggers." ) - self.input_processor = InputProcessor(self.vllm_config) + self.renderer = renderer = renderer_from_config(self.model_config) self.io_processor = get_io_processor( self.vllm_config, self.model_config.io_processor_plugin, ) - # OutputProcessor (converts EngineCoreOutputs --> RequestOutput). + # Convert TokPrompt --> EngineCoreRequest. + self.input_processor = InputProcessor(self.vllm_config, renderer) + + # Converts EngineCoreOutputs --> RequestOutput. self.output_processor = OutputProcessor( - self.tokenizer, + renderer.tokenizer, log_stats=self.log_stats, stream_interval=self.vllm_config.scheduler_config.stream_interval, + tracing_enabled=tracing_endpoint is not None, ) - if tracing_endpoint is not None: - self.output_processor.tracing_enabled = True # EngineCore (starts the engine in background process). self.engine_core = EngineCoreClient.make_async_mp_client( @@ -891,17 +894,13 @@ class AsyncLLM(EngineClient): @property def tokenizer(self) -> TokenizerLike | None: - return self.input_processor.tokenizer + return self.renderer.tokenizer def get_tokenizer(self) -> TokenizerLike: - return self.input_processor.get_tokenizer() - - @property - def renderer(self) -> BaseRenderer: - return self.input_processor.renderer + return self.renderer.get_tokenizer() async def is_tracing_enabled(self) -> bool: - return self.observability_config.otlp_traces_endpoint is not None # type: ignore + return self.observability_config.otlp_traces_endpoint is not None async def do_log_stats(self) -> None: if self.logger_manager: diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 17f4c6dec..8bd4b509a 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -27,7 +27,7 @@ from vllm.multimodal.parse import ModalityDataItems, MultiModalDataItems from vllm.multimodal.processing.context import set_request_id from vllm.multimodal.utils import argsort_mm_positions from vllm.pooling_params import PoolingParams -from vllm.renderers import BaseRenderer +from vllm.renderers import BaseRenderer, renderer_from_config from vllm.renderers.inputs import DictPrompt, TokPrompt from vllm.sampling_params import SamplingParams from vllm.tasks import POOLING_TASKS, SupportedTask @@ -44,6 +44,8 @@ class InputProcessor: def __init__( self, vllm_config: VllmConfig, + renderer: BaseRenderer | None = None, + *, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ) -> None: self.vllm_config = vllm_config @@ -57,6 +59,7 @@ class InputProcessor: self.generation_config_fields = model_config.try_get_generation_config() + self.renderer = renderer or renderer_from_config(model_config) self.mm_registry = mm_registry self.mm_processor_cache = mm_registry.processor_cache_from_config(vllm_config) @@ -74,20 +77,17 @@ class InputProcessor: self.input_preprocessor = InputPreprocessor( model_config, self.observability_config, - mm_registry, + renderer=renderer, + mm_registry=mm_registry, mm_processor_cache=self.mm_processor_cache, ) @property def tokenizer(self) -> TokenizerLike | None: - return self.input_preprocessor.tokenizer + return self.renderer.tokenizer def get_tokenizer(self) -> TokenizerLike: - return self.input_preprocessor.get_tokenizer() - - @property - def renderer(self) -> BaseRenderer: - return self.input_preprocessor.renderer + return self.renderer.get_tokenizer() def _validate_params( self, diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 294c9ff62..815236b94 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -21,7 +21,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.plugins.io_processors import get_io_processor from vllm.pooling_params import PoolingParams -from vllm.renderers import BaseRenderer +from vllm.renderers import renderer_from_config from vllm.renderers.inputs import DictPrompt, TokPrompt from vllm.renderers.inputs.preprocess import extract_prompt_components from vllm.sampling_params import SamplingParams @@ -62,9 +62,12 @@ class LLMEngine: multiprocess_mode: bool = False, ) -> None: self.vllm_config = vllm_config - self.observability_config = vllm_config.observability_config self.model_config = vllm_config.model_config - self.cache_config = vllm_config.cache_config + self.observability_config = vllm_config.observability_config + + tracing_endpoint = self.observability_config.otlp_traces_endpoint + if tracing_endpoint is not None: + init_tracer("vllm.llm_engine", tracing_endpoint) self.log_stats = log_stats @@ -87,22 +90,22 @@ class LLMEngine: self.dp_group = None self.should_execute_dummy_batch = False - self.input_processor = InputProcessor(self.vllm_config) + self.renderer = renderer = renderer_from_config(self.model_config) self.io_processor = get_io_processor( self.vllm_config, self.model_config.io_processor_plugin, ) - # OutputProcessor (convert EngineCoreOutputs --> RequestOutput). + # Convert TokPrompt --> EngineCoreRequest. + self.input_processor = InputProcessor(self.vllm_config, renderer) + + # Converts EngineCoreOutputs --> RequestOutput. self.output_processor = OutputProcessor( - self.tokenizer, + renderer.tokenizer, log_stats=self.log_stats, stream_interval=self.vllm_config.scheduler_config.stream_interval, + tracing_enabled=tracing_endpoint is not None, ) - endpoint = self.observability_config.otlp_traces_endpoint - if endpoint is not None: - init_tracer("vllm.llm_engine", endpoint) - self.output_processor.tracing_enabled = True # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs) self.engine_core = EngineCoreClient.make_client( @@ -365,14 +368,10 @@ class LLMEngine: @property def tokenizer(self) -> TokenizerLike | None: - return self.input_processor.tokenizer + return self.renderer.tokenizer def get_tokenizer(self) -> TokenizerLike: - return self.input_processor.get_tokenizer() - - @property - def renderer(self) -> BaseRenderer: - return self.input_processor.renderer + return self.renderer.get_tokenizer() def do_log_stats(self) -> None: """Log stats if logging is enabled.""" diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 58c73fbc6..de94a0e5d 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -417,8 +417,10 @@ class OutputProcessor: def __init__( self, tokenizer: TokenizerLike | None, + *, log_stats: bool, stream_interval: int = 1, + tracing_enabled: bool = False, ): self.log_stats = log_stats self.tokenizer = tokenizer @@ -427,7 +429,7 @@ class OutputProcessor: self.parent_requests: dict[str, ParentRequest] = {} self.external_req_ids: defaultdict[str, list[str]] = defaultdict(list) self.lora_states = LoRARequestStates(log_stats) - self.tracing_enabled: bool = False + self.tracing_enabled = tracing_enabled self._requests_drained = asyncio.Event() self._requests_drained.set() -- GitLab From 136b0bfa59377ed2bbd3b3716036a96267cfe80b Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 11 Feb 2026 23:44:03 -0700 Subject: [PATCH 0127/1166] [BugFix] Fix DP chunking (#34379) Signed-off-by: Lucas Wilkinson Signed-off-by: Bill Nell Co-authored-by: Bill Nell --- .../layers/fused_moe/runner/default_moe_runner.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index b265cbb41..e68d35b31 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -421,7 +421,7 @@ class DefaultMoERunner(MoERunner): layer: torch.nn.Module, full_hidden_states: torch.Tensor, full_router_logits: torch.Tensor, - shared_input: torch.Tensor | None, + full_shared_input: torch.Tensor | None, has_separate_shared_experts: bool, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert self.batched_hidden_states is not None @@ -449,6 +449,11 @@ class DefaultMoERunner(MoERunner): chunk_size = chunk_end - chunk_start hidden_states = full_hidden_states[chunk_start:chunk_end, :] router_logits = full_router_logits[chunk_start:chunk_end, :] + shared_input = ( + full_shared_input[chunk_start:chunk_end, :] + if full_shared_input is not None + else None + ) assert self.batched_hidden_states is not None assert self.batched_router_logits is not None @@ -476,8 +481,13 @@ class DefaultMoERunner(MoERunner): staged_hidden_states.copy_(hidden_states, non_blocking=True) staged_router_logits.copy_(router_logits, non_blocking=True) + shared_input = ( + shared_input if shared_input is not None else staged_hidden_states + ) + # Matrix multiply. if self.quant_method.is_monolithic: + assert has_separate_shared_experts or self.shared_experts is None final_hidden_states = self.quant_method.apply_monolithic( layer=layer, x=staged_hidden_states, @@ -501,7 +511,7 @@ class DefaultMoERunner(MoERunner): assert not isinstance(final_hidden_states, tuple) assert self.shared_experts is not None - shared_output = self.shared_experts(staged_hidden_states) + shared_output = self.shared_experts(shared_input) final_hidden_states = ( shared_output, -- GitLab From 80f2ba6ea6cbda0da56da65cee8402e7b5bf2aa0 Mon Sep 17 00:00:00 2001 From: Yichuan Wang <73766326+yichuan-w@users.noreply.github.com> Date: Wed, 11 Feb 2026 22:50:23 -0800 Subject: [PATCH 0128/1166] Fix DeepSeek-OCR tensor validation for all size variants (#34085) Co-authored-by: Cursor --- vllm/model_executor/models/deepseek_ocr.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py index 146b05002..8293d2ece 100644 --- a/vllm/model_executor/models/deepseek_ocr.py +++ b/vllm/model_executor/models/deepseek_ocr.py @@ -448,7 +448,16 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, Supports if pixel_values is None or torch.sum(pixel_values).item() == 0: return None - base_size = self.vision_config.image_size + # Use actual tensor spatial dim instead of hardcoded + # vision_config.image_size (1024). The vision encoders (SAM & CLIP) + # support arbitrary resolutions via pos-encoding interpolation, + # so Tiny/Small/Base/Large variants all work with the same weights. + base_size = pixel_values.shape[-1] + if images_crop is not None and images_crop.numel() > 0: + image_size = images_crop.shape[-1] + else: + image_size = base_size + return DeepseekOCRImagePixelInputs( type="pixel_values", data=pixel_values, @@ -456,6 +465,7 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, Supports images_spatial_crop=images_spatial_crop, resolve_bindings={ "base_size": base_size, + "image_size": image_size, }, ) -- GitLab From e9cd6911321f7671de218d0c778c5400d7f1d1a6 Mon Sep 17 00:00:00 2001 From: Kyle Sayers Date: Thu, 12 Feb 2026 02:15:16 -0500 Subject: [PATCH 0129/1166] [Bugfix] Fix Sparse24 Compressed Tensors models (#33446) Signed-off-by: Kyle Sayers Co-authored-by: Michael Goin --- csrc/sparse/cutlass/sparse_scaled_mm_entry.cu | 6 ++--- .../compressed_tensors/compressed_tensors.py | 25 ++++++++++--------- .../model_loader/weight_utils.py | 1 + 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu index 38b929be4..dbed5fa4e 100644 --- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu +++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu @@ -6,11 +6,11 @@ #include "cutlass_extensions/common.hpp" bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability) { - // sparse CUTLASS kernels need at least + // sparse CUTLASS kernels need exactly hopper and are not forward compatible // CUDA 12.2 and SM90 (Hopper) #if defined CUDA_VERSION - return CUDA_VERSION >= 12020 && cuda_device_capability >= 90; + return CUDA_VERSION >= 12020 && cuda_device_capability == 90; #endif return false; @@ -98,7 +98,7 @@ std::vector cutlass_sparse_compress(torch::Tensor const& a) { TORCH_CHECK_NOT_IMPLEMENTED( false, - "No compiled cutlass_sparse_compress for a compute capability less than " + "No compiled cutlass_sparse_compress for a compute capability equal to " "CUDA device capability: ", version_num); } diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index df3d733b7..9de2228b7 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -207,18 +207,19 @@ class CompressedTensorsConfig(QuantizationConfig): # because Attention quantization on its own is not supported by vLLM. # It is coupled with KV-cache quantization, and if scales are present in the # checkpoint, they will be used properly. - grps_without_attn_quant = {} - for k, v in config["config_groups"].items(): - # e.g. LlamaAttention, Qwen3Attention, etc. - if len(v["targets"]) == 1 and v["targets"][0].endswith("Attention"): - logger.warning( - "Skipping CompressedTensors config group for %s. Attention quant " - "is coupled with KV-cache quantization in vLLM.", - v["targets"][0], - ) - continue - grps_without_attn_quant[k] = v - config["config_groups"] = grps_without_attn_quant + if "config_groups" in config: + grps_without_attn_quant = {} + for k, v in config["config_groups"].items(): + # e.g. LlamaAttention, Qwen3Attention, etc. + if len(v["targets"]) == 1 and v["targets"][0].endswith("Attention"): + logger.warning( + "Skipping CompressedTensors config group for %s. Attention " + "quant is coupled with KV-cache quantization in vLLM.", + v["targets"][0], + ) + continue + grps_without_attn_quant[k] = v + config["config_groups"] = grps_without_attn_quant ignore: list[str] = cast(list[str], config.get("ignore", [])) quant_format = cast(str, config.get("format")) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 7025efd1c..43ea6f285 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -261,6 +261,7 @@ def get_quant_config( if ( hf_quant_config is not None and hf_quant_config.get("quant_method") == "compressed-tensors" + and "config_groups" in hf_quant_config ): if hf_text_config is not None: n_heads = getattr(hf_text_config, "num_attention_heads", None) -- GitLab From 386bfe5d08103f570a3aa03055372cbd33cf41ca Mon Sep 17 00:00:00 2001 From: AllenDou Date: Thu, 12 Feb 2026 15:26:49 +0800 Subject: [PATCH 0130/1166] [bugfix] refactor FunASR's _get_data_parser (#34397) Signed-off-by: zixiao Co-authored-by: zixiao --- vllm/model_executor/models/funasr.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/models/funasr.py b/vllm/model_executor/models/funasr.py index 3e4a6131c..dff439262 100644 --- a/vllm/model_executor/models/funasr.py +++ b/vllm/model_executor/models/funasr.py @@ -714,10 +714,6 @@ class FunASRProcessingInfo(BaseProcessingInfo): def get_hf_config(self) -> Qwen3Config: return self.ctx.get_hf_config(Qwen3Config) - @property - def skip_prompt_length_check(self) -> bool: - return True # Because the encoder prompt is padded - def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"audio": 1} @@ -727,6 +723,13 @@ class FunASRProcessingInfo(BaseProcessingInfo): assert isinstance(feature_extractor, FunASRFeatureExtractor) return feature_extractor + def get_data_parser(self) -> MultiModalDataParser: + feature_extractor = self.get_feature_extractor() + return MultiModalDataParser( + target_sr=feature_extractor.sampling_rate, + target_channels=self.get_target_channels(), + ) + def get_target_channels(self) -> int: return 1 @@ -765,13 +768,6 @@ class FunASRDummyInputsBuilder(BaseDummyInputsBuilder[FunASRProcessingInfo]): class FunASRMultiModalProcessor(BaseMultiModalProcessor[FunASRProcessingInfo]): - def _get_data_parser(self) -> MultiModalDataParser: - feature_extractor = self.info.get_feature_extractor() - return MultiModalDataParser( - target_sr=feature_extractor.sampling_rate, - target_channels=self.info.get_target_channels(), - ) - def _call_hf_processor( self, prompt: str, -- GitLab From 55a1a9563a7f8600cdc336e76d2074cef8ffe8e5 Mon Sep 17 00:00:00 2001 From: Louie Tsai Date: Thu, 12 Feb 2026 00:04:44 -0800 Subject: [PATCH 0131/1166] Vllm CPU benchmark suite improvement (#34128) Signed-off-by: louie-tsai --- .../scripts/compare-json-results.py | 445 +++++++++++++++--- .../scripts/run-performance-benchmarks.sh | 133 ++++-- .../tests/serving-tests-cpu-embed.json | 41 ++ .../tests/serving-tests-cpu-text.json | 283 +++++++++++ .../tests/serving-tests-cpu.json | 130 ----- docs/getting_started/installation/cpu.md | 24 +- 6 files changed, 802 insertions(+), 254 deletions(-) create mode 100644 .buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json create mode 100644 .buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json diff --git a/.buildkite/performance-benchmarks/scripts/compare-json-results.py b/.buildkite/performance-benchmarks/scripts/compare-json-results.py index b3d0a2d3b..ead097411 100644 --- a/.buildkite/performance-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/performance-benchmarks/scripts/compare-json-results.py @@ -9,8 +9,10 @@ import json import os from dataclasses import dataclass from importlib import util +from pathlib import Path import pandas as pd +import regex as re pd.options.display.float_format = "{:.2f}".format plotly_found = util.find_spec("plotly.express") is not None @@ -275,6 +277,131 @@ def _apply_two_decimals( return styler.format({c: "{:.2f}" for c in num_cols}, na_rep="") +# ----------------------------- +# Export helpers (Excel + CSV) +# ----------------------------- +def _sanitize_sheet_name(name: str) -> str: + """ + Excel sheet constraints: + - max 31 chars + - cannot contain: : \ / ? * [ ] + - cannot be empty + """ + name = "sheet" if name is None else str(name) + name = re.sub(r"[:\\/?*\[\]]", "_", name) + name = name.strip().strip("'") + name = re.sub(r"\s+", " ", name) + if not name: + name = "sheet" + return name[:31] + + +def _group_to_sheet_base(group_cols: list[str], gkey_tuple) -> str: + d = dict(zip(group_cols, gkey_tuple)) + model = d.get("Model", "model") + model_short = str(model).split("/")[-1] + ilen = d.get("Input Len", "") + olen = d.get("Output Len", "") + lens = f"_{ilen}x{olen}" if ilen != "" and olen != "" else "" + return _sanitize_sheet_name(f"{model_short}{lens}") + + +def _write_tables_to_excel_sheet( + writer: pd.ExcelWriter, sheet: str, blocks: list[tuple[str, pd.DataFrame]] +): + startrow = 0 + for title, df in blocks: + pd.DataFrame([[title]]).to_excel( + writer, sheet_name=sheet, index=False, header=False, startrow=startrow + ) + startrow += 1 + df.to_excel(writer, sheet_name=sheet, index=False, startrow=startrow) + startrow += len(df) + 3 + + +def _safe_filename(s: str) -> str: + s = re.sub(r"[^\w\-.]+", "_", str(s).strip()) + return s[:180] if len(s) > 180 else s + + +# ----------------------------- +# vLLM environment export helper +# ----------------------------- +def _parse_vllm_env_txt(env_path: Path) -> pd.DataFrame: + """Parse vllm_env.txt into a flat table (Section, Key, Value). + + Supports: + - section headers as standalone lines (no ':' or '=') + - key-value lines like 'OS: Ubuntu ...' + - env var lines like 'HF_HOME=/data/hf' + """ + lines = env_path.read_text(encoding="utf-8", errors="replace").splitlines() + section = "General" + rows: list[dict] = [] + + def set_section(s: str): + nonlocal section + s = (s or "").strip() + if s: + section = s + + for raw in lines: + stripped = raw.strip() + if not stripped: + continue + # divider lines like ===== + if set(stripped) <= {"="}: + continue + + # section header heuristic: short standalone line + if ":" not in stripped and "=" not in stripped and len(stripped) <= 64: + if stripped.lower().startswith("collecting environment information"): + continue + set_section(stripped) + continue + + # env var style: KEY=VALUE (and not a URL with :) + if "=" in stripped and ":" not in stripped: + k, v = stripped.split("=", 1) + k = k.strip() + v = v.strip() + if k: + rows.append({"Section": section, "Key": k, "Value": v}) + continue + + # key: value + if ":" in stripped: + k, v = stripped.split(":", 1) + k = k.strip() + v = v.strip() + if k: + rows.append({"Section": section, "Key": k, "Value": v}) + continue + + return pd.DataFrame(rows, columns=["Section", "Key", "Value"]) + + +def _load_env_df_for_inputs(args, files: list[str]) -> pd.DataFrame | None: + """Load vllm_env.txt next to the *original* input JSON file. + + Note: when only one -f is provided, the script may split JSON into ./splits/..., + but vllm_env.txt typically lives next to the original benchmark_results.json. + """ + base_dir: Path | None = None + if getattr(args, "file", None): + base_dir = Path(args.file[0]).resolve().parent + elif files: + base_dir = Path(files[0]).resolve().parent + if base_dir is None: + return None + + env_path = base_dir / "vllm_env.txt" + if not env_path.exists(): + return None + df = _parse_vllm_env_txt(env_path) + return df + + # ----------------------------- # Valid max concurrency summary helpers # ----------------------------- @@ -428,7 +555,6 @@ def build_valid_max_concurrency_summary_html( summary_df = pd.DataFrame(rows) - # --- Coerce numeric columns so Styler doesn't miss them due to object dtype --- for c in summary_df.columns: if c == "Configuration": continue @@ -436,12 +562,10 @@ def build_valid_max_concurrency_summary_html( both_col = f"Max {conc_col} (Both)" - # --- Strict 2-decimal formatting for ALL non-Configuration columns --- formatters = {} for c in summary_df.columns: if c == "Configuration": continue - # default argument binds per-column formatter correctly formatters[c] = lambda v: "" if pd.isna(v) else f"{float(v):.2f}" styler = summary_df.style.format(formatters) @@ -460,6 +584,95 @@ def build_valid_max_concurrency_summary_html( return title + styler.to_html(table_attributes='border="1" class="dataframe"') +def build_valid_max_concurrency_summary_df( + tput_group_df: pd.DataFrame | None, + ttft_group_df: pd.DataFrame | None, + tpot_group_df: pd.DataFrame | None, + conc_col: str, + args, +) -> pd.DataFrame | None: + if ttft_group_df is None and tpot_group_df is None: + return None + + ttft_cols = ( + _config_value_columns(ttft_group_df, conc_col) + if ttft_group_df is not None + else [] + ) + tpot_cols = ( + _config_value_columns(tpot_group_df, conc_col) + if tpot_group_df is not None + else [] + ) + tput_cols = ( + _config_value_columns(tput_group_df, conc_col) + if tput_group_df is not None + else [] + ) + + if ttft_group_df is not None and tpot_group_df is not None: + cfg_cols = [c for c in ttft_cols if c in tpot_cols] + if tput_group_df is not None: + cfg_cols = [c for c in cfg_cols if c in tput_cols] or cfg_cols + else: + cfg_cols = ttft_cols or tpot_cols + + if not cfg_cols: + cfg_cols = sorted(set(ttft_cols) | set(tpot_cols) | set(tput_cols), key=str) + + rows = [] + for cfg in cfg_cols: + ttft_max = ( + _max_concurrency_ok(ttft_group_df, conc_col, cfg, args.ttft_max_ms) + if ttft_group_df is not None + else pd.NA + ) + tpot_max = ( + _max_concurrency_ok(tpot_group_df, conc_col, cfg, args.tpot_max_ms) + if tpot_group_df is not None + else pd.NA + ) + both = ( + pd.NA + if (pd.isna(ttft_max) or pd.isna(tpot_max)) + else min(ttft_max, tpot_max) + ) + + tput_at_both = ( + _value_at_concurrency(tput_group_df, conc_col, cfg, both) + if tput_group_df is not None + else pd.NA + ) + ttft_at_both = ( + _value_at_concurrency(ttft_group_df, conc_col, cfg, both) + if ttft_group_df is not None + else pd.NA + ) + tpot_at_both = ( + _value_at_concurrency(tpot_group_df, conc_col, cfg, both) + if tpot_group_df is not None + else pd.NA + ) + + rows.append( + { + "Configuration": cfg, + f"Max {conc_col} (TTFT ≤ {args.ttft_max_ms:g} ms)": ttft_max, + f"Max {conc_col} (TPOT ≤ {args.tpot_max_ms:g} ms)": tpot_max, + f"Max {conc_col} (Both)": both, + "Output Tput @ Both (tok/s)": tput_at_both, + "TTFT @ Both (ms)": ttft_at_both, + "TPOT @ Both (ms)": tpot_at_both, + } + ) + + df = pd.DataFrame(rows) + for c in df.columns: + if c != "Configuration": + df[c] = pd.to_numeric(df[c], errors="coerce") + return df + + # ----------------------------- # Plot helper # ----------------------------- @@ -537,6 +750,21 @@ def build_parser() -> argparse.ArgumentParser: default=100.0, help="Reference limit for TPOT plots (ms)", ) + + # ---- NEW: export options ---- + parser.add_argument( + "--excel-out", + type=str, + default="perf_comparison.xlsx", + help="Write one sheet per (Model, Dataset, Input Len, Output Len).", + ) + parser.add_argument( + "--csv-out-dir", + type=str, + default="", + help="If set, write per-group per-metric CSVs into this directory.", + ) + return parser @@ -657,7 +885,6 @@ def maybe_write_plot( markers=True, ) - # Ensure plot hover + y tick labels are also 2 decimals. fig.update_traces(hovertemplate="%{y:.2f}") fig.update_yaxes(tickformat=".2f") @@ -730,87 +957,151 @@ def write_report_group_first( for metric_label, (df, _) in metric_cache.items() } - with open("perf_comparison.html", "w", encoding="utf-8") as main_fh: - main_fh.write('\n') - for gkey in group_keys: - gkey_tuple = normalize_group_key(gkey) - suffix = build_group_suffix(group_cols_canonical, gkey_tuple) - sub_path = group_filename(gkey_tuple) - group_header = ( - '
' - f"{_html.escape(suffix)}" - "
\n" - ) + csv_dir = Path(args.csv_out_dir) if args.csv_out_dir else None + if csv_dir: + csv_dir.mkdir(parents=True, exist_ok=True) + + excel_path = args.excel_out or "perf_comparison.xlsx" + with pd.ExcelWriter(excel_path, engine="openpyxl") as xw: + # ---- Environment sheet (first) ---- + env_sheet = _sanitize_sheet_name("Environment") + env_df = _load_env_df_for_inputs(args, files) + if env_df is None or env_df.empty: + pd.DataFrame( + [ + { + "Section": "Environment", + "Key": "vllm_env.txt", + "Value": "NOT FOUND (or empty)", + } + ] + ).to_excel(xw, sheet_name=env_sheet, index=False) + else: + env_df.to_excel(xw, sheet_name=env_sheet, index=False) + with open("perf_comparison.html", "w", encoding="utf-8") as main_fh: + main_fh.write('\n') + for gkey in group_keys: + gkey_tuple = normalize_group_key(gkey) + suffix = build_group_suffix(group_cols_canonical, gkey_tuple) + sub_path = group_filename(gkey_tuple) + group_header = ( + '
' + f"{_html.escape(suffix)}" + "
\n" + ) - main_fh.write(group_header) - with open(sub_path, "w", encoding="utf-8") as sub_fh: - sub_fh.write('\n') - sub_fh.write(group_header) - tput_group_df = None - ttft_group_df = None - tpot_group_df = None - conc_col = args.xaxis - - for metric_label in plan.data_cols: - gb = metric_groupbys[metric_label] - df_sorted, raw_data_cols = metric_cache[metric_label] - - try: - group_df = gb.get_group(gkey) - except KeyError: - missing = ( - '
' - f"{_html.escape(metric_label)} — missing for this group" - "
\n" + main_fh.write(group_header) + + sheet = _group_to_sheet_base(group_cols_canonical, gkey_tuple) + sheet_base = sheet + dedup_i = 1 + while sheet in xw.sheets: + dedup_i += 1 + sheet = _sanitize_sheet_name(f"{sheet_base}_{dedup_i}") + + excel_blocks: list[tuple[str, pd.DataFrame]] = [] + + with open(sub_path, "w", encoding="utf-8") as sub_fh: + sub_fh.write('\n') + sub_fh.write(group_header) + tput_group_df = None + ttft_group_df = None + tpot_group_df = None + conc_col = args.xaxis + + for metric_label in plan.data_cols: + gb = metric_groupbys[metric_label] + df_sorted, raw_data_cols = metric_cache[metric_label] + + try: + group_df = gb.get_group(gkey) + except KeyError: + missing = ( + '
' + f"{_html.escape(metric_label)} — missing for this group" + "
\n" + ) + main_fh.write(missing) + sub_fh.write(missing) + continue + + if conc_col not in group_df.columns: + conc_col = _find_concurrency_col(group_df) + + mn = metric_label.lower().strip() + if "tok/s" in mn: + tput_group_df = group_df + elif "ttft" in mn: + ttft_group_df = group_df + elif mn in ("p99", "median") or "tpot" in mn: + tpot_group_df = group_df + + display_group = group_df.drop( + columns=group_cols_canonical, errors="ignore" ) - main_fh.write(missing) - sub_fh.write(missing) - continue - - if conc_col not in group_df.columns: - conc_col = _find_concurrency_col(group_df) - - mn = metric_label.lower().strip() - if "tok/s" in mn: - tput_group_df = group_df - elif "ttft" in mn: - ttft_group_df = group_df - elif mn in ("p99", "median") or "tpot" in mn: - tpot_group_df = group_df - - display_group = group_df.drop( - columns=group_cols_canonical, errors="ignore" - ) + html = render_metric_table_html( + display_group, metric_label, suffix, args + ) + main_fh.write(html) + sub_fh.write(html) + + maybe_write_plot( + main_fh, + sub_fh, + group_df=group_df, + raw_data_cols=raw_data_cols, + metric_label=metric_label, + y_axis_col=y_axis_col, + args=args, + ) - html = render_metric_table_html( - display_group, metric_label, suffix, args + excel_blocks.append( + (metric_label, display_group.reset_index(drop=True)) + ) + if csv_dir: + fn = _safe_filename( + f"{sheet}__{metric_label}".replace(" ", "_").replace( + "/", "_" + ) + ) + display_group.to_csv(csv_dir / f"{fn}.csv", index=False) + + summary_html = build_valid_max_concurrency_summary_html( + tput_group_df=tput_group_df, + ttft_group_df=ttft_group_df, + tpot_group_df=tpot_group_df, + conc_col=conc_col, + args=args, ) - main_fh.write(html) - sub_fh.write(html) - - maybe_write_plot( - main_fh, - sub_fh, - group_df=group_df, - raw_data_cols=raw_data_cols, - metric_label=metric_label, - y_axis_col=y_axis_col, + if summary_html: + main_fh.write(summary_html) + sub_fh.write(summary_html) + + summary_df = build_valid_max_concurrency_summary_df( + tput_group_df=tput_group_df, + ttft_group_df=ttft_group_df, + tpot_group_df=tpot_group_df, + conc_col=conc_col, args=args, ) + if summary_df is not None: + excel_blocks.append( + ("Valid Max Concurrency Summary", summary_df) + ) + if csv_dir: + fn = _safe_filename( + f"{sheet}__Valid_Max_Concurrency_Summary" + ) + summary_df.to_csv(csv_dir / f"{fn}.csv", index=False) - summary_html = build_valid_max_concurrency_summary_html( - tput_group_df=tput_group_df, - ttft_group_df=ttft_group_df, - tpot_group_df=tpot_group_df, - conc_col=conc_col, - args=args, - ) - if summary_html: - main_fh.write(summary_html) - sub_fh.write(summary_html) + _write_tables_to_excel_sheet(xw, sheet, excel_blocks) + + print(f"Wrote Excel: {excel_path}") + if csv_dir: + print(f"Wrote CSVs under: {csv_dir}") def main(): diff --git a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh index d62c01bc7..7dabcf517 100755 --- a/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh @@ -1,6 +1,4 @@ #!/bin/bash - -# This script should be run inside the CI process # This script assumes that we are already inside the vllm/ directory # Benchmarking results will be available inside vllm/benchmarks/results/ @@ -9,6 +7,11 @@ set -x set -o pipefail +# Environment-driven debug controls (like ON_CPU=1) +DRY_RUN="${DRY_RUN:-0}" +MODEL_FILTER="${MODEL_FILTER:-}" +DTYPE_FILTER="${DTYPE_FILTER:-}" + check_gpus() { if command -v nvidia-smi; then # check the number of GPUs and GPU type. @@ -112,13 +115,12 @@ json2envs() { } wait_for_server() { - # wait for vllm server to start - # return 1 if vllm server crashes local timeout_val="1200" timeout "$timeout_val" bash -c ' - until curl -X POST localhost:8000/v1/completions; do + until curl -sf http://localhost:8000/v1/models >/dev/null; do sleep 1 - done' && return 0 || return 1 + done + ' } kill_processes_launched_by_current_bash() { @@ -252,37 +254,16 @@ run_benchmark_tests() { done } -run_latency_tests() { - run_benchmark_tests "latency" "$1" -} - -run_startup_tests() { - run_benchmark_tests "startup" "$1" -} - -run_throughput_tests() { - run_benchmark_tests "throughput" "$1" -} - -run_serving_tests() { - # run serving tests using `vllm bench serve` command - # $1: a json file specifying serving test cases - # - # Supported JSON formats: - # 1) Plain format: top-level array - # [ { "test_name": "...", "server_parameters": {...}, ... }, ... ] - # - # 2) Default parameters field + plain format tests - # { - # "defaults": { ... }, - # "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ] - # } - - local serving_test_file - serving_test_file=$1 +run_latency_tests() { run_benchmark_tests "latency" "$1"; } +run_startup_tests() { run_benchmark_tests "startup" "$1"; } +run_throughput_tests() { run_benchmark_tests "throughput" "$1"; } - # Iterate over serving tests - jq -c ' +merge_serving_tests_stream() { + # Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode. + # This helper does NOT modify JSON; it only filters the stream in dry-run mode. + local serving_test_file="$1" + # shellcheck disable=SC2016 + local merged=' if type == "array" then # Plain format: test cases array .[] @@ -304,7 +285,50 @@ run_serving_tests() { else error("Unsupported serving test file format: must be array or object with .tests") end - ' "$serving_test_file" | while read -r params; do + ' + + jq -c "$merged" "$serving_test_file" | \ + if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then + jq -c --arg model "$MODEL_FILTER" --arg dtype "$DTYPE_FILTER" ' + select((($model|length)==0) + or ((.server_parameters.model // "") == $model) + or ((.client_parameters.model // "") == $model)) + | select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype)) + ' + else + cat + fi +} + +run_serving_tests() { + # run serving tests using `vllm bench serve` command + # $1: a json file specifying serving test cases + # + # Supported JSON formats: + # 1) Plain format: top-level array + # [ { "test_name": "...", "server_parameters": {...}, ... }, ... ] + # + # 2) Default parameters field + plain format tests + # { + # "defaults": { ... }, + # "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ] + # } + + local serving_test_file + serving_test_file=$1 + + # In dry-run mode, if filters are provided but no tests match, fail fast. + if [[ "${DRY_RUN:-0}" == "1" && ( "${MODEL_FILTER}${DTYPE_FILTER}" != "" ) ]]; then + local count + count=$(merge_serving_tests_stream "$serving_test_file" | wc -l | tr -d ' ') + if [[ "$count" -eq 0 ]]; then + echo "No matching serving tests found in $serving_test_file for model='$MODEL_FILTER' dtype='$DTYPE_FILTER'." >&2 + return 0 + fi + fi + + # Iterate over serving tests (merged + optional filtered stream) + merge_serving_tests_stream "$serving_test_file" | while read -r params; do # get the test name, and append the GPU type back to it. test_name=$(echo "$params" | jq -r '.test_name') if [[ ! "$test_name" =~ ^serving_ ]]; then @@ -373,7 +397,7 @@ run_serving_tests() { echo "Server command: $server_command" # support remote vllm server client_remote_args="" - if [[ -z "${REMOTE_HOST}" ]]; then + if [[ -z "${REMOTE_HOST}" && "${DRY_RUN:-0}" != "1" ]]; then bash -c "$server_command" & server_pid=$! # wait until the server is alive @@ -384,6 +408,9 @@ run_serving_tests() { echo "" echo "vLLM failed to start within the timeout period." fi + elif [[ "${DRY_RUN:-0}" == "1" ]]; then + # dry-run: don't start server + echo "Dry Run." else server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT" if [[ ${REMOTE_PORT} ]]; then @@ -402,9 +429,7 @@ run_serving_tests() { for qps in $qps_list; do # remove the surrounding single quote from qps if [[ "$qps" == *"inf"* ]]; then - echo "qps was $qps" qps="inf" - echo "now qps is $qps" fi # iterate over different max_concurrency @@ -425,7 +450,9 @@ run_serving_tests() { echo "Running test case $test_name with qps $qps" echo "Client command: $client_command" - bash -c "$client_command" + if [[ "${DRY_RUN:-0}" != "1" ]]; then + bash -c "$client_command" + fi # record the benchmarking commands jq_output=$(jq -n \ @@ -443,12 +470,15 @@ run_serving_tests() { done # clean up - kill -9 $server_pid - kill_gpu_processes + if [[ "${DRY_RUN:-0}" != "1" ]]; then + kill -9 $server_pid + kill_gpu_processes + fi done } main() { + local ARCH ARCH='' if [[ "$ON_CPU" == "1" ]]; then @@ -458,7 +488,13 @@ main() { check_gpus ARCH="$arch_suffix" fi - check_hf_token + + # DRY_RUN does not execute vLLM; do not require HF_TOKEN. + if [[ "${DRY_RUN:-0}" != "1" ]]; then + check_hf_token + else + echo "DRY_RUN=1 -> skip HF_TOKEN validation" + fi # dependencies (which wget && which curl) || (apt-get update && apt-get install -y wget curl) @@ -479,11 +515,16 @@ main() { # dump vllm info via vllm collect-env env_output=$(vllm collect-env) - echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt" # benchmarking - run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" + run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" || exit $? + + if [[ "${DRY_RUN:-0}" == "1" ]]; then + echo "DRY_RUN=1 -> skip latency/startup/throughput suites" + exit 0 + fi + run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}" run_startup_tests $QUICK_BENCHMARK_ROOT/tests/"${STARTUP_JSON:-startup-tests$ARCH.json}" run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}" diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json new file mode 100644 index 000000000..6d3455c47 --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json @@ -0,0 +1,41 @@ +{ + "defaults": { + "qps_list": [ + "inf" + ], + "max_concurrency_list": [ + 32, + 64, + 128 + ], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "dtype": "bfloat16", + "model": "jinaai/jina-embeddings-v3", + "trust_remote_code": "" + }, + "client_parameters": { + "model": "jinaai/jina-embeddings-v3", + "backend": "openai-embeddings", + "endpoint": "/v1/embeddings", + "dataset_name": "sharegpt", + "dataset_path": "ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + "tests": [ + { + "test_name": "serving_jina_embed_v3_tp1_sharegpt", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": {} + } + ] +} diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json new file mode 100644 index 000000000..25ed7415e --- /dev/null +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json @@ -0,0 +1,283 @@ +{ + "defaults": { + "qps_list": [ + "inf" + ], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256 + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "ignore-eos": "", + "num_prompts": 200 + } + }, + "tests": [ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" + } + }, + { + "test_name": "serving_llama8B_tp2_sharegpt", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json" + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_128", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_128", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp4_random_128_128", + "server_parameters": { + "tensor_parallel_size": 4 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp1_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp4_random_128_2048", + "server_parameters": { + "tensor_parallel_size": 4 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048 + } + }, + { + "test_name": "serving_llama8B_tp1_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 1 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp2_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 2 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_tp4_random_2048_128", + "server_parameters": { + "tensor_parallel_size": 4 + }, + "client_parameters": { + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_int4_tp1_random_128_128", + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_int4_tp2_random_128_128", + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "tensor_parallel_size": 2 + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama8B_int4_tp4_random_128_128", + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "tensor_parallel_size": 4 + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_llama3B_tp1_random_128_128", + "server_parameters": { + "model": "meta-llama/Llama-3.2-3B-Instruct", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "meta-llama/Llama-3.2-3B-Instruct", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_granite2B_tp1_random_128_128", + "server_parameters": { + "model": "ibm-granite/granite-3.2-2b-instruct", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "ibm-granite/granite-3.2-2b-instruct", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_qwen1.7B_tp1_random_128_128", + "server_parameters": { + "model": "Qwen/Qwen3-1.7B", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "Qwen/Qwen3-1.7B", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_qwen4B_tp1_random_128_128", + "server_parameters": { + "model": "Qwen/Qwen3-4B", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "Qwen/Qwen3-4B", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_qwen8B_tp1_random_128_128", + "server_parameters": { + "model": "Qwen/Qwen3-8B", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "Qwen/Qwen3-8B", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_glm9B_tp1_random_128_128", + "server_parameters": { + "model": "zai-org/glm-4-9b-hf", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "zai-org/glm-4-9b-hf", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + }, + { + "test_name": "serving_gemma7B_tp1_random_128_128", + "server_parameters": { + "model": "google/gemma-7b", + "tensor_parallel_size": 1 + }, + "client_parameters": { + "model": "google/gemma-7b", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128 + } + } + ] +} diff --git a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json index 25ed7415e..e34ddcb6d 100644 --- a/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/performance-benchmarks/tests/serving-tests-cpu.json @@ -148,136 +148,6 @@ "random-input-len": 2048, "random-output-len": 128 } - }, - { - "test_name": "serving_llama8B_int4_tp1_random_128_128", - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_llama8B_int4_tp2_random_128_128", - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "tensor_parallel_size": 2 - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_llama8B_int4_tp4_random_128_128", - "server_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "tensor_parallel_size": 4 - }, - "client_parameters": { - "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_llama3B_tp1_random_128_128", - "server_parameters": { - "model": "meta-llama/Llama-3.2-3B-Instruct", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "meta-llama/Llama-3.2-3B-Instruct", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_granite2B_tp1_random_128_128", - "server_parameters": { - "model": "ibm-granite/granite-3.2-2b-instruct", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "ibm-granite/granite-3.2-2b-instruct", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_qwen1.7B_tp1_random_128_128", - "server_parameters": { - "model": "Qwen/Qwen3-1.7B", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "Qwen/Qwen3-1.7B", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_qwen4B_tp1_random_128_128", - "server_parameters": { - "model": "Qwen/Qwen3-4B", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "Qwen/Qwen3-4B", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_qwen8B_tp1_random_128_128", - "server_parameters": { - "model": "Qwen/Qwen3-8B", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "Qwen/Qwen3-8B", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_glm9B_tp1_random_128_128", - "server_parameters": { - "model": "zai-org/glm-4-9b-hf", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "zai-org/glm-4-9b-hf", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } - }, - { - "test_name": "serving_gemma7B_tp1_random_128_128", - "server_parameters": { - "model": "google/gemma-7b", - "tensor_parallel_size": 1 - }, - "client_parameters": { - "model": "google/gemma-7b", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128 - } } ] } diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index aaa9b28ab..431de0d6a 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -176,7 +176,7 @@ For the full and up-to-date list of models validated on CPU platforms, please se ### How to find benchmark configuration examples for supported CPU models? -For any model listed under [Supported Models on CPU](../../models/hardware_supported_models/cpu.md), optimized runtime configurations are provided in the vLLM Benchmark Suite’s CPU test cases, defined in [cpu test cases](../../../.buildkite/performance-benchmarks/tests/serving-tests-cpu.json) +For any model listed under [Supported Models on CPU](../../models/hardware_supported_models/cpu.md), optimized runtime configurations are provided in the vLLM Benchmark Suite’s CPU test cases, defined in cpu test cases as serving-tests-cpu.json. Full test cases for Text-only models, Multi-Modal models and Embedded models are in cpu Text-Only test cases as serving-tests-cpu-text.json, cpu Multi-Modal test cases as serving-tests-cpu-multimodal.json and cpu Embedded test cases as serving-tests-cpu-embed.json. For details on how these optimized configurations are determined, see: [performance-benchmark-details](../../../.buildkite/performance-benchmarks/README.md#performance-benchmark-details). To benchmark the supported models using these optimized settings, follow the steps in [running vLLM Benchmark Suite manually](../../benchmarking/dashboard.md#manually-trigger-the-benchmark) and run the Benchmark Suite on a CPU environment. @@ -199,6 +199,28 @@ lscpu | grep "NUMA node(s):" | awk '{print $3}' For performance reference, users may also consult the [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm&deviceName=cpu) , which publishes default-model CPU results produced using the same Benchmark Suite. +#### Dry-Run + +For users only need to get the optimized runtime configurations without running benchmark, a Dry-Run mode is provided. +By passing an environment variable DRY_RUN=1 with run-performance-benchmarks.sh, +all commands will be generated under `./benchmark/results/`. + +```bash +ON_CPU=1 DRY_RUN=1 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh +``` + +By providing different JSON file, users can get runtime configurations for different models such as Embedded Models. + +```bash +ON_CPU=1 SERVING_JSON=serving-tests-cpu-embed.json DRY_RUN=1 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh +``` + +By providing MODEL_FILTER and DTYPE_FILTER, only commands for related model ID and Data Type will be generated. + +```bash +ON_CPU=1 SERVING_JSON=serving-tests-cpu-text.json DRY_RUN=1 MODEL_FILTER=meta-llama/Llama-3.1-8B-Instruct DTYPE_FILTER=bfloat16 bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh +``` + ### How to decide `VLLM_CPU_OMP_THREADS_BIND`? - Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to the same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If you have any performance problems or unexpected binding behaviours, please try to bind threads as following. -- GitLab From f5897613fb270fb478cda362868713cc338f6be9 Mon Sep 17 00:00:00 2001 From: baonudesifeizhai <85092850+baonudesifeizhai@users.noreply.github.com> Date: Thu, 12 Feb 2026 03:22:06 -0500 Subject: [PATCH 0132/1166] Fix Mistral config remap to accept compressed-tensors quantization #34028 (#34104) Signed-off-by: baonudesifeizhai --- vllm/transformers_utils/configs/mistral.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py index 1a0e25021..aea990b07 100644 --- a/vllm/transformers_utils/configs/mistral.py +++ b/vllm/transformers_utils/configs/mistral.py @@ -198,6 +198,14 @@ def _remap_mistral_quantization_args(config: dict) -> dict: "quant_method": "fp8", "activation_scheme": "dynamic" if is_dynamic else "static", } + elif ( + str(quantization.get("quant_method", "")).lower().replace("_", "-") + == "compressed-tensors" + ): + # Pass through compressed-tensors config, while normalizing + # quant_method to the canonical community spelling. + quantization["quant_method"] = "compressed-tensors" + config["quantization_config"] = quantization else: raise ValueError(f"Found unknown quantization='{quantization}' in config") -- GitLab From fb455ed547a63e97e15deccfc493f8eef7a2da5c Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 12 Feb 2026 20:44:28 +0800 Subject: [PATCH 0133/1166] [V0 Deprecation] Remove code related to per-request logits processors (#34400) Signed-off-by: DarkLight1337 --- tests/entrypoints/openai/test_chat_error.py | 1 - .../openai/test_completion_error.py | 1 - .../entrypoints/openai/test_lora_resolvers.py | 1 - tests/entrypoints/openai/test_serving_chat.py | 1 - tests/v1/sample/test_sampling_params_e2e.py | 14 ------- vllm/config/model.py | 5 --- vllm/engine/arg_utils.py | 13 +----- .../openai/chat_completion/protocol.py | 22 ++-------- .../openai/chat_completion/serving.py | 9 ---- .../entrypoints/openai/completion/protocol.py | 19 --------- vllm/entrypoints/openai/completion/serving.py | 9 ---- vllm/sampling_params.py | 42 +++++-------------- 12 files changed, 15 insertions(+), 122 deletions(-) diff --git a/tests/entrypoints/openai/test_chat_error.py b/tests/entrypoints/openai/test_chat_error.py index 8a2894154..760ec8acb 100644 --- a/tests/entrypoints/openai/test_chat_error.py +++ b/tests/entrypoints/openai/test_chat_error.py @@ -45,7 +45,6 @@ class MockModelConfig: multimodal_config = MultiModalConfig() hf_config = MockHFConfig() hf_text_config = MockHFConfig() - logits_processor_pattern = None logits_processors: list[str] | None = None diff_sampling_param: dict | None = None allowed_local_media_path: str = "" diff --git a/tests/entrypoints/openai/test_completion_error.py b/tests/entrypoints/openai/test_completion_error.py index bbf97534f..800bf75f0 100644 --- a/tests/entrypoints/openai/test_completion_error.py +++ b/tests/entrypoints/openai/test_completion_error.py @@ -44,7 +44,6 @@ class MockModelConfig: tokenizer_revision = None multimodal_config = MultiModalConfig() hf_config = MockHFConfig() - logits_processor_pattern = None logits_processors: list[str] | None = None diff_sampling_param: dict | None = None allowed_local_media_path: str = "" diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index db7fbe2f8..56fe31556 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -45,7 +45,6 @@ class MockModelConfig: multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig) hf_config: MockHFConfig = field(default_factory=MockHFConfig) logits_processors: list[str] | None = None - logits_processor_pattern: str | None = None diff_sampling_param: dict | None = None allowed_local_media_path: str = "" allowed_media_domains: list[str] | None = None diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index ef9d944ab..b57f00ab7 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -521,7 +521,6 @@ class MockModelConfig: hf_config = MockHFConfig() hf_text_config = MockHFConfig() logits_processors: list[str] | None = None - logits_processor_pattern = None diff_sampling_param: dict | None = None allowed_local_media_path: str = "" allowed_media_domains: list[str] | None = None diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py index a75a37bef..fff953323 100644 --- a/tests/v1/sample/test_sampling_params_e2e.py +++ b/tests/v1/sample/test_sampling_params_e2e.py @@ -144,20 +144,6 @@ def test_bad_words(llm): assert not contains_bad_word(new_text, new_tokens, bad_words_2) -def test_logits_processor(llm): - """Check that we reject logits processor.""" - - # This sample logits processor gives infinite score to the i-th token, - # where i is the length of the input sequence. - # We therefore expect the output token sequence to be [0, 1, 2, ...] - def pick_ith(token_ids, logits): - logits[len(token_ids)] = float("inf") - return logits - - with pytest.raises(ValueError): - _ = llm.generate(PROMPT, SamplingParams(logits_processors=[pick_ith])) - - def test_allowed_token_ids(llm): """Check that we can use allowed_token_ids.""" diff --git a/vllm/config/model.py b/vllm/config/model.py index 5fd7d2d73..0a5ff385f 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -252,10 +252,6 @@ class ModelConfig: hf_overrides: HfOverrides = field(default_factory=dict) """If a dictionary, contains arguments to be forwarded to the Hugging Face config. If a callable, it is called to update the HuggingFace config.""" - logits_processor_pattern: str | None = None - """Optional regex pattern specifying valid logits processor qualified names - that can be passed with the `logits_processors` extra completion argument. - Defaults to `None`, which allows no processors.""" generation_config: str = "auto" """The folder path to the generation config. Defaults to `"auto"`, the generation config will be loaded from model path. If set to `"vllm"`, no @@ -342,7 +338,6 @@ class ModelConfig: "config_format", "hf_token", "hf_overrides", - "logits_processor_pattern", "override_attention_dtype", "logits_processors", "io_processor_plugin", diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 2d1e2feb9..84176e207 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -508,8 +508,6 @@ class EngineArgs: reasoning_parser: str = StructuredOutputsConfig.reasoning_parser reasoning_parser_plugin: str | None = None - logits_processor_pattern: str | None = ModelConfig.logits_processor_pattern - speculative_config: dict[str, Any] | None = None show_hidden_metrics_for_version: str | None = ( @@ -710,9 +708,6 @@ class EngineArgs: ) model_group.add_argument("--hf-overrides", **model_kwargs["hf_overrides"]) model_group.add_argument("--pooler-config", **model_kwargs["pooler_config"]) - model_group.add_argument( - "--logits-processor-pattern", **model_kwargs["logits_processor_pattern"] - ) model_group.add_argument( "--generation-config", **model_kwargs["generation_config"] ) @@ -1320,7 +1315,6 @@ class EngineArgs: mm_encoder_tp_mode=self.mm_encoder_tp_mode, mm_encoder_attn_backend=self.mm_encoder_attn_backend, pooler_config=self.pooler_config, - logits_processor_pattern=self.logits_processor_pattern, generation_config=self.generation_config, override_generation_config=self.override_generation_config, enable_sleep_mode=self.enable_sleep_mode, @@ -1429,7 +1423,7 @@ class EngineArgs: self.model_weights = model_config.model_weights self.tokenizer = model_config.tokenizer - self._check_feature_supported(model_config) + self._check_feature_supported() self._set_default_chunked_prefill_and_prefix_caching_args(model_config) self._set_default_max_num_seqs_and_batched_tokens_args( usage_context, model_config @@ -1831,11 +1825,8 @@ class EngineArgs: return config - def _check_feature_supported(self, model_config: ModelConfig): + def _check_feature_supported(self): """Raise an error if the feature is not supported.""" - if self.logits_processor_pattern != EngineArgs.logits_processor_pattern: - _raise_unsupported_error(feature_name="--logits-processor-pattern") - # No Concurrent Partial Prefills so far. if ( self.max_num_partial_prefills != SchedulerConfig.max_num_partial_prefills diff --git a/vllm/entrypoints/openai/chat_completion/protocol.py b/vllm/entrypoints/openai/chat_completion/protocol.py index d905a59af..71e59152a 100644 --- a/vllm/entrypoints/openai/chat_completion/protocol.py +++ b/vllm/entrypoints/openai/chat_completion/protocol.py @@ -26,13 +26,11 @@ from vllm.entrypoints.openai.engine.protocol import ( FunctionCall, FunctionDefinition, LegacyStructuralTagResponseFormat, - LogitsProcessors, OpenAIBaseModel, StreamOptions, StructuralTagResponseFormat, ToolCall, UsageInfo, - get_logits_processors, ) from vllm.exceptions import VLLMValidationError from vllm.logger import init_logger @@ -293,19 +291,7 @@ class ChatCompletionRequest(OpenAIBaseModel): "through out the inference process and return in response." ), ) - logits_processors: LogitsProcessors | None = Field( - default=None, - description=( - "A list of either qualified names of logits processors, or " - "constructor objects, to apply when sampling. A constructor is " - "a JSON object with a required 'qualname' field specifying the " - "qualified name of the processor class/factory, and optional " - "'args' and 'kwargs' fields containing positional and keyword " - "arguments. For example: {'qualname': " - "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': " - "{'param': 'value'}}." - ), - ) + return_tokens_as_token_ids: bool | None = Field( default=None, description=( @@ -324,6 +310,7 @@ class ChatCompletionRequest(OpenAIBaseModel): "need to map generated text back to input tokens." ), ) + cache_salt: str | None = Field( default=None, description=( @@ -335,6 +322,7 @@ class ChatCompletionRequest(OpenAIBaseModel): "to 256 bit)." ), ) + kv_transfer_params: dict[str, Any] | None = Field( default=None, description="KVTransfer parameters used for disaggregated serving.", @@ -417,7 +405,6 @@ class ChatCompletionRequest(OpenAIBaseModel): def to_sampling_params( self, max_tokens: int, - logits_processor_pattern: str | None, default_sampling_params: dict, ) -> SamplingParams: # Default parameters @@ -502,9 +489,6 @@ class ChatCompletionRequest(OpenAIBaseModel): min_tokens=self.min_tokens, skip_special_tokens=self.skip_special_tokens, spaces_between_special_tokens=self.spaces_between_special_tokens, - logits_processors=get_logits_processors( - self.logits_processors, logits_processor_pattern - ), include_stop_str_in_output=self.include_stop_str_in_output, truncate_prompt_tokens=self.truncate_prompt_tokens, output_kind=RequestOutputKind.DELTA diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py index 761ae9a50..7b54e6daf 100644 --- a/vllm/entrypoints/openai/chat_completion/serving.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py @@ -86,7 +86,6 @@ from vllm.tool_parsers import ToolParser from vllm.tool_parsers.mistral_tool_parser import MistralToolCall from vllm.tool_parsers.utils import partial_json_loads from vllm.utils.collection_utils import as_list -from vllm.v1.sample.logits_processor import validate_logits_processors_parameters logger = init_logger(__name__) @@ -130,9 +129,6 @@ class OpenAIServingChat(OpenAIServing): self.enable_log_outputs = enable_log_outputs self.enable_log_deltas = enable_log_deltas - # set up logits processors - self.logits_processors = self.model_config.logits_processors - # set up reasoning parser self.reasoning_parser_cls = ParserManager.get_reasoning_parser( reasoning_parser_name=reasoning_parser @@ -403,13 +399,8 @@ class OpenAIServingChat(OpenAIServing): else: sampling_params = request.to_sampling_params( max_tokens, - self.model_config.logits_processor_pattern, self.default_sampling_params, ) - validate_logits_processors_parameters( - self.logits_processors, - sampling_params, - ) self._log_inputs( sub_request_id, diff --git a/vllm/entrypoints/openai/completion/protocol.py b/vllm/entrypoints/openai/completion/protocol.py index aab733082..904c9eca4 100644 --- a/vllm/entrypoints/openai/completion/protocol.py +++ b/vllm/entrypoints/openai/completion/protocol.py @@ -15,12 +15,10 @@ from vllm.config import ModelConfig from vllm.entrypoints.openai.engine.protocol import ( AnyResponseFormat, LegacyStructuralTagResponseFormat, - LogitsProcessors, OpenAIBaseModel, StreamOptions, StructuralTagResponseFormat, UsageInfo, - get_logits_processors, ) from vllm.exceptions import VLLMValidationError from vllm.logger import init_logger @@ -117,19 +115,6 @@ class CompletionRequest(OpenAIBaseModel): "through out the inference process and return in response." ), ) - logits_processors: LogitsProcessors | None = Field( - default=None, - description=( - "A list of either qualified names of logits processors, or " - "constructor objects, to apply when sampling. A constructor is " - "a JSON object with a required 'qualname' field specifying the " - "qualified name of the processor class/factory, and optional " - "'args' and 'kwargs' fields containing positional and keyword " - "arguments. For example: {'qualname': " - "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': " - "{'param': 'value'}}." - ), - ) return_tokens_as_token_ids: bool | None = Field( default=None, @@ -221,7 +206,6 @@ class CompletionRequest(OpenAIBaseModel): def to_sampling_params( self, max_tokens: int, - logits_processor_pattern: str | None, default_sampling_params: dict | None = None, ) -> SamplingParams: if default_sampling_params is None: @@ -312,9 +296,6 @@ class CompletionRequest(OpenAIBaseModel): skip_special_tokens=self.skip_special_tokens, spaces_between_special_tokens=self.spaces_between_special_tokens, include_stop_str_in_output=self.include_stop_str_in_output, - logits_processors=get_logits_processors( - self.logits_processors, logits_processor_pattern - ), truncate_prompt_tokens=self.truncate_prompt_tokens, output_kind=RequestOutputKind.DELTA if self.stream diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py index 0353625fe..994cc094a 100644 --- a/vllm/entrypoints/openai/completion/serving.py +++ b/vllm/entrypoints/openai/completion/serving.py @@ -42,7 +42,6 @@ from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.tokenizers import TokenizerLike from vllm.utils.async_utils import merge_async_iterators from vllm.utils.collection_utils import as_list -from vllm.v1.sample.logits_processor import validate_logits_processors_parameters logger = init_logger(__name__) @@ -67,9 +66,6 @@ class OpenAIServingCompletion(OpenAIServing): log_error_stack=log_error_stack, ) - # set up logits processors - self.logits_processors = self.model_config.logits_processors - self.enable_prompt_tokens_details = enable_prompt_tokens_details self.enable_force_include_usage = enable_force_include_usage @@ -178,13 +174,8 @@ class OpenAIServingCompletion(OpenAIServing): else: sampling_params = request.to_sampling_params( max_tokens, - self.model_config.logits_processor_pattern, self.default_sampling_params, ) - validate_logits_processors_parameters( - self.logits_processors, - sampling_params, - ) request_id_item = f"{request_id}-{i}" diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index dd354190f..5603e5dc4 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -15,7 +15,6 @@ from pydantic.dataclasses import dataclass from vllm.config import ModelConfig, SpeculativeConfig, StructuredOutputsConfig from vllm.exceptions import VLLMValidationError from vllm.logger import init_logger -from vllm.logits_process import LogitsProcessor from vllm.tokenizers import TokenizerLike from vllm.v1.serial_utils import PydanticMsgspecMixin @@ -207,11 +206,6 @@ class SamplingParams( """Whether to skip special tokens in the output.""" spaces_between_special_tokens: bool = True """Whether to add spaces between special tokens in the output.""" - # `list[LogitsProcessor] | None` type. We use Any here because - # `list[LogitsProcessor] | None` type is not supported by msgspec. - logits_processors: Any | None = None - """Functions that modify logits based on previously generated tokens, and - optionally prompt tokens as a first argument.""" include_stop_str_in_output: bool = False """Whether to include the stop strings in output text.""" truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None @@ -277,7 +271,6 @@ class SamplingParams( detokenize: bool = True, skip_special_tokens: bool = True, spaces_between_special_tokens: bool = True, - logits_processors: list[LogitsProcessor] | None = None, truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None, output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE, structured_outputs: StructuredOutputsParams | None = None, @@ -318,7 +311,6 @@ class SamplingParams( detokenize=detokenize, skip_special_tokens=skip_special_tokens, spaces_between_special_tokens=spaces_between_special_tokens, - logits_processors=logits_processors, truncate_prompt_tokens=truncate_prompt_tokens, output_kind=output_kind, structured_outputs=structured_outputs, @@ -455,11 +447,6 @@ class SamplingParams( parameter="prompt_logprobs", value=self.prompt_logprobs, ) - if self.logits_processors: - # TODO: Remove `logits_processors` attribute - raise ValueError( - "vLLM V1 does not support per request user-provided logits processors." - ) if self.truncate_prompt_tokens is not None and ( self.truncate_prompt_tokens == 0 or self.truncate_prompt_tokens < -1 ): @@ -573,28 +560,11 @@ class SamplingParams( return self._bad_words_token_ids def clone(self) -> "SamplingParams": - """Deep copy, but maybe not the LogitsProcessor objects. - - LogitsProcessor objects may contain an arbitrary, nontrivial amount of - data that is expensive to copy. However, if not copied, the processor - needs to support parallel decoding for multiple sequences - See https://github.com/vllm-project/vllm/issues/3087 - - If skip_clone is True, uses shallow copy instead of deep copy. - """ - + """If skip_clone is True, uses shallow copy instead of deep copy.""" if self.skip_clone: return copy.copy(self) - logit_processor_refs = ( - None - if self.logits_processors is None - else { - id(lp): lp.clone() if hasattr(lp, "clone") else lp - for lp in self.logits_processors - } - ) - return copy.deepcopy(self, memo=logit_processor_refs) + return copy.deepcopy(self) def verify( self, @@ -605,6 +575,7 @@ class SamplingParams( ) -> None: self._validate_logprobs(model_config) self._validate_logit_bias(model_config) + self._validate_logits_processors(model_config) self._validate_allowed_token_ids(tokenizer) self._validate_spec_decode(speculative_config) self._validate_structured_outputs(structured_outputs_config, tokenizer) @@ -658,6 +629,13 @@ class SamplingParams( value=invalid_token_ids, ) + def _validate_logits_processors(self, model_config: ModelConfig) -> None: + from vllm.v1.sample.logits_processor import ( + validate_logits_processors_parameters, + ) + + validate_logits_processors_parameters(model_config.logits_processors, self) + def _validate_allowed_token_ids(self, tokenizer: TokenizerLike | None) -> None: allowed_token_ids = self.allowed_token_ids if allowed_token_ids is None: -- GitLab From 8a798be929d62a6467fd079c03c83632f8231b11 Mon Sep 17 00:00:00 2001 From: Douglas Lehr <91553416+dllehr-amd@users.noreply.github.com> Date: Thu, 12 Feb 2026 07:06:33 -0600 Subject: [PATCH 0134/1166] [ROCm] Enable MXFP4 MoE weight pre-shuffling on gfx950 and update aiter (#34192) Signed-off-by: Doug Lehr Co-authored-by: Doug Lehr Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Co-authored-by: tjtanaavllm --- docker/Dockerfile.rocm_base | 6 +++--- .../model_executor/layers/quantization/quark/quark_moe.py | 8 ++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base index 948f8dc56..c6e972e89 100644 --- a/docker/Dockerfile.rocm_base +++ b/docker/Dockerfile.rocm_base @@ -1,5 +1,5 @@ ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete -ARG TRITON_BRANCH="f332c492" +ARG TRITON_BRANCH="57c693b6" ARG TRITON_REPO="https://github.com/ROCm/triton.git" ARG PYTORCH_BRANCH="89075173" ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git" @@ -9,7 +9,7 @@ ARG PYTORCH_AUDIO_BRANCH="v2.9.0" ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git" ARG FA_BRANCH="0e60e394" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" -ARG AITER_BRANCH="6af8b687" +ARG AITER_BRANCH="v0.1.10.post2" ARG AITER_REPO="https://github.com/ROCm/aiter.git" ARG MORI_BRANCH="2d02c6a9" ARG MORI_REPO="https://github.com/ROCm/mori.git" @@ -239,7 +239,7 @@ RUN pip install pyyaml && cd aiter \ export HIP_CLANG_PATH=/opt/sccache-wrappers \ && sccache --show-stats; \ fi \ - && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist \ + && GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist \ && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \ && ls /app/aiter/dist/*.whl RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 555b94c1c..66db09505 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -933,7 +933,15 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): layer.w2_weight.view(self.fp4_dtype), requires_grad=layer.w2_weight.requires_grad, ) + # Pre-shuffle weight + shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( + layer.w13_weight.data, layer.w2_weight.data + ) + layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) + layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False) + layer.w13_weight.is_shuffled = True + layer.w2_weight.is_shuffled = True torch.cuda.empty_cache() def get_fused_moe_quant_config( -- GitLab From dea63512bb9bdf7521d591546c52138d9d79e8ce Mon Sep 17 00:00:00 2001 From: danisereb Date: Thu, 12 Feb 2026 16:09:55 +0200 Subject: [PATCH 0135/1166] Add config file for fused MoE for Nemotron (TP4, B200) (#34411) Signed-off-by: Daniel Serebrenik --- .../E=512,N=672,device_name=NVIDIA_B200.json | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=512,N=672,device_name=NVIDIA_B200.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=512,N=672,device_name=NVIDIA_B200.json b/vllm/model_executor/layers/fused_moe/configs/E=512,N=672,device_name=NVIDIA_B200.json new file mode 100644 index 000000000..ac46a8afb --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=512,N=672,device_name=NVIDIA_B200.json @@ -0,0 +1,59 @@ +{ + "triton_version": "3.6.0", + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "512": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "1024": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + } +} -- GitLab From 7b5a8b4a9dd6eb26057e3c8e0fa07db0d89f6d54 Mon Sep 17 00:00:00 2001 From: Aaron Hao Date: Thu, 12 Feb 2026 08:19:13 -0800 Subject: [PATCH 0136/1166] [BUG] Reset running requests when clearing cache for pause/resume (#34382) Signed-off-by: hao-aaron --- vllm/v1/engine/async_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 2d608b11a..d6ef94880 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -793,7 +793,7 @@ class AsyncLLM(EngineClient): # Clear cache if clear_cache: - await self.reset_prefix_cache() + await self.reset_prefix_cache(reset_running_requests=True) await self.reset_mm_cache() await self.reset_encoder_cache() -- GitLab From 334c715e0f4f4de2d3de90bd0b9bba59df143eda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Thu, 12 Feb 2026 18:01:51 +0100 Subject: [PATCH 0137/1166] [Docs] Spec decoding docs warning removal (#34439) Signed-off-by: NickLucche --- docs/features/spec_decode/README.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/docs/features/spec_decode/README.md b/docs/features/spec_decode/README.md index 0d19ef839..0cc77ad4b 100644 --- a/docs/features/spec_decode/README.md +++ b/docs/features/spec_decode/README.md @@ -1,10 +1,5 @@ # Speculative Decoding -!!! warning - Please note that speculative decoding in vLLM is not yet optimized and does - not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. - The work to optimize it is ongoing and can be followed here: - !!! warning Currently, speculative decoding in vLLM is not compatible with pipeline parallelism. -- GitLab From f2c47886fdbabfeae7ddad871ee7889ee472d026 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Thu, 12 Feb 2026 12:21:54 -0500 Subject: [PATCH 0138/1166] [Attention] Add FlashInfer Sparse MLA backend (#33451) Signed-off-by: Matthew Bonanni Signed-off-by: Lucas Wilkinson Co-authored-by: Lucas Wilkinson Co-authored-by: Lucas Wilkinson --- benchmarks/attention_benchmarks/benchmark.py | 47 ++- benchmarks/attention_benchmarks/common.py | 68 +++- .../configs/mla_decode.yaml | 29 +- .../configs/mla_mixed_batch.yaml | 8 +- .../configs/mla_prefill.yaml | 62 +++ .../configs/reorder_threshold.yaml | 11 +- .../configs/speculative_decode.yaml | 15 +- .../configs/standard_attention.yaml | 8 +- benchmarks/attention_benchmarks/mla_runner.py | 217 +++++++---- benchmarks/attention_benchmarks/runner.py | 51 ++- docs/design/attention_backends.md | 2 + .../v1/attention/test_sparse_mla_backends.py | 250 +++++++++---- .../generate_attention_backend_docs.py | 44 ++- .../layers/attention/mla_attention.py | 1 + vllm/platforms/cpu.py | 1 + vllm/platforms/cuda.py | 51 ++- vllm/platforms/interface.py | 1 + vllm/platforms/rocm.py | 1 + vllm/platforms/xpu.py | 1 + .../backends/mla/flashinfer_mla_sparse.py | 353 ++++++++++++++++++ .../attention/backends/mla/flashmla_sparse.py | 164 +------- .../v1/attention/backends/mla/sparse_utils.py | 191 ++++++++++ vllm/v1/attention/backends/registry.py | 4 + vllm/v1/attention/selector.py | 7 +- 24 files changed, 1180 insertions(+), 407 deletions(-) create mode 100644 benchmarks/attention_benchmarks/configs/mla_prefill.yaml create mode 100644 vllm/v1/attention/backends/mla/flashinfer_mla_sparse.py create mode 100644 vllm/v1/attention/backends/mla/sparse_utils.py diff --git a/benchmarks/attention_benchmarks/benchmark.py b/benchmarks/attention_benchmarks/benchmark.py index ba11fca74..de56cbac8 100644 --- a/benchmarks/attention_benchmarks/benchmark.py +++ b/benchmarks/attention_benchmarks/benchmark.py @@ -43,6 +43,7 @@ from common import ( ModelParameterSweep, ParameterSweep, ResultsFormatter, + batch_spec_sort_key, is_mla_backend, ) @@ -218,10 +219,13 @@ def run_model_parameter_sweep( by_param_and_spec[key].append(r) break - # Sort by param value then spec + # Sort by param value then spec (batch_size, q_len, kv_len) sorted_keys = sorted( by_param_and_spec.keys(), - key=lambda x: (int(x[0]) if x[0].isdigit() else x[0], x[1]), + key=lambda x: ( + int(x[0]) if x[0].isdigit() else x[0], + batch_spec_sort_key(x[1]), + ), ) current_param_value = None @@ -330,7 +334,7 @@ def run_parameter_sweep( by_spec[spec] = [] by_spec[spec].append(r) - for spec in sorted(by_spec.keys()): + for spec in sorted(by_spec.keys(), key=batch_spec_sort_key): results = by_spec[spec] best = min(results, key=lambda r: r.mean_time) console.print( @@ -496,15 +500,18 @@ def main(): if "description" in yaml_config: console.print(f"[dim]{yaml_config['description']}[/]") - # Override args with YAML values - # (YAML takes precedence unless CLI arg was explicitly set) - # Backend(s) - if "backend" in yaml_config: - args.backend = yaml_config["backend"] - args.backends = None - elif "backends" in yaml_config: - args.backends = yaml_config["backends"] - args.backend = None + # Override args with YAML values, but CLI args take precedence + # Check if CLI provided backends (they would be non-None and not default) + cli_backends_provided = args.backends is not None or args.backend is not None + + # Backend(s) - only use YAML if CLI didn't specify + if not cli_backends_provided: + if "backend" in yaml_config: + args.backend = yaml_config["backend"] + args.backends = None + elif "backends" in yaml_config: + args.backends = yaml_config["backends"] + args.backend = None # Check for special modes if "mode" in yaml_config: @@ -544,13 +551,15 @@ def main(): args.num_kv_heads = model.get("num_kv_heads", args.num_kv_heads) args.block_size = model.get("block_size", args.block_size) - # Benchmark settings - if "benchmark" in yaml_config: - bench = yaml_config["benchmark"] - args.device = bench.get("device", args.device) - args.repeats = bench.get("repeats", args.repeats) - args.warmup_iters = bench.get("warmup_iters", args.warmup_iters) - args.profile_memory = bench.get("profile_memory", args.profile_memory) + # Benchmark settings (top-level keys) + if "device" in yaml_config: + args.device = yaml_config["device"] + if "repeats" in yaml_config: + args.repeats = yaml_config["repeats"] + if "warmup_iters" in yaml_config: + args.warmup_iters = yaml_config["warmup_iters"] + if "profile_memory" in yaml_config: + args.profile_memory = yaml_config["profile_memory"] # Parameter sweep configuration if "parameter_sweep" in yaml_config: diff --git a/benchmarks/attention_benchmarks/common.py b/benchmarks/attention_benchmarks/common.py index 190b2f977..1de8bb0a5 100644 --- a/benchmarks/attention_benchmarks/common.py +++ b/benchmarks/attention_benchmarks/common.py @@ -16,13 +16,32 @@ from batch_spec import get_batch_type, parse_batch_spec from rich.console import Console from rich.table import Table + +def batch_spec_sort_key(spec: str) -> tuple[int, int, int]: + """ + Extract sorting key from batch spec: (batch_size, max_q_len, max_kv_len). + + This ensures results are sorted by batch size first, then query length, + then sequence length, rather than alphabetically. + """ + try: + requests = parse_batch_spec(spec) + batch_size = len(requests) + max_q_len = max(r.q_len for r in requests) if requests else 0 + max_kv_len = max(r.kv_len for r in requests) if requests else 0 + return (batch_size, max_q_len, max_kv_len) + except Exception: + # Fallback for unparseable specs + return (0, 0, 0) + + # Mock classes for vLLM attention infrastructure class MockHfConfig: """Mock HuggingFace config that satisfies vLLM's requirements.""" - def __init__(self, mla_dims: dict): + def __init__(self, mla_dims: dict, index_topk: int | None = None): self.num_attention_heads = mla_dims["num_q_heads"] self.num_key_value_heads = mla_dims["num_kv_heads"] self.hidden_size = mla_dims["head_dim"] * mla_dims["num_q_heads"] @@ -33,6 +52,8 @@ class MockHfConfig: self.qk_rope_head_dim = mla_dims["qk_rope_head_dim"] self.v_head_dim = mla_dims["v_head_dim"] self.qk_head_dim = mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"] + if index_topk is not None: + self.index_topk = index_topk def get_text_config(self): return self @@ -83,6 +104,38 @@ class MockKVBProj: return (result,) # Return as tuple to match ColumnParallelLinear API +class MockIndexer: + """Mock Indexer for sparse MLA backends. + + Provides topk_indices_buffer that sparse MLA backends use to determine + which KV cache slots to attend to for each token. + """ + + def __init__( + self, + max_num_tokens: int, + topk_tokens: int, + device: torch.device, + ): + self.topk_tokens = topk_tokens + self.topk_indices_buffer = torch.zeros( + (max_num_tokens, topk_tokens), + dtype=torch.int32, + device=device, + ) + + def fill_random_indices(self, num_tokens: int, max_kv_len: int): + """Fill topk_indices_buffer with random valid indices for benchmarking.""" + indices = torch.randint( + 0, + max_kv_len, + (num_tokens, self.topk_tokens), + dtype=torch.int32, + device=self.topk_indices_buffer.device, + ) + self.topk_indices_buffer[:num_tokens] = indices + + class MockLayer(AttentionLayerBase): """Mock attention layer with scale parameters and impl. @@ -327,6 +380,9 @@ class ResultsFormatter: specs_order.append(spec) by_spec[spec][r.config.backend] = r + # Sort specs by (batch_size, q_len, kv_len) instead of alphabetically + specs_order = sorted(by_spec.keys(), key=batch_spec_sort_key) + # Create shortened backend names for display def shorten_backend_name(name: str) -> str: """Shorten long backend names for table display.""" @@ -493,10 +549,11 @@ def get_attention_scale(head_dim: int) -> float: def is_mla_backend(backend: str) -> bool: """ - Check if backend is an MLA backend using the backend's is_mla() property. + Check if backend is an MLA backend using the AttentionBackendEnum. Args: - backend: Backend name (e.g., "CUTLASS_MLA", "FLASHINFER_MLA") + backend: Backend name matching AttentionBackendEnum exactly + (e.g., "FLASHMLA_SPARSE") Returns: True if the backend is an MLA backend, False otherwise @@ -504,7 +561,8 @@ def is_mla_backend(backend: str) -> bool: from vllm.v1.attention.backends.registry import AttentionBackendEnum try: - backend_class = AttentionBackendEnum[backend.upper()].get_class() + backend_enum = AttentionBackendEnum[backend] + backend_class = backend_enum.get_class() return backend_class.is_mla() - except (KeyError, ValueError, ImportError): + except (KeyError, ValueError, ImportError, AttributeError): return False diff --git a/benchmarks/attention_benchmarks/configs/mla_decode.yaml b/benchmarks/attention_benchmarks/configs/mla_decode.yaml index aaf4eec9b..d758654db 100644 --- a/benchmarks/attention_benchmarks/configs/mla_decode.yaml +++ b/benchmarks/attention_benchmarks/configs/mla_decode.yaml @@ -3,7 +3,7 @@ model: name: "deepseek-v3" num_layers: 60 - num_q_heads: 128 + num_q_heads: 128 # Base value, can be swept for TP simulation num_kv_heads: 1 # MLA uses single latent KV head_dim: 576 kv_lora_rank: 512 @@ -12,6 +12,13 @@ model: v_head_dim: 128 block_size: 128 # CUTLASS MLA and FlashAttn MLA use 128 +# Model parameter sweep: simulate tensor parallelism by varying num_q_heads +# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads +model_parameter_sweep: + param_name: "num_q_heads" + values: [128, 64, 32, 16] + label_format: "{backend}_{value}h" + batch_specs: # Small batches, varying sequence lengths - "16q1s512" # 16 requests, 512 KV cache @@ -34,28 +41,30 @@ batch_specs: # Very large batches - "128q1s1k" # 128 requests, 1k KV cache - "128q1s2k" # 128 requests, 2k KV cache + - "128q1s4k" # 128 requests, 4k KV cache + - "128q1s8k" # 128 requests, 8k KV cache # Long context - "32q1s16k" # 32 requests, 16k KV cache - "32q1s32k" # 32 requests, 32k KV cache backends: - - cutlass_mla - - flashinfer_mla - - flashattn_mla # Hopper only - - flashmla # Hopper only + - CUTLASS_MLA + - FLASHINFER_MLA + - FLASH_ATTN_MLA # Hopper only + - FLASHMLA # Hopper only device: "cuda:0" -repeats: 5 -warmup_iters: 3 +repeats: 100 +warmup_iters: 10 profile_memory: true # Backend-specific tuning -cutlass_mla: +CUTLASS_MLA: num_kv_splits: auto # or specific value like 4, 8, 16 -flashattn_mla: +FLASH_ATTN_MLA: reorder_batch_threshold: 512 -flashmla: +FLASHMLA: reorder_batch_threshold: 1 diff --git a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml index ad3c0dced..b555d90cb 100644 --- a/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml +++ b/benchmarks/attention_benchmarks/configs/mla_mixed_batch.yaml @@ -45,10 +45,10 @@ batch_specs: - "4q4k_60q1s4k" # 4 prefill + 60 decode backends: - - cutlass_mla - - flashinfer_mla - - flashattn_mla # Hopper only - - flashmla # Hopper only + - CUTLASS_MLA + - FLASHINFER_MLA + - FLASH_ATTN_MLA # Hopper only + - FLASHMLA # Hopper only device: "cuda:0" repeats: 5 diff --git a/benchmarks/attention_benchmarks/configs/mla_prefill.yaml b/benchmarks/attention_benchmarks/configs/mla_prefill.yaml new file mode 100644 index 000000000..ef6b2cb07 --- /dev/null +++ b/benchmarks/attention_benchmarks/configs/mla_prefill.yaml @@ -0,0 +1,62 @@ +# MLA prefill-only benchmark configuration for sparse backends + +model: + name: "deepseek-v3" + num_layers: 60 + num_q_heads: 128 + num_kv_heads: 1 + head_dim: 576 + kv_lora_rank: 512 + qk_nope_head_dim: 128 + qk_rope_head_dim: 64 + v_head_dim: 128 + block_size: 128 + +# Model parameter sweep: simulate tensor parallelism by varying num_q_heads +# TP=1: 128 heads, TP=2: 64 heads, TP=4: 32 heads, TP=8: 16 heads +model_parameter_sweep: + param_name: "num_q_heads" + values: [128, 64, 32, 16] + label_format: "{backend}_{value}h" + +batch_specs: + # Pure prefill + - "1q512" + - "1q1k" + - "1q2k" + - "1q4k" + - "1q8k" + + # Batched pure prefill + - "2q512" + - "2q1k" + - "2q2k" + - "2q4k" + - "2q8k" + - "4q512" + - "4q1k" + - "4q2k" + - "4q4k" + - "4q8k" + - "8q512" + - "8q1k" + - "8q2k" + - "8q4k" + - "8q8k" + + # Extend + - "1q512s4k" + - "1q512s8k" + - "1q1ks8k" + - "1q2ks8k" + - "1q2ks16k" + - "1q4ks16k" + +backends: + - FLASHMLA_SPARSE + - FLASHINFER_MLA_SPARSE + +device: "cuda:0" +repeats: 10 +warmup_iters: 3 +profile_memory: true diff --git a/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml index 1ea0a12b5..0d76ef0a3 100644 --- a/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml +++ b/benchmarks/attention_benchmarks/configs/reorder_threshold.yaml @@ -6,7 +6,7 @@ description: "Decode vs Prefill pipeline crossover analysis" # Test FlashAttn MLA -backend: flashattn_mla +backend: FLASH_ATTN_MLA # Mode: decode_vs_prefill comparison (special sweep mode) # For each batch spec, we'll test both decode and prefill pipelines @@ -62,11 +62,10 @@ model: block_size: 128 # Benchmark settings -benchmark: - device: "cuda:0" - repeats: 15 # More repeats for spec decode variance - warmup_iters: 5 - profile_memory: false +device: "cuda:0" +repeats: 15 # More repeats for spec decode variance +warmup_iters: 5 +profile_memory: false # Output output: diff --git a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml index 56d2428fe..47b6d3604 100644 --- a/benchmarks/attention_benchmarks/configs/speculative_decode.yaml +++ b/benchmarks/attention_benchmarks/configs/speculative_decode.yaml @@ -41,18 +41,17 @@ batch_specs: # Backends that support query length > 1 backends: - - flashattn_mla # reorder_batch_threshold = 512 - - flashmla # reorder_batch_threshold = 1 (tunable) + - FLASH_ATTN_MLA # reorder_batch_threshold = 512 + - FLASHMLA # reorder_batch_threshold = 1 (tunable) # FlashInfer-MLA also supports uniform spec-as-decode but with different mechanism -# - flashinfer_mla +# - FLASHINFER_MLA # Benchmark settings -benchmark: - device: "cuda:0" - repeats: 10 # More repeats for statistical significance - warmup_iters: 5 - profile_memory: false +device: "cuda:0" +repeats: 10 # More repeats for statistical significance +warmup_iters: 5 +profile_memory: false # Test these threshold values for optimization parameter_sweep: diff --git a/benchmarks/attention_benchmarks/configs/standard_attention.yaml b/benchmarks/attention_benchmarks/configs/standard_attention.yaml index 591db6837..deb5a4b27 100644 --- a/benchmarks/attention_benchmarks/configs/standard_attention.yaml +++ b/benchmarks/attention_benchmarks/configs/standard_attention.yaml @@ -36,11 +36,11 @@ batch_specs: - "q1ks2k" # 1k query, 2k sequence - "2q1ks4k" # 2 requests: 1k query, 4k sequence -# Available backends: flash, triton, flashinfer +# Available backends: FLASH_ATTN, TRITON_ATTN, FLASHINFER backends: - - flash - - triton - - flashinfer + - FLASH_ATTN + - TRITON_ATTN + - FLASHINFER device: "cuda:0" repeats: 5 diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py index 2c6c3aaac..ffcfa4572 100644 --- a/benchmarks/attention_benchmarks/mla_runner.py +++ b/benchmarks/attention_benchmarks/mla_runner.py @@ -8,14 +8,13 @@ This module provides helpers for running MLA backends without needing full VllmConfig integration. """ -import importlib - import numpy as np import torch from batch_spec import parse_batch_spec from common import ( BenchmarkResult, MockHfConfig, + MockIndexer, MockKVBProj, MockLayer, setup_mla_dims, @@ -62,6 +61,7 @@ def create_minimal_vllm_config( block_size: int = 128, max_num_seqs: int = 256, mla_dims: dict | None = None, + index_topk: int | None = None, ) -> VllmConfig: """ Create minimal VllmConfig for MLA benchmarks. @@ -73,6 +73,8 @@ def create_minimal_vllm_config( max_num_seqs: Maximum number of sequences mla_dims: Optional custom MLA dimensions dict. If not provided, uses setup_mla_dims(model_name) + index_topk: Optional topk value for sparse MLA backends. If provided, + the config will include index_topk for sparse attention. Returns: VllmConfig for benchmarking @@ -82,7 +84,7 @@ def create_minimal_vllm_config( mla_dims = setup_mla_dims(model_name) # Create mock HF config first (avoids downloading from HuggingFace) - mock_hf_config = MockHfConfig(mla_dims) + mock_hf_config = MockHfConfig(mla_dims, index_topk=index_topk) # Create a temporary minimal config.json to avoid HF downloads # This ensures consistent ModelConfig construction without network access @@ -120,16 +122,12 @@ def create_minimal_vllm_config( seed=0, max_model_len=32768, quantization=None, - quantization_param_path=None, enforce_eager=False, - max_context_len_to_capture=None, - max_seq_len_to_capture=8192, max_logprobs=20, disable_sliding_window=False, skip_tokenizer_init=True, served_model_name=None, limit_mm_per_prompt=None, - use_async_output_proc=True, config_format="auto", ) finally: @@ -180,56 +178,65 @@ def create_minimal_vllm_config( # ============================================================================ -# Backend name to class name prefix mapping -_BACKEND_NAME_MAP = { - "flashattn_mla": "FlashAttnMLA", - "flashmla": "FlashMLA", - "flashinfer_mla": "FlashInferMLA", - "cutlass_mla": "CutlassMLA", -} - -# Special properties that differ from defaults +# Backend-specific properties that can't be inferred from the backend class +# Keys are AttentionBackendEnum names (uppercase) _BACKEND_PROPERTIES = { - "flashmla": { + "FLASHMLA": { "query_format": "concat", # Single concatenated tensor (vs tuple) - "block_size": 64, # FlashMLA uses fixed block size }, - "flashinfer_mla": { - "block_size": 64, # FlashInfer MLA only supports 32 or 64 + "FLASHMLA_SPARSE": { + "query_format": "concat", # Single concatenated tensor (vs tuple) }, } def _get_backend_config(backend: str) -> dict: """ - Get backend configuration using naming conventions. - - All MLA backends follow the pattern: - - Module: vllm.v1.attention.backends.mla.{backend} - - Impl: {Name}Impl - - Metadata: {Name}Metadata (or MLACommonMetadata) - - DecodeMetadata: {Name}DecodeMetadata (or MLACommonDecodeMetadata) - - MetadataBuilder: {Name}MetadataBuilder + Get backend configuration from AttentionBackendEnum. + + Uses the registry to get the backend class and extract configuration + from its methods (get_impl_cls, get_builder_cls, is_sparse, etc.). + + Args: + backend: Backend name matching AttentionBackendEnum exactly + (e.g., "FLASHMLA_SPARSE") + + Returns: + Dict with backend configuration """ - if backend not in _BACKEND_NAME_MAP: - raise ValueError(f"Unknown backend: {backend}") + from vllm.v1.attention.backends.registry import AttentionBackendEnum - name = _BACKEND_NAME_MAP[backend] + try: + backend_enum = AttentionBackendEnum[backend] + backend_class = backend_enum.get_class() + except (KeyError, ValueError) as e: + valid_backends = [e.name for e in AttentionBackendEnum if e.name != "CUSTOM"] + raise ValueError( + f"Unknown backend: {backend}. " + f"Valid MLA backends: {[b for b in valid_backends if 'MLA' in b]}" + ) from e + + # Get block size from backend class + block_sizes = backend_class.get_supported_kernel_block_sizes() + # Use first supported block size (backends typically support one for MLA) + block_size = block_sizes[0] if block_sizes else None + if hasattr(block_size, "value"): + # Handle MultipleOf enum + block_size = None + + # Check if sparse via class method if available + is_sparse = getattr(backend_class, "is_sparse", lambda: False)() + + # Get properties that can't be inferred props = _BACKEND_PROPERTIES.get(backend, {}) - # Check if backend uses common metadata (FlashInfer, CUTLASS) - uses_common = backend in ("flashinfer_mla", "cutlass_mla") - return { - "module": f"vllm.v1.attention.backends.mla.{backend}", - "impl_class": f"{name}Impl", - "metadata_class": "MLACommonMetadata" if uses_common else f"{name}Metadata", - "decode_metadata_class": "MLACommonDecodeMetadata" - if uses_common - else f"{name}DecodeMetadata", - "builder_class": f"{name}MetadataBuilder", + "backend_class": backend_class, + "impl_class": backend_class.get_impl_cls(), + "builder_class": backend_class.get_builder_cls(), "query_format": props.get("query_format", "tuple"), - "block_size": props.get("block_size", None), + "block_size": block_size, + "is_sparse": is_sparse, } @@ -447,22 +454,26 @@ def _create_backend_impl( mla_dims: dict, vllm_config: VllmConfig, device: torch.device, + max_num_tokens: int = 8192, + index_topk: int | None = None, ): """ Create backend implementation instance. Args: - backend_cfg: Backend configuration dict + backend_cfg: Backend configuration dict from _get_backend_config() mla_dims: MLA dimension configuration vllm_config: VllmConfig instance device: Target device + max_num_tokens: Maximum number of tokens for sparse indexer buffer + index_topk: Topk value for sparse MLA backends Returns: - Tuple of (impl, layer, builder_instance) + Tuple of (impl, layer, builder_instance, indexer) """ - # Import backend classes - backend_module = importlib.import_module(backend_cfg["module"]) - impl_class = getattr(backend_module, backend_cfg["impl_class"]) + # Get classes from backend config (already resolved by _get_backend_config) + impl_class = backend_cfg["impl_class"] + builder_class = backend_cfg["builder_class"] # Calculate scale scale = 1.0 / np.sqrt(mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"]) @@ -474,26 +485,44 @@ def _create_backend_impl( v_head_dim=mla_dims["v_head_dim"], ) + # Create indexer for sparse backends + indexer = None + if backend_cfg.get("is_sparse", False): + if index_topk is None: + index_topk = 2048 # Default topk for sparse MLA + indexer = MockIndexer( + max_num_tokens=max_num_tokens, + topk_tokens=index_topk, + device=device, + ) + + # Build impl kwargs + impl_kwargs = { + "num_heads": mla_dims["num_q_heads"], + "head_size": mla_dims["head_dim"], + "scale": scale, + "num_kv_heads": mla_dims["num_kv_heads"], + "alibi_slopes": None, + "sliding_window": None, + "kv_cache_dtype": "auto", + "logits_soft_cap": None, + "attn_type": "decoder", + "kv_sharing_target_layer_name": None, + "q_lora_rank": None, + "kv_lora_rank": mla_dims["kv_lora_rank"], + "qk_nope_head_dim": mla_dims["qk_nope_head_dim"], + "qk_rope_head_dim": mla_dims["qk_rope_head_dim"], + "qk_head_dim": mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"], + "v_head_dim": mla_dims["v_head_dim"], + "kv_b_proj": mock_kv_b_proj, + } + + # Add indexer for sparse backends + if indexer is not None: + impl_kwargs["indexer"] = indexer + # Create impl - impl = impl_class( - num_heads=mla_dims["num_q_heads"], - head_size=mla_dims["head_dim"], - scale=scale, - num_kv_heads=mla_dims["num_kv_heads"], - alibi_slopes=None, - sliding_window=None, - kv_cache_dtype="auto", - logits_soft_cap=None, - attn_type="decoder", - kv_sharing_target_layer_name=None, - q_lora_rank=None, - kv_lora_rank=mla_dims["kv_lora_rank"], - qk_nope_head_dim=mla_dims["qk_nope_head_dim"], - qk_rope_head_dim=mla_dims["qk_rope_head_dim"], - qk_head_dim=mla_dims["qk_nope_head_dim"] + mla_dims["qk_rope_head_dim"], - v_head_dim=mla_dims["v_head_dim"], - kv_b_proj=mock_kv_b_proj, - ) + impl = impl_class(**impl_kwargs) # Initialize DCP attributes if not hasattr(impl, "dcp_world_size") or impl.dcp_world_size in (None, -1): @@ -515,9 +544,7 @@ def _create_backend_impl( # Create builder instance if needed builder_instance = None - if backend_cfg["builder_class"]: - builder_class = getattr(backend_module, backend_cfg["builder_class"]) - + if builder_class: # Populate static_forward_context so builder can find the layer # MockLayer inherits from AttentionLayerBase, so isinstance checks pass vllm_config.compilation_config.static_forward_context = {"placeholder": layer} @@ -529,7 +556,7 @@ def _create_backend_impl( device=device, ) - return impl, layer, builder_instance + return impl, layer, builder_instance, indexer # ============================================================================ @@ -594,6 +621,7 @@ def _run_single_benchmark( backend_cfg: dict, mla_dims: dict, device: torch.device, + indexer=None, ) -> BenchmarkResult: """ Run a single benchmark iteration. @@ -606,6 +634,7 @@ def _run_single_benchmark( backend_cfg: Backend configuration dict mla_dims: MLA dimension configuration device: Target device + indexer: Optional MockIndexer for sparse backends Returns: BenchmarkResult with timing statistics @@ -613,7 +642,9 @@ def _run_single_benchmark( # Parse batch spec requests = parse_batch_spec(config.batch_spec) q_lens = [r.q_len for r in requests] + kv_lens = [r.kv_len for r in requests] total_q = sum(q_lens) + max_kv_len = max(kv_lens) # Determine block size block_size = backend_cfg["block_size"] or config.block_size @@ -641,8 +672,16 @@ def _run_single_benchmark( torch.bfloat16, ) - # Determine which forward method to use based on metadata - if metadata.decode is not None: + # Fill indexer with random indices for sparse backends + is_sparse = backend_cfg.get("is_sparse", False) + if is_sparse and indexer is not None: + indexer.fill_random_indices(total_q, max_kv_len) + + # Determine which forward method to use + if is_sparse: + # Sparse backends use forward_mqa + forward_fn = lambda: impl.forward_mqa(decode_inputs, kv_cache, metadata, layer) + elif metadata.decode is not None: forward_fn = lambda: impl._forward_decode( decode_inputs, kv_cache, metadata, layer ) @@ -693,11 +732,13 @@ def _run_single_benchmark( def _run_mla_benchmark_batched( backend: str, configs_with_params: list[tuple], # [(config, threshold, num_splits), ...] + index_topk: int = 2048, ) -> list[BenchmarkResult]: """ Unified batched MLA benchmark runner for all backends. - Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla + Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla, + flashinfer_mla_sparse, flashmla_sparse This function reuses backend initialization across multiple benchmarks to avoid setup/teardown overhead. @@ -707,6 +748,7 @@ def _run_mla_benchmark_batched( configs_with_params: List of (config, threshold, num_splits) tuples - threshold: reorder_batch_threshold (FlashAttn/FlashMLA only) - num_splits: num_kv_splits (CUTLASS only) + index_topk: Topk value for sparse MLA backends (default 2048) Returns: List of BenchmarkResult objects @@ -730,19 +772,27 @@ def _run_mla_benchmark_batched( if mla_dims is None: mla_dims = setup_mla_dims("deepseek-v3") + # Determine if this is a sparse backend + is_sparse = backend_cfg.get("is_sparse", False) + # Create and set vLLM config for MLA (reused across all benchmarks) vllm_config = create_minimal_vllm_config( model_name="deepseek-v3", # Used only for model path block_size=block_size, mla_dims=mla_dims, # Use custom dims from config or default + index_topk=index_topk if is_sparse else None, ) results = [] with set_current_vllm_config(vllm_config): - # Create backend impl, layer, and builder (reused across benchmarks) - impl, layer, builder_instance = _create_backend_impl( - backend_cfg, mla_dims, vllm_config, device + # Create backend impl, layer, builder, and indexer (reused across benchmarks) + impl, layer, builder_instance, indexer = _create_backend_impl( + backend_cfg, + mla_dims, + vllm_config, + device, + index_topk=index_topk if is_sparse else None, ) # Run each benchmark with the shared impl @@ -768,6 +818,7 @@ def _run_mla_benchmark_batched( backend_cfg, mla_dims, device, + indexer=indexer, ) results.append(result) @@ -793,20 +844,24 @@ def run_mla_benchmark( config, reorder_batch_threshold: int | None = None, num_kv_splits: int | None = None, + index_topk: int = 2048, ) -> BenchmarkResult | list[BenchmarkResult]: """ Unified MLA benchmark runner for all backends. - Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla + Works for: flashattn_mla, flashmla, flashinfer_mla, cutlass_mla, + flashinfer_mla_sparse, flashmla_sparse Always uses batched execution internally for optimal performance. Args: - backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla) + backend: Backend name (flashattn_mla, flashmla, flashinfer_mla, cutlass_mla, + flashinfer_mla_sparse, flashmla_sparse) config: BenchmarkConfig or list of (BenchmarkConfig, param) tuples reorder_batch_threshold: Threshold override for FlashAttn/FlashMLA (single config mode only) num_kv_splits: Number of KV splits for CUTLASS (single config mode only) + index_topk: Topk value for sparse MLA backends (default 2048) Returns: BenchmarkResult (single mode) or list of BenchmarkResult (batched mode) @@ -816,9 +871,9 @@ def run_mla_benchmark( # Already in batched format if len(config) > 0 and isinstance(config[0], tuple): # Format: [(cfg, param), ...] where param is threshold or num_splits - if backend in ("flashattn_mla", "flashmla"): + if backend in ("flashattn_mla", "flashmla", "flashmla_sparse"): configs_with_params = [(cfg, param, None) for cfg, param in config] - else: # cutlass_mla or flashinfer_mla + else: # cutlass_mla, flashinfer_mla, or sparse backends configs_with_params = [(cfg, None, param) for cfg, param in config] else: # Format: [cfg, ...] - just configs @@ -830,7 +885,7 @@ def run_mla_benchmark( return_single = True # Use unified batched execution - results = _run_mla_benchmark_batched(backend, configs_with_params) + results = _run_mla_benchmark_batched(backend, configs_with_params, index_topk) # Return single result or list based on input return results[0] if return_single else results diff --git a/benchmarks/attention_benchmarks/runner.py b/benchmarks/attention_benchmarks/runner.py index 79bfca681..6457a599a 100644 --- a/benchmarks/attention_benchmarks/runner.py +++ b/benchmarks/attention_benchmarks/runner.py @@ -40,29 +40,29 @@ from vllm.v1.kv_cache_interface import FullAttentionSpec # ============================================================================ -_BACKEND_CONFIG = { - "flash": { - "module": "vllm.v1.attention.backends.flash_attn", - "backend_class": "FlashAttentionBackend", - }, - "triton": { - "module": "vllm.v1.attention.backends.triton_attn", - "backend_class": "TritonAttentionBackend", - }, - "flashinfer": { - "module": "vllm.v1.attention.backends.flashinfer", - "backend_class": "FlashInferBackend", - }, -} +def _get_backend_config(backend: str) -> dict: + """ + Get backend configuration from AttentionBackendEnum. + Args: + backend: Backend name matching AttentionBackendEnum exactly + (e.g., "FLASH_ATTN", "TRITON_ATTN", "FLASHINFER") -def _get_backend_config(backend: str) -> dict: - if backend not in _BACKEND_CONFIG: + Returns: + Dict with backend_class + """ + from vllm.v1.attention.backends.registry import AttentionBackendEnum + + try: + backend_enum = AttentionBackendEnum[backend] + backend_class = backend_enum.get_class() + except (KeyError, ValueError) as e: + valid_backends = [b.name for b in AttentionBackendEnum if b.name != "CUSTOM"] raise ValueError( - f"Unknown backend: {backend}. " - f"Available: {', '.join(_BACKEND_CONFIG.keys())}" - ) - return _BACKEND_CONFIG[backend] + f"Unknown backend: {backend}. Valid backends: {valid_backends}" + ) from e + + return {"backend_class": backend_class} @contextmanager @@ -205,10 +205,7 @@ def _create_backend_impl( dtype: torch.dtype, ): """Create backend implementation instance.""" - import importlib - - backend_module = importlib.import_module(backend_cfg["module"]) - backend_class = getattr(backend_module, backend_cfg["backend_class"]) + backend_class = backend_cfg["backend_class"] scale = get_attention_scale(config.head_dim) @@ -247,7 +244,7 @@ def _create_metadata_builder( # Flashinfer needs get_per_layer_parameters mocked since we don't have # real model layers registered - if backend_name == "flashinfer": + if backend_name == "FLASHINFER": import unittest.mock from vllm.v1.attention.backends.utils import PerLayerParameters @@ -438,7 +435,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult: """ Run standard attention benchmark with real kernels. - Supports: flash, triton, flashinfer + Supports: FLASH_ATTN, TRITON_ATTN, FLASHINFER Args: config: Benchmark configuration @@ -453,7 +450,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult: requests = parse_batch_spec(config.batch_spec) - if config.backend == "flashinfer": + if config.backend == "FLASHINFER": requests = reorder_for_flashinfer(requests) q_lens = [r.q_len for r in requests] diff --git a/docs/design/attention_backends.md b/docs/design/attention_backends.md index b551e31db..3244ce7cc 100644 --- a/docs/design/attention_backends.md +++ b/docs/design/attention_backends.md @@ -128,6 +128,7 @@ Priority is **1 = highest** (tried first). | 4 | `FLASHMLA` | | 5 | `TRITON_MLA` | | 6 | `FLASHMLA_SPARSE` | +| 7 | `FLASHINFER_MLA_SPARSE` | **Ampere/Hopper (SM 8.x-9.x):** @@ -204,6 +205,7 @@ configuration. |---------|--------|-----------|-------------|------------|------|--------|-----------|-----|-----------------|--------------| | `CUTLASS_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 128 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 10.x | | `FLASHINFER_MLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 32, 64 | Any | ❌ | ❌ | ❌ | ❌ | Decoder | 10.x | +| `FLASHINFER_MLA_SPARSE` | fp16, bf16 | `auto`, `bfloat16` | 32, 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 10.x | | `FLASHMLA` | fp16, bf16 | `auto`, `bfloat16`, `fp8`, `fp8_e4m3` | 64 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x-10.x | | `FLASHMLA_SPARSE` | bf16 | `auto`, `bfloat16`, `fp8_ds_mla` | 64 | 576 | ❌ | ✅ | ❌ | ❌ | Decoder | 9.x-10.x | | `FLASH_ATTN_MLA` | fp16, bf16 | `auto`, `bfloat16` | %16 | Any | ❌ | ❌ | ❌ | ✅ | Decoder | 9.x | diff --git a/tests/v1/attention/test_sparse_mla_backends.py b/tests/v1/attention/test_sparse_mla_backends.py index e4ffd12ca..fe9ca8289 100644 --- a/tests/v1/attention/test_sparse_mla_backends.py +++ b/tests/v1/attention/test_sparse_mla_backends.py @@ -1,11 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Unit tests for the FlashMLA sparse backend utilities.""" +"""Unit tests for the sparse MLA backends and utilities.""" import math from types import MethodType, SimpleNamespace -import numpy as np import pytest import torch @@ -25,6 +24,9 @@ from vllm.config import set_current_vllm_config from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv +from vllm.v1.attention.backends.mla.flashinfer_mla_sparse import ( + FlashInferMLASparseBackend, +) from vllm.v1.attention.backends.mla.flashmla_sparse import ( FlashMLASparseBackend, triton_convert_req_index_to_global_index, @@ -156,31 +158,47 @@ def _quantize_dequantize_fp8_ds_mla( return dequant_kv_c, dequant_k_pe +@pytest.mark.parametrize( + "backend_cls", + [FlashMLASparseBackend, FlashInferMLASparseBackend], + ids=["FlashMLA", "FlashInfer"], +) @pytest.mark.parametrize("batch_name", list(SPARSE_BACKEND_BATCH_SPECS.keys())) -@pytest.mark.parametrize("kv_cache_dtype", ["fp8_ds_mla", "auto"]) +@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8", "fp8_ds_mla"]) @pytest.mark.parametrize("tensor_parallel_size", [1, 2, 4]) -@pytest.mark.skipif( - torch.cuda.get_device_capability() < (9, 0), - reason="FlashMLASparseBackend requires CUDA 9.0 or higher", -) +@pytest.mark.parametrize("block_size", [32, 64]) def test_sparse_backend_decode_correctness( default_vllm_config, dist_init, + backend_cls, batch_name, kv_cache_dtype, tensor_parallel_size, + block_size, workspace_init, ): - if current_platform.is_rocm(): - pytest.skip("ROCm does not support fp8_ds_mla data type for kv cache.") + if kv_cache_dtype not in backend_cls.supported_kv_cache_dtypes: + pytest.skip(f"{backend_cls.get_name()} does not support {kv_cache_dtype}") - if not torch.cuda.is_available(): - pytest.skip("CUDA is required for sparse MLA decode test") + supported_block_sizes = backend_cls.get_supported_kernel_block_sizes() + if block_size not in supported_block_sizes: + pytest.skip( + f"{backend_cls.get_name()} does not support block_size={block_size}" + ) - device = torch.device("cuda") - dtype = torch.bfloat16 + if backend_cls == FlashMLASparseBackend: + ok, reason = flashmla.is_flashmla_sparse_supported() + if not ok: + pytest.skip(reason) + elif backend_cls == FlashInferMLASparseBackend: + if not current_platform.has_device_capability(100): + pytest.skip("FlashInferMLASparseBackend requires SM 10.0 or higher") batch_spec = SPARSE_BACKEND_BATCH_SPECS[batch_name] + use_fp8_ds_mla_quantization = kv_cache_dtype == "fp8_ds_mla" + + device = torch.device("cuda") + dtype = torch.bfloat16 # Model hyper-parameters (kept intentionally small for the unit test) total_num_heads = 128 @@ -192,11 +210,10 @@ def test_sparse_backend_decode_correctness( qk_rope_head_dim = 64 v_head_dim = 128 head_size = kv_lora_rank + qk_rope_head_dim - topk_tokens = 2048 + topk_tokens = 128 max_seqlen = max(batch_spec.seq_lens) total_cache_tokens = sum(batch_spec.seq_lens) - block_size = 64 # Note: We use TP=1 to avoid multi-GPU requirements in CI. # The test simulates head partitioning via mocked methods below. @@ -247,11 +264,55 @@ def test_sparse_backend_decode_correctness( seq_lens = batch_spec.seq_lens query_lens = batch_spec.query_lens + # Pre-compute positions and sparse indices for all tokens. + # We need these BEFORE computing the reference to use sparse attention masks. + total_query_tokens = sum(query_lens) + positions = [] + for i in range(batch_spec.batch_size): + s_len = seq_lens[i] + q_len = query_lens[i] + ctx_len = s_len - q_len + for q_idx in range(q_len): + positions.append(ctx_len + q_idx) + + # Create sparse indices with UNIQUE per-token offsets to catch bugs where + # the kernel uses wrong indices for some tokens (e.g., due to incorrect + # tensor shapes like [1, num_tokens, ...] instead of [num_tokens, 1, ...]). + # Also include -1 masked indices to verify the kernel handles them correctly. + sparse_indices = torch.empty( + total_query_tokens, topk_tokens, dtype=torch.int32, device=device + ) + for tok_idx in range(total_query_tokens): + max_valid_idx = positions[tok_idx] + offset = tok_idx * 7 # Prime number for varied offsets + # Use only half the topk indices as valid, mask the rest with -1 + # This tests that the kernel correctly ignores -1 indices + num_valid = min(topk_tokens // 2, max_valid_idx + 1) + if num_valid > 0: + valid_range = torch.arange(num_valid, device=device, dtype=torch.int32) + tok_indices = (valid_range + offset) % (max_valid_idx + 1) + # Pad with -1 for the remaining positions + tok_indices = torch.cat( + [ + tok_indices, + torch.full( + (topk_tokens - num_valid,), -1, device=device, dtype=torch.int32 + ), + ] + ) + else: + tok_indices = torch.full( + (topk_tokens,), -1, device=device, dtype=torch.int32 + ) + tok_indices[0] = 0 # At least one valid index + sparse_indices[tok_idx] = tok_indices + all_q_vllm, all_kv_c_vllm, all_k_pe_vllm = [], [], [] kv_c_contexts, k_pe_contexts = [], [] reference_outputs = [] kv_cache_scale = torch.tensor(1.0, dtype=torch.float32, device=device) + global_token_idx = 0 for i in range(batch_spec.batch_size): s_len = seq_lens[i] @@ -268,40 +329,53 @@ def test_sparse_backend_decode_correctness( kv_c_full = torch.rand(s_len, kv_lora_rank, dtype=dtype, device=device) k_pe_full = torch.rand(s_len, 1, qk_rope_head_dim, dtype=dtype, device=device) - # SM100 (Blackwell) uses float -> e8m0 -> bf16 scale conversion - # which truncates scales to powers of 2. Simulate this in reference. - is_sm100 = torch.cuda.get_device_capability()[0] >= 10 - kv_c_full, k_pe_full = _quantize_dequantize_fp8_ds_mla( - kv_c_full, - k_pe_full.squeeze(1), - block_size=vllm_config.cache_config.block_size, - scale=kv_cache_scale, - simulate_sm100_e8m0_scales=is_sm100, - ) + if use_fp8_ds_mla_quantization: + is_sm100 = torch.cuda.get_device_capability()[0] >= 10 + kv_c_full, k_pe_squeezed = _quantize_dequantize_fp8_ds_mla( + kv_c_full, + k_pe_full.squeeze(1), + block_size=block_size, + scale=kv_cache_scale, + simulate_sm100_e8m0_scales=is_sm100, + ) + k_pe_full = k_pe_squeezed.unsqueeze(1) q_nope, q_pe = q_c.split([qk_nope_head_dim, qk_rope_head_dim], dim=-1) ql_nope = torch.einsum("qnh,lnh->qnl", q_nope, W_UK) q_mqa = torch.cat([ql_nope, q_pe], dim=-1) - k_mqa = torch.cat([kv_c_full, k_pe_full], dim=-1) - k_mqa = k_mqa.unsqueeze(1).expand(-1, num_heads, -1) - v_mqa = kv_c_full.unsqueeze(1).expand(-1, num_heads, -1) + k_mqa = torch.cat([kv_c_full, k_pe_full.squeeze(1)], dim=-1) + v_mqa = kv_c_full - attn_mask = torch.ones(q_len, s_len, dtype=torch.bool, device=device) - causal_mask = torch.tril(torch.ones(q_len, q_len, device=device)) - attn_mask[:, ctx_len:] = causal_mask + # Compute sparse SDPA reference per query token using its sparse indices + for q_idx in range(q_len): + tok_sparse_idx = sparse_indices[global_token_idx] + valid_mask = tok_sparse_idx >= 0 + valid_indices = tok_sparse_idx[valid_mask].long() - q_sdpa_in = q_mqa.unsqueeze(0).transpose(1, 2) - k_sdpa_in = k_mqa.unsqueeze(0).transpose(1, 2) - v_sdpa_in = v_mqa.unsqueeze(0).transpose(1, 2) + q_tok = q_mqa[q_idx : q_idx + 1] # [1, num_heads, head_dim] + k_sparse = k_mqa[valid_indices] # [num_valid, head_dim] + v_sparse = v_mqa[valid_indices] # [num_valid, kv_lora_rank] - sdpa_out = torch.nn.functional.scaled_dot_product_attention( - q_sdpa_in, k_sdpa_in, v_sdpa_in, attn_mask=attn_mask, scale=scale - ) - sdpa_out = sdpa_out.transpose(1, 2).squeeze(0) + k_sparse = k_sparse.unsqueeze(1).expand(-1, num_heads, -1) + v_sparse = v_sparse.unsqueeze(1).expand(-1, num_heads, -1) + + # SDPA: [1, num_heads, 1, head_dim] x [1, num_heads, num_valid, head_dim] + q_sdpa_in = q_tok.unsqueeze(0).transpose(1, 2) + k_sdpa_in = k_sparse.unsqueeze(0).transpose(1, 2) + v_sdpa_in = v_sparse.unsqueeze(0).transpose(1, 2) + + sdpa_out = torch.nn.functional.scaled_dot_product_attention( + q_sdpa_in, k_sdpa_in, v_sdpa_in, scale=scale + ) + sdpa_out = sdpa_out.transpose(1, 2).squeeze( + 0 + ) # [1, num_heads, kv_lora_rank] - sdpa_out = torch.einsum("qnl,lnv->qnv", sdpa_out, W_UV) - reference_outputs.append(sdpa_out.flatten(start_dim=-2)) + sdpa_out = torch.einsum("qnl,lnv->qnv", sdpa_out, W_UV) + reference_outputs.append(sdpa_out.flatten(start_dim=-2)) + + global_token_idx += 1 all_q_vllm.append(q_c) all_kv_c_vllm.append(kv_c_full[ctx_len:]) @@ -334,42 +408,18 @@ def test_sparse_backend_decode_correctness( num_blocks=vllm_config.cache_config.num_gpu_blocks, common_attn_metadata=common_attn_metadata, randomize_blocks=False, - kv_cache_dtype=vllm_config.cache_config.cache_dtype, + kv_cache_dtype=kv_cache_dtype if use_fp8_ds_mla_quantization else "auto", scale=kv_cache_scale, ) - builder_cls = FlashMLASparseBackend.get_builder_cls() + builder_cls = backend_cls.get_builder_cls() builder = builder_cls(kv_cache_spec, ["placeholder"], vllm_config, device) metadata = builder.build( common_prefix_len=0, common_attn_metadata=common_attn_metadata ) - starts = np.asarray(common_attn_metadata.query_start_loc_cpu, dtype=np.int32) - seg_lengths = np.diff(starts) - positions = np.arange(starts[-1], dtype=np.int32) - np.repeat( - starts[:-1], seg_lengths - ) - seq_lengths = np.asarray(common_attn_metadata.seq_lens.cpu(), dtype=np.int32) - prefix_lengths = seq_lengths - seg_lengths - positions += np.repeat(prefix_lengths, seg_lengths) - - pos_gpu = torch.as_tensor(positions, device=device, dtype=torch.int32) - topk = metadata.topk_tokens - debug_indices = torch.arange(topk, device=device, dtype=torch.int32).unsqueeze(0) - token_positions = pos_gpu.unsqueeze(1) - causal_mask = debug_indices <= token_positions - debug_indices = torch.where( - causal_mask, debug_indices, torch.full_like(debug_indices, -1) - ) - - # FlashMLASparseImpl now reads top-k indices from the indexer-provided - # buffer, so emulate that contract with a simple namespace mock. - debug_indices = debug_indices.expand(metadata.num_actual_tokens, -1).clone() - mock_indexer = SimpleNamespace(topk_indices_buffer=debug_indices) - - ok, reason = flashmla.is_flashmla_sparse_supported() - if not ok: - pytest.skip(reason) + # Use the pre-computed sparse_indices for the mock indexer + mock_indexer = SimpleNamespace(topk_indices_buffer=sparse_indices) kv_b_proj_weight = torch.cat([W_UK, W_UV], dim=-1) kv_b_proj_weight = kv_b_proj_weight.view( @@ -383,7 +433,7 @@ def test_sparse_backend_decode_correctness( ).to(device=device, dtype=dtype) mock_kv_b_proj.weight = torch.nn.Parameter(kv_b_proj_weight.T.contiguous()) - impl_cls = FlashMLASparseBackend.get_impl_cls() + impl_cls = backend_cls.get_impl_cls() with set_current_vllm_config(vllm_config): impl = impl_cls( num_heads=num_heads, @@ -441,7 +491,7 @@ def test_sparse_backend_decode_correctness( # FP8 quantization introduces some error, but should be within reasonable bounds # BF16 (auto) should be very accurate, FP8 allows slightly more tolerance - if kv_cache_dtype == "fp8_ds_mla": + if kv_cache_dtype.startswith("fp8"): torch.testing.assert_close(backend_output, sdpa_reference, rtol=0.05, atol=0.05) else: torch.testing.assert_close(backend_output, sdpa_reference, rtol=0.01, atol=0.01) @@ -636,3 +686,63 @@ def test_triton_convert_req_index_to_global_index_with_prefill_workspace(block_s def test_split_prefill_chunks(seq_lens, max_buf, expected): out = split_prefill_chunks(seq_lens, max_buf) assert out == expected + + +def test_triton_convert_returns_valid_counts(): + """Test that return_valid_counts correctly counts non-negative indices.""" + device = torch.device("cuda") + num_tokens = 8 + num_requests = 2 + max_blocks_per_req = 10 + block_size = 64 + num_topk_tokens = 128 + + req_id = torch.tensor([0, 0, 0, 0, 1, 1, 1, 1], dtype=torch.int32, device=device) + block_table = torch.arange( + num_requests * max_blocks_per_req, dtype=torch.int32, device=device + ).view(num_requests, max_blocks_per_req) + + # Create token indices with varying numbers of valid entries + # Token 0: 64 valid, 64 invalid (-1) + # Token 1: 32 valid, 96 invalid + # Token 2: 128 valid (all) + # Token 3: 1 valid, 127 invalid + # etc. + token_indices = torch.full( + (num_tokens, num_topk_tokens), -1, dtype=torch.int32, device=device + ) + expected_valid = [] + for i in range(num_tokens): + num_valid = [64, 32, 128, 1, 64, 32, 128, 1][i] + token_indices[i, :num_valid] = torch.arange( + num_valid, dtype=torch.int32, device=device + ) % (block_size * max_blocks_per_req) + expected_valid.append(num_valid) + + expected_valid_tensor = torch.tensor( + expected_valid, dtype=torch.int32, device=device + ) + + # Test with return_valid_counts=True + result, valid_counts = triton_convert_req_index_to_global_index( + req_id, + block_table, + token_indices, + BLOCK_SIZE=block_size, + NUM_TOPK_TOKENS=num_topk_tokens, + return_valid_counts=True, + ) + + torch.testing.assert_close(valid_counts, expected_valid_tensor, rtol=0, atol=0) + + # Test that return_valid_counts=False returns only the indices + result_only = triton_convert_req_index_to_global_index( + req_id, + block_table, + token_indices, + BLOCK_SIZE=block_size, + NUM_TOPK_TOKENS=num_topk_tokens, + return_valid_counts=False, + ) + assert isinstance(result_only, torch.Tensor) + torch.testing.assert_close(result_only, result, rtol=0, atol=0) diff --git a/tools/pre_commit/generate_attention_backend_docs.py b/tools/pre_commit/generate_attention_backend_docs.py index eb68deb1b..3aca49f94 100644 --- a/tools/pre_commit/generate_attention_backend_docs.py +++ b/tools/pre_commit/generate_attention_backend_docs.py @@ -901,10 +901,50 @@ def parse_cuda_priority_lists() -> dict[str, list[str]]: def _get_backends_from_return(stmts: list) -> list[str]: - """Extract backend names from return statements in a list of statements.""" + """Extract backend names from return statements in a list of statements. + + Handles starred unpacking (e.g. ``*sparse_backends``) by resolving the + variable from assignments found in the same statement list. When the + variable is conditionally assigned (inside an ``if/else``), the ``else`` + branch value is used as the representative default. + """ + # Collect variable assignments so we can resolve starred expressions. + # For conditional assignments, last-written (else branch) wins. + var_assigns: dict[str, list[str]] = {} + for stmt in stmts: + if isinstance(stmt, ast.Assign) and isinstance(stmt.value, ast.List): + for target in stmt.targets: + if isinstance(target, ast.Name): + var_assigns[target.id] = [ + e.attr for e in stmt.value.elts if isinstance(e, ast.Attribute) + ] + elif isinstance(stmt, ast.If): + for branch in (stmt.body, stmt.orelse): + for branch_stmt in branch: + if isinstance(branch_stmt, ast.Assign) and isinstance( + branch_stmt.value, ast.List + ): + for target in branch_stmt.targets: + if isinstance(target, ast.Name): + var_assigns[target.id] = [ + e.attr + for e in branch_stmt.value.elts + if isinstance(e, ast.Attribute) + ] + for stmt in stmts: if isinstance(stmt, ast.Return) and isinstance(stmt.value, ast.List): - return [e.attr for e in stmt.value.elts if isinstance(e, ast.Attribute)] + backends: list[str] = [] + for e in stmt.value.elts: + if isinstance(e, ast.Attribute): + backends.append(e.attr) + elif ( + isinstance(e, ast.Starred) + and isinstance(e.value, ast.Name) + and e.value.id in var_assigns + ): + backends.extend(var_assigns[e.value.id]) + return backends return [] diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py index c44bf1f16..98ff02e9d 100644 --- a/vllm/model_executor/layers/attention/mla_attention.py +++ b/vllm/model_executor/layers/attention/mla_attention.py @@ -334,6 +334,7 @@ class MLAAttention(nn.Module, AttentionLayerBase): block_size, use_mla=True, use_sparse=use_sparse, + num_heads=self.num_heads, ) if ( diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 3edc83b15..b3d6b0ed6 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -129,6 +129,7 @@ class CpuPlatform(Platform): cls, selected_backend: "AttentionBackendEnum", attn_selector_config: "AttentionSelectorConfig", + num_heads: int | None = None, ) -> str: if selected_backend and selected_backend != AttentionBackendEnum.CPU_ATTN: logger.info("Cannot use %s backend on CPU.", selected_backend) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 0c0bd7db3..b7efe24dc 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -45,17 +45,29 @@ torch.backends.cuda.enable_cudnn_sdp(False) def _get_backend_priorities( use_mla: bool, device_capability: DeviceCapability, + num_heads: int | None = None, ) -> list[AttentionBackendEnum]: """Get backend priorities with lazy import to avoid circular dependency.""" if use_mla: if device_capability.major == 10: + # Prefer FlashInfer at low head counts (FlashMLA uses padding) + if num_heads is not None and num_heads <= 16: + sparse_backends = [ + AttentionBackendEnum.FLASHINFER_MLA_SPARSE, + AttentionBackendEnum.FLASHMLA_SPARSE, + ] + else: + sparse_backends = [ + AttentionBackendEnum.FLASHMLA_SPARSE, + AttentionBackendEnum.FLASHINFER_MLA_SPARSE, + ] return [ AttentionBackendEnum.FLASHINFER_MLA, AttentionBackendEnum.CUTLASS_MLA, AttentionBackendEnum.FLASH_ATTN_MLA, AttentionBackendEnum.FLASHMLA, AttentionBackendEnum.TRITON_MLA, - AttentionBackendEnum.FLASHMLA_SPARSE, + *sparse_backends, ] else: return [ @@ -182,6 +194,8 @@ class CudaPlatformBase(Platform): use_flashmla = False use_cutlass_mla = False use_flashinfer_mla = False + use_flashmla_sparse = False + use_flashinfer_mla_sparse = False from vllm.v1.attention.ops.flashmla import is_flashmla_dense_supported @@ -217,6 +231,10 @@ class CudaPlatformBase(Platform): use_flashmla = backend == AttentionBackendEnum.FLASHMLA use_cutlass_mla = backend == AttentionBackendEnum.CUTLASS_MLA use_flashinfer_mla = backend == AttentionBackendEnum.FLASHINFER_MLA + use_flashmla_sparse = backend == AttentionBackendEnum.FLASHMLA_SPARSE + use_flashinfer_mla_sparse = ( + backend == AttentionBackendEnum.FLASHINFER_MLA_SPARSE + ) if ( use_flashmla @@ -242,12 +260,24 @@ class CudaPlatformBase(Platform): "Forcing kv cache block size to 64 for FlashInferMLA backend." ) - # TODO(Chen): remove this hacky code - if use_sparse and cache_config.block_size != 64: - cache_config.block_size = 64 - logger.info( - "Forcing kv cache block size to 64 for FlashMLASparse backend." - ) + if use_sparse: + if not (use_flashmla_sparse or use_flashinfer_mla_sparse): + use_flashmla_sparse = True + + if use_flashmla_sparse and cache_config.block_size != 64: + cache_config.block_size = 64 + logger.info( + "Forcing kv cache block size to 64 for FlashMLASparse backend." + ) + elif use_flashinfer_mla_sparse and cache_config.block_size not in ( + 32, + 64, + ): + cache_config.block_size = 64 + logger.info( + "Forcing kv cache block size to 64 for FlashInferMLASparse " + "backend." + ) scheduler_config = vllm_config.scheduler_config # Note: model_config may be None during testing @@ -276,6 +306,7 @@ class CudaPlatformBase(Platform): cls, device_capability: DeviceCapability, attn_selector_config: "AttentionSelectorConfig", + num_heads: int | None = None, ) -> tuple[ list[tuple["AttentionBackendEnum", int]], dict["AttentionBackendEnum", list[str]], @@ -284,7 +315,9 @@ class CudaPlatformBase(Platform): invalid_reasons = {} backend_priorities = _get_backend_priorities( - attn_selector_config.use_mla, device_capability + attn_selector_config.use_mla, + device_capability, + num_heads, ) for priority, backend in enumerate(backend_priorities): try: @@ -307,6 +340,7 @@ class CudaPlatformBase(Platform): cls, selected_backend: "AttentionBackendEnum", attn_selector_config: "AttentionSelectorConfig", + num_heads: int | None = None, ) -> str: device_capability = cls.get_device_capability() assert device_capability is not None @@ -336,6 +370,7 @@ class CudaPlatformBase(Platform): valid_backends_priorities, invalid_reasons = cls.get_valid_backends( device_capability=device_capability, attn_selector_config=attn_selector_config, + num_heads=num_heads, ) reasons_str = ( "{" diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 27f5ea517..4595b599b 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -233,6 +233,7 @@ class Platform: cls, selected_backend: "AttentionBackendEnum", attn_selector_config: "AttentionSelectorConfig", + num_heads: int | None = None, ) -> str: """Get the attention backend class of a device.""" return "" diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index b463c80a1..808d21400 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -265,6 +265,7 @@ class RocmPlatform(Platform): cls, selected_backend: "AttentionBackendEnum", attn_selector_config: "AttentionSelectorConfig", + num_heads: int | None = None, ) -> str: from vllm._aiter_ops import rocm_aiter_ops diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 3a0ea8b12..8daa2d47f 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -48,6 +48,7 @@ class XPUPlatform(Platform): cls, selected_backend: "AttentionBackendEnum", attn_selector_config: "AttentionSelectorConfig", + num_heads: int | None = None, ) -> str: from vllm.v1.attention.backends.utils import set_kv_cache_layout diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla_sparse.py b/vllm/v1/attention/backends/mla/flashinfer_mla_sparse.py new file mode 100644 index 000000000..21a0d99c2 --- /dev/null +++ b/vllm/v1/attention/backends/mla/flashinfer_mla_sparse.py @@ -0,0 +1,353 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""FlashInfer MLA Sparse Attention Backend. + +This backend uses the FlashInfer TRT-LLM MLA kernel with sparse_mla_top_k +for models like DeepSeek-V3.2 that use index-based sparse attention. + +For sparse MLA: +- block_tables shape changes from [batch_size, max_num_blocks] (dense) + to [batch_size, q_len_per_request, sparse_mla_top_k] (sparse) +- The sparse indices represent physical cache slot positions to attend to +- sparse_mla_top_k parameter must be set to the topk value +""" + +from dataclasses import dataclass +from typing import TYPE_CHECKING, ClassVar + +import numpy as np +import torch +from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla + +from vllm.config import VllmConfig +from vllm.config.cache import CacheDType +from vllm.logger import init_logger +from vllm.model_executor.layers.attention.mla_attention import ( + get_mla_dims, +) +from vllm.platforms.interface import DeviceCapability +from vllm.v1.attention.backend import ( + AttentionBackend, + AttentionCGSupport, + AttentionLayer, + AttentionMetadata, + AttentionMetadataBuilder, + AttentionType, + CommonAttentionMetadata, + MultipleOf, + SparseMLAAttentionImpl, +) +from vllm.v1.attention.backends.mla.sparse_utils import ( + triton_convert_req_index_to_global_index, +) +from vllm.v1.attention.backends.utils import KVCacheLayoutType +from vllm.v1.kv_cache_interface import AttentionSpec + +if TYPE_CHECKING: + from vllm.model_executor.models.deepseek_v2 import Indexer + +logger = init_logger(__name__) + +FLASHINFER_MLA_SPARSE_WORKSPACE_BUFFER_SIZE = 128 * 1024 * 1024 + + +class FlashInferMLASparseBackend(AttentionBackend): + """FlashInfer MLA backend with sparse attention support. + + This backend uses the FlashInfer TRT-LLM MLA kernel with sparse_mla_top_k + for models like DeepSeek-V3.2 that use index-based sparse attention. + """ + + accept_output_buffer: bool = True + supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] + supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [ + "auto", + "bfloat16", + ] + + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [32, 64] + + @staticmethod + def get_name() -> str: + return "FLASHINFER_MLA_SPARSE" + + @staticmethod + def get_impl_cls() -> type["FlashInferMLASparseImpl"]: + return FlashInferMLASparseImpl + + @staticmethod + def get_builder_cls() -> type["FlashInferMLASparseMetadataBuilder"]: + return FlashInferMLASparseMetadataBuilder + + @classmethod + def get_supported_head_sizes(cls) -> list[int]: + return [576] + + @classmethod + def is_mla(cls) -> bool: + return True + + @classmethod + def is_sparse(cls) -> bool: + return True + + @classmethod + def supports_compute_capability(cls, capability: DeviceCapability) -> bool: + # FlashInfer sparse MLA targets Blackwell (SM 10.x) + return capability.major == 10 + + @classmethod + def supports_combination( + cls, + head_size: int, + dtype: torch.dtype, + kv_cache_dtype: CacheDType | None, + block_size: int, + use_mla: bool, + has_sink: bool, + use_sparse: bool, + device_capability: DeviceCapability, + ) -> str | None: + # FlashInfer MLA sparse kernel requires qk_nope_head_dim == 128 + from vllm.config import get_current_vllm_config + + vllm_config = get_current_vllm_config() + if vllm_config.model_config is not None: + hf_text_config = vllm_config.model_config.hf_text_config + qk_nope_head_dim = getattr(hf_text_config, "qk_nope_head_dim", 1) + if qk_nope_head_dim != 128: + return ( + f"FlashInfer MLA Sparse kernel requires qk_nope_head_dim == 128, " + f"but got {qk_nope_head_dim}" + ) + # Check for index_topk which indicates sparse model + if not hasattr(hf_text_config, "index_topk"): + return "FlashInfer MLA Sparse requires model with index_topk config" + return None + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, # assumed to be 1 for MLA + head_size: int, + cache_dtype_str: str = "auto", + ) -> tuple[int, ...]: + return (num_blocks, block_size, head_size) + + @classmethod + def get_required_kv_cache_layout(cls) -> "KVCacheLayoutType | None": + return "HND" + + +@dataclass +class FlashInferMLASparseMetadata(AttentionMetadata): + """Attention metadata for FlashInfer MLA Sparse backend.""" + + num_reqs: int + max_query_len: int + max_seq_len: int + num_actual_tokens: int + + # Query start locations + query_start_loc: torch.Tensor + slot_mapping: torch.Tensor + block_table: torch.Tensor + req_id_per_token: torch.Tensor + + # Sequence lengths for all requests (context + query) + seq_lens: torch.Tensor + + # Sparse-specific + block_size: int = 64 + topk_tokens: int = 2048 + + +class FlashInferMLASparseMetadataBuilder( + AttentionMetadataBuilder[FlashInferMLASparseMetadata] +): + """Builder for FlashInfer MLA Sparse attention metadata.""" + + _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH + + def __init__( + self, + kv_cache_spec: AttentionSpec, + layer_names: list[str], + vllm_config: VllmConfig, + device: torch.device, + ) -> None: + self.vllm_config = vllm_config + self.layer_names = layer_names + self.kv_cache_spec = kv_cache_spec + self.model_config = vllm_config.model_config + self.device = device + + self.mla_dims = get_mla_dims(self.model_config) + self.topk_tokens = vllm_config.model_config.hf_config.index_topk + + self.req_id_per_token_buffer = torch.empty( + (vllm_config.scheduler_config.max_num_batched_tokens,), + dtype=torch.int32, + device=device, + ) + + def build( + self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False, + ) -> FlashInferMLASparseMetadata: + cm = common_attn_metadata + num_tokens = cm.num_actual_tokens + + # Build req_id_per_token mapping + starts = np.asarray(cm.query_start_loc_cpu, dtype=np.int32) + seg_lengths = np.diff(starts) + req_id_per_token = np.repeat( + np.arange(seg_lengths.shape[0], dtype=np.int32), seg_lengths + ) + + # Zero-fill for cudagraphs + self.req_id_per_token_buffer.fill_(0) + self.req_id_per_token_buffer[: req_id_per_token.shape[0]].copy_( + torch.from_numpy(req_id_per_token), non_blocking=True + ) + req_id_per_token_tensor = self.req_id_per_token_buffer[:num_tokens] + + return FlashInferMLASparseMetadata( + num_reqs=cm.num_reqs, + max_query_len=cm.max_query_len, + max_seq_len=cm.max_seq_len, + num_actual_tokens=cm.num_actual_tokens, + query_start_loc=cm.query_start_loc, + slot_mapping=cm.slot_mapping, + block_table=cm.block_table_tensor, + req_id_per_token=req_id_per_token_tensor, + seq_lens=cm.seq_lens, + block_size=self.kv_cache_spec.block_size, + topk_tokens=self.topk_tokens, + ) + + +# Global workspace buffer (lazily initialized) +_fi_sparse_workspace: torch.Tensor | None = None + + +def _get_workspace_buffer(device: torch.device) -> torch.Tensor: + global _fi_sparse_workspace + if _fi_sparse_workspace is None: + _fi_sparse_workspace = torch.zeros( + FLASHINFER_MLA_SPARSE_WORKSPACE_BUFFER_SIZE, + dtype=torch.uint8, + device=device, + ) + return _fi_sparse_workspace + + +class FlashInferMLASparseImpl(SparseMLAAttentionImpl[FlashInferMLASparseMetadata]): + """FlashInfer MLA Sparse implementation. + + Uses the TRT-LLM MLA kernel with sparse_mla_top_k parameter for + sparse attention computation. + """ + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: list[float] | None, + sliding_window: int | None, + kv_cache_dtype: str, + logits_soft_cap: float | None, + attn_type: str, + kv_sharing_target_layer_name: str | None, + # MLA Specific Arguments + topk_indice_buffer: torch.Tensor | None = None, + indexer: "Indexer | None" = None, + **mla_args, + ) -> None: + unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] + if any(unsupported_features): + raise NotImplementedError( + "FlashInferMLASparseImpl does not support one of the following: " + "alibi_slopes, sliding_window, logits_soft_cap" + ) + + if attn_type != AttentionType.DECODER: + raise NotImplementedError( + "Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "FlashInferMLASparseImpl" + ) + + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_kv_heads + self.kv_cache_dtype = kv_cache_dtype + + # MLA-specific dimensions + self.kv_lora_rank: int = mla_args["kv_lora_rank"] + self.qk_nope_head_dim: int = mla_args["qk_nope_head_dim"] + self.qk_rope_head_dim: int = mla_args["qk_rope_head_dim"] + + assert indexer is not None, "Indexer required for sparse MLA" + self.topk_indices_buffer: torch.Tensor | None = indexer.topk_indices_buffer + + self._workspace_buffer: torch.Tensor | None = None + self.bmm1_scale: float | None = None + self.bmm2_scale: float | None = None + + def forward_mqa( + self, + q: torch.Tensor | tuple[torch.Tensor, torch.Tensor], + kv_c_and_k_pe_cache: torch.Tensor, + attn_metadata: FlashInferMLASparseMetadata, + layer: AttentionLayer, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + if isinstance(q, tuple): + q = torch.cat(q, dim=-1) + + num_actual_toks = q.shape[0] + + assert self.topk_indices_buffer is not None + topk_indices = self.topk_indices_buffer[:num_actual_toks] + + topk_indices_physical, seq_lens = triton_convert_req_index_to_global_index( + attn_metadata.req_id_per_token[:num_actual_toks], + attn_metadata.block_table, + topk_indices, + BLOCK_SIZE=attn_metadata.block_size, + NUM_TOPK_TOKENS=topk_indices.shape[1], + return_valid_counts=True, + ) + + if self._workspace_buffer is None: + self._workspace_buffer = _get_workspace_buffer(q.device) + + if self.bmm1_scale is None: + self.bmm1_scale = layer._q_scale_float * layer._k_scale_float * self.scale + if self.bmm2_scale is None: + self.bmm2_scale = layer._v_scale_float + + o = trtllm_batch_decode_with_kv_cache_mla( + query=q.unsqueeze(1), + kv_cache=kv_c_and_k_pe_cache.unsqueeze(1), + workspace_buffer=self._workspace_buffer, + qk_nope_head_dim=self.qk_nope_head_dim, + kv_lora_rank=self.kv_lora_rank, + qk_rope_head_dim=self.qk_rope_head_dim, + block_tables=topk_indices_physical.unsqueeze(1), + seq_lens=seq_lens, + max_seq_len=attn_metadata.topk_tokens, + bmm1_scale=self.bmm1_scale, + bmm2_scale=self.bmm2_scale, + sparse_mla_top_k=attn_metadata.topk_tokens, + ) + return o.view(-1, o.shape[-2], o.shape[-1]), None diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py index 80e402a4d..799c77d73 100644 --- a/vllm/v1/attention/backends/mla/flashmla_sparse.py +++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py @@ -15,7 +15,6 @@ from vllm.model_executor.layers.attention.mla_attention import ( ) from vllm.platforms import current_platform from vllm.platforms.interface import DeviceCapability -from vllm.triton_utils import tl, triton from vllm.v1.attention.backend import ( AttentionBackend, AttentionCGSupport, @@ -26,6 +25,9 @@ from vllm.v1.attention.backend import ( MultipleOf, SparseMLAAttentionImpl, ) +from vllm.v1.attention.backends.mla.sparse_utils import ( + triton_convert_req_index_to_global_index, +) from vllm.v1.attention.backends.utils import ( reshape_attn_output_for_spec_decode, reshape_query_for_spec_decode, @@ -203,166 +205,6 @@ class FlashMLASparseMetadata(AttentionMetadata): fp8_use_mixed_batch: bool = False -# Kernel with prefill workspace support -@triton.jit -def _convert_req_index_to_global_index_kernel( - req_id_ptr, # int32 [num_tokens] - block_table_ptr, # int32 [num_requests, max_num_blocks_per_req] - token_indices_ptr, # int32 [num_tokens, NUM_TOPK_TOKENS] - out_ptr, # int32 [num_tokens, NUM_TOPK_TOKENS] - prefill_request_id_ptr, # int32 [num_tokens], -1 for decode, >=0 for prefill - workspace_starts_ptr, # int32 [num_prefill_reqs+1] or nullptr - # shapes (compile-time where possible) - max_num_blocks_per_req: tl.constexpr, - BLOCK_SIZE: tl.constexpr, - BLOCK_N: tl.constexpr, # tile width along columns - HAS_PREFILL: tl.constexpr, - # strides (in elements) - bt_stride0, - bt_stride1, - ti_stride0, - ti_stride1, - out_stride0, - out_stride1, -): - # program_id(0) -> token_id (row) - # program_id(1) -> tile index along columns - token_id = tl.program_id(0) - tile_id = tl.program_id(1) - - # Each program covers BLOCK_N consecutive columns - indice_id = tile_id * BLOCK_N + tl.arange(0, BLOCK_N) - - # Load request id for this token (no mask: grid is exact) - req = tl.load(req_id_ptr + token_id) - - # Load token indices for this tile - ti_ptr = token_indices_ptr + token_id * ti_stride0 + indice_id * ti_stride1 - tok = tl.load(ti_ptr) # int32 - - # Only token == -1 should propagate as -1 - is_invalid_tok = tok < 0 - is_prefill = False - if HAS_PREFILL: - prefill_req_id = tl.load(prefill_request_id_ptr + token_id) - is_prefill = prefill_req_id >= 0 - # Compute block id and in-block offset - block_id = tok // BLOCK_SIZE - inblock_off = tok % BLOCK_SIZE - - # Guard block_table access - valid_block = (block_id < max_num_blocks_per_req) & (block_id >= 0) - bt_ptr = block_table_ptr + req * bt_stride0 + block_id * bt_stride1 - is_invalid_tok |= ~valid_block - base = tl.load(bt_ptr, mask=valid_block & ~is_prefill, other=0) - out_val = base * BLOCK_SIZE + inblock_off - - # Override with prefill output if prefill is enabled - if HAS_PREFILL: - workspace_start = tl.load( - workspace_starts_ptr + prefill_req_id, mask=is_prefill, other=0 - ) - prefill_out = workspace_start + tok - out_val = tl.where(is_prefill, prefill_out, out_val) - out_val = tl.where(is_invalid_tok, -1, out_val) - - # Store results - out_ptr_ij = out_ptr + token_id * out_stride0 + indice_id * out_stride1 - tl.store(out_ptr_ij, out_val) - - -def triton_convert_req_index_to_global_index( - req_id: torch.Tensor, # int32 [num_tokens] - block_table: torch.Tensor, # int32 [num_requests, max_num_blocks_per_req] - token_indices: torch.Tensor, # int32 [num_tokens, NUM_TOPK_TOKENS] - BLOCK_SIZE: int = 64, - NUM_TOPK_TOKENS: int = 2048, - BLOCK_N: int = 128, # tile width along columns - HAS_PREFILL_WORKSPACE: bool = False, - prefill_workspace_request_ids: torch.Tensor | None = None, - prefill_workspace_starts: torch.Tensor | None = None, -): - """ - out[token_id, indice_id] = - block_table[req_id[token_id], - token_indices[token_id, indice_id] // BLOCK_SIZE] * BLOCK_SIZE - + token_indices[token_id, indice_id] % BLOCK_SIZE - - Only when token_indices[token_id, indice_id] == -1 do we output -1. - For safety, we also output -1 if the derived block_id would be - out-of-bounds. - - When HAS_PREFILL_WORKSPACE is True, prefill tokens are mapped to workspace offsets - instead of global cache slots. prefill_workspace_request_ids and - prefill_workspace_starts must be provided. - - prefill_workspace_request_ids: int32 [num_tokens], -1 for decode else - prefill request index (maps to prefill_workspace_starts) - prefill_workspace_starts: int32 [num_prefills], 0-indexed workspace - starts for each prefill request - """ - assert req_id.dtype == torch.int32 - assert block_table.dtype == torch.int32 - assert token_indices.dtype == torch.int32 - assert token_indices.shape[1] == NUM_TOPK_TOKENS - assert NUM_TOPK_TOKENS % BLOCK_N == 0, ( - f"NUM_TOPK_TOKENS ({NUM_TOPK_TOKENS}) must be divisible by BLOCK_N ({BLOCK_N})" - ) - - if HAS_PREFILL_WORKSPACE: - assert prefill_workspace_request_ids is not None - assert prefill_workspace_starts is not None - assert prefill_workspace_request_ids.dtype == torch.int32 - assert prefill_workspace_starts.dtype == torch.int32 - - num_tokens = req_id.shape[0] - max_num_blocks_per_req = block_table.shape[1] - tiles_per_row = NUM_TOPK_TOKENS // BLOCK_N - - # Ensure contiguous tensors on the same device - req_id_c = req_id.contiguous() - block_table_c = block_table.contiguous() - token_indices_c = token_indices.contiguous() - out = torch.empty_like(token_indices_c) - - # Strides in elements - bt_stride0, bt_stride1 = block_table_c.stride() - ti_stride0, ti_stride1 = token_indices_c.stride() - out_stride0, out_stride1 = out.stride() - - # Prepare prefill pointers - if HAS_PREFILL_WORKSPACE: - assert prefill_workspace_request_ids is not None # for mypy - assert prefill_workspace_starts is not None # for mypy - assert prefill_workspace_request_ids.is_contiguous() - assert prefill_workspace_starts.is_contiguous() - - # Exact 2D grid: tokens × column tiles - grid = (num_tokens, tiles_per_row) - - _convert_req_index_to_global_index_kernel[grid]( - req_id_c, - block_table_c, - token_indices_c, - out, - prefill_workspace_request_ids, - prefill_workspace_starts, - # shapes / constexprs - max_num_blocks_per_req, - BLOCK_SIZE, - BLOCK_N, - HAS_PREFILL_WORKSPACE, - # strides - bt_stride0, - bt_stride1, - ti_stride0, - ti_stride1, - out_stride0, - out_stride1, - ) - return out - - def get_prefill_workspace_size(max_model_len: int): # NOTE(Lucas): 5 is a magic number for controlling the prefill buffer size. # May be tuned later. diff --git a/vllm/v1/attention/backends/mla/sparse_utils.py b/vllm/v1/attention/backends/mla/sparse_utils.py new file mode 100644 index 000000000..e4bd0cf42 --- /dev/null +++ b/vllm/v1/attention/backends/mla/sparse_utils.py @@ -0,0 +1,191 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Utility functions for sparse MLA backends.""" + +import torch + +from vllm.triton_utils import tl, triton + + +# Kernel with prefill workspace support and valid count tracking +@triton.jit +def _convert_req_index_to_global_index_kernel( + req_id_ptr, # int32 [num_tokens] + block_table_ptr, # int32 [num_requests, max_num_blocks_per_req] + token_indices_ptr, # int32 [num_tokens, NUM_TOPK_TOKENS] + out_ptr, # int32 [num_tokens, NUM_TOPK_TOKENS] + valid_count_ptr, # int32 [num_tokens] - output valid count per row + prefill_request_id_ptr, # int32 [num_tokens], -1 for decode, >=0 for prefill + workspace_starts_ptr, # int32 [num_prefill_reqs+1] or nullptr + # shapes (compile-time where possible) + max_num_blocks_per_req: tl.constexpr, + BLOCK_SIZE: tl.constexpr, + BLOCK_N: tl.constexpr, # tile width along columns + HAS_PREFILL: tl.constexpr, + COUNT_VALID: tl.constexpr, # whether to count valid indices + # strides (in elements) + bt_stride0, + bt_stride1, + ti_stride0, + ti_stride1, + out_stride0, + out_stride1, +): + # program_id(0) -> token_id (row) + # program_id(1) -> tile index along columns + token_id = tl.program_id(0) + tile_id = tl.program_id(1) + + # Each program covers BLOCK_N consecutive columns + indice_id = tile_id * BLOCK_N + tl.arange(0, BLOCK_N) + + # Load request id for this token (no mask: grid is exact) + req = tl.load(req_id_ptr + token_id) + + # Load token indices for this tile + ti_ptr = token_indices_ptr + token_id * ti_stride0 + indice_id * ti_stride1 + tok = tl.load(ti_ptr) # int32 + + # Only token == -1 should propagate as -1 + is_invalid_tok = tok < 0 + is_prefill = False + if HAS_PREFILL: + prefill_req_id = tl.load(prefill_request_id_ptr + token_id) + is_prefill = prefill_req_id >= 0 + # Compute block id and in-block offset + block_id = tok // BLOCK_SIZE + inblock_off = tok % BLOCK_SIZE + + # Guard block_table access + valid_block = (block_id < max_num_blocks_per_req) & (block_id >= 0) + bt_ptr = block_table_ptr + req * bt_stride0 + block_id * bt_stride1 + is_invalid_tok |= ~valid_block + base = tl.load(bt_ptr, mask=valid_block & ~is_prefill, other=0) + out_val = base * BLOCK_SIZE + inblock_off + + # Override with prefill output if prefill is enabled + if HAS_PREFILL: + workspace_start = tl.load( + workspace_starts_ptr + prefill_req_id, mask=is_prefill, other=0 + ) + prefill_out = workspace_start + tok + out_val = tl.where(is_prefill, prefill_out, out_val) + out_val = tl.where(is_invalid_tok, -1, out_val) + + # Store results + out_ptr_ij = out_ptr + token_id * out_stride0 + indice_id * out_stride1 + tl.store(out_ptr_ij, out_val) + + # Count valid indices in this tile and atomically add to row total + if COUNT_VALID: + tile_valid_count = tl.sum((~is_invalid_tok).to(tl.int32)) + tl.atomic_add(valid_count_ptr + token_id, tile_valid_count) + + +def triton_convert_req_index_to_global_index( + req_id: torch.Tensor, # int32 [num_tokens] + block_table: torch.Tensor, # int32 [num_requests, max_num_blocks_per_req] + token_indices: torch.Tensor, # int32 [num_tokens, NUM_TOPK_TOKENS] + BLOCK_SIZE: int = 64, + NUM_TOPK_TOKENS: int = 2048, + BLOCK_N: int = 128, # tile width along columns + HAS_PREFILL_WORKSPACE: bool = False, + prefill_workspace_request_ids: torch.Tensor | None = None, + prefill_workspace_starts: torch.Tensor | None = None, + return_valid_counts: bool = False, +) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + """ + out[token_id, indice_id] = + block_table[req_id[token_id], + token_indices[token_id, indice_id] // BLOCK_SIZE] * BLOCK_SIZE + + token_indices[token_id, indice_id] % BLOCK_SIZE + + Only when token_indices[token_id, indice_id] == -1 do we output -1. + For safety, we also output -1 if the derived block_id would be + out-of-bounds. + + When HAS_PREFILL_WORKSPACE is True, prefill tokens are mapped to workspace offsets + instead of global cache slots. prefill_workspace_request_ids and + prefill_workspace_starts must be provided. + + prefill_workspace_request_ids: int32 [num_tokens], -1 for decode else + prefill request index (maps to prefill_workspace_starts) + prefill_workspace_starts: int32 [num_prefills], 0-indexed workspace + starts for each prefill request + + When return_valid_counts is True, also returns the count of valid (non -1) + indices per row, computed during the same kernel pass (no extra overhead). + """ + assert req_id.dtype == torch.int32 + assert block_table.dtype == torch.int32 + assert token_indices.dtype == torch.int32 + assert token_indices.shape[1] == NUM_TOPK_TOKENS + assert NUM_TOPK_TOKENS % BLOCK_N == 0, ( + f"NUM_TOPK_TOKENS ({NUM_TOPK_TOKENS}) must be divisible by BLOCK_N ({BLOCK_N})" + ) + + if HAS_PREFILL_WORKSPACE: + assert prefill_workspace_request_ids is not None + assert prefill_workspace_starts is not None + assert prefill_workspace_request_ids.dtype == torch.int32 + assert prefill_workspace_starts.dtype == torch.int32 + + num_tokens = req_id.shape[0] + max_num_blocks_per_req = block_table.shape[1] + tiles_per_row = NUM_TOPK_TOKENS // BLOCK_N + + # Ensure contiguous tensors on the same device + req_id_c = req_id.contiguous() + block_table_c = block_table.contiguous() + token_indices_c = token_indices.contiguous() + out = torch.empty_like(token_indices_c) + + # Allocate valid count buffer if needed (must be zero-initialized for atomics) + valid_counts: torch.Tensor | None = None + if return_valid_counts: + valid_counts = torch.zeros( + num_tokens, dtype=torch.int32, device=token_indices.device + ) + + # Strides in elements + bt_stride0, bt_stride1 = block_table_c.stride() + ti_stride0, ti_stride1 = token_indices_c.stride() + out_stride0, out_stride1 = out.stride() + + # Prepare prefill pointers + if HAS_PREFILL_WORKSPACE: + assert prefill_workspace_request_ids is not None # for mypy + assert prefill_workspace_starts is not None # for mypy + assert prefill_workspace_request_ids.is_contiguous() + assert prefill_workspace_starts.is_contiguous() + + # Exact 2D grid: tokens × column tiles + grid = (num_tokens, tiles_per_row) + + _convert_req_index_to_global_index_kernel[grid]( + req_id_c, + block_table_c, + token_indices_c, + out, + valid_counts, + prefill_workspace_request_ids, + prefill_workspace_starts, + # shapes / constexprs + max_num_blocks_per_req, + BLOCK_SIZE, + BLOCK_N, + HAS_PREFILL_WORKSPACE, + return_valid_counts, + # strides + bt_stride0, + bt_stride1, + ti_stride0, + ti_stride1, + out_stride0, + out_stride1, + ) + + if return_valid_counts: + assert valid_counts is not None + return out, valid_counts + return out diff --git a/vllm/v1/attention/backends/registry.py b/vllm/v1/attention/backends/registry.py index 2a80bbd94..8e60551e2 100644 --- a/vllm/v1/attention/backends/registry.py +++ b/vllm/v1/attention/backends/registry.py @@ -62,6 +62,10 @@ class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta): FLASHINFER_MLA = ( "vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend" ) + FLASHINFER_MLA_SPARSE = ( + "vllm.v1.attention.backends.mla.flashinfer_mla_sparse." + "FlashInferMLASparseBackend" + ) TRITON_MLA = "vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend" CUTLASS_MLA = "vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend" FLASHMLA = "vllm.v1.attention.backends.mla.flashmla.FlashMLABackend" diff --git a/vllm/v1/attention/selector.py b/vllm/v1/attention/selector.py index e364c3235..9580c1d5f 100644 --- a/vllm/v1/attention/selector.py +++ b/vllm/v1/attention/selector.py @@ -53,6 +53,7 @@ def get_attn_backend( use_sparse: bool = False, use_mm_prefix: bool = False, attn_type: str | None = None, + num_heads: int | None = None, ) -> type[AttentionBackend]: """Selects which attention backend to use and lazily imports it.""" @@ -66,7 +67,6 @@ def get_attn_backend( from vllm.config import get_current_vllm_config vllm_config = get_current_vllm_config() - backend_enum = vllm_config.attention_config.backend attn_selector_config = AttentionSelectorConfig( head_size=head_size, @@ -81,8 +81,9 @@ def get_attn_backend( ) return _cached_get_attn_backend( - backend=backend_enum, + backend=vllm_config.attention_config.backend, attn_selector_config=attn_selector_config, + num_heads=num_heads, ) @@ -90,12 +91,14 @@ def get_attn_backend( def _cached_get_attn_backend( backend, attn_selector_config: AttentionSelectorConfig, + num_heads: int | None = None, ) -> type[AttentionBackend]: from vllm.platforms import current_platform attention_cls = current_platform.get_attn_backend_cls( backend, attn_selector_config=attn_selector_config, + num_heads=num_heads, ) if not attention_cls: raise ValueError( -- GitLab From 679ca5d8d346ede84c9cbba5d6a8789723c295c0 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 12 Feb 2026 18:29:42 +0100 Subject: [PATCH 0139/1166] Fix MoE for the Transformers modelling backend (#34436) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/transformers/moe.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py index c636da211..320bbab08 100644 --- a/vllm/model_executor/models/transformers/moe.py +++ b/vllm/model_executor/models/transformers/moe.py @@ -45,7 +45,6 @@ class TransformersFusedMoE(FusedMoE): # --8<-- [end:transformers_fused_moe] def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) self._topk_ids: torch.Tensor = None def custom_routing_function(hidden_states, gating_output, topk, renormalize): @@ -63,7 +62,8 @@ class TransformersFusedMoE(FusedMoE): (topk_ids,) = dist_group.all_gatherv([topk_ids], 0, sizes) return topk_weights, topk_ids - self.custom_routing_function = custom_routing_function + kwargs["custom_routing_function"] = custom_routing_function + super().__init__(*args, **kwargs) def forward( self, @@ -94,7 +94,7 @@ def transformers_moe_forward( self = forward_context.no_compile_layers[layer_name] self._topk_ids = topk_ids # Clone hidden_states because it will be mutated in-place in FusedMoE - return self.forward_impl(hidden_states.clone(), topk_weights) + return self.runner.forward(hidden_states.clone(), topk_weights) def transformers_moe_forward_fake( -- GitLab From becbe2480871573f9464e4941b179c1c21f2c786 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 13 Feb 2026 01:40:01 +0800 Subject: [PATCH 0140/1166] [Bugfix] Remove broken raw url GGUF model loading support (#34433) Signed-off-by: Isotr0py --- tests/models/test_gguf_download.py | 19 ------------------- .../model_loader/gguf_loader.py | 7 +------ 2 files changed, 1 insertion(+), 25 deletions(-) diff --git a/tests/models/test_gguf_download.py b/tests/models/test_gguf_download.py index b1674cdf7..e9ca35afd 100644 --- a/tests/models/test_gguf_download.py +++ b/tests/models/test_gguf_download.py @@ -113,25 +113,6 @@ class TestGGUFModelLoader: assert result == "/path/to/model.gguf" mock_isfile.assert_called_once_with("/path/to/model.gguf") - @patch("vllm.model_executor.model_loader.gguf_loader.hf_hub_download") - @patch("os.path.isfile", return_value=False) - def test_prepare_weights_https_url(self, mock_isfile, mock_hf_download): - """Test _prepare_weights with HTTPS URL.""" - load_config = LoadConfig(load_format="gguf") - loader = GGUFModelLoader(load_config) - - mock_hf_download.return_value = "/downloaded/model.gguf" - - # Create a simple mock ModelConfig with only the model attribute - model_config = MagicMock() - model_config.model = "https://huggingface.co/model.gguf" - - result = loader._prepare_weights(model_config) - assert result == "/downloaded/model.gguf" - mock_hf_download.assert_called_once_with( - url="https://huggingface.co/model.gguf" - ) - @patch("vllm.model_executor.model_loader.gguf_loader.hf_hub_download") @patch("os.path.isfile", return_value=False) def test_prepare_weights_repo_filename(self, mock_isfile, mock_hf_download): diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py index e1fb99a5a..25fa3ba03 100644 --- a/vllm/model_executor/model_loader/gguf_loader.py +++ b/vllm/model_executor/model_loader/gguf_loader.py @@ -49,11 +49,6 @@ class GGUFModelLoader(BaseModelLoader): model_name_or_path = model_config.model if os.path.isfile(model_name_or_path): return model_name_or_path - # for raw HTTPS link - if model_name_or_path.startswith( - ("http://", "https://") - ) and model_name_or_path.endswith(".gguf"): - return hf_hub_download(url=model_name_or_path) # repo id/filename.gguf if "/" in model_name_or_path and model_name_or_path.endswith(".gguf"): repo_id, filename = model_name_or_path.rsplit("/", 1) @@ -71,7 +66,7 @@ class GGUFModelLoader(BaseModelLoader): raise ValueError( f"Unrecognised GGUF reference: {model_name_or_path} " - "(expected local file, raw URL, /.gguf, " + "(expected local file, /.gguf, " "or :)" ) -- GitLab From 766e1678210d797757dcfe28f05184a251685dfe Mon Sep 17 00:00:00 2001 From: xuebwang-amd Date: Fri, 13 Feb 2026 01:40:19 +0800 Subject: [PATCH 0141/1166] [ROCm][quantization] improve OCP weight quant parser robust (#34431) Signed-off-by: xuebwang-amd Co-authored-by: TJian --- vllm/model_executor/layers/quantization/quark/quark.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index 2e75a3de5..36f20c89f 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -337,6 +337,13 @@ class QuarkConfig(QuantizationConfig): ) return False + if isinstance(weight_quant, list): + logger.debug( + "Quark model's weight quantization is incompatible with OCP_MX format: " + "weight_quant is a list (e.g. fp8_w4a8), OCP_MX requires a single dict." + ) + return False + # Input and weight qscheme needs to be per group. if weight_quant.get("qscheme") != "per_group": logger.debug( -- GitLab From 1100a97621ebbf226488268f47d0252b789276e6 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 12 Feb 2026 18:43:24 +0100 Subject: [PATCH 0142/1166] [Voxstral Realtime] Enable tests (#33803) Signed-off-by: Patrick von Platen --- tests/entrypoints/openai/test_realtime_validation.py | 12 +----------- .../multimodal/generation/test_voxtral_realtime.py | 2 -- tests/models/multimodal/processing/test_common.py | 7 +++++++ tests/models/registry.py | 9 ++++----- vllm/model_executor/models/voxtral.py | 10 ++++++++++ 5 files changed, 22 insertions(+), 18 deletions(-) diff --git a/tests/entrypoints/openai/test_realtime_validation.py b/tests/entrypoints/openai/test_realtime_validation.py index 946843e0b..af15b7099 100644 --- a/tests/entrypoints/openai/test_realtime_validation.py +++ b/tests/entrypoints/openai/test_realtime_validation.py @@ -27,15 +27,6 @@ MISTRAL_FORMAT_ARGS = [ MODEL_NAME = "mistralai/Voxtral-Mini-4B-Realtime-2602" -def _audio_to_base64_pcm16(path: str, target_sr: int = 16000) -> str: - """Load audio file, convert to PCM16 @ target sample rate, base64 encode.""" - audio, _ = librosa.load(path, sr=target_sr, mono=True) - # Convert float32 [-1, 1] to int16 [-32768, 32767] - audio_int16 = (audio * 32767).astype(np.int16) - audio_bytes = audio_int16.tobytes() - return base64.b64encode(audio_bytes).decode("utf-8") - - def _get_websocket_url(server: RemoteOpenAIServer) -> str: """Convert HTTP URL to WebSocket URL for realtime endpoint.""" http_url = server.url_root @@ -74,12 +65,11 @@ def mary_had_lamb_audio_chunks() -> list[str]: @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -@pytest.mark.skip(reason="Voxtral streaming is not yet public") async def test_multi_chunk_streaming( model_name, mary_had_lamb_audio_chunks, rocm_aiter_fa_attention ): """Test streaming multiple audio chunks before committing.""" - server_args = ["--enforce-eager"] + server_args = ["--enforce-eager", "--max-model-len", "2048"] if model_name.startswith("mistralai"): server_args += MISTRAL_FORMAT_ARGS diff --git a/tests/models/multimodal/generation/test_voxtral_realtime.py b/tests/models/multimodal/generation/test_voxtral_realtime.py index d162f80ff..96f60bb5c 100644 --- a/tests/models/multimodal/generation/test_voxtral_realtime.py +++ b/tests/models/multimodal/generation/test_voxtral_realtime.py @@ -73,7 +73,6 @@ def async_engine() -> AsyncLLM: return AsyncLLM.from_engine_args(engine_args) -@pytest.mark.skip(reason="Voxtral streaming is not yet public") def test_voxtral_realtime_forward(audio_assets, tokenizer, engine): audio_config = tokenizer.instruct_tokenizer.tokenizer.audio @@ -218,7 +217,6 @@ class RealTimeAudioInput: @pytest.mark.asyncio -@pytest.mark.skip(reason="Voxtral streaming is not yet public") async def test_voxtral_realtime_generator(audio_assets, tokenizer, async_engine): sampling_params = SamplingParams(temperature=0.0, max_tokens=1) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 4c99c9bad..f1344ed86 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -441,6 +441,13 @@ def test_processing_correctness( "Qwen-VL tokenizer requires downloading a font file from " "servers that often refuse connections in CI" ) + if model_id == "mistralai/Voxtral-Mini-4B-Realtime-2602": + pytest.skip( + "Voxtral Realtime doesn't make use of any place-holder" + "tokens and hence cannot pass the processing " + "correctness test as is. Let's revisit adapting this " + "test once more realtime models exist." + ) if model_id == "internlm/Intern-S1-Pro": # FIXME(Isotr0py): Fix later. pytest.skip("Tokenization issue. Fix later") diff --git a/tests/models/registry.py b/tests/models/registry.py index 21188bf39..dcd1fa8ed 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -1031,13 +1031,12 @@ _MULTIMODAL_EXAMPLE_MODELS = { ), "VoxtralForConditionalGeneration": _HfExamplesInfo( "mistralai/Voxtral-Mini-3B-2507", - # disable this temporarily until we support HF format - is_available_online=False, + tokenizer_mode="mistral", ), "VoxtralRealtimeGeneration": _HfExamplesInfo( - "", - # disable this temporarily until we support HF format - is_available_online=False, + "mistralai/Voxtral-Mini-4B-Realtime-2602", + enforce_eager=True, + tokenizer_mode="mistral", ), # [Encoder-decoder] "NemotronParseForConditionalGeneration": _HfExamplesInfo( diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 715d6aa25..2dbfe0a95 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -54,6 +54,7 @@ from vllm.multimodal.processing.processor import ( BaseMultiModalProcessor, BaseProcessingInfo, MultiModalProcessingInfo, + PlaceholderFeaturesInfo, PromptReplacement, PromptUpdate, ) @@ -283,6 +284,15 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]) ) -> Mapping[str, MultiModalFieldConfig]: return dict(audio_arrays=MultiModalFieldConfig.batched("audio")) + def _validate_mm_placeholders( + self, + mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]], + mm_item_counts: Mapping[str, int], + ) -> None: + # mistral_common's tokenizer's does not follow HF's placeholder norms + # skip validation here + ... + def _get_prompt_updates( self, mm_items: MultiModalDataItems, -- GitLab From 6c0baee61025f258c6d56830d0150feab34c45ab Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 12 Feb 2026 18:46:43 +0100 Subject: [PATCH 0143/1166] [Voxtral Realtime] Refactor & Improve buffering logic (#34428) Signed-off-by: Patrick von Platen Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- requirements/common.txt | 4 +- requirements/nightly_torch_test.txt | 2 +- requirements/test.in | 2 +- requirements/test.txt | 2 +- .../generation/test_voxtral_realtime.py | 128 ++-------- vllm/model_executor/models/voxtral.py | 4 +- .../model_executor/models/voxtral_realtime.py | 231 +++++++++--------- 7 files changed, 135 insertions(+), 238 deletions(-) diff --git a/requirements/common.txt b/requirements/common.txt index 297447cf2..ef320c5e2 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -31,7 +31,7 @@ partial-json-parser # used for parsing partial JSON outputs pyzmq >= 25.0.0 msgspec gguf >= 0.17.0 -mistral_common[image] >= 1.9.0 +mistral_common[image] >= 1.9.1 opencv-python-headless >= 4.13.0 # required for video IO pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 @@ -52,4 +52,4 @@ anthropic >= 0.71.0 model-hosting-container-standards >= 0.1.13, < 1.0.0 mcp grpcio -grpcio-reflection \ No newline at end of file +grpcio-reflection diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index a45634d0c..cc5ea519a 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -23,7 +23,7 @@ jiwer # required for audio tests timm # required for internvl test transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test -mistral_common[image,audio] >= 1.9.0 # required for voxtral test +mistral_common[image,audio] >= 1.9.1 # required for voxtral test num2words # required for smolvlm test opencv-python-headless >= 4.13.0 # required for video test datamodel_code_generator # required for minicpm3 test diff --git a/requirements/test.in b/requirements/test.in index 8a97c0e88..1c43d4446 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -30,7 +30,7 @@ torchaudio==2.10.0 torchvision==0.25.0 transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test -mistral_common[image,audio] >= 1.9.0 # required for voxtral test +mistral_common[image,audio] >= 1.9.1 # required for voxtral test num2words # required for smolvlm test open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py opencv-python-headless >= 4.13.0 # required for video test diff --git a/requirements/test.txt b/requirements/test.txt index fbe3228d2..f2ab8037a 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -499,7 +499,7 @@ mbstrdecoder==1.1.3 # typepy mdurl==0.1.2 # via markdown-it-py -mistral-common==1.9.0 +mistral-common==1.9.1 # via -r requirements/test.in mlflow==2.22.0 # via terratorch diff --git a/tests/models/multimodal/generation/test_voxtral_realtime.py b/tests/models/multimodal/generation/test_voxtral_realtime.py index 96f60bb5c..2b769e3ed 100644 --- a/tests/models/multimodal/generation/test_voxtral_realtime.py +++ b/tests/models/multimodal/generation/test_voxtral_realtime.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import asyncio from dataclasses import asdict import pytest @@ -10,14 +9,13 @@ from mistral_common.protocol.transcription.request import ( StreamingMode, TranscriptionRequest, ) -from mistral_common.tokens.tokenizers.audio import AudioConfig from mistral_common.tokens.tokenizers.mistral import MistralTokenizer from vllm import LLM, EngineArgs, SamplingParams from vllm.assets.audio import AudioAsset from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.inputs.data import TokensPrompt -from vllm.v1.engine.async_llm import AsyncLLM, StreamingInput +from vllm.model_executor.models.voxtral_realtime import VoxtralRealtimeBuffer +from vllm.v1.engine.async_llm import AsyncLLM MODEL_NAME = "mistralai/Voxtral-Mini-4B-Realtime-2602" ENGINE_CONFIG = dict( @@ -114,136 +112,40 @@ def test_voxtral_realtime_forward(audio_assets, tokenizer, engine): assert texts == EXPECTED_TEXT -class RealTimeAudioInput: - """ - This class is used to stream an audio file just as - if it would be streamed in real-time. - """ - - def __init__(self, tokenizer: MistralTokenizer) -> None: - self._tokenizer = tokenizer - self._config: AudioConfig = ( - self._tokenizer.instruct_tokenizer.audio_encoder.audio_config - ) - - self._look_ahead_in_ms = self._config.streaming_look_ahead_ms - self._look_back_in_ms = self._config.streaming_look_back_ms - - self._sampling_rate = self._config.sampling_rate - - self._audio: Audio | None = None - - # mutable objects - self._start = 0 - - n_left_pad_samples = ( - self._config.raw_audio_length_per_tok * self._config.n_left_pad_tokens - ) - self._end = self.streaming_delay + n_left_pad_samples + self.streaming_size - self._queue: asyncio.Queue[StreamingInput | None] = asyncio.Queue() - - @classmethod - async def create(cls, audio: Audio, tokenizer: MistralTokenizer): - self = cls(tokenizer) - - # we're doing "OFFLINE" encoding here to right & left pad the audio since - # we have access to the whole audio - # if we'd do an actual online realtime streaming application we - # should instead pass `StreamingMode.ONLINE` - req = TranscriptionRequest( - streaming=StreamingMode.OFFLINE, - audio=RawAudio.from_audio(audio), - language=None, - ) - audio_enc = self._tokenizer.encode_transcription(req) - self._audio = audio_enc.audios[0] - - # add first request - await self.add_tokens(audio_enc.tokens) - - return self - - @property - def look_ahead(self) -> int: - return self._get_len_in_samples(self._look_ahead_in_ms) - - @property - def look_back(self) -> int: - return self._get_len_in_samples(self._look_back_in_ms) - - @property - def streaming_delay(self) -> int: - return self._get_len_in_samples(self._config.transcription_delay_ms) - - @property - def streaming_size(self) -> int: - stream_size_in_ms = 1000 / self._config.frame_rate - return self._get_len_in_samples(stream_size_in_ms) - - def _get_len_in_samples(self, len_in_ms: float) -> int: - _len_in_s = self._sampling_rate * len_in_ms / 1000 - assert _len_in_s.is_integer(), _len_in_s - len_in_s = int(_len_in_s) - - return len_in_s - - async def add_tokens(self, tokens: list[int]) -> None: - assert self._audio is not None - if self._start >= len(self._audio.audio_array): - self.stop() - return - - _end = self._end + self.look_ahead - _start = max(0, self._start - self.look_back) - - multi_modal_data = {"audio": (self._audio.audio_array[_start:_end], None)} - - prompt = TokensPrompt( - prompt_token_ids=tokens, multi_modal_data=multi_modal_data - ) - - await self._queue.put(StreamingInput(prompt)) - - # increase - self._start = self._end - self._end = self._end + self.streaming_size - - def stop(self): - self._queue.put_nowait(None) - - async def generator(self): - while (item := await self._queue.get()) is not None: - yield item - - @pytest.mark.asyncio async def test_voxtral_realtime_generator(audio_assets, tokenizer, async_engine): sampling_params = SamplingParams(temperature=0.0, max_tokens=1) + audio_config = tokenizer.instruct_tokenizer.audio_encoder.audio_config output_tokens_list = [] for i, audio_asset in enumerate(audio_assets): output_tokens = [] audio = Audio.from_file(audio_asset.get_local_path(), strict=False) - streaming_input = await RealTimeAudioInput.create( - audio=audio, tokenizer=tokenizer + + req = TranscriptionRequest( + streaming=StreamingMode.OFFLINE, + audio=RawAudio.from_audio(audio), + language=None, ) + audio_enc = tokenizer.encode_transcription(req) + + buffer = VoxtralRealtimeBuffer(audio_config, audio_enc.tokens) + await buffer.append_audio(audio_enc.audios[0].audio_array) + await buffer.append_audio(None) request_id = f"session-{i}" async for resp in async_engine.generate( - prompt=streaming_input.generator(), + prompt=buffer.get_input_stream(), sampling_params=sampling_params, request_id=request_id, ): tokens = resp.outputs[0].token_ids[-1:] - output_tokens.extend(tokens) - await streaming_input.add_tokens(tokens) + await buffer.append_tokens(tokens) output_tokens_list.append(output_tokens) texts = [tokenizer.decode(output_tokens) for output_tokens in output_tokens_list] - texts[1] = texts[1].replace("a base hit", "OBS").replace("oh my", "oh, my") - assert texts == EXPECTED_TEXT diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 2dbfe0a95..cc9856f28 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -155,9 +155,7 @@ class VoxtralProcessorAdapter: assert audio.ndim == 1 if not self._audio_processor.audio_config.is_streaming: - audio = self._audio_processor.pad( - audio, self.sampling_rate, is_online_streaming=False - ) + audio = self._audio_processor.pad(audio, self.sampling_rate) audio_tokens = [self.begin_audio_token_id] + [ self.audio_token_id diff --git a/vllm/model_executor/models/voxtral_realtime.py b/vllm/model_executor/models/voxtral_realtime.py index 6c4d20d35..81406c66b 100644 --- a/vllm/model_executor/models/voxtral_realtime.py +++ b/vllm/model_executor/models/voxtral_realtime.py @@ -3,7 +3,7 @@ import asyncio import math -from collections.abc import AsyncGenerator, Mapping +from collections.abc import AsyncGenerator, Iterable, Iterator, Mapping from typing import Literal import numpy as np @@ -18,7 +18,7 @@ from mistral_common.tokens.tokenizers.audio import Audio, AudioConfig from vllm.compilation.decorators import support_torch_compile from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig from vllm.envs import VLLM_ENGINE_ITERATION_TIMEOUT_S -from vllm.inputs.data import PromptType, TokensPrompt +from vllm.inputs.data import PromptType, StreamingInput, TokensPrompt from vllm.logger import init_logger from vllm.model_executor.models.interfaces import MultiModalEmbeddings, SupportsRealtime from vllm.model_executor.models.voxtral import ( @@ -47,8 +47,6 @@ from .utils import ( logger = init_logger(__name__) -_PRE_ALLOCATE_BUFFER_SIZE_IN_S = 30 - class VoxtralRealtimeMultiModalProcessor(VoxtralMultiModalProcessor): def __init__( @@ -130,84 +128,81 @@ def _expand_tensor(input_tensor: torch.Tensor, scaling: int) -> torch.Tensor: class VoxtralRealtimeBuffer: - def __init__(self, config: AudioConfig) -> None: + def __init__(self, config: AudioConfig, prompt_tokens: list[int]) -> None: self._config = config - self._look_ahead_in_ms = config.streaming_look_ahead_ms - self._look_back_in_ms = config.streaming_look_back_ms - - self._sampling_rate = self._config.sampling_rate - - self._look_ahead = self._get_len_in_samples(self._look_ahead_in_ms) - self._look_back = self._get_len_in_samples(self._look_back_in_ms) - self._streaming_size = self._get_len_in_samples(1000 / self._config.frame_rate) - - # mutable objects - streaming_delay = self._get_len_in_samples(self._config.transcription_delay_ms) - self._start = 0 - self._end = streaming_delay + self._streaming_size - - # always pre-allocate 30 second buffers - self._buffer_size = _PRE_ALLOCATE_BUFFER_SIZE_IN_S * self._sampling_rate - self._buffer: np.ndarray = np.empty(self._buffer_size, dtype=np.float32) - self._filled_buffer_len = 0 - - @property - def start_idx(self): - return max(self._start - self._look_back, 0) - - @property - def end_idx(self): - return self._end + self._look_ahead - - @property - def is_audio_complete(self) -> bool: - return self._filled_buffer_len >= self.end_idx - - def _get_len_in_samples(self, len_in_ms: float) -> int: - _len_in_s = self._sampling_rate * len_in_ms / 1000 - assert _len_in_s.is_integer(), _len_in_s - len_in_s = int(_len_in_s) - - return len_in_s - - def _allocate_new_buffer(self) -> None: - # allocate new buffer - new_buffer = np.empty(self._buffer_size, dtype=np.float32) - left_to_copy = max(self._filled_buffer_len - self.start_idx, 0) - - if left_to_copy > 0: - new_buffer[:left_to_copy] = self._buffer[ - self.start_idx : self._filled_buffer_len - ] - - del self._buffer - self._buffer = new_buffer - - self._filled_buffer_len = left_to_copy - self._start = self._look_back - self._end = self._start + self._streaming_size - - def write_audio(self, audio: np.ndarray) -> None: - put_end_idx = self._filled_buffer_len + len(audio) - - if put_end_idx > self._buffer_size: - self._allocate_new_buffer() - - self._buffer[self._filled_buffer_len : self._filled_buffer_len + len(audio)] = ( - audio - ) - self._filled_buffer_len += len(audio) - - def read_audio(self) -> np.ndarray | None: - if not self.is_audio_complete: - return None + _look_ahead_in_ms = self._config.streaming_look_ahead_ms + _look_back_in_ms = self._config.streaming_look_back_ms + self._look_ahead_in_samples = self._ms_to_samples(_look_ahead_in_ms) + self._look_back_in_samples = self._ms_to_samples(_look_back_in_ms) + + # None signals the end + self._audio_queue: asyncio.Queue[np.ndarray | None] = asyncio.Queue() + self._leftover: np.ndarray | None = None + self._token_queue: asyncio.Queue[int] = asyncio.Queue() + + self._initial_end = len(prompt_tokens) * self._config.raw_audio_length_per_tok + for token in prompt_tokens: + self._token_queue.put_nowait(token) + + def _generate_frame_size_and_num_tokens(self) -> Iterator[tuple[int, int]]: + streaming_step_size = self._ms_to_samples(1000 / self._config.frame_rate) + start = 0 + end = self._initial_end + while True: + frame_start = max(start - self._look_back_in_samples, 0) + frame_end = end + self._look_ahead_in_samples + frame_size = frame_end - frame_start + num_tokens = (end - start) / self._config.raw_audio_length_per_tok + assert num_tokens.is_integer() + yield frame_size, int(num_tokens) + start = end + end += streaming_step_size + + def _ms_to_samples(self, ms: float) -> int: + len_ = self._config.sampling_rate * ms / 1000 + assert len_.is_integer(), len_ + return int(len_) + + async def append_audio(self, audio_array: np.ndarray | None) -> None: + await self._audio_queue.put(audio_array) + + async def append_tokens(self, tokens: Iterable[int]) -> None: + for token in tokens: + await self._token_queue.put(token) + + async def get_input_stream(self) -> AsyncGenerator[StreamingInput]: + for frame_size, num_tokens in self._generate_frame_size_and_num_tokens(): + next_tokens = [await self._token_queue.get() for _ in range(num_tokens)] + + audio_arrays: list[np.ndarray] = ( + [self._leftover] if self._leftover is not None else [] + ) + while sum(len(arr) for arr in audio_arrays) < frame_size: + arr = await self._audio_queue.get() + if arr is None: + return + audio_arrays.append(arr) + + audio_array = np.concatenate(audio_arrays) + frame = audio_array[:frame_size] + + # The current stride took look_ahead_in_samples audio of the next sample + # In addition the next sample will take look_back_in_samples audio of + # the current sample => So let's put both of this into the leftover + stride = ( + frame_size - self._look_ahead_in_samples - self._look_back_in_samples + ) + assert stride > 0, f"{stride=} must be positive" - audio = self._buffer[self.start_idx : self.end_idx] - self._start = self._end - self._end += self._streaming_size + self._leftover = audio_array[stride:] - return audio + yield StreamingInput( + TokensPrompt( + prompt_token_ids=next_tokens, + multi_modal_data={"audio": (frame, None)}, + ) + ) @MULTIMODAL_REGISTRY.register_processor( @@ -234,7 +229,7 @@ class VoxtralRealtimeGeneration(VoxtralForConditionalGeneration, SupportsRealtim ) audio_config = self.tokenizer.instruct.audio_encoder.audio_config - self.n_delay_tokens = audio_config.num_delay_tokens + self.n_delay_tokens = audio_config.get_num_delay_tokens() # for realtime transcription @classmethod @@ -248,45 +243,47 @@ class VoxtralRealtimeGeneration(VoxtralForConditionalGeneration, SupportsRealtim audio_encoder = tokenizer.instruct.audio_encoder config = audio_encoder.audio_config - buffer = VoxtralRealtimeBuffer(config) - is_first_yield = True - - async for audio in audio_stream: - buffer.write_audio(audio) - - while (new_audio := buffer.read_audio()) is not None: - if is_first_yield: - # make sure that input_stream is empty - assert input_stream.empty() - - audio = Audio(new_audio, config.sampling_rate, format="wav") - - request = TranscriptionRequest( - streaming=StreamingMode.ONLINE, - audio=RawAudio.from_audio(audio), - language=None, - ) - # mistral tokenizer takes care - # of preparing the first prompt inputs - # and does some left-silence padding - # for improved performance - audio_enc = tokenizer.mistral.encode_transcription(request) - - token_ids = audio_enc.tokens - new_audio = audio_enc.audios[0].audio_array - - is_first_yield = False - else: - # pop last element from input_stream - all_outputs = await asyncio.wait_for( - input_stream.get(), timeout=VLLM_ENGINE_ITERATION_TIMEOUT_S - ) - token_ids = all_outputs[-1:] - - multi_modal_data = {"audio": (new_audio, None)} - yield TokensPrompt( - prompt_token_ids=token_ids, multi_modal_data=multi_modal_data + # Get prompt tokens (streaming prefix tokens) without encoding audio + prompt_tokens = ( + tokenizer.instruct.start() + audio_encoder.encode_streaming_tokens() + ) + + # Get left/right padding audio + left_pad, right_pad = audio_encoder.get_padding_audio() + + buffer = VoxtralRealtimeBuffer(config, prompt_tokens) + + # Feed audio with padding into buffer in background + async def feed_audio(): + yielded_first_chunk = False + async for audio_chunk in audio_stream: + if not yielded_first_chunk: + yielded_first_chunk = True + # Prepend left padding before first real audio + await buffer.append_audio(left_pad.audio_array) + await buffer.append_audio(audio_chunk) + # Append right padding at the end + await buffer.append_audio(right_pad.audio_array) + await buffer.append_audio(None) # signal end + + # Feed output tokens back into buffer in background + async def feed_tokens(): + while True: + all_outputs = await asyncio.wait_for( + input_stream.get(), + timeout=VLLM_ENGINE_ITERATION_TIMEOUT_S, ) + await buffer.append_tokens(all_outputs[-1:]) + + audio_task = asyncio.create_task(feed_audio()) + token_task = asyncio.create_task(feed_tokens()) + + try: + async for streaming_input in buffer.get_input_stream(): + yield streaming_input.prompt + finally: + audio_task.cancel() + token_task.cancel() @property def audio_config(self): -- GitLab From 4c078fa546016eacab87f833ff625463421f7d29 Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Thu, 12 Feb 2026 12:47:34 -0600 Subject: [PATCH 0144/1166] [ROCm][CI] Pin TorchCodec to v0.10.0 for ROCm compatibility (#34447) Signed-off-by: Andreas Karatzas --- tools/install_torchcodec_rocm.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/install_torchcodec_rocm.sh b/tools/install_torchcodec_rocm.sh index f4a255473..6cb3b39fd 100755 --- a/tools/install_torchcodec_rocm.sh +++ b/tools/install_torchcodec_rocm.sh @@ -7,7 +7,8 @@ set -e TORCHCODEC_REPO="${TORCHCODEC_REPO:-https://github.com/pytorch/torchcodec.git}" -TORCHCODEC_BRANCH="${TORCHCODEC_BRANCH:-main}" +# Pin to a specific release for reproducibility; update as needed. +TORCHCODEC_BRANCH="${TORCHCODEC_BRANCH:-v0.10.0}" echo "=== TorchCodec Installation Script ===" -- GitLab From 6d4e27ce29bac0e4cd4975cddf5b0dacc6cb727a Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 12 Feb 2026 15:08:06 -0500 Subject: [PATCH 0145/1166] [Bugfix] Enforce DeepGEMM when using sparse_attn_indexer on CUDA (#34374) Signed-off-by: mgoin --- vllm/model_executor/layers/sparse_attn_indexer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/model_executor/layers/sparse_attn_indexer.py b/vllm/model_executor/layers/sparse_attn_indexer.py index 538860ca6..826caa5d3 100644 --- a/vllm/model_executor/layers/sparse_attn_indexer.py +++ b/vllm/model_executor/layers/sparse_attn_indexer.py @@ -10,6 +10,7 @@ from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp from vllm.platforms import current_platform from vllm.utils.deep_gemm import fp8_mqa_logits, fp8_paged_mqa_logits +from vllm.utils.import_utils import has_deep_gemm from vllm.utils.torch_utils import direct_register_custom_op from vllm.v1.attention.backends.mla.indexer import ( DeepseekV32IndexerMetadata, @@ -277,6 +278,10 @@ class SparseAttnIndexer(CustomOp): self.max_model_len = max_model_len self.max_total_seq_len = max_total_seq_len self.topk_indices_buffer = topk_indices_buffer + if current_platform.is_cuda() and not has_deep_gemm(): + raise RuntimeError( + "Sparse Attention Indexer CUDA op requires DeepGEMM to be installed." + ) def forward_native( self, -- GitLab From fac4e96940d9f2ac8dde8fc864b4c76cbdfd0e2d Mon Sep 17 00:00:00 2001 From: Hashem Hashemi <159079214+amd-hhashemi@users.noreply.github.com> Date: Thu, 12 Feb 2026 12:26:36 -0800 Subject: [PATCH 0146/1166] small adjustment to wvSplitKrc (#34410) Signed-off-by: Hashem Hashemi --- csrc/rocm/skinny_gemms.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu index ecd94cacc..976874e6f 100644 --- a/csrc/rocm/skinny_gemms.cu +++ b/csrc/rocm/skinny_gemms.cu @@ -1568,8 +1568,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) { #endif unsigned int kOff = k + (thrd * A_CHUNK); - unsigned int kOffcp = - k_str + kOff; // min__(K - A_CHUNK, k_str + kOff); + unsigned int kOffcp = min__(K - A_CHUNK, k_str + kOff); for (unsigned int n = 0; n < N; n += CHUNKK * sprdN) { __builtin_amdgcn_global_load_lds( (int*)(&A[min__( -- GitLab From f120bd42d3daf733425d7feaaeffc2a23ba71c17 Mon Sep 17 00:00:00 2001 From: amitz-nv <203509407+amitz-nv@users.noreply.github.com> Date: Thu, 12 Feb 2026 23:06:58 +0200 Subject: [PATCH 0147/1166] [Kernel] Support Flashinfer trtllm fused MoE non gated FP8 & NVFP4 (#33506) Signed-off-by: amitz-nv <203509407+amitz-nv@users.noreply.github.com> --- tests/kernels/moe/test_flashinfer.py | 58 +++++++++--- .../layers/fused_moe/flashinfer_trtllm_moe.py | 14 +-- .../layers/quantization/modelopt.py | 7 +- .../quantization/utils/flashinfer_fp4_moe.py | 70 ++++++++++---- .../quantization/utils/flashinfer_utils.py | 93 ++++++++++++++++++- 5 files changed, 197 insertions(+), 45 deletions(-) diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py index 9c31d9325..d524b5667 100644 --- a/tests/kernels/moe/test_flashinfer.py +++ b/tests/kernels/moe/test_flashinfer.py @@ -71,7 +71,8 @@ def quant_fp8_per_tensor_batches(a): for i in range(num_batches): a_fp8, a_global_sf = input_to_float8(a[i]) - a_global_sf = 1.0 / a_global_sf + if a_global_sf.numel() == 1: + a_global_sf = a_global_sf.view(1, 1) a_quant.append(a_fp8) a_scales.append(a_global_sf) @@ -81,6 +82,20 @@ def quant_fp8_per_tensor_batches(a): return result_a_quant, result_a_scales +def check_accuracy(ref_output, actual_output, atol=0.1, rtol=0.85, percent=0.925): + close = torch.isclose(ref_output, actual_output, atol=atol, rtol=rtol) + match_ratio = close.float().mean() + assert match_ratio >= percent, ( + f"Match ratio {match_ratio:.4f} is below the threshold {percent:.4f}" + ) + + mismatch_percent = 1.0 - match_ratio.item() + assert mismatch_percent <= 1 - percent, ( + f"Mismatch percentage {mismatch_percent:.4f} is above the threshold " + f"{1 - percent:.4f}" + ) + + @dataclass class TestData: hidden_states: torch.Tensor @@ -104,14 +119,16 @@ class TestData: is_gated = activation.is_gated hidden_states = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10 - w13 = torch.randn( - (e, (2 * n) if is_gated else n, k), device="cuda", dtype=torch.bfloat16 + w13 = ( + torch.randn( + (e, (2 * n) if is_gated else n, k), device="cuda", dtype=torch.bfloat16 + ) + / 10 ) - w2 = torch.randn((e, k, n), device="cuda", dtype=torch.bfloat16) + w2 = torch.randn((e, k, n), device="cuda", dtype=torch.bfloat16) / 10 # Scale to fp8 _, a1_scale = input_to_float8(hidden_states) - a1_scale = 1.0 / a1_scale a2_scale = torch.scalar_tensor(1.0).to(device="cuda").to(dtype=torch.float32) w13_quantized, w13_weight_scale = quant_fp8_per_tensor_batches(w13) w2_quantized, w2_weight_scale = quant_fp8_per_tensor_batches(w2) @@ -124,14 +141,16 @@ class TestData: layer.w2_input_scale = a2_scale layer.w13_weight_scale = w13_weight_scale layer.w2_weight_scale = w2_weight_scale + layer.activation = activation # Setup dummy config. layer.moe_parallel_config = mk.FusedMoEParallelConfig.make_no_parallel() # flashinfer expects swapped rows for w13 - layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data) + if is_gated: + layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data) if is_trtllm: rotate_weights_for_fi_trtllm_fp8_per_tensor_moe( - layer.w13_weight, layer.w2_weight + layer.w13_weight, layer.w2_weight, is_gated ) register_scales_for_trtllm_fp8_per_tensor_moe( layer, @@ -162,12 +181,14 @@ class TestData: @pytest.mark.parametrize("m,n,k", MNK_FACTORS) @pytest.mark.parametrize("e", NUM_EXPERTS) @pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("activation", [MoEActivation.SILU, MoEActivation.RELU2_NO_MUL]) def test_flashinfer_per_tensor_moe_fp8_no_graph( m: int, n: int, k: int, e: int, topk: int, + activation: MoEActivation, monkeypatch, ): if not current_platform.has_device_capability(100): @@ -175,7 +196,9 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph( set_random_seed(7) monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") with set_current_vllm_config(vllm_config): - td = TestData.make_moe_tensors_8bit(m, k, n, e, is_trtllm=True) + td = TestData.make_moe_tensors_8bit( + m, k, n, e, is_trtllm=True, activation=activation + ) score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16) topk_weights, topk_ids = Llama4MoE.custom_routing_function( @@ -200,7 +223,7 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph( topk_weights=topk_weights, topk_ids=topk_ids, inplace=False, - activation=MoEActivation.SILU, + activation=activation, global_num_experts=e, expert_map=None, apply_router_weight_on_input=True, @@ -219,7 +242,13 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph( apply_router_weight_on_input=True, ) - torch.testing.assert_close(output, flashinfer_output, atol=5.5e-2, rtol=1e-2) + check_accuracy( + ref_output=output, + actual_output=flashinfer_output, + atol=0.1, + rtol=0.85, + percent=0.925, + ) @pytest.mark.parametrize("m,n,k", MNK_FACTORS) @@ -320,8 +349,13 @@ def test_flashinfer_cutlass_moe_fp8_no_graph( expert_map=None, apply_router_weight_on_input=True, ) - torch.testing.assert_close( - output, flashinfer_cutlass_output, atol=5.5e-2, rtol=1e-2 + + check_accuracy( + ref_output=output, + actual_output=flashinfer_cutlass_output, + atol=0.1, + rtol=0.85, + percent=0.925, ) diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py index a50ad6722..b2d571dd8 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py @@ -35,8 +35,8 @@ def _supports_current_device() -> bool: def _supports_no_act_and_mul() -> bool: - """Does not support non-gated MoE (i.e. Nanotron-Mini).""" - return False + """Supports non-gated MoE.""" + return True def _supports_quant_scheme( @@ -52,8 +52,7 @@ def _supports_quant_scheme( def _supports_activation(activation: MoEActivation) -> bool: - """Supports silu activation only.""" - return activation == MoEActivation.SILU + return activation in [MoEActivation.SILU, MoEActivation.RELU2_NO_MUL] def _supports_routing_method( @@ -74,6 +73,7 @@ def _supports_routing_method( elif (weight_key, activation_key) == (kFp8StaticTensorSym, kFp8StaticTensorSym): # NOTE(dbari): as above, potentially allow others here. return routing_method in [ + RoutingMethodType.DeepSeekV3, RoutingMethodType.Llama4, RoutingMethodType.Renormalize, RoutingMethodType.RenormalizeNaive, @@ -291,6 +291,7 @@ def fi_trtllm_fp8_per_tensor_moe( local_num_experts: int, use_routing_scales_on_input: bool, routing_method_type: int, + activation_type: int, routed_scaling_factor: float = 1.0, ) -> torch.Tensor: num_expert_group = num_expert_group if num_expert_group is not None else 0 @@ -326,9 +327,9 @@ def fi_trtllm_fp8_per_tensor_moe( routed_scaling_factor=routed_scaling_factor, use_routing_scales_on_input=use_routing_scales_on_input, routing_method_type=routing_method_type, - # TODO: Required for flashinfer==0.6.3, remove with update + # TODO: enum type Required for flashinfer==0.6.3, remove with update # https://github.com/flashinfer-ai/flashinfer/pull/2508 - activation_type=ActivationType.Swiglu, + activation_type=ActivationType(activation_type), ) @@ -351,6 +352,7 @@ def fi_trtllm_fp8_per_tensor_moe_fake( local_num_experts: int, use_routing_scales_on_input: bool, routing_method_type: int, + activation_type: int, routed_scaling_factor: float = 1.0, ) -> torch.Tensor: return torch.empty_like(hidden_states) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index e0322a46f..9af815ee9 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -937,10 +937,11 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): ) # TODO(rob): this validation should happen at kernel selection # time in the oracle rather than here. - assert layer.activation == MoEActivation.SILU, ( - f"Expected 'silu' activation but got {layer.activation}" + SUPPORTED_ACTIVATIONS = [MoEActivation.SILU, MoEActivation.RELU2_NO_MUL] + assert layer.activation in SUPPORTED_ACTIVATIONS, ( + f"Only {SUPPORTED_ACTIVATIONS} activations are supported for FlashInfer " + f"TRTLLM FP4 MoE, {layer.activation} found instead." ) - assert not layer.renormalize return apply_fi_trtllm_fp8_per_tensor_moe( layer=layer, hidden_states=x, diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index 9d9fd31ad..ea84406ba 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -15,6 +15,10 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEParallelConfig, RoutingMethodType, ) +from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( + activation_to_flashinfer_int, + align_fp4_moe_weights_for_fi, +) from vllm.model_executor.layers.quantization.utils.nvfp4_utils import ( swizzle_blockscale, ) @@ -50,8 +54,8 @@ def _supports_current_device() -> bool: def _supports_no_act_and_mul() -> bool: - """Does not support non-gated MoE (i.e. Nemotron-Nano).""" - return False + """Supports non-gated MoE.""" + return True def _supports_quant_scheme( @@ -66,8 +70,7 @@ def _supports_quant_scheme( def _supports_activation(activation: MoEActivation) -> bool: - """Supports silu activation only.""" - return activation in [MoEActivation.SILU] + return activation in [MoEActivation.SILU, MoEActivation.RELU2_NO_MUL] def _supports_routing_method( @@ -150,6 +153,7 @@ def prepare_static_weights_for_trtllm_fp4_moe( hidden_size, intermediate_size, num_experts, + is_gated_activation: bool, ): from flashinfer import nvfp4_block_scale_interleave from flashinfer.fused_moe.core import ( @@ -160,15 +164,18 @@ def prepare_static_weights_for_trtllm_fp4_moe( _cache_permute_indices: dict[torch.Size, torch.Tensor] = {} """Prepare quantized weights for kernel (done offline with weights).""" epilogue_tile_m = 128 # FIXME: this depends on the kernel internals + gemm1_intermediate_size = ( + 2 * intermediate_size if is_gated_activation else intermediate_size + ) # Convert quantized weights to proper formats gemm1_weights_fp4 = gemm1_weights.view(torch.float8_e4m3fn).reshape( - num_experts, 2 * intermediate_size, hidden_size // 2 + num_experts, gemm1_intermediate_size, hidden_size // 2 ) # packed fp4 gemm1_scales_linear_fp4 = gemm1_scales_linear_fp4_bytes.view( torch.float8_e4m3fn ).reshape( - num_experts, 2 * intermediate_size, hidden_size // 16 + num_experts, gemm1_intermediate_size, hidden_size // 16 ) # fp8 scaling factors gemm2_weights_fp4 = gemm2_weights.view(torch.float8_e4m3fn).reshape( @@ -191,6 +198,7 @@ def prepare_static_weights_for_trtllm_fp4_moe( _cache_permute_indices, gemm1_weights_fp4[i].view(torch.uint8), epilogue_tile_m, + is_gated_act_gemm=is_gated_activation, ) gemm1_weights_fp4_shuffled.append( gemm1_weights_fp4[i] @@ -203,6 +211,7 @@ def prepare_static_weights_for_trtllm_fp4_moe( gemm1_scales_linear_fp4[i].view(torch.uint8), epilogue_tile_m, num_elts_per_sf=16, + is_gated_act_gemm=is_gated_activation, ) gemm1_scales_fp4_shuffled.append( nvfp4_block_scale_interleave( @@ -246,7 +255,7 @@ def prepare_static_weights_for_trtllm_fp4_moe( gemm1_scales_fp4_shuffled = ( torch.stack(gemm1_scales_fp4_shuffled) .view(torch.float8_e4m3fn) - .reshape(num_experts, 2 * intermediate_size, hidden_size // 16) + .reshape(num_experts, gemm1_intermediate_size, hidden_size // 16) ) gemm2_weights_fp4_shuffled = torch.stack(gemm2_weights_fp4_shuffled) @@ -297,10 +306,10 @@ def flashinfer_trtllm_fp4_moe( from vllm.model_executor.models.llama4 import Llama4MoE - # https://github.com/flashinfer-ai/flashinfer/blob/f0277fd1bff90e309e5c19cab36c5dae056d685d/flashinfer/fused_moe/core.py#L2404 - assert activation == MoEActivation.SILU, ( - "Only SiLU activation is supported for FlashInfer TRTLLM FP4 MoE. " - f"{activation} found instead." + SUPPORTED_ACTIVATIONS = [MoEActivation.SILU, MoEActivation.RELU2_NO_MUL] + assert activation in SUPPORTED_ACTIVATIONS, ( + f"Only {SUPPORTED_ACTIVATIONS} activations are supported for FlashInfer " + f"TRTLLM FP4 MoE, {activation} found instead." ) # Quantize input to FP4 @@ -325,6 +334,9 @@ def flashinfer_trtllm_fp4_moe( else router_logits ) + # Determine activation type + activation_type = activation_to_flashinfer_int(layer.activation) + # Call TRT-LLM FP4 block-scale MoE kernel out = flashinfer.fused_moe.trtllm_fp4_block_scale_moe( routing_logits=router_logits, @@ -355,6 +367,7 @@ def flashinfer_trtllm_fp4_moe( routed_scaling_factor=None, routing_method_type=routing_method_type, do_finalize=True, + activation_type=activation_type, )[0] return out @@ -479,10 +492,16 @@ def prepare_nvfp4_moe_layer_for_fi_or_cutlass( ] # Reorder [w1, w3] to [w3, w1] for FI NVFP4 MoE kernels. - if is_act_and_mul and backend in [ - NvFp4MoeBackend.FLASHINFER_CUTLASS, - NvFp4MoeBackend.FLASHINFER_TRTLLM, - ]: + is_gated = layer.activation.is_gated + if ( + is_gated + and is_act_and_mul + and backend + in [ + NvFp4MoeBackend.FLASHINFER_CUTLASS, + NvFp4MoeBackend.FLASHINFER_TRTLLM, + ] + ): w13, w13_scale = reorder_w1w3_to_w3w1(w13, w13_scale) # For some FI kernels, the input scales are shared by all experts. @@ -495,19 +514,32 @@ def prepare_nvfp4_moe_layer_for_fi_or_cutlass( # Shuffle weights and scales for FI TRTLLM NVFP4 MoE kernels. if backend == NvFp4MoeBackend.FLASHINFER_TRTLLM: + # Align weights for FI NVFP4 MoE kernels. + min_alignment = 16 if is_gated else 128 + w13, w13_scale, w2, w2_scale, padded_intermediate = ( + align_fp4_moe_weights_for_fi( + w13, w13_scale, w2, w2_scale, is_act_and_mul, min_alignment + ) + ) + layer.intermediate_size_per_partition = padded_intermediate + w13, w13_scale, w2, w2_scale = prepare_static_weights_for_trtllm_fp4_moe( w13, w2, w13_scale, w2_scale, - w2.size(-2), # hidden_size - w13.size(-2) // 2, # intermediate_size - w13.size(0), # num_experts + hidden_size=w2.size(-2), + intermediate_size=w13.size(-2) // 2 if is_gated else w13.size(-2), + num_experts=w13.size(0), + is_gated_activation=is_gated, ) # We do not need to make this a parameter, because # it is not used during the weight (re)-loading process. - layer.g1_scale_c = a13_scale * w13_scale_2 / a2_scale + if is_gated: + layer.g1_scale_c = a13_scale * w13_scale_2 / a2_scale + else: + layer.g1_scale_c = torch.ones_like(a13_scale) / a2_scale layer.a1_gscale = 1.0 / a13_scale layer.g1_alphas = a13_scale * w13_scale_2 layer.g2_alphas = a2_scale * w2_scale_2 diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py index 56c90aa86..42fae9ee9 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py @@ -6,6 +6,7 @@ import torch from vllm import envs from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.platforms import current_platform from vllm.utils.math_utils import round_up @@ -18,6 +19,20 @@ class FlashinferMoeBackend(Enum): CUTEDSL = "CUTEDSL" +def activation_to_flashinfer_int(activation: MoEActivation) -> int: + from flashinfer.fused_moe.core import ActivationType + + # silu and gelu are mapped to their gated versions SwiGLU and GeGLU respectively + ACTIVATION_TO_FI_ACTIVATION = { + MoEActivation.SILU_NO_MUL: ActivationType.Silu, + MoEActivation.GELU_NO_MUL: ActivationType.Gelu, + MoEActivation.SILU: ActivationType.Swiglu, + MoEActivation.GELU: ActivationType.Geglu, + MoEActivation.RELU2_NO_MUL: ActivationType.Relu2, + } + return ACTIVATION_TO_FI_ACTIVATION[activation].value + + def swap_w13_to_w31(x: torch.Tensor) -> torch.Tensor: return ( x.reshape(-1, 2, x.shape[-2] // 2, x.shape[-1]).flip(dims=[1]).reshape(x.shape) @@ -25,7 +40,7 @@ def swap_w13_to_w31(x: torch.Tensor) -> torch.Tensor: def rotate_weights_for_fi_trtllm_fp8_per_tensor_moe( - gemm1_weights: torch.Tensor, gemm2_weights: torch.Tensor + gemm1_weights: torch.Tensor, gemm2_weights: torch.Tensor, is_gated_activation: bool ): """Shuffle weights for for FI TRT-LLM Format""" from flashinfer import reorder_rows_for_gated_act_gemm, shuffle_matrix_a @@ -40,6 +55,8 @@ def rotate_weights_for_fi_trtllm_fp8_per_tensor_moe( for i in range(num_experts): gemm1_weights_fp8_interleaved.append( reorder_rows_for_gated_act_gemm(gemm1_weights[i]) + if is_gated_activation + else gemm1_weights[i] ) # Stack weights and scales for all experts @@ -86,7 +103,13 @@ def register_scales_for_trtllm_fp8_per_tensor_moe( ) layer.w2_input_scale_inv = 1.0 / w2_input_scale layer.output1_scales_gate_scalar = g1_alphas - layer.output1_scales_scalar = g1_alphas * layer.w2_input_scale_inv + + if layer.activation.is_gated: + layer.output1_scales_scalar = g1_alphas * layer.w2_input_scale_inv + else: + layer.output1_scales_scalar = ( + torch.ones_like(g1_alphas) * layer.w2_input_scale_inv + ) layer.output2_scales_scalar = g2_alphas @@ -125,6 +148,7 @@ def apply_fi_trtllm_fp8_per_tensor_moe( assert layer.custom_routing_function is None, ( "Custom routing function is only supported for Llama4" ) + activation_type = activation_to_flashinfer_int(layer.activation) return torch.ops.vllm.fi_trtllm_fp8_per_tensor_moe( routing_logits=router_logits, @@ -145,6 +169,7 @@ def apply_fi_trtllm_fp8_per_tensor_moe( local_num_experts=layer.local_num_experts, use_routing_scales_on_input=apply_router_weight_on_input, routing_method_type=layer.routing_method_type, + activation_type=activation_type, ) @@ -274,8 +299,64 @@ def convert_moe_weights_to_flashinfer_trtllm_block_layout( return w13_weights_shuffled_tensor, w2_weights_shuffled_tensor +def align_fp4_moe_weights_for_fi( + w13: torch.Tensor, + w13_scale: torch.Tensor, + w2: torch.Tensor, + w2_scale: torch.Tensor, + is_act_and_mul: bool, + min_alignment: int = 16, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int]: + """Pad intermediate size so FlashInfer kernels' alignment constraints hold. + + Some FlashInfer FP4 MoE kernels require the intermediate size + used for GEMM to be divisible by a small alignment value. When this is + not satisfied (e.g. with certain tensor-parallel sizes), we pad the + gate/up and down projection weights along the intermediate dim. + """ + + # Current local intermediate size (per partition) is the K dimension of + # the down projection. + num_experts, hidden_size, intermediate = w2.shape + intermediate *= 2 # because of packed FP4 + + padded_intermediate = round_up(intermediate, min_alignment) + + if padded_intermediate == intermediate: + return w13, w13_scale, w2, w2_scale, intermediate + + logger.info_once( + "Padding intermediate size from %d to %d for up/down projection weights.", + intermediate, + padded_intermediate, + scope="local", + ) + + up_mult = 2 if is_act_and_mul else 1 + padded_gate_up_dim = up_mult * padded_intermediate + + # Pad w13 and w2 along its intermediate dimension. + padded_w13 = w13.new_zeros((num_experts, padded_gate_up_dim, hidden_size // 2)) + padded_w13[:, : w13.shape[1], :] = w13 + + padded_w2 = w2.new_zeros((num_experts, hidden_size, padded_intermediate // 2)) + padded_w2[:, :, : w2.shape[2]] = w2 + + padded_w13_scale = w13_scale.new_zeros( + (num_experts, padded_gate_up_dim, hidden_size // 16) + ) + padded_w13_scale[:, : w13_scale.shape[1], :] = w13_scale + + padded_w2_scale = w2_scale.new_zeros( + (num_experts, hidden_size, padded_intermediate // 16) + ) + padded_w2_scale[:, :, : w2_scale.shape[2]] = w2_scale + + return padded_w13, padded_w13_scale, padded_w2, padded_w2_scale, padded_intermediate + + def align_fp8_moe_weights_for_fi( - w13: torch.Tensor, w2: torch.Tensor, is_act_and_mul: bool + w13: torch.Tensor, w2: torch.Tensor, is_act_and_mul: bool, min_alignment: int = 16 ) -> tuple[torch.Tensor, torch.Tensor, int]: """Pad intermediate size so FlashInfer kernels' alignment constraints hold. @@ -289,7 +370,6 @@ def align_fp8_moe_weights_for_fi( # the down projection. num_experts, hidden_size, intermediate = w2.shape - min_alignment = 16 padded_intermediate = round_up(intermediate, min_alignment) if padded_intermediate == intermediate: @@ -342,11 +422,14 @@ def prepare_fp8_moe_layer_for_fi( # Some FI MoE kernels require internal alignment of 16 # for the gate-up proj. Pad the weights to respect this. + is_gated = layer.activation.is_gated if not block_quant: + min_alignment = 16 if is_gated else 128 w13, w2, new_intermediate = align_fp8_moe_weights_for_fi( w13, w2, layer.moe_config.is_act_and_mul, + min_alignment, ) layer.intermediate_size_per_partition = new_intermediate @@ -363,7 +446,7 @@ def prepare_fp8_moe_layer_for_fi( assert w13_input_scale is not None assert w2_input_scale is not None - rotate_weights_for_fi_trtllm_fp8_per_tensor_moe(w13, w2) + rotate_weights_for_fi_trtllm_fp8_per_tensor_moe(w13, w2, is_gated) register_scales_for_trtllm_fp8_per_tensor_moe( layer, w13_scale=w13_scale, -- GitLab From 9ea1f598ce48da3054d073e74b9e51e8d0de945a Mon Sep 17 00:00:00 2001 From: "Mengtao (Martin) Yuan" Date: Thu, 12 Feb 2026 16:14:43 -0800 Subject: [PATCH 0148/1166] Use paged_attention_v1 for sliding window decode in rocm_aiter_fa (#34378) Signed-off-by: Martin Yuan Co-authored-by: Martin Yuan --- vllm/v1/attention/backends/rocm_aiter_fa.py | 31 ++------------------- 1 file changed, 2 insertions(+), 29 deletions(-) diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 4be650f93..d479f8abc 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -1075,35 +1075,6 @@ class AiterFlashAttentionImpl(AttentionImpl): assert not rocm_aiter_ops.is_shuffle_kv_cache_enabled(), ( "Sliding window with shuffle layout is not supported yet." ) - from aiter.ops.triton.unified_attention import ( - unified_attention, - ) - - descale_shape = ( - attn_metadata.query_start_loc[:num_decodes].shape[0] - 1, - key_cache.shape[2], - ) - unified_attention( - q=query[:num_decode_tokens], - k=key_cache, - v=value_cache, - out=output[:num_decode_tokens], - cu_seqlens_q=attn_metadata.query_start_loc[:num_decodes], - max_seqlen_q=1, # optimize this - seqused_k=attn_metadata.seq_lens[:num_decodes], - max_seqlen_k=attn_metadata.max_seq_len, - softmax_scale=self.scale, - causal=True, - alibi_slopes=self.alibi_slopes, - window_size=self.sliding_window, - block_table=attn_metadata.block_table[:num_decodes], - softcap=self.logits_soft_cap, - q_descale=None, - k_descale=layer._k_scale.expand(descale_shape), - v_descale=layer._v_scale.expand(descale_shape), - ) - return - assert attn_metadata.decode_metadata is not None if rocm_aiter_ops.is_shuffle_kv_cache_enabled(): num_blocks, block_size, num_kv_heads, head_size = key_cache.shape @@ -1172,6 +1143,8 @@ class AiterFlashAttentionImpl(AttentionImpl): layer._v_scale, None, _PARTITION_SIZE_ROCM, + 1, + self.sliding_window[0] + 1, ) else: raise NotImplementedError( -- GitLab From be7370daf3596da71776375b9aba6dd712646fdc Mon Sep 17 00:00:00 2001 From: Alec S <10566873+alecsolder@users.noreply.github.com> Date: Thu, 12 Feb 2026 19:15:48 -0500 Subject: [PATCH 0149/1166] [Frontend] Enable generic structured_outputs for responses API (#33709) Signed-off-by: Alec Solder Co-authored-by: Alec Solder --- .../openai/responses/test_sampling_params.py | 51 +++++++++++++++++-- vllm/entrypoints/openai/responses/protocol.py | 14 +++-- 2 files changed, 58 insertions(+), 7 deletions(-) diff --git a/tests/entrypoints/openai/responses/test_sampling_params.py b/tests/entrypoints/openai/responses/test_sampling_params.py index b8d1aa664..87910271d 100644 --- a/tests/entrypoints/openai/responses/test_sampling_params.py +++ b/tests/entrypoints/openai/responses/test_sampling_params.py @@ -4,8 +4,17 @@ """Unit tests for ResponsesRequest.to_sampling_params() parameter mapping.""" import pytest +import torch +from openai.types.responses.response_format_text_json_schema_config import ( + ResponseFormatTextJSONSchemaConfig, +) +from pydantic import ValidationError -from vllm.entrypoints.openai.responses.protocol import ResponsesRequest +from vllm.entrypoints.openai.responses.protocol import ( + ResponsesRequest, + ResponseTextConfig, +) +from vllm.sampling_params import StructuredOutputsParams class TestResponsesRequestSamplingParams: @@ -76,9 +85,6 @@ class TestResponsesRequestSamplingParams: def test_seed_bounds_validation(self): """Test that seed values outside torch.long bounds are rejected.""" - import torch - from pydantic import ValidationError - # Test seed below minimum with pytest.raises(ValidationError) as exc_info: ResponsesRequest( @@ -111,3 +117,40 @@ class TestResponsesRequestSamplingParams: seed=torch.iinfo(torch.long).max, ) assert request_max.seed == torch.iinfo(torch.long).max + + def test_structured_outputs_passed_through(self): + """Test that structured_outputs field is passed to SamplingParams.""" + structured_outputs = StructuredOutputsParams(grammar="root ::= 'hello'") + request = ResponsesRequest( + model="test-model", + input="test input", + structured_outputs=structured_outputs, + ) + + sampling_params = request.to_sampling_params(default_max_tokens=1000) + + assert sampling_params.structured_outputs is not None + assert sampling_params.structured_outputs.grammar == "root ::= 'hello'" + + def test_structured_outputs_and_json_schema_conflict(self): + """Test that specifying both structured_outputs and json_schema raises.""" + structured_outputs = StructuredOutputsParams(grammar="root ::= 'hello'") + text_config = ResponseTextConfig() + text_config.format = ResponseFormatTextJSONSchemaConfig( + type="json_schema", + name="test", + schema={"type": "object"}, + ) + request = ResponsesRequest( + model="test-model", + input="test input", + structured_outputs=structured_outputs, + text=text_config, + ) + + with pytest.raises(ValueError) as exc_info: + request.to_sampling_params(default_max_tokens=1000) + + assert "Cannot specify both structured_outputs and text.format" in str( + exc_info.value + ) diff --git a/vllm/entrypoints/openai/responses/protocol.py b/vllm/entrypoints/openai/responses/protocol.py index 9a471852b..2b62d7dca 100644 --- a/vllm/entrypoints/openai/responses/protocol.py +++ b/vllm/entrypoints/openai/responses/protocol.py @@ -233,6 +233,10 @@ class ResponsesRequest(OpenAIBaseModel): # this cannot be used in conjunction with previous_response_id # TODO: consider supporting non harmony messages as well previous_input_messages: list[OpenAIHarmonyMessage | dict] | None = None + structured_outputs: StructuredOutputsParams | None = Field( + default=None, + description="Additional kwargs for structured outputs", + ) repetition_penalty: float | None = None seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) @@ -319,8 +323,14 @@ class ResponsesRequest(OpenAIBaseModel): stop_token_ids = default_sampling_params.get("stop_token_ids") # Structured output - structured_outputs = None + structured_outputs = self.structured_outputs + + # Also check text.format for OpenAI-style json_schema if self.text is not None and self.text.format is not None: + if structured_outputs is not None: + raise ValueError( + "Cannot specify both structured_outputs and text.format" + ) response_format = self.text.format if ( response_format.type == "json_schema" @@ -329,8 +339,6 @@ class ResponsesRequest(OpenAIBaseModel): structured_outputs = StructuredOutputsParams( json=response_format.schema_ ) - elif response_format.type == "json_object": - raise NotImplementedError("json_object is not supported") stop = self.stop if self.stop else [] if isinstance(stop, str): -- GitLab From aa181c923bf83b6f8c4ce5613492a6b410c0c535 Mon Sep 17 00:00:00 2001 From: Jaewon <52840625+jaewonlee-fb@users.noreply.github.com> Date: Thu, 12 Feb 2026 16:16:25 -0800 Subject: [PATCH 0150/1166] [Core] Add sleep level 0 mode with enqueue/wait pattern (#33195) Signed-off-by: Jaewon Lee Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com> --- vllm/entrypoints/llm.py | 129 +++++++++++++++++++++++++++++----- vllm/v1/engine/async_llm.py | 3 +- vllm/v1/engine/core.py | 52 ++++++++++++-- vllm/v1/engine/core_client.py | 3 +- vllm/v1/engine/llm_engine.py | 6 +- 5 files changed, 167 insertions(+), 26 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index ab0b46821..9cb40448b 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -458,6 +458,93 @@ class LLM: return self.engine_class.validate_outputs(outputs, RequestOutput) + def enqueue( + self, + prompts: PromptType | Sequence[PromptType], + sampling_params: SamplingParams | Sequence[SamplingParams] | None = None, + lora_request: list[LoRARequest] | LoRARequest | None = None, + priority: list[int] | None = None, + use_tqdm: bool | Callable[..., tqdm] = True, + tokenization_kwargs: dict[str, Any] | None = None, + ) -> list[str]: + """Enqueue prompts for generation without waiting for completion. + + This method adds requests to the engine queue but does not start + processing them. Use wait_for_completion() to process the queued + requests and get results. + + Args: + prompts: The prompts to the LLM. See generate() for details. + sampling_params: The sampling parameters for text generation. + lora_request: LoRA request to use for generation, if any. + priority: The priority of the requests, if any. + use_tqdm: If True, shows a tqdm progress bar while adding requests. + tokenization_kwargs: Overrides for `tokenizer.encode`. + + Returns: + A list of request IDs for the enqueued requests. + """ + model_config = self.model_config + runner_type = model_config.runner_type + if runner_type != "generate": + raise ValueError("LLM.enqueue() is only supported for generative models.") + + if sampling_params is None: + sampling_params = self.get_default_sampling_params() + + # Use the same preprocessing as _run_completion + seq_prompts = prompt_to_seq(prompts) + seq_params = self._params_to_seq(sampling_params, len(seq_prompts)) + + if any(param.truncate_prompt_tokens is not None for param in seq_params): + engine_prompts: Sequence[DictPrompt | TokPrompt] = [ + engine_prompt + for prompt, param in zip(seq_prompts, seq_params) + for engine_prompt in self._preprocess_completion( + [prompt], + tokenization_kwargs=merge_kwargs( + tokenization_kwargs, + dict(truncate_prompt_tokens=param.truncate_prompt_tokens), + ), + ) + ] + else: + engine_prompts = self._preprocess_completion( + seq_prompts, + tokenization_kwargs=tokenization_kwargs, + ) + + request_ids = self._validate_and_add_requests( + prompts=engine_prompts, + params=seq_params, + use_tqdm=use_tqdm, + lora_request=self._get_modality_specific_lora_reqs( + engine_prompts, lora_request + ), + tokenization_kwargs=tokenization_kwargs, + priority=priority, + ) + + return request_ids + + def wait_for_completion( + self, + use_tqdm: bool | Callable[..., tqdm] = True, + ) -> list[RequestOutput]: + """Wait for all enqueued requests to complete and return results. + + This method processes all requests currently in the engine queue + and returns their outputs. Use after enqueue() to get results. + + Args: + use_tqdm: If True, shows a tqdm progress bar. + + Returns: + A list of RequestOutput objects for all completed requests. + """ + outputs = self._run_engine(use_tqdm=use_tqdm) + return self.engine_class.validate_outputs(outputs, RequestOutput) + def _get_modality_specific_lora_reqs( self, prompts: Sequence[DictPrompt | TokPrompt], @@ -1618,19 +1705,22 @@ class LLM: during the sleep period, before `wake_up` is called. Args: - level: The sleep level. Level 1 sleep will offload the model - weights and discard the kv cache. The content of kv cache - is forgotten. Level 1 sleep is good for sleeping and waking - up the engine to run the same model again. The model weights - are backed up in CPU memory. Please make sure there's enough - CPU memory to store the model weights. Level 2 sleep will - discard both the model weights and the kv cache. The content - of both the model weights and kv cache is forgotten. Level 2 - sleep is good for sleeping and waking up the engine to run a - different model or update the model, where previous model - weights are not needed. It reduces CPU memory pressure. + level: The sleep level. + - Level 0: Pause scheduling but continue accepting requests. + Requests are queued but not processed. + - Level 1: Offload model weights to CPU, discard KV cache. + The content of kv cache is forgotten. Good for + sleeping and waking up the engine to run the same + model again. Please make sure there's enough CPU + memory to store the model weights. + - Level 2: Discard all GPU memory (weights + KV cache). + Good for sleeping and waking up the engine to run + a different model or update the model, where + previous model weights are not needed. It reduces + CPU memory pressure. """ - self.reset_prefix_cache() + if level > 0: + self.reset_prefix_cache() self.llm_engine.sleep(level=level) def wake_up(self, tags: list[str] | None = None): @@ -1641,9 +1731,10 @@ class LLM: Args: tags: An optional list of tags to reallocate the engine memory for specific memory allocations. Values must be in - `("weights", "kv_cache")`. If None, all memory is reallocated. - wake_up should be called with all tags (or None) before the - engine is used again. + `("weights", "kv_cache", "scheduling")`. If None, all memory + is reallocated. wake_up should be called with all tags + (or None) before the engine is used again. + Use tags=["scheduling"] to resume from level 0 sleep. """ self.llm_engine.wake_up(tags) @@ -1810,7 +1901,7 @@ class LLM: lora_request: Sequence[LoRARequest | None] | LoRARequest | None, tokenization_kwargs: dict[str, Any] | None = None, priority: list[int] | None = None, - ) -> None: + ) -> list[str]: num_requests = len(prompts) seq_params = self._params_to_seq(params, num_requests) seq_lora_requests = self._lora_request_to_seq(lora_request, num_requests) @@ -1844,6 +1935,8 @@ class LLM: self.llm_engine.abort_request(added_request_ids, internal=True) raise e + return added_request_ids + def _add_request( self, prompt: PromptType | DictPrompt | TokPrompt, @@ -1895,7 +1988,9 @@ class LLM: return engine_request.request_id def _run_engine( - self, *, use_tqdm: bool | Callable[..., tqdm] = True + self, + *, + use_tqdm: bool | Callable[..., tqdm] = True, ) -> list[RequestOutput | PoolingRequestOutput]: # Initialize tqdm. if use_tqdm: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index d6ef94880..44853ec88 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -938,7 +938,8 @@ class AsyncLLM(EngineClient): await self.engine_core.reset_encoder_cache_async() async def sleep(self, level: int = 1) -> None: - await self.reset_prefix_cache() + if level > 0: + await self.reset_prefix_cache() await self.engine_core.sleep_async(level) if self.logger_manager is not None: diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 1d64b82f7..afa59d52d 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -614,13 +614,43 @@ class EngineCore: self.model_executor.reset_encoder_cache() def sleep(self, level: int = 1): - self.model_executor.sleep(level) + """Put the engine to sleep at the specified level. + + Args: + level: Sleep level. + - Level 0: Pause scheduling only. Requests are still accepted + but not processed. No GPU memory changes. + - Level 1: Offload model weights to CPU, discard KV cache. + - Level 2: Discard all GPU memory. + """ + if level == 0: + # Level 0: Just pause scheduling, don't touch GPU + self.pause_scheduler() + else: + # Level 1+: Delegate to executor for GPU memory management + self.model_executor.sleep(level) def wake_up(self, tags: list[str] | None = None): - self.model_executor.wake_up(tags) + """Wake up the engine from sleep. + + Args: + tags: Tags to wake up. Use ["scheduling"] for level 0 wake up. + """ + if tags is not None and "scheduling" in tags: + # Level 0 wake up: Resume scheduling + self.resume_scheduler() + # Remove "scheduling" from tags if there are other tags to process + remaining_tags = [t for t in tags if t != "scheduling"] + if remaining_tags: + self.model_executor.wake_up(remaining_tags) + else: + # Full wake up + self.resume_scheduler() + self.model_executor.wake_up(tags) def is_sleeping(self) -> bool: - return self.model_executor.is_sleeping + """Check if engine is sleeping at any level.""" + return self._scheduler_paused or self.model_executor.is_sleeping def execute_dummy_batch(self): self.model_executor.execute_dummy_batch() @@ -1023,7 +1053,13 @@ class EngineCoreProc(EngineCore): # 1) Poll the input queue until there is work to do. self._process_input_queue() # 2) Step the engine core and return the outputs. - self._process_engine_step() + # Skip if scheduling is paused (level 0 sleep) + if not self._scheduler_paused: + self._process_engine_step() + else: + # When scheduling is paused, still need to check for wake up + # by processing any utility requests that might resume scheduling + pass def _process_input_queue(self): """Exits when an engine step needs to be performed.""" @@ -1031,7 +1067,7 @@ class EngineCoreProc(EngineCore): waited = False while ( not self.engines_running - and not self.scheduler.has_requests() + and (not self.scheduler.has_requests() or self._scheduler_paused) and not self.batch_queue and not self._scheduler_paused ): @@ -1414,11 +1450,15 @@ class DPEngineCoreProc(EngineCoreProc): # 1) Poll the input queue until there is work to do. self._process_input_queue() + # Skip processing if scheduling is paused (level 0 sleep) + if self._scheduler_paused: + continue + # 2) Step the engine core. executed = self._process_engine_step() self._maybe_publish_request_counts() - local_unfinished_reqs = self.scheduler.has_unfinished_requests() + if not executed: if not local_unfinished_reqs and not self.engines_running: # All engines are idle. diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index deae0c83e..b31f1c406 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -194,7 +194,7 @@ class EngineCoreClient(ABC): raise NotImplementedError def dp_engines_running(self) -> bool: - """Returns True id data parallel engines are collectively in a + """Returns True if data parallel engines are collectively in a running state.""" raise NotImplementedError @@ -724,6 +724,7 @@ class SyncMPClient(MPClient): # it is forwarded to the outputs_queue so we can raise it # from this (run_output_handler) task to shut down the server. outputs = self.outputs_queue.get() + if isinstance(outputs, Exception): raise self._format_exception(outputs) from None if outputs.wave_complete is not None: diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 815236b94..51f39c929 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -312,7 +312,11 @@ class LLMEngine: # 4) Record stats with record_function_or_nullcontext("llm_engine step: record_stats"): - if self.logger_manager is not None and outputs.scheduler_stats is not None: + if ( + self.logger_manager is not None + and outputs.scheduler_stats is not None + and len(outputs.outputs) > 0 + ): self.logger_manager.record( scheduler_stats=outputs.scheduler_stats, iteration_stats=iteration_stats, -- GitLab From 4453ba8d9ec8e35d68084a118f35ce5c48b5dae6 Mon Sep 17 00:00:00 2001 From: Jaewon <52840625+jaewonlee-fb@users.noreply.github.com> Date: Thu, 12 Feb 2026 16:16:38 -0800 Subject: [PATCH 0151/1166] [Core] Profiler improvements and lazy initialization (#33198) Signed-off-by: Jaewon Lee Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com> --- vllm/distributed/utils.py | 40 +++++++++++++++++++++++ vllm/entrypoints/llm.py | 11 +++++-- vllm/v1/engine/async_llm.py | 4 +-- vllm/v1/engine/core.py | 4 +-- vllm/v1/engine/core_client.py | 20 +++++++----- vllm/v1/engine/llm_engine.py | 4 +-- vllm/v1/executor/abstract.py | 4 +-- vllm/v1/metrics/loggers.py | 4 +-- vllm/v1/worker/cpu_worker.py | 2 +- vllm/v1/worker/gpu_worker.py | 61 ++++++++++++++++++++++++++--------- 10 files changed, 117 insertions(+), 37 deletions(-) diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index 8df9d638a..17375259e 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -524,3 +524,43 @@ def stateless_destroy_torch_distributed_process_group(pg: ProcessGroup) -> None: """ pg.shutdown() _unregister_process_group(pg.group_name) + + +def get_worker_rank_suffix(global_rank: int | None = None) -> str: + """Generate a descriptive rank suffix for worker identification. + + Returns a string like 'dp0_pp0_tp0_dcp0_ep0_rank0' including all + parallel dimensions: DP, PP, TP, DCP, EP. + + Args: + global_rank: Optional global rank to append. If not provided, + only parallel dimension ranks are included. + + Returns: + A string suffix identifying the worker's position in the + distributed topology. + """ + from vllm.distributed.parallel_state import ( + get_dcp_group, + get_dp_group, + get_ep_group, + get_pp_group, + get_tp_group, + ) + + try: + dp_rank = get_dp_group().rank_in_group + pp_rank = get_pp_group().rank_in_group + tp_rank = get_tp_group().rank_in_group + dcp_rank = get_dcp_group().rank_in_group + ep_rank = get_ep_group().rank_in_group + + suffix = f"dp{dp_rank}_pp{pp_rank}_tp{tp_rank}_dcp{dcp_rank}_ep{ep_rank}" + if global_rank is not None: + suffix = f"{suffix}_rank{global_rank}" + return suffix + except Exception: + # Fallback if parallel state not initialized + if global_rank is not None: + return f"rank{global_rank}" + return "" diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 9cb40448b..f54d9121c 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1685,8 +1685,15 @@ class LLM: tokenization_kwargs=encode_kwargs, ) - def start_profile(self) -> None: - self.llm_engine.start_profile() + def start_profile(self, profile_prefix: str | None = None) -> None: + """Start profiling with optional custom trace prefix. + + Args: + profile_prefix: Optional prefix for the trace file names. If provided, + trace files will be named as "_dp_pp_tp". + If not provided, default naming will be used. + """ + self.llm_engine.start_profile(profile_prefix) def stop_profile(self) -> None: self.llm_engine.stop_profile() diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 44853ec88..bab898da6 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -911,8 +911,8 @@ class AsyncLLM(EngineClient): if self.errored: raise self.dead_error - async def start_profile(self) -> None: - coros = [self.engine_core.profile_async(True)] + async def start_profile(self, profile_prefix: str | None = None) -> None: + coros = [self.engine_core.profile_async(True, profile_prefix)] if self.profiler is not None: coros.append(asyncio.to_thread(self.profiler.start)) await asyncio.gather(*coros) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index afa59d52d..7553c7332 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -568,8 +568,8 @@ class EngineCore: if self.scheduler: self.scheduler.shutdown() - def profile(self, is_start: bool = True): - self.model_executor.profile(is_start) + def profile(self, is_start: bool = True, profile_prefix: str | None = None): + self.model_executor.profile(is_start, profile_prefix) def reset_mm_cache(self): # NOTE: Since this is mainly for debugging, we don't attempt to diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index b31f1c406..e9187c4e8 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -135,7 +135,7 @@ class EngineCoreClient(ABC): def add_request(self, request: EngineCoreRequest) -> None: raise NotImplementedError - def profile(self, is_start: bool = True) -> None: + def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None: raise NotImplementedError def reset_mm_cache(self) -> None: @@ -210,7 +210,9 @@ class EngineCoreClient(ABC): async def add_request_async(self, request: EngineCoreRequest) -> None: raise NotImplementedError - async def profile_async(self, is_start: bool = True) -> None: + async def profile_async( + self, is_start: bool = True, profile_prefix: str | None = None + ) -> None: raise NotImplementedError async def reset_mm_cache_async(self) -> None: @@ -295,8 +297,8 @@ class InprocClient(EngineCoreClient): def shutdown(self) -> None: self.engine_core.shutdown() - def profile(self, is_start: bool = True) -> None: - self.engine_core.profile(is_start) + def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None: + self.engine_core.profile(is_start, profile_prefix) def reset_mm_cache(self) -> None: self.engine_core.reset_mm_cache() @@ -765,8 +767,8 @@ class SyncMPClient(MPClient): if request_ids and not self.resources.engine_dead: self._send_input(EngineCoreRequestType.ABORT, request_ids) - def profile(self, is_start: bool = True) -> None: - self.call_utility("profile", is_start) + def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None: + self.call_utility("profile", is_start, profile_prefix) def reset_mm_cache(self) -> None: self.call_utility("reset_mm_cache") @@ -987,8 +989,10 @@ class AsyncMPClient(MPClient): """Resume the scheduler after a pause.""" await self.call_utility_async("resume_scheduler") - async def profile_async(self, is_start: bool = True) -> None: - await self.call_utility_async("profile", is_start) + async def profile_async( + self, is_start: bool = True, profile_prefix: str | None = None + ) -> None: + await self.call_utility_async("profile", is_start, profile_prefix) async def reset_mm_cache_async(self) -> None: await self.call_utility_async("reset_mm_cache") diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 51f39c929..76aa8f438 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -326,8 +326,8 @@ class LLMEngine: return processed_outputs.request_outputs - def start_profile(self): - self.engine_core.profile(True) + def start_profile(self, profile_prefix: str | None = None): + self.engine_core.profile(True, profile_prefix) def stop_profile(self): self.engine_core.profile(False) diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index 32fa87e9d..91bd019f8 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -238,8 +238,8 @@ class Executor(ABC): def max_concurrent_batches(self) -> int: return 1 - def profile(self, is_start: bool = True): - self.collective_rpc("profile", args=(is_start,)) + def profile(self, is_start: bool = True, profile_prefix: str | None = None): + self.collective_rpc("profile", args=(is_start, profile_prefix)) def save_sharded_state( self, diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 49b97e8f3..229b5742d 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -1305,8 +1305,8 @@ class StatLoggerManager: ): if engine_idx is None: engine_idx = 0 - for logger in self.stat_loggers: - logger.record( + for stat_logger in self.stat_loggers: + stat_logger.record( scheduler_stats, iteration_stats, mm_cache_stats=mm_cache_stats, diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index 2fbcc9c44..752b692f8 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -212,7 +212,7 @@ class CPUWorker(Worker): ) return ",".join([str(x.id) for x in logical_cpu_list]) - def profile(self, is_start: bool = True): + def profile(self, is_start: bool = True, profile_prefix: str | None = None): if self.profiler is None: raise RuntimeError("Profiler is not enabled.") if is_start: diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 635402f3d..2507b7f20 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -103,20 +103,14 @@ class Worker(WorkerBase): ) # Torch/CUDA profiler. Enabled and configured through profiler_config. + # Profiler wrapper is created lazily in profile() when start is called, + # so we have all the information needed for proper trace naming. self.profiler: Any | None = None - profiler_config = vllm_config.profiler_config - if profiler_config.profiler == "torch": - worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" - self.profiler = TorchProfilerWrapper( - profiler_config, - worker_name=worker_name, - local_rank=self.local_rank, - activities=["CPU", "CUDA"], - ) - elif profiler_config.profiler == "cuda": - self.profiler = CudaProfilerWrapper(profiler_config) - else: - self.profiler = None + self.profiler_config = vllm_config.profiler_config + + # Only validate profiler config is valid, don't instantiate yet + if self.profiler_config.profiler not in ("torch", "cuda", None): + raise ValueError(f"Unknown profiler type: {self.profiler_config.profiler}") self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER @@ -677,17 +671,52 @@ class Worker(WorkerBase): def take_draft_token_ids(self) -> DraftTokenIds | None: return self.model_runner.take_draft_token_ids() - def profile(self, is_start: bool = True): - if self.profiler is None: + def profile(self, is_start: bool = True, profile_prefix: str | None = None): + # Check if profiling is enabled + if self.profiler_config is None or self.profiler_config.profiler is None: raise RuntimeError( "Profiling is not enabled. Please set --profiler-config to enable " "profiling. Example: " "'--profiler-config.profiler=torch --profiler-config.torch_profiler_dir" "=YOUR_DIR_PATH_TO_DUMP_TRACE'" ) + if is_start: - self.profiler.start() + # Generate the trace name by combining prefix with comprehensive rank suffix + from vllm.distributed.utils import get_worker_rank_suffix + + rank_suffix = get_worker_rank_suffix(global_rank=self.rank) + + # Build the full trace name + if profile_prefix: + trace_name = f"{profile_prefix}_{rank_suffix}" + else: + trace_name = rank_suffix + + # Create the profiler wrapper only on the first start call + if self.profiler is None: + if self.profiler_config.profiler == "torch": + self.profiler = TorchProfilerWrapper( + self.profiler_config, + worker_name=trace_name, + local_rank=self.local_rank, + activities=["CPU", "CUDA"], + ) + logger.debug( + "Starting torch profiler with trace name: %s", trace_name + ) + elif self.profiler_config.profiler == "cuda": + self.profiler = CudaProfilerWrapper(self.profiler_config) + logger.debug("Starting CUDA profiler") + self.profiler.start() + else: + # Profiler already initialized. Restart profiling but keep + # the original trace name from the first initialization. + self.profiler.start() else: + if self.profiler is None: + logger.warning("Profiler was not started, nothing to stop.") + return self.profiler.stop() def execute_dummy_batch(self) -> None: -- GitLab From 96161fe9785814bf1adcce49dfd3c47863a2ecac Mon Sep 17 00:00:00 2001 From: Yanan Cao Date: Thu, 12 Feb 2026 18:13:12 -0800 Subject: [PATCH 0152/1166] [Kernel] [Helion] [4/N] Add silu_mul_fp8 Helion kernel (#33373) Signed-off-by: Yanan Cao --- tests/kernels/helion/test_register.py | 10 +- tests/kernels/helion/test_silu_mul_fp8.py | 331 +++++++++++ vllm/kernels/helion/__init__.py | 1 + vllm/kernels/helion/config_manager.py | 3 - vllm/kernels/helion/configs/silu_mul_fp8.json | 550 ++++++++++++++++++ vllm/kernels/helion/ops/__init__.py | 11 + vllm/kernels/helion/ops/silu_mul_fp8.py | 100 ++++ 7 files changed, 1002 insertions(+), 4 deletions(-) create mode 100644 tests/kernels/helion/test_silu_mul_fp8.py create mode 100644 vllm/kernels/helion/configs/silu_mul_fp8.json create mode 100644 vllm/kernels/helion/ops/__init__.py create mode 100644 vllm/kernels/helion/ops/silu_mul_fp8.py diff --git a/tests/kernels/helion/test_register.py b/tests/kernels/helion/test_register.py index faac2765c..02b05be74 100644 --- a/tests/kernels/helion/test_register.py +++ b/tests/kernels/helion/test_register.py @@ -554,11 +554,19 @@ class TestKernelRegistry: """Test suite for kernel registry functionality.""" def setup_method(self): - """Clear the registry before each test.""" + """Save and clear the registry before each test.""" from vllm.kernels.helion.register import _REGISTERED_KERNELS + self._saved_registry = dict(_REGISTERED_KERNELS) _REGISTERED_KERNELS.clear() + def teardown_method(self): + """Restore the registry after each test.""" + from vllm.kernels.helion.register import _REGISTERED_KERNELS + + _REGISTERED_KERNELS.clear() + _REGISTERED_KERNELS.update(self._saved_registry) + def test_get_registered_kernels_returns_copy(self): """Test get_registered_kernels returns copy of registry.""" result1 = get_registered_kernels() diff --git a/tests/kernels/helion/test_silu_mul_fp8.py b/tests/kernels/helion/test_silu_mul_fp8.py new file mode 100644 index 000000000..da6405d6c --- /dev/null +++ b/tests/kernels/helion/test_silu_mul_fp8.py @@ -0,0 +1,331 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +import torch +import torch.nn.functional as F + +from vllm.utils.import_utils import has_helion + +if not has_helion(): + pytest.skip( + "Helion is not installed. Install with: pip install vllm[helion]", + allow_module_level=True, + ) + +from vllm.kernels.helion.config_manager import ConfigManager +from vllm.kernels.helion.ops.silu_mul_fp8 import ( + pick_silu_mul_fp8_config, + silu_mul_fp8, + silu_mul_fp8_baseline, +) + + +def skip_if_platform_unsupported(): + try: + from vllm.kernels.helion.utils import get_canonical_gpu_name + + if not torch.cuda.is_available(): + pytest.skip("CUDA not available") + + platform = get_canonical_gpu_name() + + try: + config_manager = ConfigManager.get_instance() + except RuntimeError: + config_manager = ConfigManager() + + configs = config_manager.get_platform_configs("silu_mul_fp8", platform) + if len(configs) == 0: + pytest.skip("Current GPU platform not supported for silu_mul_fp8 kernel") + + except (ImportError, RuntimeError, KeyError): + pytest.skip("Error detecting platform support for silu_mul_fp8 kernel") + + +@pytest.fixture(autouse=True) +def reset_config_manager_singleton(): + ConfigManager.reset_instance() + ConfigManager() + yield + ConfigManager.reset_instance() + + +class TestSiluMulFp8ConfigPicker: + def test_config_picker_exact_match(self): + config_keys = [ + "intermediate_2048_batchsize_256", + "intermediate_4096_batchsize_256", + ] + + input_tensor = torch.randn(32, 4096, dtype=torch.bfloat16, device="cuda") + scale = torch.tensor([0.5], dtype=torch.float32, device="cuda") + args = (input_tensor, scale) + + selected_key = pick_silu_mul_fp8_config(args, config_keys) + assert selected_key == "intermediate_2048_batchsize_256" + + def test_config_picker_closest_match(self): + config_keys = [ + "intermediate_2048_batchsize_256", + "intermediate_4096_batchsize_256", + ] + # Use 7000 (intermediate_size=3500) which is closer to 4096 than 2048 + input_tensor = torch.randn(32, 7000, dtype=torch.bfloat16, device="cuda") + scale = torch.tensor([0.5], dtype=torch.float32, device="cuda") + args = (input_tensor, scale) + + selected_key = pick_silu_mul_fp8_config(args, config_keys) + assert selected_key == "intermediate_4096_batchsize_256" + + def test_config_picker_fallback_to_default(self): + config_keys = ["default", "some_other_key"] + + input_tensor = torch.randn(32, 4096, dtype=torch.bfloat16, device="cuda") + scale = torch.tensor([0.5], dtype=torch.float32, device="cuda") + args = (input_tensor, scale) + + selected_key = pick_silu_mul_fp8_config(args, config_keys) + assert selected_key == "default" + + def test_config_picker_no_configs(self): + config_keys: list[str] = [] + + input_tensor = torch.randn(32, 4096, dtype=torch.bfloat16, device="cuda") + scale = torch.tensor([0.5], dtype=torch.float32, device="cuda") + args = (input_tensor, scale) + + selected_key = pick_silu_mul_fp8_config(args, config_keys) + assert selected_key is None + + @pytest.mark.parametrize("intermediate_size", [2048, 4096, 5120]) + def test_config_picker_different_sizes(self, intermediate_size): + config_keys = [ + "intermediate_2048_batchsize_256", + "intermediate_4096_batchsize_256", + "intermediate_5120_batchsize_256", + ] + + input_tensor = torch.randn( + 32, 2 * intermediate_size, dtype=torch.bfloat16, device="cuda" + ) + scale = torch.tensor([0.5], dtype=torch.float32, device="cuda") + args = (input_tensor, scale) + + selected_key = pick_silu_mul_fp8_config(args, config_keys) + expected_key = f"intermediate_{intermediate_size}_batchsize_256" + assert selected_key == expected_key + + +class TestSiluMulFp8Correctness: + @pytest.mark.parametrize("batch_size", [1, 8, 32, 128]) + @pytest.mark.parametrize("intermediate_size", [2048, 3000, 3500, 4096, 5000]) + @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) + def test_silu_mul_fp8_correctness(self, batch_size, intermediate_size, dtype): + skip_if_platform_unsupported() + + input_size = 2 * intermediate_size + input_tensor = torch.randn(batch_size, input_size, dtype=dtype, device="cuda") + scale = torch.tensor([0.5], dtype=torch.float32, device="cuda") + + reference_output = silu_mul_fp8_baseline(input_tensor, scale) + helion_output = silu_mul_fp8(input_tensor, scale) + + assert helion_output.shape == reference_output.shape + assert helion_output.dtype == torch.float8_e4m3fn + assert reference_output.dtype == torch.float8_e4m3fn + + ref_f32 = reference_output.to(torch.float32) + helion_f32 = helion_output.to(torch.float32) + # FP8 E4M3 has limited precision. Values near quantization boundaries + # can round differently due to intermediate precision differences. + torch.testing.assert_close( + helion_f32, + ref_f32, + atol=0.05, + rtol=0.05, + msg=f"Mismatch at batch={batch_size}, size={intermediate_size}", + ) + + def test_silu_mul_fp8_shape_inference(self): + skip_if_platform_unsupported() + batch_size, input_size = 32, 8192 + intermediate_size = input_size // 2 + + input_tensor = torch.randn( + batch_size, input_size, dtype=torch.bfloat16, device="cuda" + ) + scale = torch.tensor([0.5], dtype=torch.float32, device="cuda") + + output = silu_mul_fp8(input_tensor, scale) + + expected_shape = (batch_size, intermediate_size) + assert output.shape == expected_shape + assert output.dtype == torch.float8_e4m3fn + + def test_silu_mul_fp8_scale_variations(self): + skip_if_platform_unsupported() + batch_size, input_size = 16, 4096 + + input_tensor = torch.randn( + batch_size, input_size, dtype=torch.bfloat16, device="cuda" + ) + + scales = [0.1, 0.5, 1.0, 2.0, 10.0] + + for scale_val in scales: + scale = torch.tensor([scale_val], dtype=torch.float32, device="cuda") + + reference_output = silu_mul_fp8_baseline(input_tensor, scale) + helion_output = silu_mul_fp8(input_tensor, scale) + ref_f32 = reference_output.to(torch.float32) + helion_f32 = helion_output.to(torch.float32) + + torch.testing.assert_close( + helion_f32, + ref_f32, + atol=0.05, + rtol=0.05, + msg=f"Mismatch for scale={scale_val}", + ) + + @pytest.mark.parametrize( + "shape", + [ + (1, 4096), + (16, 4096), + (128, 4096), + (1024, 4096), + (1, 8192), + (16, 8192), + (128, 8192), + ], + ) + def test_silu_mul_fp8_various_shapes(self, shape): + skip_if_platform_unsupported() + + input_tensor = torch.randn(*shape, dtype=torch.bfloat16, device="cuda") + scale = torch.tensor([0.5], dtype=torch.float32, device="cuda") + + reference_output = silu_mul_fp8_baseline(input_tensor, scale) + helion_output = silu_mul_fp8(input_tensor, scale) + + assert helion_output.shape == reference_output.shape + + ref_f32 = reference_output.to(torch.float32) + helion_f32 = helion_output.to(torch.float32) + + torch.testing.assert_close( + helion_f32, ref_f32, atol=0.05, rtol=0.05, msg=f"Mismatch for shape={shape}" + ) + + +def silu_mul_fp8_pytorch(input: torch.Tensor, scale: torch.Tensor) -> torch.Tensor: + """Pure PyTorch reference using F.silu. + + This matches vLLM's SiluAndMul.forward_native exactly: + F.silu(x[..., :d]) * x[..., d:] + """ + d = input.shape[-1] // 2 + result = F.silu(input[..., :d]) * input[..., d:] + return (result.to(torch.float32) / scale).to(torch.float8_e4m3fn) + + +class TestSiluMulFp8PytorchReference: + """Tests comparing Helion kernel against pure PyTorch implementation. + + Uses tighter tolerance since both use PyTorch's FP8 conversion + (same rounding mode), unlike the vLLM C++ baseline which uses + NVIDIA's hardware FP8 conversion with different rounding. + """ + + @pytest.mark.parametrize("batch_size", [1, 8, 32, 128, 256]) + @pytest.mark.parametrize("intermediate_size", [1024, 2048, 4096]) + @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) + def test_silu_mul_fp8_vs_pytorch(self, batch_size, intermediate_size, dtype): + skip_if_platform_unsupported() + + input_tensor = torch.randn( + batch_size, 2 * intermediate_size, dtype=dtype, device="cuda" + ) + scale = torch.tensor([0.5], dtype=torch.float32, device="cuda") + + pytorch_output = silu_mul_fp8_pytorch(input_tensor, scale) + helion_output = silu_mul_fp8(input_tensor, scale) + + assert helion_output.shape == pytorch_output.shape + assert helion_output.dtype == torch.float8_e4m3fn + + pytorch_f32 = pytorch_output.to(torch.float32) + helion_f32 = helion_output.to(torch.float32) + + # Tolerance accounts for FP8 quantization boundary effects + torch.testing.assert_close( + helion_f32, + pytorch_f32, + atol=0.05, + rtol=0.05, + msg=( + f"Mismatch at batch={batch_size}, size={intermediate_size}, " + f"dtype={dtype}" + ), + ) + + @pytest.mark.parametrize( + "shape", + [ + (1, 2, 4096), # 3D input + (2, 4, 2048), # 3D input + (1, 1, 1, 8192), # 4D input + ], + ) + def test_silu_mul_fp8_multidim_vs_pytorch(self, shape): + skip_if_platform_unsupported() + + input_tensor = torch.randn(*shape, dtype=torch.bfloat16, device="cuda") + scale = torch.tensor([0.5], dtype=torch.float32, device="cuda") + + pytorch_output = silu_mul_fp8_pytorch(input_tensor, scale) + helion_output = silu_mul_fp8(input_tensor, scale) + + assert helion_output.shape == pytorch_output.shape + + pytorch_f32 = pytorch_output.to(torch.float32) + helion_f32 = helion_output.to(torch.float32) + + torch.testing.assert_close( + helion_f32, + pytorch_f32, + atol=0.05, + rtol=0.05, + msg=f"Mismatch for shape={shape}", + ) + + +class TestSiluMulFp8Integration: + def test_kernel_registration_integration(self): + from vllm.kernels.helion.register import get_registered_kernels + + registered_kernels = get_registered_kernels() + assert "silu_mul_fp8" in registered_kernels + + kernel_wrapper = registered_kernels["silu_mul_fp8"] + assert kernel_wrapper.op_name == "silu_mul_fp8" + assert kernel_wrapper._config_picker is not None + + def test_fake_impl_functionality(self): + skip_if_platform_unsupported() + from vllm.kernels.helion.register import get_registered_kernels + + input_tensor = torch.randn(32, 4096, dtype=torch.bfloat16, device="cuda") + scale = torch.tensor([0.5], dtype=torch.float32, device="cuda") + registered_kernels = get_registered_kernels() + kernel_wrapper = registered_kernels["silu_mul_fp8"] + fake_impl = kernel_wrapper._fake_impl + + fake_output = fake_impl(input_tensor, scale) + + expected_shape = (32, 2048) + assert fake_output.shape == expected_shape + assert fake_output.dtype == torch.float8_e4m3fn + assert fake_output.device == input_tensor.device diff --git a/vllm/kernels/helion/__init__.py b/vllm/kernels/helion/__init__.py index dfbf28b8d..2568baa20 100644 --- a/vllm/kernels/helion/__init__.py +++ b/vllm/kernels/helion/__init__.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Helion integration for vLLM.""" +import vllm.kernels.helion.ops # noqa: F401 Auto-register all Helion ops from vllm.kernels.helion.config_manager import ( ConfigManager, ConfigSet, diff --git a/vllm/kernels/helion/config_manager.py b/vllm/kernels/helion/config_manager.py index 59d5bf430..63560761e 100644 --- a/vllm/kernels/helion/config_manager.py +++ b/vllm/kernels/helion/config_manager.py @@ -104,9 +104,6 @@ class ConfigSet: result[platform] = {} for config_key, config in config_keys_dict.items(): - # Convert helion.Config to dict using to_json() + json.loads() - import json - result[platform][config_key] = json.loads(config.to_json()) return result diff --git a/vllm/kernels/helion/configs/silu_mul_fp8.json b/vllm/kernels/helion/configs/silu_mul_fp8.json new file mode 100644 index 000000000..c26ca087d --- /dev/null +++ b/vllm/kernels/helion/configs/silu_mul_fp8.json @@ -0,0 +1,550 @@ +{ + "nvidia_h200": { + "intermediate_2048_batchsize_256": { + "block_sizes": [ + 64, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 2 + ], + "range_unroll_factors": [ + 0 + ], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 32, + "num_stages": 1, + "indexing": [ + "pointer", + "tensor_descriptor", + "pointer", + "pointer" + ], + "pid_type": "flat", + "range_warp_specializes": [] + }, + "intermediate_4096_batchsize_256": { + "block_sizes": [ + 16, + 64 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 0 + ], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "", + "" + ], + "num_warps": 2, + "num_stages": 1, + "indexing": [ + "pointer", + "pointer", + "pointer", + "pointer" + ], + "pid_type": "flat", + "range_warp_specializes": [] + }, + "default": { + "block_sizes": [ + 1, + 512 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat", + "range_warp_specializes": [] + } + }, + "nvidia_h100_pcie": { + "intermediate_2048_batchsize_256": { + "block_sizes": [ + 1, + 512 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat", + "range_warp_specializes": [] + }, + "intermediate_4096_batchsize_256": { + "block_sizes": [ + 256, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 2 + ], + "range_num_stages": [ + 3 + ], + "range_multi_buffers": [ + false + ], + "range_flattens": [ + true + ], + "load_eviction_policies": [ + "last", + "last", + "" + ], + "num_warps": 32, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "persistent_blocked", + "range_warp_specializes": [] + }, + "default": { + "block_sizes": [ + 1, + 512 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat", + "range_warp_specializes": [] + } + }, + "nvidia_h100_sxm5": { + "intermediate_2048_batchsize_256": { + "block_sizes": [ + 1, + 512 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat", + "range_warp_specializes": [] + }, + "intermediate_4096_batchsize_256": { + "block_sizes": [ + 256, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 2 + ], + "range_num_stages": [ + 3 + ], + "range_multi_buffers": [ + false + ], + "range_flattens": [ + true + ], + "load_eviction_policies": [ + "last", + "last", + "" + ], + "num_warps": 32, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "persistent_blocked", + "range_warp_specializes": [] + }, + "default": { + "block_sizes": [ + 1, + 512 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat", + "range_warp_specializes": [] + } + }, + "nvidia_h100": { + "intermediate_2048_batchsize_256": { + "block_sizes": [ + 1, + 512 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat", + "range_warp_specializes": [] + }, + "intermediate_4096_batchsize_256": { + "block_sizes": [ + 256, + 128 + ], + "loop_orders": [ + [ + 0, + 1 + ] + ], + "flatten_loops": [ + true + ], + "l2_groupings": [ + 1 + ], + "range_unroll_factors": [ + 2 + ], + "range_num_stages": [ + 3 + ], + "range_multi_buffers": [ + false + ], + "range_flattens": [ + true + ], + "load_eviction_policies": [ + "last", + "last", + "" + ], + "num_warps": 32, + "num_stages": 3, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "persistent_blocked", + "range_warp_specializes": [] + }, + "default": { + "block_sizes": [ + 1, + 512 + ], + "loop_orders": [ + [ + 1, + 0 + ] + ], + "flatten_loops": [ + false + ], + "l2_groupings": [ + 4 + ], + "range_unroll_factors": [ + 0 + ], + "range_num_stages": [ + 0 + ], + "range_multi_buffers": [ + null + ], + "range_flattens": [ + null + ], + "load_eviction_policies": [ + "", + "first", + "" + ], + "num_warps": 8, + "num_stages": 2, + "indexing": [ + "tensor_descriptor", + "tensor_descriptor", + "tensor_descriptor", + "pointer" + ], + "pid_type": "flat", + "range_warp_specializes": [] + } + } +} \ No newline at end of file diff --git a/vllm/kernels/helion/ops/__init__.py b/vllm/kernels/helion/ops/__init__.py new file mode 100644 index 000000000..eacd483bb --- /dev/null +++ b/vllm/kernels/helion/ops/__init__.py @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Auto-import all Helion op modules to trigger kernel registration.""" + +import importlib +import pkgutil + +# Automatically import all submodules so that @register_kernel +# decorators execute and register ops with torch.ops.vllm_helion. +for _module_info in pkgutil.iter_modules(__path__): + importlib.import_module(f"{__name__}.{_module_info.name}") diff --git a/vllm/kernels/helion/ops/silu_mul_fp8.py b/vllm/kernels/helion/ops/silu_mul_fp8.py new file mode 100644 index 000000000..a45943b1a --- /dev/null +++ b/vllm/kernels/helion/ops/silu_mul_fp8.py @@ -0,0 +1,100 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Any + +import torch + +from vllm.logger import init_logger +from vllm.utils.import_utils import has_helion + +if not has_helion(): + raise ImportError( + "silu_mul_fp8 Helion kernel requires helion to be installed. " + "Install it with: pip install helion" + ) + +import helion.language as hl + +from vllm.kernels.helion.register import register_kernel + +logger = init_logger(__name__) + + +@register_kernel # type: ignore[misc] +def silu_mul_fp8(input: torch.Tensor, scale: torch.Tensor) -> torch.Tensor: + original_shape = input.shape + two_d = hl.specialize(original_shape[-1]) + d = two_d // 2 + output_shape = original_shape[:-1] + (d,) + + input_2d = input.view(-1, original_shape[-1]) + m = input_2d.shape[0] + + # TODO(gmagogsfm): Support for more float8 subtypes (e4m3fnuz, e5m2) coming + out = torch.empty((m, d), device=input.device, dtype=torch.float8_e4m3fn) + + input_part_a = input_2d[:, :d] + input_part_b = input_2d[:, d:] + + assert scale.numel() == 1, "Scale must be a scalar Tensor" + + for tile_m, tile_n in hl.tile([m, d]): + a_vals = input_part_a[tile_m, tile_n] + silu_result = torch.nn.functional.silu(a_vals) + b_vals = input_part_b[tile_m, tile_n] + result = silu_result * b_vals + result_f32 = result.to(torch.float32) + scale_val = hl.load(scale, [0]) + inv_scale = 1.0 / scale_val + result_scaled = result_f32 * inv_scale + out[tile_m, tile_n] = result_scaled.to(out.dtype) + + return out.view(output_shape) + + +@silu_mul_fp8.register_config_picker # type: ignore[misc] +def pick_silu_mul_fp8_config( + args: tuple[Any, ...], config_keys: list[str] +) -> str | None: + if not config_keys: + return None + + input_tensor, scale = args + intermediate_size = input_tensor.shape[-1] // 2 + + # TODO(gmagosfm): Rerun autotuning to capture config for + # other batch sizes. + target_key = f"intermediate_{intermediate_size}_batchsize_256" + if target_key in config_keys: + return target_key + + intermediate_sizes = [] + for key in config_keys: + if key.startswith("intermediate_") and "_batchsize_256" in key: + try: + size_str = key.split("_")[1] + size = int(size_str) + intermediate_sizes.append((abs(size - intermediate_size), key)) + except (ValueError, IndexError): + continue + + if intermediate_sizes: + _, best_key = min(intermediate_sizes) + logger.debug( + "No exact config for intermediate_size=%d, using closest match: %s", + intermediate_size, + best_key, + ) + return best_key + if "default" in config_keys: + return "default" + + return None + + +def silu_mul_fp8_baseline(input: torch.Tensor, scale: torch.Tensor) -> torch.Tensor: + output_shape = input.shape[:-1] + (input.shape[-1] // 2,) + out = torch.empty(output_shape, dtype=torch.float8_e4m3fn, device=input.device) + torch.ops._C.silu_and_mul_quant(out, input, scale) + return out -- GitLab From fc22cae4ac73288f0b3a4c6ef7cdc2521a5411ac Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 13 Feb 2026 10:15:36 +0800 Subject: [PATCH 0153/1166] [CI/Build] Update video URLs for testing (#34446) Signed-off-by: DarkLight1337 --- tests/entrypoints/openai/test_video.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py index 65bda9e8b..70d234e89 100644 --- a/tests/entrypoints/openai/test_video.py +++ b/tests/entrypoints/openai/test_video.py @@ -13,13 +13,12 @@ from vllm.platforms import current_platform from ...utils import RemoteOpenAIServer MODEL_NAME = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf" -MAXIMUM_VIDEOS = 4 +MAXIMUM_VIDEOS = 3 TEST_VIDEO_URLS = [ - "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4", - "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ElephantsDream.mp4", - "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerBlazes.mp4", - "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4", + "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4", + "https://github.com/opencv/opencv/raw/refs/tags/4.12.0/samples/data/vtest.avi", + "https://github.com/opencv/opencv/raw/refs/tags/4.12.0/samples/data/Megamind.avi", ] -- GitLab From d707678dfb9a1f616d174022ebc74065d1011863 Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Thu, 12 Feb 2026 18:18:03 -0800 Subject: [PATCH 0154/1166] Fix num_logprobs parameter description in sampler.py (#34451) Signed-off-by: Zhuohan Li --- vllm/v1/sample/sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index c75b4f054..3840a7068 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -217,7 +217,7 @@ class Sampler(nn.Module): Args: logprobs: (num tokens) x (vocab) tensor - num_logprobs: minimum number of logprobs to + num_logprobs: maximum number of logprobs to retain per token token_ids: prompt tokens (if prompt logprobs) or sampled tokens (if sampled -- GitLab From 6f019e6e0a0cde34a33826bc08756480816448dd Mon Sep 17 00:00:00 2001 From: Harry Huang Date: Fri, 13 Feb 2026 10:18:07 +0800 Subject: [PATCH 0155/1166] [BugFix] Add block_size validation for mamba cache align mode (#34445) Signed-off-by: huanghaoyan.hhy --- vllm/config/vllm.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index e9f6b37ab..0310e8aed 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -1110,6 +1110,15 @@ class VllmConfig: self.scheduler_config.disable_hybrid_kv_cache_manager = False if self.cache_config.mamba_cache_mode == "align": + assert ( + self.cache_config.block_size + <= self.scheduler_config.max_num_batched_tokens + ), ( + "In Mamba cache align mode, block_size " + f"({self.cache_config.block_size}) must be <= " + "max_num_batched_tokens " + f"({self.scheduler_config.max_num_batched_tokens})." + ) if self.scheduler_config.long_prefill_token_threshold > 0: assert ( self.scheduler_config.long_prefill_token_threshold -- GitLab From 04ea31baabc6f5be6b0afd88541f569a4c771ab9 Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Thu, 12 Feb 2026 21:18:15 -0500 Subject: [PATCH 0156/1166] [Bugfix] Remove assert that's no longer valid (#34443) Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py index 1aa9e3a65..187464ce8 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -37,7 +37,6 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): not self.moe_mk.supports_expert_map(), ) self.old_quant_method = old_quant_method - assert not self.old_quant_method.is_monolithic logger.debug("Swapping out %s", self.old_quant_method.__class__.__name__) @staticmethod -- GitLab From ea5ff3a1f60e1b9f01af17260608009c184e7ff0 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 13 Feb 2026 10:18:24 +0800 Subject: [PATCH 0157/1166] [Refactor] Simplify BOS/EOS token handling (#34435) Signed-off-by: DarkLight1337 --- tests/detokenizer/test_min_tokens.py | 1 - ...stop_string_while_stop_model_terminates.py | 1 - tests/tokenizers_/test_detokenize.py | 1 - .../tool_parsers/test_step3p5_tool_parser.py | 2 +- tests/v1/core/test_kv_cache_utils.py | 6 ++- tests/v1/core/test_prefix_caching.py | 6 ++- .../v1/core/test_priority_scheduler_random.py | 8 ++-- tests/v1/core/test_scheduler.py | 13 +++---- tests/v1/core/utils.py | 5 ++- tests/v1/engine/test_engine_core.py | 1 - tests/v1/engine/test_engine_core_client.py | 1 - .../v1/engine/test_fast_incdec_prefix_err.py | 1 - tests/v1/engine/test_output_processor.py | 37 ++++++++----------- tests/v1/engine/test_parallel_sampling.py | 1 - tests/v1/engine/utils.py | 4 +- .../unit/test_decode_bench_connector.py | 6 ++- .../unit/test_lmcache_integration.py | 6 ++- .../unit/test_offloading_connector.py | 6 ++- tests/v1/kv_connector/unit/utils.py | 2 +- .../test_scheduler_streaming.py | 1 - .../test_backend_guidance.py | 4 +- vllm/inputs/preprocess.py | 25 ++----------- vllm/renderers/base.py | 21 +++++++++++ vllm/sampling_params.py | 17 ++++++--- vllm/v1/core/sched/utils.py | 2 +- vllm/v1/engine/__init__.py | 13 ++++++- vllm/v1/engine/input_processor.py | 5 +-- vllm/v1/request.py | 16 ++++++-- vllm/v1/structured_output/utils.py | 34 ++++------------- 29 files changed, 123 insertions(+), 123 deletions(-) diff --git a/tests/detokenizer/test_min_tokens.py b/tests/detokenizer/test_min_tokens.py index 1f8e94469..37cc3ca1b 100644 --- a/tests/detokenizer/test_min_tokens.py +++ b/tests/detokenizer/test_min_tokens.py @@ -39,7 +39,6 @@ def test_min_tokens_with_stop(min_tokens: int, stop: str, truth: str): mm_features=None, sampling_params=params, pooling_params=None, - eos_token_id=None, arrival_time=0.0, lora_request=None, cache_salt=None, diff --git a/tests/detokenizer/test_stop_string_while_stop_model_terminates.py b/tests/detokenizer/test_stop_string_while_stop_model_terminates.py index 5624332ef..44215cb72 100644 --- a/tests/detokenizer/test_stop_string_while_stop_model_terminates.py +++ b/tests/detokenizer/test_stop_string_while_stop_model_terminates.py @@ -35,7 +35,6 @@ def _make_request(stop, include_stop_str_in_output: bool, min_tokens: int = 0): mm_features=None, sampling_params=params, pooling_params=None, - eos_token_id=None, arrival_time=0.0, lora_request=None, cache_salt=None, diff --git a/tests/tokenizers_/test_detokenize.py b/tests/tokenizers_/test_detokenize.py index ad6c5fb41..2f173bec8 100644 --- a/tests/tokenizers_/test_detokenize.py +++ b/tests/tokenizers_/test_detokenize.py @@ -67,7 +67,6 @@ def _run_incremental_decode( mm_features=None, sampling_params=params, pooling_params=None, - eos_token_id=None, arrival_time=0.0, lora_request=None, cache_salt=None, diff --git a/tests/tool_parsers/test_step3p5_tool_parser.py b/tests/tool_parsers/test_step3p5_tool_parser.py index 6da1e0855..b3cb4e20f 100644 --- a/tests/tool_parsers/test_step3p5_tool_parser.py +++ b/tests/tool_parsers/test_step3p5_tool_parser.py @@ -1123,7 +1123,7 @@ rectangle # Encode all content tokens at once all_token_ids = step3p5_tokenizer.encode(model_output, add_special_tokens=False) - eos_token_id = getattr(step3p5_tokenizer, "eos_token_id", None) + eos_token_id = step3p5_tokenizer.eos_token_id # Include EOS token in delta_token_ids if available if eos_token_id is not None: diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index b91d59e46..ceb8ec424 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -84,13 +84,15 @@ def make_request( ) mm_features.append(mm_feature) + sampling_params = SamplingParams(max_tokens=17) + sampling_params.update_from_generation_config({}, eos_token_id=100) + return Request( request_id=request_id, prompt_token_ids=prompt_token_ids, mm_features=mm_features if mm_features else None, - sampling_params=SamplingParams(max_tokens=17), + sampling_params=sampling_params, pooling_params=None, - eos_token_id=100, lora_request=None, cache_salt=cache_salt, block_hasher=get_request_block_hasher(block_size, hash_fn), diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index e2c924a61..9a968a473 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -75,13 +75,15 @@ def make_request( ) mm_features.append(mm_feature) + sampling_params = SamplingParams(max_tokens=17, prompt_logprobs=prompt_logprobs) + sampling_params.update_from_generation_config({}, eos_token_id=100) + return Request( request_id=request_id, prompt_token_ids=prompt_token_ids, mm_features=mm_features if mm_features else None, - sampling_params=SamplingParams(max_tokens=17, prompt_logprobs=prompt_logprobs), + sampling_params=sampling_params, pooling_params=None, - eos_token_id=100, lora_request=lora_request, cache_salt=cache_salt, block_hasher=get_request_block_hasher(block_size, hash_fn), diff --git a/tests/v1/core/test_priority_scheduler_random.py b/tests/v1/core/test_priority_scheduler_random.py index cb4dfc046..1d03bd104 100644 --- a/tests/v1/core/test_priority_scheduler_random.py +++ b/tests/v1/core/test_priority_scheduler_random.py @@ -48,10 +48,9 @@ def _create_random_request( request_id = uuid.uuid4().hex - sampling_params = SamplingParams( - ignore_eos=False, - max_tokens=max_tokens, - ) + sampling_params = SamplingParams(ignore_eos=False, max_tokens=max_tokens) + sampling_params.update_from_generation_config({}, EOS_TOKEN_ID) + mm_features = [] for j, position in enumerate(mm_positions): identifier = f"{request_id}_hash_{j}" @@ -79,7 +78,6 @@ def _create_random_request( sampling_params=sampling_params, pooling_params=None, mm_features=mm_features if mm_features else None, - eos_token_id=EOS_TOKEN_ID, arrival_time=arrival_time, priority=priority, block_hasher=block_hasher, diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 376b06a5e..0713aa8ab 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -469,8 +469,7 @@ def test_stop_via_update_from_output(): # Test case 4: Ignore EOS flag scheduler = create_scheduler(num_speculative_tokens=2) - requests = create_requests(num_requests=1, max_tokens=10) - requests[0].sampling_params.ignore_eos = True + requests = create_requests(num_requests=1, max_tokens=10, ignore_eos=True) requests[0].num_computed_tokens = requests[0].num_tokens scheduler.requests[requests[0].request_id] = requests[0] scheduler.running.append(requests[0]) @@ -515,12 +514,12 @@ def test_check_stop_min_tokens(): max_tokens=20, min_tokens=5, ) + sampling_params.update_from_generation_config({}, EOS_TOKEN_ID) request = Request( request_id="0", prompt_token_ids=[0, 1, 2], sampling_params=sampling_params, pooling_params=None, - eos_token_id=EOS_TOKEN_ID, ) # Simulate having generated 3 output tokens (less than min_tokens=5) request.append_output_token_ids([10, 11, EOS_TOKEN_ID]) # EOS token present @@ -551,12 +550,12 @@ def test_check_stop_min_tokens(): max_tokens=20, min_tokens=0, ) + sampling_params_no_min.update_from_generation_config({}, EOS_TOKEN_ID) request_no_min = Request( request_id="1", prompt_token_ids=[0, 1, 2], sampling_params=sampling_params_no_min, pooling_params=None, - eos_token_id=EOS_TOKEN_ID, ) request_no_min.append_output_token_ids([10, EOS_TOKEN_ID]) @@ -571,12 +570,12 @@ def test_check_stop_min_tokens(): min_tokens=5, stop_token_ids=[42], ) + sampling_params_stop.update_from_generation_config({}, EOS_TOKEN_ID) request_stop = Request( request_id="2", prompt_token_ids=[0, 1, 2], sampling_params=sampling_params_stop, pooling_params=None, - eos_token_id=EOS_TOKEN_ID, ) # Only 3 output tokens, less than min_tokens=5, but has stop token request_stop.append_output_token_ids([10, 11, 42]) @@ -1877,6 +1876,7 @@ def create_requests_with_priority( stop_token_ids=stop_token_ids, prompt_logprobs=prompt_logprobs, ) + sampling_params.update_from_generation_config({}, EOS_TOKEN_ID) requests = [] if mm_hashes_list is not None: @@ -1938,7 +1938,6 @@ def create_requests_with_priority( sampling_params=sampling_params, pooling_params=None, mm_features=mm_features if mm_features else None, - eos_token_id=EOS_TOKEN_ID, arrival_time=arrival_times[i], priority=priorities[i], block_hasher=block_hasher, @@ -2429,13 +2428,13 @@ def test_schedule_skip_tokenizer_init_structured_output_request(): max_tokens=16, structured_outputs=structured_outputs_params, ) + sampling_params.update_from_generation_config({}, EOS_TOKEN_ID) request = Request( request_id="0", prompt_token_ids=[0, 1], mm_features=None, sampling_params=sampling_params, pooling_params=None, - eos_token_id=EOS_TOKEN_ID, ) scheduler.add_request(request) output = scheduler.schedule() diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index 00eb61285..90c174adf 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -174,6 +174,7 @@ def create_requests( num_tokens: int = 10, mm_hashes_list: list[list[str]] | None = None, mm_positions: list[list[PlaceholderRange]] | None = None, + ignore_eos: bool = False, max_tokens: int = 16, stop_token_ids: list[int] | None = None, prompt_logprobs: int | None = None, @@ -188,11 +189,12 @@ def create_requests( block_hasher = get_request_block_hasher(block_size, sha256) sampling_params = SamplingParams( - ignore_eos=False, + ignore_eos=ignore_eos, max_tokens=max_tokens, stop_token_ids=stop_token_ids, prompt_logprobs=prompt_logprobs, ) + sampling_params.update_from_generation_config({}, EOS_TOKEN_ID) requests = [] if mm_hashes_list is not None: @@ -250,7 +252,6 @@ def create_requests( sampling_params=sampling_params, pooling_params=None, mm_features=mm_features if mm_features else None, - eos_token_id=EOS_TOKEN_ID, block_hasher=block_hasher, ) requests.append(request) diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 4f96ded7e..8d7377c28 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -54,7 +54,6 @@ def make_request() -> EngineCoreRequest: mm_features=None, sampling_params=SamplingParams(), pooling_params=None, - eos_token_id=None, arrival_time=time.time(), lora_request=None, cache_salt=None, diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index ce0d70cc9..8f8a3cac9 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -69,7 +69,6 @@ def make_request( mm_features=None, sampling_params=params, pooling_params=None, - eos_token_id=None, arrival_time=time.time(), lora_request=None, cache_salt=None, diff --git a/tests/v1/engine/test_fast_incdec_prefix_err.py b/tests/v1/engine/test_fast_incdec_prefix_err.py index 67a3b6b01..036a19b82 100644 --- a/tests/v1/engine/test_fast_incdec_prefix_err.py +++ b/tests/v1/engine/test_fast_incdec_prefix_err.py @@ -32,7 +32,6 @@ def test_fast_inc_detok_invalid_utf8_err_case(): mm_features=None, sampling_params=params, pooling_params=None, - eos_token_id=None, arrival_time=0.0, lora_request=None, cache_salt=None, diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index 7c78c5436..ece48e009 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -66,7 +66,6 @@ def test_incremental_detokenization( external_req_id=f"request-{idx}", prompt_token_ids=prompt_tokens, mm_features=None, - eos_token_id=None, arrival_time=0, lora_request=None, cache_salt=None, @@ -487,7 +486,6 @@ def test_logprobs_processor( external_req_id=request_id_list[idx], prompt_token_ids=prompt_tokens, mm_features=None, - eos_token_id=None, arrival_time=0, lora_request=None, cache_salt=None, @@ -663,6 +661,19 @@ def test_stop_token( prompt_string = dummy_test_vectors.prompt_strings[0] prompt_tokens = dummy_test_vectors.prompt_tokens[0] + sampling_params = SamplingParams( + skip_special_tokens=False, + spaces_between_special_tokens=False, + output_kind=RequestOutputKind.DELTA, + stop=[], + stop_token_ids=stop_token_ids, + include_stop_str_in_output=include_stop_str_in_output, + logprobs=num_sample_logprobs, + prompt_logprobs=None, + ignore_eos=ignore_eos, + ) + sampling_params.update_from_generation_config({}, eos_token_id) + # Make request. request_id = "request-0" request = EngineCoreRequest( @@ -670,22 +681,11 @@ def test_stop_token( external_req_id=request_id + "-ext", prompt_token_ids=prompt_tokens, mm_features=None, - eos_token_id=eos_token_id, arrival_time=0, lora_request=None, cache_salt=None, data_parallel_rank=None, - sampling_params=SamplingParams( - skip_special_tokens=False, - spaces_between_special_tokens=False, - output_kind=RequestOutputKind.DELTA, - stop=[], - stop_token_ids=stop_token_ids, - include_stop_str_in_output=include_stop_str_in_output, - logprobs=num_sample_logprobs, - prompt_logprobs=None, - ignore_eos=ignore_eos, - ), + sampling_params=sampling_params, pooling_params=None, ) @@ -693,9 +693,8 @@ def test_stop_token( tokens_list=[generation_tokens], generated_logprobs_raw=[generation_logprobs] if do_logprobs else None, prompt_logprobs_raw=None, - eos_token_id=eos_token_id, - stop_token_ids=stop_token_ids, - ignore_eos=ignore_eos, + eos_token_id=sampling_params.eos_token_id, + stop_token_ids=sampling_params.stop_token_ids, request_ids=[request.request_id], ) @@ -775,7 +774,6 @@ def test_stop_string( external_req_id=request_id_list[idx], prompt_token_ids=prompt_tokens, mm_features=None, - eos_token_id=None, arrival_time=0, lora_request=None, cache_salt=None, @@ -907,7 +905,6 @@ def test_iteration_stats(dummy_test_vectors): external_req_id=f"request-{idx}-ext", prompt_token_ids=prompt_tokens, mm_features=None, - eos_token_id=None, arrival_time=0, lora_request=None, cache_salt=None, @@ -994,7 +991,6 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors): external_req_id=f"request-{idx}", prompt_token_ids=prompt_tokens, mm_features=None, - eos_token_id=None, arrival_time=0, lora_request=lora_assignments[idx], cache_salt=None, @@ -1315,7 +1311,6 @@ def test_abort_requests(runner: str, abort_by: str, dummy_test_vectors): external_req_id=f"external-{idx}", prompt_token_ids=prompt_tokens, mm_features=None, - eos_token_id=None, arrival_time=0, lora_request=None, cache_salt=None, diff --git a/tests/v1/engine/test_parallel_sampling.py b/tests/v1/engine/test_parallel_sampling.py index fe6f15df2..395867c06 100644 --- a/tests/v1/engine/test_parallel_sampling.py +++ b/tests/v1/engine/test_parallel_sampling.py @@ -76,7 +76,6 @@ def make_request(sampling_params: SamplingParams) -> EngineCoreRequest: mm_features=None, sampling_params=sampling_params, pooling_params=None, - eos_token_id=None, arrival_time=0.0, lora_request=None, cache_salt=None, diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py index d14775668..de953a588 100644 --- a/tests/v1/engine/utils.py +++ b/tests/v1/engine/utils.py @@ -342,7 +342,6 @@ class MockEngineCore: prompt_logprobs_raw: list[LogprobsTensors] | None = None, eos_token_id: int | None = None, stop_token_ids: list[int] | None = None, - ignore_eos: bool = False, request_ids: list[str] | None = None, ) -> None: self.num_requests = len(tokens_list) @@ -355,7 +354,6 @@ class MockEngineCore: self.request_finished = [False for _ in range(self.num_requests)] self.eos_token_id = eos_token_id self.stop_token_ids = stop_token_ids - self.ignore_eos = ignore_eos self.request_ids = ( request_ids if request_ids is not None @@ -400,7 +398,7 @@ class MockEngineCore: if token_idx == len(token_ids) - 1: output.finish_reason = FinishReason.LENGTH self.request_finished[req_idx] = True - if not self.ignore_eos and new_token_id == self.eos_token_id: + if new_token_id == self.eos_token_id: output.finish_reason = FinishReason.STOP self.request_finished[req_idx] = True if new_token_id in (self.stop_token_ids or ()): diff --git a/tests/v1/kv_connector/unit/test_decode_bench_connector.py b/tests/v1/kv_connector/unit/test_decode_bench_connector.py index 93f4f8537..1d5343644 100644 --- a/tests/v1/kv_connector/unit/test_decode_bench_connector.py +++ b/tests/v1/kv_connector/unit/test_decode_bench_connector.py @@ -93,12 +93,14 @@ class DecodeBenchTestRunner: """Create a new request with given token IDs.""" self.req_id += 1 + sampling_params = SamplingParams(max_tokens=100) + sampling_params.update_from_generation_config({}, EOS_TOKEN_ID) + req = Request( request_id=str(self.req_id), prompt_token_ids=token_ids, - sampling_params=SamplingParams(max_tokens=100), + sampling_params=sampling_params, pooling_params=None, - eos_token_id=EOS_TOKEN_ID, block_hasher=self._block_hasher, ) diff --git a/tests/v1/kv_connector/unit/test_lmcache_integration.py b/tests/v1/kv_connector/unit/test_lmcache_integration.py index cfe8d810c..57ddaa8bf 100644 --- a/tests/v1/kv_connector/unit/test_lmcache_integration.py +++ b/tests/v1/kv_connector/unit/test_lmcache_integration.py @@ -142,12 +142,14 @@ def test_request_interface(): from vllm.sampling_params import SamplingParams from vllm.v1.request import Request + sampling_params = SamplingParams(max_tokens=10) + sampling_params.update_from_generation_config({}, eos_token_id=100) + req = Request( request_id="test_request", prompt_token_ids=[1, 2, 3], - sampling_params=SamplingParams(max_tokens=10), + sampling_params=sampling_params, pooling_params=None, - eos_token_id=100, lora_request=None, ) assumes(req, "mm_features", is_instance_of=(list, NoneType)) diff --git a/tests/v1/kv_connector/unit/test_offloading_connector.py b/tests/v1/kv_connector/unit/test_offloading_connector.py index 5b84202a5..cc89ed1dc 100644 --- a/tests/v1/kv_connector/unit/test_offloading_connector.py +++ b/tests/v1/kv_connector/unit/test_offloading_connector.py @@ -226,12 +226,14 @@ class RequestRunner: def new_request(self, token_ids: list[int]): self.req_id += 1 + sampling_params = SamplingParams(max_tokens=1000) + sampling_params.update_from_generation_config({}, EOS_TOKEN_ID) + req = Request( request_id=str(self.req_id), prompt_token_ids=token_ids, - sampling_params=SamplingParams(max_tokens=1000), + sampling_params=sampling_params, pooling_params=None, - eos_token_id=EOS_TOKEN_ID, block_hasher=self._block_hasher, ) diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index e754a0917..d843bd6ff 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -212,6 +212,7 @@ def create_request( max_tokens = 1 if do_remote_decode else max_tokens sampling_params = SamplingParams(max_tokens=max_tokens) + sampling_params.update_from_generation_config({}, EOS_TOKEN_ID) common_prefix = [1] * common_prefix_len if common_prefix_len > 0 else [] suffix = [i * request_id for i in range(num_tokens - common_prefix_len)] @@ -223,7 +224,6 @@ def create_request( sampling_params=sampling_params, pooling_params=None, mm_features=None, - eos_token_id=EOS_TOKEN_ID, block_hasher=get_request_block_hasher(block_size, hash_fn), ) req.kv_transfer_params = kv_transfer_params diff --git a/tests/v1/streaming_input/test_scheduler_streaming.py b/tests/v1/streaming_input/test_scheduler_streaming.py index f8d8c3cb3..fd9f6b17f 100644 --- a/tests/v1/streaming_input/test_scheduler_streaming.py +++ b/tests/v1/streaming_input/test_scheduler_streaming.py @@ -43,7 +43,6 @@ class DummyRequest(Request): stop_token_ids=[STOP_TOKEN], max_tokens=max_tokens ), pooling_params=None, - eos_token_id=None, mm_features=mm_features, resumable=resumable, ) diff --git a/tests/v1/structured_output/test_backend_guidance.py b/tests/v1/structured_output/test_backend_guidance.py index 362f75c49..704ed8b9c 100644 --- a/tests/v1/structured_output/test_backend_guidance.py +++ b/tests/v1/structured_output/test_backend_guidance.py @@ -83,6 +83,7 @@ def test_grammar_bitmask_with_specdec(): ), ) sampling_params.structured_outputs._backend = "guidance" + sampling_params.update_from_generation_config({}, tokenizer.eos_token_id) my_req_id = f"my_req_id_{i}" request = Request( @@ -90,7 +91,6 @@ def test_grammar_bitmask_with_specdec(): prompt_token_ids=prompt[:i], sampling_params=sampling_params, pooling_params=None, - eos_token_id=tokenizer.eos_token_id, ) structured_output_manager.grammar_init(request) @@ -147,13 +147,13 @@ def test_grammar_init_async_and_sync(async_grammar): ), ) sampling_params.structured_outputs._backend = "guidance" + sampling_params.update_from_generation_config({}, tokenizer.eos_token_id) request = Request( "test_request", prompt_token_ids=prompt, sampling_params=sampling_params, pooling_params=None, - eos_token_id=tokenizer.eos_token_id, ) structured_output_manager.grammar_init(request) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index b2cdccbed..2699f70cb 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -77,24 +77,6 @@ class InputPreprocessor: def get_tokenizer(self) -> TokenizerLike: return self.renderer.get_tokenizer() - def get_bos_token_id(self) -> int | None: - if self.tokenizer is None: - logger.warning_once( - "Using None for BOS token id because tokenizer is not initialized" - ) - return None - - return self.tokenizer.bos_token_id - - def get_eos_token_id(self) -> int | None: - if self.tokenizer is None: - logger.warning_once( - "Using None for EOS token id because tokenizer is not initialized" - ) - return None - - return self.tokenizer.eos_token_id - def get_decoder_start_token_id(self) -> int: """ Obtain the decoder start token id employed by an encoder/decoder @@ -106,11 +88,10 @@ class InputPreprocessor: if dec_start_token_id is None: logger.warning_once( - "Falling back on for decoder start token " - "id because decoder start token id is not " - "available." + "Falling back on for decoder start token id " + "because decoder start token id is not available." ) - dec_start_token_id = self.get_bos_token_id() + dec_start_token_id = self.renderer.get_bos_token_id() if dec_start_token_id is None: raise RuntimeError("Cannot find decoder start token id or ") diff --git a/vllm/renderers/base.py b/vllm/renderers/base.py index adf2ee552..0002bdf89 100644 --- a/vllm/renderers/base.py +++ b/vllm/renderers/base.py @@ -6,6 +6,7 @@ from collections.abc import Sequence from typing import TYPE_CHECKING, Any, overload from vllm.inputs import EmbedsPrompt, TextPrompt, TokensPrompt +from vllm.logger import init_logger from vllm.tokenizers import TokenizerLike from vllm.utils.async_utils import AsyncMicrobatchTokenizer @@ -26,6 +27,8 @@ if TYPE_CHECKING: ConversationMessage, ) +logger = init_logger(__name__) + class BaseRenderer(ABC): @classmethod @@ -63,6 +66,24 @@ class BaseRenderer(ABC): return self._async_tokenizer + def get_bos_token_id(self) -> int | None: + if self.tokenizer is None: + logger.warning_once( + "Using None for BOS token id because tokenizer is not initialized" + ) + return None + + return self.tokenizer.bos_token_id + + def get_eos_token_id(self) -> int | None: + if self.tokenizer is None: + logger.warning_once( + "Using None for EOS token id because tokenizer is not initialized" + ) + return None + + return self.tokenizer.eos_token_id + # Step 1: Convert raw inputs to prompts def render_prompt( self, diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 5603e5dc4..520481c58 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -223,6 +223,7 @@ class SamplingParams( # The below fields are not supposed to be used as an input. # They are set in post_init. output_text_buffer_length: int = 0 + _eos_token_id: int | None = None _all_stop_token_ids: set[int] = msgspec.field(default_factory=set) # Fields used to construct logits processors @@ -477,24 +478,26 @@ class SamplingParams( def update_from_generation_config( self, generation_config: dict[str, Any], - model_eos_token_id: int | None = None, + eos_token_id: int | None = None, ) -> None: """Update if there are non-default values from generation_config""" + if not self.ignore_eos: + self._eos_token_id = eos_token_id - if model_eos_token_id is not None: + if eos_token_id is not None: # Add the eos token id into the sampling_params to support # min_tokens processing. - self._all_stop_token_ids.add(model_eos_token_id) + self._all_stop_token_ids.add(eos_token_id) # Update eos_token_id for generation if (eos_ids := generation_config.get("eos_token_id")) is not None: # it can be either int or list of int eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids) - if model_eos_token_id is not None: + if eos_token_id is not None: # We don't need to include the primary eos_token_id in # stop_token_ids since it's handled separately for stopping # purposes. - eos_ids.discard(model_eos_token_id) + eos_ids.discard(eos_token_id) if eos_ids: self._all_stop_token_ids.update(eos_ids) if not self.ignore_eos: @@ -550,6 +553,10 @@ class SamplingParams( return SamplingType.RANDOM_SEED return SamplingType.RANDOM + @property + def eos_token_id(self) -> int | None: + return self._eos_token_id + @property def all_stop_token_ids(self) -> set[int]: return self._all_stop_token_ids diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py index 631973188..22e3aefb6 100644 --- a/vllm/v1/core/sched/utils.py +++ b/vllm/v1/core/sched/utils.py @@ -47,7 +47,7 @@ def check_stop(request: Request, max_model_len: int) -> bool: return False last_token_id = request.output_token_ids[-1] - if not sampling_params.ignore_eos and last_token_id == request.eos_token_id: + if last_token_id == sampling_params.eos_token_id: request.status = RequestStatus.FINISHED_STOPPED return True diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index d0b0370fb..1dd9f64f8 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -9,6 +9,7 @@ from typing import Any, Literal import msgspec import numpy as np import torch +from typing_extensions import deprecated from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import MultiModalFeatureSpec @@ -63,7 +64,6 @@ class EngineCoreRequest( mm_features: list[MultiModalFeatureSpec] | None sampling_params: SamplingParams | None pooling_params: PoolingParams | None - eos_token_id: int | None arrival_time: float lora_request: LoRARequest | None cache_salt: str | None @@ -99,6 +99,17 @@ class EngineCoreRequest( assert self.pooling_params is not None return self.pooling_params + @property + @deprecated( + "EngineCoreRequest.eos_token_id will be removed in v0.18. " + "Please use EngineCoreRequest.sampling_params.eos_token_id instead." + ) + def eos_token_id(self) -> int | None: + if self.sampling_params is None: + return None + + return self.sampling_params.eos_token_id + class EngineCoreEventType(enum.IntEnum): """The type of engine core request event.""" diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 8bd4b509a..4c105c87b 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -376,8 +376,6 @@ class InputProcessor: processed_inputs=processed_inputs, ) - eos_token_id = self.input_preprocessor.get_eos_token_id() - encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs) self._validate_model_inputs(encoder_inputs, decoder_inputs) @@ -403,7 +401,7 @@ class InputProcessor: sampling_params.update_from_generation_config( self.generation_config_fields, - None if self.tokenizer is None else self.tokenizer.eos_token_id, + self.renderer.get_eos_token_id(), ) if self.tokenizer is not None: sampling_params.update_from_tokenizer(self.tokenizer) @@ -446,7 +444,6 @@ class InputProcessor: mm_features=mm_features, sampling_params=sampling_params, pooling_params=pooling_params, - eos_token_id=eos_token_id, arrival_time=arrival_time, lora_request=lora_request, cache_salt=decoder_inputs.get("cache_salt"), diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 970b7e1eb..66ade0097 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -9,6 +9,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any import torch +from typing_extensions import deprecated from vllm.multimodal.inputs import MultiModalFeatureSpec from vllm.pooling_params import PoolingParams @@ -62,7 +63,6 @@ class Request: prompt_token_ids: list[int] | None, sampling_params: SamplingParams | None, pooling_params: PoolingParams | None, - eos_token_id: int | None, client_index: int = 0, arrival_time: float | None = None, prompt_embeds: torch.Tensor | None = None, @@ -80,8 +80,6 @@ class Request: self.priority = priority self.sampling_params = sampling_params self.pooling_params = pooling_params - # Because of LoRA, the eos token id can be different for each request. - self.eos_token_id = eos_token_id self.lora_request = lora_request self.structured_output_request = StructuredOutputRequest.from_sampling_params( sampling_params @@ -176,6 +174,17 @@ class Request: # None entry in the queue means finished. self.streaming_queue: deque[StreamingUpdate | None] | None = None + @property + @deprecated( + "Request.eos_token_id will be removed in v0.18. " + "Please use Request.sampling_params.eos_token_id instead." + ) + def eos_token_id(self) -> int | None: + if self.sampling_params is None: + return None + + return self.sampling_params.eos_token_id + @classmethod def from_engine_core_request( cls, @@ -190,7 +199,6 @@ class Request: mm_features=request.mm_features, sampling_params=request.sampling_params, pooling_params=request.pooling_params, - eos_token_id=request.eos_token_id, arrival_time=request.arrival_time, lora_request=request.lora_request, cache_salt=request.cache_salt, diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index 1419cdce1..aadd057b1 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -185,14 +185,13 @@ re_llama_byte_token = re.compile(r"^<0x[0-9A-F]{2}>$") re_replacement_seq = re.compile(r"^.{0,6}�+.{0,6}$") -def _reduced_vocabulary( - tokenizer: TokenizerLike, eos_token_id: int -) -> dict[bytes, list[int]]: +def _reduced_vocabulary(tokenizer: TokenizerLike) -> dict[bytes, list[int]]: """Create a map from vocabulary tokens to lists of equivalent token ids. Returns: A Dict of token string -> equivalent token ids """ + eos_token_id = tokenizer.eos_token_id unicode_to_bytes = { v: k for k, v in convert_slow_tokenizer.bytes_to_unicode().items() @@ -260,30 +259,13 @@ def get_outlines_vocabulary(tokenizer: TokenizerLike) -> oc.Vocabulary: if hasattr(tokenizer, "_outlines_vocabulary"): return tokenizer._outlines_vocabulary # type: ignore - try: - if hasattr(tokenizer, "eos_token_id") and tokenizer.eos_token_id is not None: - eos_token_id = tokenizer.eos_token_id - else: - raise ValueError( - "Error during structured outputs setup for outlines: Tokenizer " - f"({type(tokenizer)}) has no `eos_token_id` property, but " - "`eos_token_id` is required for structured outputs to work properly." - ) - - reduced_vocab = _reduced_vocabulary( - tokenizer, - eos_token_id, # type: ignore - ) - vocabulary = OutlinesVocabulary(oc.Vocabulary(eos_token_id, reduced_vocab)) - tokenizer._outlines_vocabulary = vocabulary # type: ignore + reduced_vocab = _reduced_vocabulary(tokenizer) + vocabulary = OutlinesVocabulary( + oc.Vocabulary(tokenizer.eos_token_id, reduced_vocab) + ) + tokenizer._outlines_vocabulary = vocabulary # type: ignore - return vocabulary - except AttributeError as e: - raise ValueError( - "Cannot get the vocabulary of the tokenizer " - f"({type(tokenizer)}). The tokenizer should have a " - "get_vocab method." - ) from e + return vocabulary def grammar_is_likely_lark(grammar_str: str) -> bool: -- GitLab From 62788f99a4d0e483a6e9114e6708489b44b51a78 Mon Sep 17 00:00:00 2001 From: LoganJane <42287016+LoganJane@users.noreply.github.com> Date: Fri, 13 Feb 2026 10:18:42 +0800 Subject: [PATCH 0158/1166] [Bugfix] Delete unused redundant code in Kimi-K2.5 (#34427) Signed-off-by: Isotr0py Co-authored-by: Isotr0py --- vllm/model_executor/models/kimi_k25.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vllm/model_executor/models/kimi_k25.py b/vllm/model_executor/models/kimi_k25.py index bc6fffa3b..bb9f35bdb 100644 --- a/vllm/model_executor/models/kimi_k25.py +++ b/vllm/model_executor/models/kimi_k25.py @@ -11,7 +11,6 @@ This module defines: - KimiK25ForConditionalGeneration: Main model class """ -import copy from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass from typing import Annotated, Any, Literal @@ -378,10 +377,6 @@ class KimiK25ForConditionalGeneration( ) self.quant_config = quant_config - sub_vllm_config = copy.deepcopy(vllm_config) - sub_vllm_config.model_config.hf_config = ( - sub_vllm_config.model_config.hf_config.text_config - ) with self._mark_language_model(vllm_config): self.language_model = init_vllm_registered_model( vllm_config=vllm_config, -- GitLab From de13dd781f1bb18fb5bbaf4535389053d98780f8 Mon Sep 17 00:00:00 2001 From: Yanan Cao Date: Thu, 12 Feb 2026 18:21:05 -0800 Subject: [PATCH 0159/1166] [Kernel] [Helion] [5/N] Add Helion Autotuning infrastructure (#34025) Signed-off-by: Yanan Cao --- scripts/autotune_helion_kernels.py | 430 ++++++++++++++++++++++++++ vllm/kernels/helion/config_manager.py | 51 ++- vllm/kernels/helion/register.py | 88 +++++- 3 files changed, 551 insertions(+), 18 deletions(-) create mode 100644 scripts/autotune_helion_kernels.py diff --git a/scripts/autotune_helion_kernels.py b/scripts/autotune_helion_kernels.py new file mode 100644 index 000000000..755ba3115 --- /dev/null +++ b/scripts/autotune_helion_kernels.py @@ -0,0 +1,430 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Autotune registered Helion kernels for optimal configurations. + +Usage: + # Autotune all registered kernels + python scripts/autotune_helion_kernels.py + + # Autotune specific kernel + python scripts/autotune_helion_kernels.py --kernels silu_mul_fp8 + + # Autotune multiple kernels + python scripts/autotune_helion_kernels.py --kernels silu_mul_fp8 rms_norm_fp8 + + # Force re-autotuning + python scripts/autotune_helion_kernels.py --force + + # List available kernels + python scripts/autotune_helion_kernels.py --list +""" + +import argparse +import sys +import time +from dataclasses import dataclass + +import torch + +try: + import helion + + from vllm.kernels.helion import ( + ConfigManager, + get_kernel_by_name, + get_registered_kernels, + ) + from vllm.kernels.helion.utils import get_canonical_gpu_name + from vllm.logger import init_logger + from vllm.utils.import_utils import has_helion +except ImportError as e: + print(f"Error importing vLLM: {e}") + print("Please ensure vLLM is installed and in your Python path") + sys.exit(1) + +logger = init_logger("vllm.scripts.autotune_helion_kernels") + + +@dataclass +class AutotuneResult: + status: str # "success" | "partial" | "error" | "skipped" + successful: int + failed: int + configs: dict[str, "helion.Config"] + message: str = "" + + +def list_kernels() -> None: + kernels = get_registered_kernels() + + if not kernels: + print("No Helion kernels found in registry.") + return + + print("Available Helion kernels:") + print("=" * 50) + + for name in sorted(kernels.keys()): + print(f" {name}") + + print(f"\nTotal: {len(kernels)} kernels") + + +def check_requirements() -> bool: + if not torch.cuda.is_available(): + logger.error("CUDA is not available. Helion autotuning requires GPU.") + return False + + if not has_helion(): + logger.error("Helion is not installed. Please install Helion package.") + return False + + return True + + +def autotune_kernel( + kernel_name: str, + platform: str, + config_manager: ConfigManager, + force: bool = False, + autotune_effort: str = "quick", +) -> AutotuneResult: + logger.debug( + "Starting autotune for kernel '%s' with effort='%s'", + kernel_name, + autotune_effort, + ) + kernel_wrapper = get_kernel_by_name(kernel_name) + if kernel_wrapper is None: + error_msg = f"Kernel '{kernel_name}' not found in registry" + logger.error(error_msg) + return AutotuneResult( + status="error", + message=error_msg, + successful=0, + failed=0, + configs={}, + ) + + try: + inputs_dict = kernel_wrapper.get_inputs() + except NotImplementedError: + error_msg = f"Kernel '{kernel_name}' has no input generator registered" + logger.error(error_msg) + return AutotuneResult( + status="error", + message=error_msg, + successful=0, + failed=0, + configs={}, + ) + + try: + logger.info( + "Autotuning kernel '%s' for platform '%s' with %d configs", + kernel_name, + platform, + len(inputs_dict), + ) + + configs_to_autotune = {} + if not force: + existing_configs = config_manager.get_platform_configs( + kernel_name, platform + ) + for config_key, inputs in inputs_dict.items(): + if config_key in existing_configs: + logger.debug( + "Config '%s' already exists for platform '%s', skipping", + config_key, + platform, + ) + else: + configs_to_autotune[config_key] = inputs + else: + logger.debug("Force mode enabled, will re-autotune all configs") + configs_to_autotune = inputs_dict + + if not configs_to_autotune: + logger.info( + "All configs already exist for kernel '%s' on platform '%s'. " + "Use --force to re-autotune.", + kernel_name, + platform, + ) + return AutotuneResult( + status="skipped", + message="All configs already exist", + successful=0, + failed=0, + configs={}, + ) + + total_start_time = time.time() + autotuned_configs = {} + failed_configs = [] + + for config_key, inputs in configs_to_autotune.items(): + logger.info("Autotuning config: %s", config_key) + logger.debug( + "Input shapes: %s", + [getattr(inp, "shape", type(inp).__name__) for inp in inputs], + ) + + try: + config_start_time = time.time() + config = kernel_wrapper.run_autotune(inputs, autotune_effort) + config_duration = time.time() - config_start_time + + # Save immediately for checkpointing + config_manager.save_configs(kernel_name, platform, {config_key: config}) + + autotuned_configs[config_key] = config + logger.debug("Config details: %s", config) + + logger.info( + "✓ Autotuned and saved config '%s' (%.2fs)", + config_key, + config_duration, + ) + + except (RuntimeError, ValueError, OSError) as e: + logger.exception( + "Failed to autotune config '%s': %s", + config_key, + e, + ) + failed_configs.append(config_key) + + total_duration = time.time() - total_start_time + successful = len(autotuned_configs) + failed = len(failed_configs) + + logger.info( + "Completed autotuning for kernel '%s': %d successful, %d failed (%.2fs)", + kernel_name, + successful, + failed, + total_duration, + ) + + status = "success" if failed == 0 else "partial" + return AutotuneResult( + status=status, + successful=successful, + failed=failed, + configs=autotuned_configs, + ) + + except (KeyError, RuntimeError, ValueError, OSError) as e: + error_msg = f"Unexpected error: {e}" + logger.exception("Failed to autotune kernel '%s': %s", kernel_name, e) + return AutotuneResult( + status="error", + message=error_msg, + successful=0, + failed=0, + configs={}, + ) + + +def summarize_results(results: dict[str, AutotuneResult]) -> bool: + logger.info("=" * 50) + logger.info("Autotuning Results Summary") + logger.info("=" * 50) + + total_successful = 0 + total_failed = 0 + success_kernels = [] + partial_kernels = [] + error_kernels = [] + skipped_kernels = [] + + for kernel_name, result in results.items(): + total_successful += result.successful + total_failed += result.failed + + if result.status == "success": + success_kernels.append(f"{kernel_name} ({result.successful} configs)") + logger.info("✓ %s: %d configs successful", kernel_name, result.successful) + elif result.status == "partial": + partial_kernels.append( + f"{kernel_name} ({result.successful} ok, {result.failed} failed)" + ) + logger.warning( + "⚠ %s: %d successful, %d failed", + kernel_name, + result.successful, + result.failed, + ) + elif result.status == "error": + error_kernels.append(f"{kernel_name}: {result.message or 'Unknown error'}") + logger.error("✗ %s: %s", kernel_name, result.message or "Unknown error") + elif result.status == "skipped": + skipped_kernels.append(f"{kernel_name}: {result.message or 'Skipped'}") + logger.info("- %s: %s", kernel_name, result.message or "Skipped") + + logger.info("=" * 50) + logger.info( + "Summary: %d total configs (%d successful, %d failed)", + total_successful + total_failed, + total_successful, + total_failed, + ) + logger.info( + "Kernels: %d success, %d partial, %d error, %d skipped", + len(success_kernels), + len(partial_kernels), + len(error_kernels), + len(skipped_kernels), + ) + + has_failures = bool(error_kernels or partial_kernels) + + if not has_failures: + if total_successful > 0: + logger.info("All configs autotuned successfully!") + else: + logger.info("No new configs were generated (all may already exist)") + + return not has_failures + + +def get_kernels_to_autotune(requested_kernels: list[str] | None) -> list[str]: + all_kernels = get_registered_kernels() + if not all_kernels: + logger.error("No Helion kernels found in registry") + sys.exit(1) + + if not requested_kernels: + return list(all_kernels.keys()) + + if len(requested_kernels) != len(set(requested_kernels)): + duplicates = [ + k for k in set(requested_kernels) if requested_kernels.count(k) > 1 + ] + logger.error("Duplicate kernel names in --kernels flag: %s", duplicates) + sys.exit(1) + + kernels_to_autotune = [] + missing_kernels = [] + + for kernel_name in requested_kernels: + if kernel_name in all_kernels: + kernels_to_autotune.append(kernel_name) + else: + missing_kernels.append(kernel_name) + + if missing_kernels: + logger.error("Kernel(s) not found: %s", missing_kernels) + logger.error("Available kernels: %s", list(all_kernels.keys())) + sys.exit(1) + + return kernels_to_autotune + + +def main(): + parser = argparse.ArgumentParser( + description="Autotune Helion kernels", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__.split("Usage:")[1] if "Usage:" in __doc__ else "", + ) + + parser.add_argument( + "--kernels", + nargs="+", + help="Kernel(s) to autotune (default: all kernels)", + ) + + parser.add_argument( + "--config-dir", + type=str, + help="Config directory for config files (default: vLLM helion configs dir)", + ) + + parser.add_argument( + "--list", + action="store_true", + help="List available Helion kernels and exit", + ) + + parser.add_argument( + "--force", + action="store_true", + help=( + "Force re-autotuning even if configs already exist for the " + "platform and config keys" + ), + ) + + parser.add_argument( + "--autotune-effort", + type=str, + default="quick", + help=( + "Helion autotune effort level: 'quick' (smaller search) or " + "'full' (full search budget) (default: quick)" + ), + ) + + parser.add_argument( + "--verbose", + action="store_true", + help="Enable verbose logging", + ) + + args = parser.parse_args() + + import logging + + if args.verbose: + logging.getLogger("vllm").setLevel(logging.DEBUG) + logger.debug("Verbose mode enabled") + logger.debug("Arguments: %s", vars(args)) + else: + logging.getLogger("vllm").setLevel(logging.INFO) + + if args.list: + list_kernels() + return + + if not check_requirements(): + sys.exit(1) + + platform = get_canonical_gpu_name() + logger.info("Detected GPU platform: %s", platform) + + config_manager = ( + ConfigManager(args.config_dir) if args.config_dir else ConfigManager() + ) + + try: + config_manager.ensure_base_dir_writable() + except OSError as e: + logger.error("Failed to access config directory: %s", e) + sys.exit(1) + + kernels_to_autotune = get_kernels_to_autotune(args.kernels) + + logger.info( + "Will autotune %d kernel(s) for platform '%s': %s", + len(kernels_to_autotune), + platform, + kernels_to_autotune, + ) + + results = {} + for kernel_name in kernels_to_autotune: + result = autotune_kernel( + kernel_name, platform, config_manager, args.force, args.autotune_effort + ) + results[kernel_name] = result + + success = summarize_results(results) + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/vllm/kernels/helion/config_manager.py b/vllm/kernels/helion/config_manager.py index 63560761e..3c53106ce 100644 --- a/vllm/kernels/helion/config_manager.py +++ b/vllm/kernels/helion/config_manager.py @@ -131,6 +131,27 @@ class ConfigSet: return config_set + def set_config( + self, platform: str, config_key: str, config: "helion.Config" + ) -> None: + platform = platform.lower() + if platform not in self._configs: + self._configs[platform] = {} + self._configs[platform][config_key] = config + logger.debug( + "Set config for kernel '%s': platform='%s', key='%s'", + self._kernel_name, + platform, + config_key, + ) + + def has_config(self, platform: str, config_key: str) -> bool: + platform = platform.lower() + platform_dict = self._configs.get(platform) + if platform_dict is None: + return False + return config_key in platform_dict + class ConfigManager: """File-level configuration management for Helion kernels (global singleton).""" @@ -142,7 +163,6 @@ class ConfigManager: resolved_base_dir = cls._resolve_base_dir(base_dir) if cls._instance is not None: - # Instance already exists - check for base_dir mismatch if cls._instance_base_dir != resolved_base_dir: raise ValueError( f"ConfigManager singleton already exists with base_dir " @@ -151,14 +171,12 @@ class ConfigManager: ) return cls._instance - # Create new instance instance = super().__new__(cls) cls._instance = instance cls._instance_base_dir = resolved_base_dir return instance def __init__(self, base_dir: str | Path | None = None): - # Only initialize if not already initialized if hasattr(self, "_base_dir"): return @@ -193,6 +211,17 @@ class ConfigManager: self._base_dir.mkdir(parents=True, exist_ok=True) return self._base_dir + def ensure_base_dir_writable(self) -> None: + self.ensure_base_dir_exists() + test_file = self._base_dir / ".write_test" + try: + test_file.write_text("test") + test_file.unlink() + except OSError as e: + raise OSError( + f"Config directory '{self._base_dir}' is not writable: {e}" + ) from e + def load_config_set(self, kernel_name: str) -> ConfigSet: config_path = self.get_config_file_path(kernel_name) if not config_path.exists(): @@ -226,3 +255,19 @@ class ConfigManager: logger.info("Saved config to: %s", config_path) return config_path + + def save_configs( + self, + kernel_name: str, + platform: str, + configs: dict[str, "helion.Config"], + ) -> Path: + """Save configs for a kernel/platform, merging with existing.""" + config_set = self.load_config_set(kernel_name) + for config_key, config in configs.items(): + config_set.set_config(platform, config_key, config) + return self.save_config_set(config_set) + + def config_exists(self, kernel_name: str, platform: str, config_key: str) -> bool: + config_set = self.load_config_set(kernel_name) + return config_set.has_config(platform, config_key) diff --git a/vllm/kernels/helion/register.py b/vllm/kernels/helion/register.py index b90110724..3114631dd 100644 --- a/vllm/kernels/helion/register.py +++ b/vllm/kernels/helion/register.py @@ -65,7 +65,6 @@ vllm_helion_lib = Library("vllm_helion", "FRAGMENT") # noqa def validate_helion_settings( helion_settings: "helion.Settings | None", op_name: str ) -> None: - """Validate that helion_settings doesn't contain conflicting options.""" if helion_settings is None: return @@ -93,6 +92,26 @@ def validate_helion_settings( ) +def create_helion_decorated_kernel( + raw_kernel_func: Callable, + helion_settings: "helion.Settings | None" = None, + extra_kwargs: dict[str, Any] | None = None, +) -> Any: + kernel_kwargs: dict[str, Any] = {} + if helion_settings: + kernel_kwargs.update(helion_settings.to_dict()) + + # Set static_shapes=False by default if user didn't explicitly set it + # This is needed for dynamic batch sizes and sequence lengths in vLLM + if kernel_kwargs.get("static_shapes") is not True: + kernel_kwargs["static_shapes"] = False + + if extra_kwargs: + kernel_kwargs.update(extra_kwargs) + + return helion.kernel(**kernel_kwargs)(raw_kernel_func) + + class PresetConfigSearch(BaseAutotuner): """Custom autotuner that uses a preset config selector instead of autotuning.""" @@ -198,26 +217,19 @@ class ConfiguredHelionKernel: key_computer = self._create_key_computer() config_selector = self._create_config_selector(key_computer) - kernel_kwargs = {} - if self.helion_settings: - kernel_kwargs.update(self.helion_settings.to_dict()) - - # Set static_shapes=False by default if user didn't explicitly set it to True - # This is needed for dynamic batch sizes and sequence lengths in vLLM - if kernel_kwargs.get("static_shapes") is not True: - kernel_kwargs["static_shapes"] = False - - kernel_kwargs["autotuner_fn"] = lambda _, args: PresetConfigSearch( - args, config_selector - ) - kernel_kwargs["key"] = key_computer + extra_kwargs = { + "autotuner_fn": lambda _, args: PresetConfigSearch(args, config_selector), + "key": key_computer, + } logger.debug( "Creating decorated kernel %s with custom autotuner on platform %s", self.op_name, self.platform, ) - return helion.kernel(**kernel_kwargs)(self.raw_kernel_func) + return create_helion_decorated_kernel( + self.raw_kernel_func, self.helion_settings, extra_kwargs + ) class HelionKernelWrapper: @@ -240,6 +252,7 @@ class HelionKernelWrapper: self._config_picker: ( Callable[[tuple[Any, ...], list[str]], str | None] | None ) = None + self._input_generator: Callable[[], dict[str, tuple[Any, ...]]] | None = None def __call__(self, *args, **kwargs): configured_op = self.get_configured_op() @@ -251,6 +264,51 @@ class HelionKernelWrapper: self._config_picker = picker_func return picker_func + def register_input_generator( + self, generator_func: Callable[[], dict[str, tuple[Any, ...]]] + ) -> Callable[[], dict[str, tuple[Any, ...]]]: + """ + Register a function to generate inputs for autotuning and benchmarking. + + Args: + generator_func: Function that returns dict[str, tuple] where: + - key: Configuration identifier (e.g., "4096", "hidden_4096") + - value: Tuple of arguments to pass to the kernel + + Returns: + The registered function (for decorator usage) + + Example: + @kernel_wrapper.register_input_generator + def generate_inputs(): + return { + "4096": (torch.randn(4096, device="cuda"), 0.5), + "8192": (torch.randn(8192, device="cuda"), 0.5), + } + """ + self._input_generator = generator_func + return generator_func + + def get_inputs(self) -> dict[str, tuple[Any, ...]]: + if self._input_generator is None: + raise NotImplementedError( + f"No input generator registered for kernel '{self.op_name}'. " + f"Use @{self.op_name}.register_input_generator to register one." + ) + return self._input_generator() + + def run_autotune( + self, + inputs: tuple[Any, ...], + autotune_effort: str = "quick", + ) -> Config: + """Run autotuning for a single input configuration.""" + extra_kwargs = {"autotune_effort": autotune_effort} + autotune_kernel = create_helion_decorated_kernel( + self.raw_kernel_func, self.helion_settings, extra_kwargs + ) + return autotune_kernel.autotune(inputs) + def get_configured_op(self) -> Any: assert self._config_picker is not None, ( f"No config picker registered for kernel '{self.op_name}'. " -- GitLab From b86bf4417e3172b372ff20cccf4d30289a6db8ae Mon Sep 17 00:00:00 2001 From: Frank Wang <41319051+frankwang28@users.noreply.github.com> Date: Thu, 12 Feb 2026 18:21:19 -0800 Subject: [PATCH 0160/1166] [Bugfix] Fix Random Dataset Prefix Length Inaccuracy (#33907) Signed-off-by: frankwang28 Co-authored-by: Roger Wang --- vllm/benchmarks/datasets.py | 39 +++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 86e080b55..36573a040 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -380,7 +380,7 @@ def gen_prompt_decode_to_target_len( max_retry: int = 10, add_special_tokens: bool = False, rng: np.random.Generator | None = None, -) -> tuple[str, list[int]]: +) -> tuple[str, list[int], int]: """ Ensure decoded-then-encoded prompt length matches the target token length. @@ -392,7 +392,9 @@ def gen_prompt_decode_to_target_len( [6880, 6881] -> ['Ġcalls', 'here'] -> [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] - Returns a tuple of the final prompt string and the adjusted token sequence. + Returns a tuple of the final prompt string, the adjusted token sequence, + and the token mismatch (final_len - target_token_len) if the retry budget + is exhausted. """ remain_num_try = max_retry token_mismatch = 0 @@ -499,7 +501,7 @@ class RandomDataset(BenchmarkDataset): allowed_tokens = np.array(list(set(all_tokens) - set(prohibited_tokens))) # Generate prefix once - prefix_token_ids = self.get_prefix(allowed_tokens, prefix_len) + prefix_token_ids = self.get_prefix(tokenizer, allowed_tokens, prefix_len) requests = [] token_mismatch_total = 0 @@ -554,19 +556,36 @@ class RandomDataset(BenchmarkDataset): def get_prefix( self, + tokenizer: TokenizerLike, allowed_tokens: np.ndarray, prefix_len: int, ) -> list[int]: """ Get the prefix for the dataset. """ - return ( - allowed_tokens[ - self._rng.integers(0, len(allowed_tokens), size=prefix_len) - ].tolist() - if prefix_len > 0 - else [] + if prefix_len <= 0: + return [] + + prefix_tokens = allowed_tokens[ + self._rng.integers(0, len(allowed_tokens), size=prefix_len) + ].tolist() + _, adjusted_tokens, token_mismatch = gen_prompt_decode_to_target_len( + tokenizer=tokenizer, + token_sequence=prefix_tokens, + target_token_len=prefix_len, + add_special_tokens=False, + rng=self._rng, ) + if token_mismatch != 0: + sign = "more" if token_mismatch > 0 else "fewer" + logger.warning( + "Prefix tokenization produced %d %s tokens than expected " + "after decoding and re-encoding. This is expected due to " + "the imperfect nature of the sampling procedure", + abs(token_mismatch), + sign, + ) + return adjusted_tokens def get_sampling_params( self, @@ -1128,7 +1147,7 @@ class RandomMultiModalDataset(RandomDataset): "Sampling from %d out of %d (vocab size)", len(allowed_tokens), vocab_size ) # Generate prefix once - prefix_token_ids = self.get_prefix(allowed_tokens, prefix_len) + prefix_token_ids = self.get_prefix(tokenizer, allowed_tokens, prefix_len) # Add synthetic multimodal items to each request mm_requests = [] token_mismatch_total = 0 -- GitLab From bf37812ca77acf7f00c7761bdb0cf257d0e391a3 Mon Sep 17 00:00:00 2001 From: Harry Huang Date: Fri, 13 Feb 2026 10:21:52 +0800 Subject: [PATCH 0161/1166] [Hybrid] Fix and optimize block-aligned splitting in mamba cache align mode (#33706) Signed-off-by: huanghaoyan.hhy --- vllm/v1/core/sched/scheduler.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 9546672de..f5482e656 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -281,27 +281,30 @@ class Scheduler(SchedulerInterface): assert num_external_computed_tokens == 0, ( "External KV connector is not verified yet" ) - # TODO: need check for resume requests - if request.num_output_tokens == 0: # prefill + num_computed_tokens = ( + request.num_computed_tokens + + num_new_local_computed_tokens + + num_external_computed_tokens + ) + # Perform block-aligned splitting at prefill phase, including: + # * non-resumed requests: num_computed_tokens < num_prompt_tokens + 0 + # * resumed requests: num_computed_tokens < ( + # num_prompt_tokens + num_output_tokens + # ) + # NOTE: Use `request.num_tokens - 1` to bypass normal decoding. + if num_computed_tokens < max(request.num_prompt_tokens, request.num_tokens - 1): # To enable block-aligned caching of the Mamba state, `num_new_tokens` # must be a multiple of `block_size`. # As an exception, if `num_new_tokens` is less than `block_size`, the # state is simply not cached, requiring no special handling. # Additionally, when Eagle mode is enabled, FullAttn prunes the last # matching block. To prevent this from causing a Mamba cache miss, the - # last chunk must be larger than `block_size`. + # last chunk must be not smaller than `block_size`. block_size = self.cache_config.block_size - last_cache_position = ( - request.num_prompt_tokens - request.num_prompt_tokens % block_size - ) + last_cache_position = request.num_tokens - request.num_tokens % block_size # eagle prune if self.use_eagle: last_cache_position = max(last_cache_position - block_size, 0) - num_computed_tokens = ( - request.num_computed_tokens - + num_new_local_computed_tokens - + num_external_computed_tokens - ) num_computed_tokens_after_sched = num_computed_tokens + num_new_tokens if num_computed_tokens_after_sched < last_cache_position: # align to block_size -- GitLab From 94ed6cf6ea9b0097bbf738467b8fa27b77c2838a Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 13 Feb 2026 10:39:28 +0800 Subject: [PATCH 0162/1166] Add new sections to CODEOWNERS (#34309) Signed-off-by: DarkLight1337 --- .github/CODEOWNERS | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 2e7930785..9be9190c2 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,7 +2,9 @@ # for more info about CODEOWNERS file # This lists cover the "core" components of vLLM that require careful review -/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn +/vllm/compilation @zou3519 @youkaichao @ProExpertProg +/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery +/vllm/lora @jeejeelee /vllm/model_executor/layers/attention @LucasWilkinson /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety @@ -11,18 +13,34 @@ /vllm/model_executor/layers/batch_invariant.py @yewentao256 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa /vllm/vllm_flash_attn @LucasWilkinson -/vllm/lora @jeejeelee -/vllm/reasoning @aarnphm @chaunceyjiang -/vllm/entrypoints @aarnphm @chaunceyjiang -/vllm/tool_parsers @aarnphm @chaunceyjiang -/vllm/compilation @zou3519 @youkaichao @ProExpertProg -/vllm/distributed/kv_transfer @NickLucche @ApostaC @orozery CMakeLists.txt @tlrmchlsmth @LucasWilkinson # Any change to the VllmConfig changes can have a large user-facing impact, # so spam a lot of people /vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg -/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345 +/vllm/config/cache.py @heheda12345 + +# Entrypoints +/vllm/entrypoints/anthropic @mgoin @DarkLight1337 +/vllm/entrypoints/cli @hmellor @mgoin @DarkLight1337 @russellb +/vllm/entrypoints/mcp @heheda12345 +/vllm/entrypoints/openai @aarnphm @chaunceyjiang @DarkLight1337 @russellb +/vllm/entrypoints/openai/realtime @njhill +/vllm/entrypoints/openai/speech_to_text @NickLucche +/vllm/entrypoints/pooling @noooop +/vllm/entrypoints/sagemaker @DarkLight1337 +/vllm/entrypoints/serve @njhill +/vllm/entrypoints/*.py @njhill +/vllm/entrypoints/chat_utils.py @DarkLight1337 +/vllm/entrypoints/llm.py @DarkLight1337 + +# Input/Output Processing +/vllm/sampling_params.py @njhill @NickLucche +/vllm/pooling_params.py @noooop @DarkLight1337 +/vllm/tokenizers @DarkLight1337 @njhill +/vllm/renderers @DarkLight1337 @njhill +/vllm/reasoning @aarnphm @chaunceyjiang +/vllm/tool_parsers @aarnphm @chaunceyjiang # vLLM V1 /vllm/v1/attention @LucasWilkinson @@ -115,8 +133,8 @@ mkdocs.yaml @hmellor /vllm/model_executor/models/mixtral*.py @patrickvonplaten /vllm/model_executor/models/voxtral*.py @patrickvonplaten /vllm/model_executor/models/pixtral*.py @patrickvonplaten +/vllm/tokenizers/mistral.py @patrickvonplaten /vllm/transformers_utils/configs/mistral.py @patrickvonplaten -/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten # Kernels /vllm/v1/attention/ops/chunked_prefill_paged_decode.py @tdoublep @@ -152,9 +170,7 @@ mkdocs.yaml @hmellor /examples/pooling @noooop /tests/models/*/pooling* @noooop /tests/entrypoints/pooling @noooop -/vllm/entrypoints/pooling @noooop /vllm/config/pooler.py @noooop -/vllm/pooling_params.py @noooop /vllm/model_executor/layers/pooler @noooop # Security guide and policies -- GitLab From 6afa587d31e911c4be495f16916d45d98ebd600c Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Thu, 12 Feb 2026 21:27:53 -0600 Subject: [PATCH 0163/1166] [ROCm][CI] Fix serving tokens test failures (#34047) Signed-off-by: Andreas Karatzas --- .../entrypoints/openai/test_serving_tokens.py | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_serving_tokens.py b/tests/entrypoints/openai/test_serving_tokens.py index aa56dfd6b..6cd4fd7a1 100644 --- a/tests/entrypoints/openai/test_serving_tokens.py +++ b/tests/entrypoints/openai/test_serving_tokens.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os + import httpx import pytest import pytest_asyncio @@ -46,6 +48,27 @@ def server(request): "--max-model-len", "1024", "--enforce-eager", + # On ROCm (e.g. MI355X/gfx950), bf16 GEMM results can differ by + # 1 ULP when the batch dimension (M) changes, because different M + # values cause the Tensile backend to select different tile + # configurations with different fp32 accumulation orders. With + # prefix caching, cache-miss prefills compute all tokens in one + # pass (large M) while cache-hit requests compute only the + # uncached suffix (small M), seeding a divergence that amplifies + # through the residual stream and flips argmax tokens. + # See: https://github.com/vllm-project/vllm/issues/33123 + # + # Either disable prefix caching entirely, or enable it with + # --deterministic-prefix-caching which forces cache-miss prefills + # to split at block boundaries so the suffix GEMM shape is always + # identical regardless of cache state. + # + # Option A: disable prefix caching + "--no-enable-prefix-caching", + # + # Option B: deterministic prefix caching + # "--enable-prefix-caching", + # "--deterministic-prefix-caching", ] extra_args = getattr(request, "param", None) @@ -56,7 +79,11 @@ def server(request): else [str(extra_args)] ) - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + envs = os.environ.copy() + # See: https://github.com/vllm-project/vllm/pull/33493#issuecomment-3888060787 + envs["VLLM_ROCM_USE_SKINNY_GEMM"] = "0" + + with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server: yield remote_server -- GitLab From 372b2e762aeeb040e57a690f0aa0428775a1e239 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 13 Feb 2026 12:47:01 +0800 Subject: [PATCH 0164/1166] [Bugfix] Standardize getting number of image patches/tokens (#34358) Signed-off-by: DarkLight1337 --- tests/kernels/core/test_mrope.py | 24 +---- .../multimodal/generation/test_common.py | 6 -- .../multimodal/processing/test_gemma3.py | 1 + .../multimodal/processing/test_idefics3.py | 12 ++- .../multimodal/processing/test_qwen2_vl.py | 1 + .../multimodal/processing/test_smolvlm.py | 12 ++- vllm/model_executor/models/cohere2_vision.py | 41 ++------- vllm/model_executor/models/ernie45_vl.py | 39 +++++--- vllm/model_executor/models/gemma3_mm.py | 92 ++++++++----------- vllm/model_executor/models/gemma3n_mm.py | 10 +- vllm/model_executor/models/h2ovl.py | 5 +- vllm/model_executor/models/hunyuan_vision.py | 34 ++++--- vllm/model_executor/models/idefics3.py | 64 +++++-------- vllm/model_executor/models/interns1.py | 26 +++--- vllm/model_executor/models/internvl.py | 5 +- vllm/model_executor/models/keye.py | 44 +++++---- vllm/model_executor/models/lfm2_vl.py | 65 ++++++++----- vllm/model_executor/models/molmo.py | 5 +- vllm/model_executor/models/molmo2.py | 32 ++++--- vllm/model_executor/models/ovis2_5.py | 5 +- vllm/model_executor/models/paddleocr_vl.py | 20 ++-- vllm/model_executor/models/phi3v.py | 5 +- vllm/model_executor/models/phi4mm.py | 18 ++-- vllm/model_executor/models/pixtral.py | 17 +--- vllm/model_executor/models/qwen2_vl.py | 33 ++++--- vllm/model_executor/models/qwen3_vl.py | 20 ++-- vllm/model_executor/models/skyworkr1v.py | 5 +- vllm/model_executor/models/smolvlm.py | 4 +- vllm/multimodal/processing/context.py | 7 +- 29 files changed, 320 insertions(+), 332 deletions(-) diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py index f12dc1865..29051b4a0 100644 --- a/tests/kernels/core/test_mrope.py +++ b/tests/kernels/core/test_mrope.py @@ -4,8 +4,6 @@ from typing import NamedTuple import pytest import torch -from packaging.version import Version -from transformers import __version__ as TRANSFORMERS_VERSION from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.platforms import current_platform @@ -46,31 +44,13 @@ class MRoPETestInfo(NamedTuple): marks: list[pytest.MarkDecorator] = [] -TRANSFORMERS_BASE_VERSION = Version(TRANSFORMERS_VERSION).base_version - MODELS_TO_TEST = [ MRoPETestInfo(model_name="zai-org/GLM-4.1V-9B-Thinking"), MRoPETestInfo(model_name="Qwen/Qwen2-VL-7B-Instruct"), MRoPETestInfo(model_name="Qwen/Qwen2-VL-72B-Instruct"), MRoPETestInfo(model_name="Qwen/Qwen2.5-VL-72B-Instruct"), - MRoPETestInfo( - model_name="Qwen/Qwen3-VL-4B-Instruct", - marks=[ - pytest.mark.skipif( - Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"), - reason="Qwen3-VL only available after Transformers v4.57", - ) - ], - ), - MRoPETestInfo( - model_name="Qwen/Qwen3-VL-30B-A3B-Instruct", - marks=[ - pytest.mark.skipif( - Version(TRANSFORMERS_BASE_VERSION) < Version("4.57.0"), - reason="Qwen3-VL only available after Transformers v4.57", - ) - ], - ), + MRoPETestInfo(model_name="Qwen/Qwen3-VL-4B-Instruct"), + MRoPETestInfo(model_name="Qwen/Qwen3-VL-30B-A3B-Instruct"), ] num_tokens_list = [11, 8192] diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index d9b7a2821..2db9c531d 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -961,12 +961,6 @@ VLM_TEST_SETTINGS = { limit_mm_per_prompt={"image": 4}, ) ], - marks=[ - pytest.mark.skipif( - Version(TRANSFORMERS_VERSION) == Version("4.57.1"), - reason="This model is broken in Transformers v4.57.1", - ) - ], ), # regression test for https://github.com/vllm-project/vllm/issues/15122 "qwen2_5_vl-windows-attention": VLMTestInfo( diff --git a/tests/models/multimodal/processing/test_gemma3.py b/tests/models/multimodal/processing/test_gemma3.py index 5a3271e07..a9c259c89 100644 --- a/tests/models/multimodal/processing/test_gemma3.py +++ b/tests/models/multimodal/processing/test_gemma3.py @@ -168,6 +168,7 @@ def test_get_image_size_with_most_features( image_width=max_image_size.width, image_height=max_image_size.height, processor=hf_processor, + mm_kwargs=hf_processor_mm_kwargs, ) prompt = "" diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py index d88d37f0b..342075ccc 100644 --- a/tests/models/multimodal/processing/test_idefics3.py +++ b/tests/models/multimodal/processing/test_idefics3.py @@ -3,7 +3,9 @@ """Tests for Idefics3's multimodal preprocessing kwargs.""" import pytest +from packaging.version import Version from transformers import Idefics3Config +from transformers import __version__ as TRANSFORMERS_VERSION from vllm.multimodal import MULTIMODAL_REGISTRY @@ -11,6 +13,10 @@ from ....conftest import ImageTestAssets from ...utils import build_model_context +@pytest.mark.skipif( + Version(TRANSFORMERS_VERSION) < Version("5.2.0"), + reason="See https://github.com/huggingface/transformers/pull/43948", +) @pytest.mark.parametrize("model_id", ["HuggingFaceM4/Idefics3-8B-Llama3"]) @pytest.mark.parametrize( ("mm_processor_kwargs", "expected_toks_per_img"), @@ -63,7 +69,11 @@ def test_processor_override( # Ensure the placeholders format are correct hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs) - hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"]) + hf_processed_inputs = hf_processor( + text=prompt, + images=mm_data["image"], + **processor.info.ctx.get_merged_mm_kwargs(hf_processor_mm_kwargs), + ) assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0] # Ensure we have the right number of placeholders per num_crops size diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py index a0ecce5d8..11f9ac232 100644 --- a/tests/models/multimodal/processing/test_qwen2_vl.py +++ b/tests/models/multimodal/processing/test_qwen2_vl.py @@ -82,6 +82,7 @@ def test_get_image_size_with_most_features( image_width=max_image_size.width, image_height=max_image_size.height, image_processor=hf_processor.image_processor, + mm_kwargs=hf_processor_mm_kwargs, ) prompt = "<|vision_start|><|image_pad|><|vision_end|>" diff --git a/tests/models/multimodal/processing/test_smolvlm.py b/tests/models/multimodal/processing/test_smolvlm.py index 102563154..e8ae56efd 100644 --- a/tests/models/multimodal/processing/test_smolvlm.py +++ b/tests/models/multimodal/processing/test_smolvlm.py @@ -3,7 +3,9 @@ """Tests for smolvlm's multimodal preprocessing kwargs.""" import pytest +from packaging.version import Version from transformers import SmolVLMConfig +from transformers import __version__ as TRANSFORMERS_VERSION from vllm.multimodal import MULTIMODAL_REGISTRY @@ -11,6 +13,10 @@ from ....conftest import ImageTestAssets from ...utils import build_model_context +@pytest.mark.skipif( + Version(TRANSFORMERS_VERSION) < Version("5.2.0"), + reason="See https://github.com/huggingface/transformers/pull/43948", +) @pytest.mark.parametrize("model_id", ["HuggingFaceTB/SmolVLM2-2.2B-Instruct"]) @pytest.mark.parametrize( ("mm_processor_kwargs", "expected_toks_per_img"), @@ -63,7 +69,11 @@ def test_processor_override( # Ensure the placeholders format are correct hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs) - hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"]) + hf_processed_inputs = hf_processor( + text=prompt, + images=mm_data["image"], + **processor.info.ctx.get_merged_mm_kwargs(hf_processor_mm_kwargs), + ) assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0] # Ensure we have the right number of placeholders per num_crops size diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index 4aefd2ead..1bcdd41b3 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -11,7 +11,7 @@ from torch import nn from transformers import BatchFeature, PretrainedConfig from transformers.models.cohere2_vision import Cohere2VisionConfig from transformers.models.cohere2_vision.image_processing_cohere2_vision_fast import ( # noqa: E501 - get_optimal_tiled_canvas, + Cohere2VisionImageProcessorFast, ) from transformers.models.cohere2_vision.processing_cohere2_vision import ( Cohere2VisionProcessor, @@ -166,43 +166,20 @@ class Cohere2VisionProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - processor: Cohere2VisionProcessor | None, + processor: Cohere2VisionProcessor, + mm_kwargs: Mapping[str, object], ) -> int: """ Calculate the number of image patches for a given image. Uses the HF processor to determine the actual number of patches. """ - if processor is None: - processor = self.get_hf_processor() - - image_processor = processor.image_processor + image_processor: Cohere2VisionImageProcessorFast = processor.image_processor - # The current implementation of get_number_of_image_patches - # is incorrect, so we patch it here. - # TODO: Revert once - # https://github.com/huggingface/transformers/pull/40312 is released. - # return image_processor.get_number_of_image_patches(image_height, - # image_width, {}) - - min_patches = image_processor.min_patches - max_patches = image_processor.max_patches - patch_size = image_processor.size - crop_to_patches = image_processor.crop_to_patches - - if not crop_to_patches: - return 1 - - num_columns, num_rows = get_optimal_tiled_canvas( - (image_height, image_width), - (patch_size["height"], patch_size["width"]), - min_patches, - max_patches, + return image_processor.get_number_of_image_patches( + image_height, + image_width, + self.ctx.get_merged_mm_kwargs(mm_kwargs), ) - num_patches = num_columns * num_rows - if num_patches > 1: - num_patches += 1 # Thumbnail image - - return num_patches class Cohere2VisionDummyInputsBuilder( @@ -271,6 +248,7 @@ class Cohere2VisionMultiModalProcessor( image_width=parsed_images.get_image_size(i).width, image_height=parsed_images.get_image_size(i).height, processor=hf_processor, + mm_kwargs=mm_kwargs, ) for i in range(len(parsed_images)) ] @@ -311,6 +289,7 @@ class Cohere2VisionMultiModalProcessor( image_width=image_size.width, image_height=image_size.height, processor=hf_processor, + mm_kwargs=hf_processor_mm_kwargs, ) patch_tokens = image_token * img_tokens_per_tile + img_line_break_token repl = f"{boi_token}{patch_tokens * num_patches}{eoi_token}" diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 50d3954b6..37e95b261 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -34,7 +34,7 @@ import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange -from transformers import BatchFeature +from transformers import BaseImageProcessor, BatchFeature from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions @@ -818,10 +818,9 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo): image_height: int, num_frames: int = 1, do_resize: bool = True, - image_processor: Any | None, + image_processor: BaseImageProcessor, + mm_kwargs: Mapping[str, object], ) -> tuple[ImageSize, int]: - if image_processor is None: - image_processor = self.get_image_processor() hf_config = self.get_hf_config() vision_config = hf_config.vision_config @@ -829,13 +828,16 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo): spatial_conv_size = hf_config.spatial_conv_size temporal_conv_size = hf_config.temporal_conv_size + mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs) + size = mm_kwargs.get("size", image_processor.size) + if do_resize: resized_height, resized_width = smart_resize( height=image_height, width=image_width, factor=patch_size * spatial_conv_size, - min_pixels=image_processor.min_pixels, - max_pixels=image_processor.max_pixels, + min_pixels=size["min_pixels"], + max_pixels=size["max_pixels"], ) preprocessed_size = ImageSize(width=resized_width, height=resized_height) else: @@ -855,12 +857,14 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - image_processor: Any | None, + image_processor: BaseImageProcessor, + mm_kwargs: Mapping[str, object], ) -> int: _, num_image_tokens = self._get_vision_info( image_width=image_width, image_height=image_height, image_processor=image_processor, + mm_kwargs=mm_kwargs, ) return num_image_tokens @@ -870,35 +874,43 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo): image_width: int, image_height: int, num_frames: int, - image_processor: Any | None, + image_processor: BaseImageProcessor, + mm_kwargs: Mapping[str, object], ) -> int: _, num_video_tokens = self._get_vision_info( image_width=image_width, image_height=image_height, num_frames=num_frames, image_processor=image_processor, + mm_kwargs=mm_kwargs, ) return num_video_tokens def get_image_size_with_most_features(self) -> ImageSize: + image_processor = self.get_image_processor() + max_image_size, _ = self._get_vision_info( image_width=9999999, image_height=9999999, - image_processor=None, + image_processor=image_processor, + mm_kwargs={}, ) return max_image_size def get_max_image_tokens(self) -> int: + image_processor = self.get_image_processor() target_width, target_height = self.get_image_size_with_most_features() num_image_tokens = self.get_num_image_tokens( image_width=target_width, image_height=target_height, - image_processor=None, + image_processor=image_processor, + mm_kwargs={}, ) return num_image_tokens def _get_max_video_frames(self, max_tokens: int) -> int: + image_processor = self.get_image_processor() target_width, target_height = self.get_image_size_with_most_features() num_frames = 0 @@ -909,7 +921,8 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo): image_width=target_width, image_height=target_height, num_frames=next_num_frames, - image_processor=None, + image_processor=image_processor, + mm_kwargs={}, ) if next_max_tokens > max_tokens: @@ -942,13 +955,15 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo): seq_len: int, mm_counts: Mapping[str, int], ) -> int: + image_processor = self.get_image_processor() target_width, target_height = self.get_image_size_with_most_features() return self.get_num_video_tokens( image_width=target_width, image_height=target_height, num_frames=self.get_num_frames_with_most_features(seq_len, mm_counts), - image_processor=None, + image_processor=image_processor, + mm_kwargs={}, ) diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 1e803f89b..d0a326ccd 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -7,6 +7,7 @@ from typing import Annotated, Any, Literal import torch from torch import nn from transformers import BatchFeature, Gemma3Config, Gemma3Processor +from transformers.models.gemma3.image_processing_gemma3 import Gemma3ImageProcessor from transformers.models.gemma3.processing_gemma3 import Gemma3ProcessorKwargs from vllm.config import VllmConfig @@ -84,54 +85,35 @@ class Gemma3ProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"image": None} - def _resolve_image_kwargs( - self, - processor: Gemma3Processor, - keys: set[str], - ) -> dict[str, Any]: - image_processor = processor.image_processor - kwargs = processor._merge_kwargs( - Gemma3ProcessorKwargs, - tokenizer_init_kwargs=processor.tokenizer.init_kwargs, - ) - - images_kwargs = kwargs["images_kwargs"] - - def _resolve_kw(key: str): - val = getattr(image_processor, key) - if val is None: - val = images_kwargs[key] - - return val - - return {k: _resolve_kw(k) for k in keys} - def get_num_crops( self, *, image_width: int, image_height: int, - processor: Gemma3Processor | None, + processor: Gemma3Processor, + mm_kwargs: Mapping[str, object], ) -> int: - if processor is None: - processor = self.get_hf_processor() - - images_kwargs = self._resolve_image_kwargs( - processor, - { - "do_pan_and_scan", - "pan_and_scan_min_crop_size", - "pan_and_scan_max_num_crops", - "pan_and_scan_min_ratio_to_activate", - }, - ) + image_processor: Gemma3ImageProcessor = processor.image_processor - do_pan_and_scan = images_kwargs["do_pan_and_scan"] - pan_and_scan_min_crop_size = images_kwargs["pan_and_scan_min_crop_size"] - pan_and_scan_max_num_crops = images_kwargs["pan_and_scan_max_num_crops"] - pan_and_scan_min_ratio_to_activate = images_kwargs[ - "pan_and_scan_min_ratio_to_activate" - ] + images_kwargs = processor._merge_kwargs( + Gemma3ProcessorKwargs, + tokenizer_init_kwargs=processor.tokenizer.init_kwargs, + **self.ctx.get_merged_mm_kwargs(mm_kwargs), + )["images_kwargs"] + + do_pan_and_scan = images_kwargs.get( + "do_pan_and_scan", image_processor.do_pan_and_scan + ) + pan_and_scan_min_crop_size = images_kwargs.get( + "pan_and_scan_min_crop_size", image_processor.pan_and_scan_min_crop_size + ) + pan_and_scan_max_num_crops = images_kwargs.get( + "pan_and_scan_max_num_crops", image_processor.pan_and_scan_max_num_crops + ) + pan_and_scan_min_ratio_to_activate = images_kwargs.get( + "pan_and_scan_min_ratio_to_activate", + image_processor.pan_and_scan_min_ratio_to_activate, + ) if not do_pan_and_scan: return 0 @@ -180,17 +162,16 @@ class Gemma3ProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - processor: Gemma3Processor | None, + processor: Gemma3Processor, + mm_kwargs: Mapping[str, object], ) -> PromptUpdateDetails[str]: - if processor is None: - processor = self.get_hf_processor() - boi_token = processor.boi_token num_crops = self.get_num_crops( image_width=image_width, image_height=image_height, processor=processor, + mm_kwargs=mm_kwargs, ) if num_crops == 0: @@ -215,15 +196,14 @@ class Gemma3ProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - processor: Gemma3Processor | None, + processor: Gemma3Processor, + mm_kwargs: Mapping[str, object], ) -> int: - if processor is None: - processor = self.get_hf_processor() - num_crops = self.get_num_crops( image_width=image_width, image_height=image_height, processor=processor, + mm_kwargs=mm_kwargs, ) image_seq_len = processor.image_seq_length @@ -231,11 +211,17 @@ class Gemma3ProcessingInfo(BaseProcessingInfo): def get_image_size_with_most_features(self) -> ImageSize: processor = self.get_hf_processor() + image_processor: Gemma3ImageProcessor = processor.image_processor + + images_kwargs = processor._merge_kwargs( + Gemma3ProcessorKwargs, + tokenizer_init_kwargs=processor.tokenizer.init_kwargs, + **self.ctx.get_merged_mm_kwargs({}), + )["images_kwargs"] - images_kwargs = self._resolve_image_kwargs( - processor, {"pan_and_scan_max_num_crops"} + max_num_crops = images_kwargs.get( + "pan_and_scan_max_num_crops", image_processor.pan_and_scan_max_num_crops ) - max_num_crops = images_kwargs["pan_and_scan_max_num_crops"] vision_config = self.get_hf_config().vision_config native_size = vision_config.image_size @@ -303,6 +289,7 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]): image_width=size.width, image_height=size.height, processor=hf_processor, + mm_kwargs=mm_kwargs, ) for size in image_sizes ] @@ -339,6 +326,7 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]): image_width=image_size.width, image_height=image_size.height, processor=hf_processor, + mm_kwargs=hf_processor_mm_kwargs, ) return [ diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 8588e51f5..3e4745f7c 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -131,7 +131,7 @@ class Gemma3nProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - processor: Gemma3nProcessor | None, + processor: Gemma3nProcessor, ) -> str: """ Get the replacement text for image tokens. @@ -139,9 +139,6 @@ class Gemma3nProcessingInfo(BaseProcessingInfo): For Gemma3n, this should return the full_image_sequence which includes BOI token, repeated image tokens, and EOI token. """ - if processor is None: - processor = self.get_hf_processor() - return PromptUpdateDetails.select_token_id( processor.full_image_sequence, processor.image_token_id ) @@ -149,7 +146,7 @@ class Gemma3nProcessingInfo(BaseProcessingInfo): def get_audio_repl( self, *, - processor: Gemma3nProcessor | None, + processor: Gemma3nProcessor, ) -> str: """ Get the replacement text for audio tokens. @@ -157,9 +154,6 @@ class Gemma3nProcessingInfo(BaseProcessingInfo): For Gemma3n, this should return the full_audio_sequence which includes BOA token, repeated audio tokens, and EOA token. """ - if processor is None: - processor = self.get_hf_processor() - # Return the full audio sequence as defined by the processor return PromptUpdateDetails.select_token_id( processor.full_audio_sequence, processor.audio_token_id diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 90b495e0d..ea25f884f 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -424,12 +424,9 @@ class H2OVLProcessingInfo(BaseInternVLProcessingInfo): *, image_width: int, image_height: int, - processor: H2OVLProcessor | None, + processor: H2OVLProcessor, use_msac: bool | None = None, ) -> int: - if processor is None: - processor = self.get_hf_processor() - return processor.get_num_image_tokens( image_width=image_width, image_height=image_height, diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index edd00c5cd..50b6bd427 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -78,7 +78,10 @@ from vllm.transformers_utils.configs.hunyuan_vl import ( HunYuanVLVisionConfig, ) from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor -from vllm.transformers_utils.processors.hunyuan_vl_image import smart_resize +from vllm.transformers_utils.processors.hunyuan_vl_image import ( + HunYuanVLImageProcessor, + smart_resize, +) from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import ( @@ -596,7 +599,7 @@ class HunYuanVLProcessingInfo(BaseProcessingInfo): def get_image_processor( self, **kwargs: object, - ) -> HunYuanVLProcessor: + ) -> HunYuanVLImageProcessor: return self.get_hf_processor(**kwargs).image_processor def get_data_parser(self): @@ -624,23 +627,24 @@ class HunYuanVLProcessingInfo(BaseProcessingInfo): image_height: int, num_frames: int = 1, do_resize: bool = True, - image_processor: HunYuanVLProcessor | None, + image_processor: HunYuanVLImageProcessor, + mm_kwargs: Mapping[str, object], ) -> tuple[ImageSize, int]: - if image_processor is None: - image_processor = self.get_image_processor() - hf_config = self.get_hf_config() vision_config = hf_config.vision_config patch_size = vision_config.patch_size spatial_merge_size = vision_config.spatial_merge_size + mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs) + size = mm_kwargs.get("size", image_processor.size) + if do_resize: resized_height, resized_width = smart_resize( height=image_height, width=image_width, factor=patch_size * spatial_merge_size, - min_pixels=image_processor.min_pixels, - max_pixels=image_processor.max_pixels, + min_pixels=size["shortest_edge"], + max_pixels=size["longest_edge"], ) preprocessed_size = ImageSize(width=resized_width, height=resized_height) else: @@ -662,29 +666,37 @@ class HunYuanVLProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - image_processor: HunYuanVLProcessor | None, + image_processor: HunYuanVLImageProcessor, + mm_kwargs: Mapping[str, object], ) -> int: _, num_image_tokens = self._get_vision_info( image_width=image_width, image_height=image_height, image_processor=image_processor, + mm_kwargs=mm_kwargs, ) return num_image_tokens def get_image_size_with_most_features(self) -> ImageSize: + image_processor = self.get_image_processor() + max_image_size, _ = self._get_vision_info( image_width=512, image_height=8192, - image_processor=None, + image_processor=image_processor, + mm_kwargs={}, ) return max_image_size def get_max_image_tokens(self) -> int: + image_processor = self.get_image_processor() target_width, target_height = self.get_image_size_with_most_features() + return self.get_num_image_tokens( image_width=target_width, image_height=target_height, - image_processor=None, + image_processor=image_processor, + mm_kwargs={}, ) diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index e2cfd1d63..434bc7318 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -16,7 +16,6 @@ # limitations under the License. """Inference-only Idefics3 model compatible with HuggingFace weights.""" -import math from collections.abc import Iterable, Mapping, Sequence from typing import Annotated, Literal, TypeAlias @@ -168,54 +167,35 @@ class Idefics3ProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - processor: Idefics3Processor | None, - ) -> tuple[int, int]: - if processor is None: - processor = self.get_hf_processor() - + processor: Idefics3Processor, + mm_kwargs: Mapping[str, object], + ) -> tuple[int, int, int]: image_processor: Idefics3ImageProcessor = processor.image_processor - max_image_size = image_processor.max_image_size["longest_edge"] - size = image_processor.size["longest_edge"] - assert size % max_image_size == 0, ( - "`longest_edge` in image_processor's `size` must be divisible by " - "`longest_edge` in `max_image_size`, this may be caused by " - "incorrect mm_kwargs override." - ) - - resized_height, resized_width = self._get_resize_output_image_size( - image_width=image_width, - image_height=image_height, - resolution_max_side=size, + return image_processor.get_number_of_image_patches( + image_height, + image_width, + self.ctx.get_merged_mm_kwargs(mm_kwargs), ) - if resized_height > max_image_size or resized_width > max_image_size: - grid_h = math.ceil(resized_height / max_image_size) - grid_w = math.ceil(resized_width / max_image_size) - else: - grid_h = grid_w = 0 - return grid_w, grid_h def get_num_patches( self, *, image_width: int, image_height: int, - processor: Idefics3Processor | None, + processor: Idefics3Processor, + mm_kwargs: Mapping[str, object], ) -> int: - grid_w, grid_h = self._get_image_feature_grid_size( + num_patches, _, _ = self._get_image_feature_grid_size( image_width=image_width, image_height=image_height, processor=processor, + mm_kwargs=mm_kwargs, ) - return grid_w * grid_h + 1 - - def _get_image_token( - self, processor: Idefics3Processor | None - ) -> tuple[str, str, str]: - if processor is None: - processor = self.get_hf_processor() + return num_patches + def _get_image_token(self, processor: Idefics3Processor) -> tuple[str, str, str]: image_token = processor.image_token fake_image_token = processor.fake_image_token global_image_token = processor.global_image_tag @@ -226,11 +206,9 @@ class Idefics3ProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - processor: Idefics3Processor | None, + processor: Idefics3Processor, + mm_kwargs: Mapping[str, object], ) -> str: - if processor is None: - processor = self.get_hf_processor() - image_token, fake_image_token, global_img_token = self._get_image_token( processor ) @@ -241,10 +219,11 @@ class Idefics3ProcessingInfo(BaseProcessingInfo): global_img_placeholder = fake_image_token + global_img_token + p_img tile_img_placeholder = fake_image_token + grid_placeholder + p_img - grid_w, grid_h = self._get_image_feature_grid_size( + _, grid_h, grid_w = self._get_image_feature_grid_size( image_width=image_width, image_height=image_height, processor=processor, + mm_kwargs=mm_kwargs, ) if grid_w == 0 and grid_h == 0: return global_img_placeholder + fake_image_token @@ -272,15 +251,14 @@ class Idefics3ProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - processor: Idefics3Processor | None, + processor: Idefics3Processor, + mm_kwargs: Mapping[str, object], ) -> int: - if processor is None: - processor = self.get_hf_processor() - num_patches = self.get_num_patches( image_width=image_width, image_height=image_height, processor=processor, + mm_kwargs=mm_kwargs, ) return num_patches * processor.image_seq_len @@ -353,6 +331,7 @@ class Idefics3MultiModalProcessor(BaseMultiModalProcessor[Idefics3ProcessingInfo image_width=size.width, image_height=size.height, processor=hf_processor, + mm_kwargs=mm_kwargs, ) for size in image_sizes ] @@ -398,6 +377,7 @@ class Idefics3MultiModalProcessor(BaseMultiModalProcessor[Idefics3ProcessingInfo image_width=image_size.width, image_height=image_size.height, processor=hf_processor, + mm_kwargs=hf_processor_mm_kwargs, ) return PromptUpdateDetails.select_text( diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index dd1332dfd..5e973aa83 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -197,20 +197,18 @@ class InternS1ProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - processor: GotOcr2ImageProcessorFast | None = None, + processor: InternVLProcessor, + mm_kwargs: Mapping[str, object], ) -> int: - if processor is None: - processor = self.get_hf_processor().image_processor + image_processor: GotOcr2ImageProcessorFast = processor.image_processor - if not isinstance(processor, GotOcr2ImageProcessorFast): - raise ValueError( - f"GotOcr2ImageProcessorFast is expected but got {type(processor)}" - ) - num_image_patches = processor.get_number_of_image_patches( - image_height, image_width, images_kwargs=dict() + num_image_patches = image_processor.get_number_of_image_patches( + image_height, + image_width, + self.ctx.get_merged_mm_kwargs(mm_kwargs), ) - num_image_tokens = self.get_hf_processor().image_seq_length * num_image_patches - return num_image_tokens + + return processor.image_seq_length * num_image_patches def resolve_target_ratios(self, use_thumbnail: bool | None = None): image_processor = self.get_hf_processor().image_processor @@ -243,7 +241,8 @@ class InternS1ProcessingInfo(BaseProcessingInfo): feat_size = self.get_num_image_tokens( image_width=width, image_height=height, - processor=processor.image_processor, + processor=processor, + mm_kwargs={}, ) if feat_size > largest_feature_size: largest_feature_size = feat_size @@ -262,7 +261,8 @@ class InternS1ProcessingInfo(BaseProcessingInfo): return self.get_num_image_tokens( image_width=target_width, image_height=target_height, - processor=processor.image_processor, + processor=processor, + mm_kwargs={}, ) def get_num_frames_with_most_features( diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 334ee3cbe..7fbbb7237 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -705,11 +705,8 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - processor: BaseInternVLProcessor | None, + processor: BaseInternVLProcessor, ) -> int: - if processor is None: - processor = self.get_hf_processor() - return processor.get_num_image_tokens( image_width=image_width, image_height=image_height, diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 960915af6..2ae044c28 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -10,7 +10,7 @@ import numpy as np import torch import torch.nn as nn from einops import rearrange -from transformers import PretrainedConfig +from transformers import BaseImageProcessor, PretrainedConfig from transformers.activations import GELUActivation from transformers.feature_extraction_utils import BatchFeature from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling @@ -1011,24 +1011,25 @@ class KeyeProcessingInfo(BaseProcessingInfo): image_height: int, num_frames: int = 1, do_resize: bool = True, - image_processor, + image_processor: BaseImageProcessor, + mm_kwargs: Mapping[str, object], ) -> tuple[ImageSize, int]: - if image_processor is None: - image_processor = self.get_image_processor() - hf_config = self.get_hf_config() vision_config = hf_config.vision_config patch_size = vision_config.patch_size merge_size = vision_config.spatial_merge_size temporal_patch_size = 1 + mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs) + size = mm_kwargs.get("size", image_processor.size) + if do_resize: resized_height, resized_width = smart_resize( height=image_height, width=image_width, factor=patch_size * merge_size, - min_pixels=image_processor.min_pixels, - max_pixels=image_processor.max_pixels, + min_pixels=size["min_pixels"], + max_pixels=size["max_pixels"], ) preprocessed_size = ImageSize(width=resized_width, height=resized_height) else: @@ -1050,12 +1051,14 @@ class KeyeProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - image_processor, + image_processor: BaseImageProcessor, + mm_kwargs: Mapping[str, object], ) -> int: _, num_image_tokens = self._get_vision_info( image_width=image_width, image_height=image_height, image_processor=image_processor, + mm_kwargs=mm_kwargs, ) return num_image_tokens @@ -1065,36 +1068,42 @@ class KeyeProcessingInfo(BaseProcessingInfo): image_width: int, image_height: int, num_frames: int, - image_processor, + image_processor: BaseImageProcessor, + mm_kwargs: Mapping[str, object], ) -> int: _, num_video_tokens = self._get_vision_info( image_width=image_width, image_height=image_height, num_frames=num_frames, image_processor=image_processor, + mm_kwargs=mm_kwargs, ) return num_video_tokens - def get_image_size_with_most_features( - self, - ) -> ImageSize: + def get_image_size_with_most_features(self) -> ImageSize: + image_processor = self.get_image_processor() + max_image_size, _ = self._get_vision_info( image_width=self.get_max_image_size(), image_height=self.get_max_image_size(), - image_processor=None, + image_processor=image_processor, + mm_kwargs={}, ) return max_image_size def get_max_image_tokens(self) -> int: + image_processor = self.get_image_processor() target_width, target_height = self.get_image_size_with_most_features() return self.get_num_image_tokens( image_width=target_width, image_height=target_height, - image_processor=None, + image_processor=image_processor, + mm_kwargs={}, ) def _get_max_video_frames(self, max_tokens: int) -> int: + image_processor = self.get_image_processor() target_width, target_height = self.get_image_size_with_most_features() num_frames = 0 @@ -1105,7 +1114,8 @@ class KeyeProcessingInfo(BaseProcessingInfo): image_width=target_width, image_height=target_height, num_frames=next_num_frames, - image_processor=None, + image_processor=image_processor, + mm_kwargs={}, ) if next_max_tokens > max_tokens: @@ -1130,13 +1140,15 @@ class KeyeProcessingInfo(BaseProcessingInfo): return max(max_frames_per_video, 1) def get_max_video_tokens(self, seq_len: int) -> int: + image_processor = self.get_image_processor() target_width, target_height = self.get_image_size_with_most_features() return self.get_num_video_tokens( image_width=target_width, image_height=target_height, num_frames=self.get_num_frames_with_most_features(seq_len), - image_processor=None, + image_processor=image_processor, + mm_kwargs={}, ) diff --git a/vllm/model_executor/models/lfm2_vl.py b/vllm/model_executor/models/lfm2_vl.py index b77b93196..98fd0b1b0 100644 --- a/vllm/model_executor/models/lfm2_vl.py +++ b/vllm/model_executor/models/lfm2_vl.py @@ -176,7 +176,7 @@ class Lfm2VLProcessingInfo(BaseProcessingInfo): min_tiles: int, max_tiles: int, tile_size: int, - ) -> tuple[int, int]: + ) -> tuple[int, int, int]: aspect_ratio = width / height target_ratios = self._target_ratios(min_tiles, max_tiles) # find best matching grid configuration @@ -190,18 +190,27 @@ class Lfm2VLProcessingInfo(BaseProcessingInfo): self, image_width: int, image_height: int, - processor: Lfm2VlProcessor | None, - ) -> tuple[int, int]: - if processor is None: - processor = self.get_image_processor() + processor: Lfm2VlProcessor, + mm_kwargs: Mapping[str, object], + ) -> tuple[int, int, int]: + image_processor: Lfm2VlImageProcessorFast = processor.image_processor - downsample_factor = processor.image_processor.downsample_factor - encoder_patch_size = processor.image_processor.encoder_patch_size - max_pixels_tolerance = processor.image_processor.max_pixels_tolerance - min_tiles = processor.image_processor.min_tiles - max_tiles = processor.image_processor.max_tiles - max_image_tokens = processor.image_processor.max_image_tokens - tile_size = processor.image_processor.tile_size + mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs) + downsample_factor = mm_kwargs.get( + "downsample_factor", image_processor.downsample_factor + ) + encoder_patch_size = mm_kwargs.get( + "encoder_patch_size", image_processor.encoder_patch_size + ) + max_pixels_tolerance = mm_kwargs.get( + "max_pixels_tolerance", image_processor.max_pixels_tolerance + ) + min_tiles = mm_kwargs.get("min_tiles", image_processor.min_tiles) + max_tiles = mm_kwargs.get("max_tiles", image_processor.max_tiles) + max_image_tokens = mm_kwargs.get( + "max_image_tokens", image_processor.max_image_tokens + ) + tile_size = mm_kwargs.get("tile_size", image_processor.tile_size) do_image_splitting = not min_tiles == max_tiles == 1 is_image_large = self._is_image_too_large( @@ -235,12 +244,14 @@ class Lfm2VLProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - processor: Lfm2VlProcessor | None, + processor: Lfm2VlProcessor, + mm_kwargs: Mapping[str, object], ) -> int: _, _, total_patches = self._get_image_feature_grid_size( image_width=image_width, image_height=image_height, processor=processor, + mm_kwargs=mm_kwargs, ) return total_patches @@ -249,11 +260,9 @@ class Lfm2VLProcessingInfo(BaseProcessingInfo): image_width: int, image_height: int, spatial_shapes: torch.Tensor, - processor: Lfm2VlProcessor | None, + processor: Lfm2VlProcessor, + mm_kwargs: Mapping[str, object], ) -> str: - if processor is None: - processor = self.get_hf_processor() - grid_placeholder = "<|img_row_{n_h}_col_{n_w}|>" image_token = processor.image_token image_start_token = processor.image_start_token @@ -263,6 +272,7 @@ class Lfm2VLProcessingInfo(BaseProcessingInfo): num_thumbnail_tokens, num_tokens_per_tile = self.get_num_image_tokens( spatial_shapes=spatial_shapes, processor=processor, + mm_kwargs=mm_kwargs, ) tile_img_placeholder = grid_placeholder + (image_token * num_tokens_per_tile) @@ -270,6 +280,7 @@ class Lfm2VLProcessingInfo(BaseProcessingInfo): image_width=image_width, image_height=image_height, processor=processor, + mm_kwargs=mm_kwargs, ) if grid_w > 1 or grid_h > 1: @@ -295,15 +306,25 @@ class Lfm2VLProcessingInfo(BaseProcessingInfo): self, *, spatial_shapes: torch.Tensor, - processor: Lfm2VlProcessor | None, + processor: Lfm2VlProcessor, + mm_kwargs: Mapping[str, object], ) -> tuple[int, int]: - tile_size = processor.image_processor.tile_size - downsample_factor = processor.image_processor.downsample_factor - encoder_patch_size = processor.image_processor.encoder_patch_size + image_processor: Lfm2VlImageProcessorFast = processor.image_processor + + mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs) + downsample_factor = mm_kwargs.get( + "downsample_factor", image_processor.downsample_factor + ) + encoder_patch_size = mm_kwargs.get( + "encoder_patch_size", image_processor.encoder_patch_size + ) + tile_size = mm_kwargs.get("tile_size", image_processor.tile_size) + num_thumbnail_tokens = spatial_shapes[-1].prod() // (downsample_factor**2) num_patches_tile = tile_size // encoder_patch_size dwn_num_patches_tile = math.ceil(num_patches_tile / downsample_factor) num_tiles_tokens = dwn_num_patches_tile * dwn_num_patches_tile + return num_thumbnail_tokens, num_tiles_tokens @@ -372,6 +393,7 @@ class Lfm2VLMultiModalProcessor(BaseMultiModalProcessor[Lfm2VLProcessingInfo]): image_width=size.width, image_height=size.height, processor=hf_processor, + mm_kwargs=mm_kwargs, ) for size in image_sizes ] @@ -414,6 +436,7 @@ class Lfm2VLMultiModalProcessor(BaseMultiModalProcessor[Lfm2VLProcessingInfo]): image_height=image_size.height, spatial_shapes=spatial_shapes, processor=hf_processor, + mm_kwargs=hf_processor_mm_kwargs, ) return PromptUpdateDetails.select_text( image_repl, diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 6edec9719..b3689ed19 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1224,11 +1224,8 @@ class MolmoProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - processor: MolmoProcessorWrapper | None, + processor: MolmoProcessorWrapper, ) -> int: - if processor is None: - processor = self.get_hf_processor() - ncols, nrows = processor.get_patches_grid_size( image_width=image_width, image_height=image_height, diff --git a/vllm/model_executor/models/molmo2.py b/vllm/model_executor/models/molmo2.py index e0f74ce46..d32c034b5 100644 --- a/vllm/model_executor/models/molmo2.py +++ b/vllm/model_executor/models/molmo2.py @@ -1869,12 +1869,9 @@ class Molmo2ProcessingInfo(BaseProcessingInfo): *, image_height: int, image_width: int, - processor: Molmo2ProcessorWrapper | None = None, + processor: Molmo2ProcessorWrapper, ) -> int: - if processor is None: - processor = self.get_hf_processor() - - hf_processor = processor.processor # type: ignore + hf_processor = processor.processor resize_nrows, resize_cols = processor.get_base_grid_size(is_video=False) # start/end tokens + image patch token + col tokens @@ -1897,11 +1894,8 @@ class Molmo2ProcessingInfo(BaseProcessingInfo): self, *, num_frames: int, - processor: Molmo2ProcessorWrapper | None = None, + processor: Molmo2ProcessorWrapper, ) -> int: - if processor is None: - processor = self.get_hf_processor() - resize_nrows, resize_cols = processor.get_base_grid_size(is_video=True) # start/end tokens extra = 2 + resize_nrows * ( @@ -1929,7 +1923,9 @@ class Molmo2ProcessingInfo(BaseProcessingInfo): width = wr * crop_window_size + total_margin_pixels feat_size = self.get_num_image_tokens( - image_height=height, image_width=width, processor=processor + image_height=height, + image_width=width, + processor=processor, ) if feat_size > largest_feature_size: largest_feature_size = feat_size @@ -1940,8 +1936,15 @@ class Molmo2ProcessingInfo(BaseProcessingInfo): return largest_feature_pinpoint - def _get_max_video_frames(self, max_tokens: int) -> int: - num_tokens_per_frame = self.get_num_video_tokens(num_frames=1) + def _get_max_video_frames( + self, + max_tokens: int, + processor: Molmo2ProcessorWrapper, + ) -> int: + num_tokens_per_frame = self.get_num_video_tokens( + num_frames=1, + processor=processor, + ) max_frames = max_tokens // num_tokens_per_frame return max(max_frames, 1) @@ -1950,10 +1953,11 @@ class Molmo2ProcessingInfo(BaseProcessingInfo): seq_len: int, mm_counts: Mapping[str, int], ) -> int: - video_processor = self.get_hf_processor().processor.video_processor + processor = self.get_hf_processor() + video_processor = processor.processor.video_processor num_frames = video_processor.num_frames max_videos = mm_counts.get("video", 0) - max_total_frames = self._get_max_video_frames(seq_len) + max_total_frames = self._get_max_video_frames(seq_len, processor) max_frames_per_video = min( max_total_frames // max(max_videos, 1), num_frames, diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index 69c0600d8..8d038d4ad 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -215,7 +215,7 @@ class Ovis2_5ProcessingInfo(BaseProcessingInfo): image_width: int, image_height: int, num_frames: int = 1, - ) -> tuple[ImageSize, int]: + ) -> int: hf_config = self.get_hf_config() vit_config = hf_config.vit_config patch_size = vit_config.patch_size @@ -245,7 +245,6 @@ class Ovis2_5ProcessingInfo(BaseProcessingInfo): image_width=target_width, image_height=target_height, num_frames=next_num_frames, - image_processor=None, ) if next_max_tokens > max_tokens: break @@ -270,7 +269,6 @@ class Ovis2_5ProcessingInfo(BaseProcessingInfo): image_width: int, image_height: int, num_frames: int, - image_processor: BaseImageProcessor | None, ) -> int: num_video_tokens = self.get_num_image_tokens( image_width=image_width, image_height=image_height, num_frames=num_frames @@ -287,7 +285,6 @@ class Ovis2_5ProcessingInfo(BaseProcessingInfo): image_width=target_width, image_height=target_height, num_frames=self.get_num_frames_with_most_features(seq_len, mm_counts), - image_processor=None, ) diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index 8d287e342..021f24e11 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -23,7 +23,7 @@ import numpy as np import torch import torch.nn as nn from einops import rearrange -from transformers import BatchFeature, PretrainedConfig +from transformers import BaseImageProcessor, BatchFeature, PretrainedConfig from transformers.activations import GELUActivation from transformers.modeling_outputs import ( BaseModelOutputWithPooling, @@ -147,21 +147,23 @@ class PaddleOCRVLProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - image_processor, + image_processor: BaseImageProcessor, + mm_kwargs: Mapping[str, object], ) -> int: - if image_processor is None: - image_processor = self.get_image_processor() - hf_config = self.get_hf_config() vision_config = hf_config.vision_config patch_size = vision_config.patch_size merge_size = vision_config.spatial_merge_size + + mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs) + size = mm_kwargs.get("size", image_processor.size) + resized_height, resized_width = smart_resize( height=image_height, width=image_width, factor=patch_size * merge_size, - min_pixels=image_processor.min_pixels, - max_pixels=image_processor.max_pixels, + min_pixels=size["min_pixels"], + max_pixels=size["max_pixels"], ) preprocessed_size = ImageSize(width=resized_width, height=resized_height) @@ -176,12 +178,13 @@ class PaddleOCRVLProcessingInfo(BaseProcessingInfo): def get_image_size_with_most_features(self) -> ImageSize: hf_config = self.get_hf_config() + image_processor = self.get_image_processor() # See `smart_resize` for the calculation of the image size. merge_size = hf_config.vision_config.spatial_merge_size patch_size = hf_config.vision_config.patch_size factor = merge_size * patch_size - max_num_tokens = self.get_image_processor().max_pixels // (factor**2) + max_num_tokens = image_processor.max_pixels // (factor**2) # Find factors of max_num_tokens close to its square root # to create a dummy image with a reasonable aspect ratio. h_patches = int(math.sqrt(max_num_tokens)) @@ -276,6 +279,7 @@ class PaddleOCRVLMultiModalProcessor( image_width=image_size.width, image_height=image_size.height, image_processor=image_processor, + mm_kwargs=hf_processor_mm_kwargs, ) return [image_token_id] * num_image_tokens diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 8f33cc859..a5a346e72 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -351,11 +351,8 @@ class Phi3VProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - processor: ProcessorMixin | None = None, + processor: ProcessorMixin, ) -> int: - if processor is None: - processor = self.get_hf_processor() - return processor.calc_num_image_tokens_from_image_size( # type: ignore width=image_width, height=image_height, diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index d11483a6b..89676a9a7 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -558,10 +558,8 @@ class Phi4MMProcessingInfo(BaseProcessingInfo): def get_dynamic_hd( self, - processor: ProcessorMixin | None = None, + processor: ProcessorMixin, ) -> int: - if processor is None: - processor = self.get_hf_processor() image_processor = processor.image_processor return image_processor.dynamic_hd @@ -715,7 +713,7 @@ class Phi4MMProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - processor: ProcessorMixin | None = None, + processor: ProcessorMixin, ) -> int: hf_config = self.get_hf_config() vision_encoder_name = hf_config.img_processor @@ -739,10 +737,9 @@ class Phi4MMProcessingInfo(BaseProcessingInfo): return image_num_tokens - def get_image_size_with_most_features( - self, - processor: ProcessorMixin | None = None, - ) -> ImageSize: + def get_image_size_with_most_features(self) -> ImageSize: + processor = self.get_hf_processor() + hf_config = self.get_hf_config() vision_encoder_name = hf_config.img_processor if vision_encoder_name is None: @@ -874,9 +871,12 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): prompt, mm_data, mm_kwargs, tok_kwargs ) + hf_processor = self.info.get_hf_processor(**mm_kwargs) num_img_tokens = [ self.info.get_num_image_tokens( - image_width=img_size[0], image_height=img_size[1] + image_width=img_size[0], + image_height=img_size[1], + processor=hf_processor, ) for img_size in processed_outputs["image_sizes"] ] diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 7d12cffcd..407cf3ff5 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -217,28 +217,13 @@ class PixtralProcessingInfo(BaseProcessingInfo): def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"image": None} - def get_vision_config( - self, - processor: PixtralProcessorAdapter | None = None, - ): - if processor is None: - processor = self.get_hf_processor() - - return PixtralVisionConfig( - image_size=processor.image_size, - patch_size=processor.patch_size, - ) - def get_num_image_tokens( self, *, image_width: int, image_height: int, - processor: PixtralProcessorAdapter | None = None, + processor: PixtralProcessorAdapter, ) -> int: - if processor is None: - processor = self.get_hf_processor() - ncols, nrows = processor.image_processor._image_to_num_tokens( Image.new("RGB", (image_width, image_height)) ) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 62df900ad..1c568bdff 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -832,24 +832,25 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): image_height: int, num_frames: int = 1, do_resize: bool = True, - image_processor: Qwen2VLImageProcessor | None, + image_processor: Qwen2VLImageProcessor, + mm_kwargs: Mapping[str, object], ) -> tuple[ImageSize, int]: - if image_processor is None: - image_processor = self.get_image_processor() - hf_config = self.get_hf_config() vision_config = hf_config.vision_config patch_size = vision_config.patch_size merge_size = vision_config.spatial_merge_size temporal_patch_size = vision_config.temporal_patch_size + mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs) + size = mm_kwargs.get("size", image_processor.size) + if do_resize: resized_height, resized_width = smart_resize( height=image_height, width=image_width, factor=patch_size * merge_size, - min_pixels=image_processor.size["shortest_edge"], - max_pixels=image_processor.size["longest_edge"], + min_pixels=size["shortest_edge"], + max_pixels=size["longest_edge"], ) preprocessed_size = ImageSize(width=resized_width, height=resized_height) else: @@ -873,13 +874,15 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - image_processor: Qwen2VLImageProcessor | None, + image_processor: Qwen2VLImageProcessor, + mm_kwargs: Mapping[str, object], ) -> int: _, num_image_tokens = self._get_vision_info( image_width=image_width, image_height=image_height, num_frames=1, image_processor=image_processor, + mm_kwargs=mm_kwargs, ) return num_image_tokens @@ -889,13 +892,15 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): image_width: int, image_height: int, num_frames: int, - image_processor: Qwen2VLImageProcessor | None, + image_processor: Qwen2VLImageProcessor, + mm_kwargs: Mapping[str, object], ) -> int: _, num_video_tokens = self._get_vision_info( image_width=image_width, image_height=image_height, num_frames=num_frames, image_processor=image_processor, + mm_kwargs=mm_kwargs, ) return num_video_tokens @@ -941,15 +946,18 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): return ImageSize(width=unit * width_factor, height=unit * height_factor) def get_max_image_tokens(self) -> int: + image_processor = self.get_image_processor() target_width, target_height = self.get_image_size_with_most_features() return self.get_num_image_tokens( image_width=target_width, image_height=target_height, - image_processor=None, + image_processor=image_processor, + mm_kwargs={}, ) def _get_max_video_frames(self, max_tokens: int, start_num_frames: int = 1) -> int: + image_processor = self.get_image_processor() target_width, target_height = self.get_image_size_with_most_features() num_frames = start_num_frames @@ -960,7 +968,8 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): image_width=target_width, image_height=target_height, num_frames=next_num_frames, - image_processor=None, + image_processor=image_processor, + mm_kwargs={}, ) if next_max_tokens > max_tokens: @@ -990,13 +999,15 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): seq_len: int, mm_counts: Mapping[str, int], ) -> int: + image_processor = self.get_image_processor() target_width, target_height = self.get_image_size_with_most_features() return self.get_num_video_tokens( image_width=target_width, image_height=target_height, num_frames=self.get_num_frames_with_most_features(seq_len, mm_counts), - image_processor=None, + image_processor=image_processor, + mm_kwargs={}, ) diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 7d9785141..c18fc77f7 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -642,13 +642,9 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo): image_height: int, num_frames: int = 2, do_resize: bool = True, - image_processor: Qwen2VLImageProcessorFast | Qwen3VLVideoProcessor | None, + image_processor: Qwen2VLImageProcessorFast | Qwen3VLVideoProcessor, + mm_kwargs: Mapping[str, object], ) -> tuple[ImageSize, int]: - if image_processor is None and num_frames > 1: - image_processor = self.get_video_processor() - elif image_processor is None: - image_processor = self.get_image_processor() - is_video = isinstance(image_processor, Qwen3VLVideoProcessor) hf_config = self.get_hf_config() @@ -657,6 +653,9 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo): merge_size = vision_config.spatial_merge_size temporal_patch_size = vision_config.temporal_patch_size + mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs) + size = mm_kwargs.get("size", image_processor.size) + if do_resize: if is_video: smart_resize = video_smart_resize @@ -667,12 +666,13 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo): else: smart_resize = image_smart_resize extra_kwargs = {} + resized_height, resized_width = smart_resize( height=image_height, width=image_width, factor=patch_size * merge_size, - min_pixels=image_processor.size["shortest_edge"], - max_pixels=image_processor.size["longest_edge"], + min_pixels=size["shortest_edge"], + max_pixels=size["longest_edge"], **extra_kwargs, ) preprocessed_size = ImageSize(width=resized_width, height=resized_height) @@ -720,7 +720,8 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo): image_width=target_width, image_height=target_height, num_frames=2, - image_processor=None, + image_processor=video_processor, + mm_kwargs={}, ) return num_video_soft_tokens @@ -846,6 +847,7 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]): image_height=target_video_height, num_frames=target_num_frames, image_processor=video_processor, + mm_kwargs={}, ) # NOTE: we need to do this check here since Qwen3-VL resizes video # frames depending on how many frames there are. diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index 4fadad14d..acedb04bc 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -487,11 +487,8 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo): *, image_width: int, image_height: int, - processor: SkyworkR1VProcessor | None, + processor: SkyworkR1VProcessor, ) -> int: - if processor is None: - processor = self.get_hf_processor() - return processor.get_num_image_tokens( image_width=image_width, image_height=image_height, diff --git a/vllm/model_executor/models/smolvlm.py b/vllm/model_executor/models/smolvlm.py index e8b805297..aef00ec59 100644 --- a/vllm/model_executor/models/smolvlm.py +++ b/vllm/model_executor/models/smolvlm.py @@ -16,9 +16,7 @@ class SmolVLMProcessingInfo(Idefics3ProcessingInfo): def get_hf_processor(self, **kwargs: object) -> SmolVLMProcessor: return self.ctx.get_hf_processor(SmolVLMProcessor, **kwargs) - def _get_image_token(self, processor: SmolVLMProcessor | None) -> tuple[str, str]: - if processor is None: - processor = self.get_hf_processor() + def _get_image_token(self, processor: SmolVLMProcessor) -> tuple[str, str, str]: image_token = processor.image_token fake_image_token = processor.fake_image_token global_image_token = processor.global_image_token diff --git a/vllm/multimodal/processing/context.py b/vllm/multimodal/processing/context.py index d5c14310c..34a186710 100644 --- a/vllm/multimodal/processing/context.py +++ b/vllm/multimodal/processing/context.py @@ -409,6 +409,10 @@ class InputProcessingContext: return json_map_leaves(_postprocess_one, output) + def get_merged_mm_kwargs(self, kwargs: Mapping[str, object]): + mm_config = self.model_config.get_multimodal_config() + return mm_config.merge_mm_processor_kwargs(kwargs) + def call_hf_processor( self, hf_processor: ProcessorMixin, @@ -424,8 +428,7 @@ class InputProcessingContext: """ assert callable(hf_processor) - mm_config = self.model_config.get_multimodal_config() - merged_kwargs = mm_config.merge_mm_processor_kwargs(kwargs) + merged_kwargs = self.get_merged_mm_kwargs(kwargs) allowed_kwargs = get_allowed_kwarg_only_overrides( hf_processor, -- GitLab From dcf6ee8592b4f33593feb579b7a420d155ada374 Mon Sep 17 00:00:00 2001 From: haosdent Date: Fri, 13 Feb 2026 13:04:06 +0800 Subject: [PATCH 0165/1166] [Bugfix] Fix encoder cache underestimation for GLM-4V/GLM-OCR single image (#34483) Signed-off-by: haosdent --- vllm/model_executor/models/glm4_1v.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 8440c3946..23f27db3c 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -869,9 +869,28 @@ class Glm4vProcessingInfo(BaseProcessingInfo): return preprocessed_size, num_vision_tokens + def _get_image_max_pixels(self) -> int: + """Read max_pixels from the HF image processor config. + + Despite the name, ``longest_edge`` is a pixel **area** (total pixel + count), not an edge length. The HF processor passes it directly to + ``smart_resize`` as the ``max_pixels`` argument, which constrains + ``t_bar * h_bar * w_bar <= max_pixels``. + """ + return self.get_image_processor().size["longest_edge"] + def get_image_size_with_most_features(self) -> ImageSize: + # Use num_frames=1 for single-image budget estimation. + # _get_vision_info defaults to num_frames=16 (video), which + # makes smart_resize constrain 16*H*W <= max_pixels, vastly + # underestimating the spatial budget for a single image and + # causing encoder cache overflow for large images + # (see https://github.com/vllm-project/vllm/issues/34040). max_image_size, _ = self._get_vision_info( - image_width=9999999, image_height=9999999 + image_width=9999999, + image_height=9999999, + num_frames=1, + max_image_pixels=self._get_image_max_pixels(), ) return max_image_size @@ -884,7 +903,8 @@ class Glm4vProcessingInfo(BaseProcessingInfo): _, num_image_tokens = self._get_vision_info( image_width=image_width, image_height=image_height, - max_image_pixels=28 * 28 * 2 * 6144, + num_frames=1, + max_image_pixels=self._get_image_max_pixels(), ) return num_image_tokens -- GitLab From 1b4e8e53f87b2c6f5cd30d0eace501d7d2192236 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 13 Feb 2026 14:43:53 +0800 Subject: [PATCH 0166/1166] [CI/Build] Fix CUDA re-initialization error in distributed model tests (#34491) Signed-off-by: DarkLight1337 --- tests/models/multimodal/generation/test_voxtral_realtime.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/models/multimodal/generation/test_voxtral_realtime.py b/tests/models/multimodal/generation/test_voxtral_realtime.py index 2b769e3ed..ebd979ddb 100644 --- a/tests/models/multimodal/generation/test_voxtral_realtime.py +++ b/tests/models/multimodal/generation/test_voxtral_realtime.py @@ -14,7 +14,6 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer from vllm import LLM, EngineArgs, SamplingParams from vllm.assets.audio import AudioAsset from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.model_executor.models.voxtral_realtime import VoxtralRealtimeBuffer from vllm.v1.engine.async_llm import AsyncLLM MODEL_NAME = "mistralai/Voxtral-Mini-4B-Realtime-2602" @@ -114,6 +113,9 @@ def test_voxtral_realtime_forward(audio_assets, tokenizer, engine): @pytest.mark.asyncio async def test_voxtral_realtime_generator(audio_assets, tokenizer, async_engine): + # Lazy import to avoid CUDA-reinitialization error + from vllm.model_executor.models.voxtral_realtime import VoxtralRealtimeBuffer + sampling_params = SamplingParams(temperature=0.0, max_tokens=1) audio_config = tokenizer.instruct_tokenizer.audio_encoder.audio_config -- GitLab From 2f308214c0ff6cfa849879c5beb884192714f429 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 13 Feb 2026 14:48:38 +0800 Subject: [PATCH 0167/1166] [Refactor] Pass full VllmConfig to Renderer (#34485) Signed-off-by: DarkLight1337 --- tests/entrypoints/openai/test_chat_error.py | 7 ++- .../openai/test_completion_error.py | 7 ++- .../entrypoints/openai/test_lora_resolvers.py | 7 ++- tests/entrypoints/openai/test_serving_chat.py | 17 +++++- tests/renderers/test_completions.py | 55 +++++++++++-------- tests/renderers/test_mistral.py | 10 +++- tests/test_inputs.py | 5 +- vllm/inputs/preprocess.py | 11 ++-- vllm/renderers/base.py | 14 ++--- vllm/renderers/deepseek_v32.py | 13 +++-- vllm/renderers/grok2.py | 13 +++-- vllm/renderers/hf.py | 15 ++--- vllm/renderers/mistral.py | 13 +++-- vllm/renderers/registry.py | 14 +++-- vllm/renderers/terratorch.py | 13 +++-- vllm/v1/engine/async_llm.py | 2 +- vllm/v1/engine/input_processor.py | 5 +- vllm/v1/engine/llm_engine.py | 2 +- 18 files changed, 137 insertions(+), 86 deletions(-) diff --git a/tests/entrypoints/openai/test_chat_error.py b/tests/entrypoints/openai/test_chat_error.py index 760ec8acb..6095d1ec8 100644 --- a/tests/entrypoints/openai/test_chat_error.py +++ b/tests/entrypoints/openai/test_chat_error.py @@ -59,11 +59,16 @@ class MockModelConfig: return self.diff_sampling_param or {} +@dataclass +class MockVllmConfig: + model_config: MockModelConfig + + def _build_renderer(model_config: MockModelConfig): _, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config) return HfRenderer( - model_config, + MockVllmConfig(model_config), tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name}, ) diff --git a/tests/entrypoints/openai/test_completion_error.py b/tests/entrypoints/openai/test_completion_error.py index 800bf75f0..d5a266831 100644 --- a/tests/entrypoints/openai/test_completion_error.py +++ b/tests/entrypoints/openai/test_completion_error.py @@ -58,6 +58,11 @@ class MockModelConfig: return self.diff_sampling_param or {} +@dataclass +class MockVllmConfig: + model_config: MockModelConfig + + def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion: models = OpenAIServingModels( engine_client=engine, @@ -74,7 +79,7 @@ def _build_renderer(model_config: MockModelConfig): _, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config) return HfRenderer( - model_config, + MockVllmConfig(model_config), tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name}, ) diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index 56fe31556..450a788a3 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -57,6 +57,11 @@ class MockModelConfig: return self.diff_sampling_param or {} +@dataclass +class MockVllmConfig: + model_config: MockModelConfig + + class MockLoRAResolver(LoRAResolver): async def resolve_lora( self, base_model_name: str, lora_name: str @@ -91,7 +96,7 @@ def _build_renderer(model_config: MockModelConfig): _, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config) return HfRenderer( - model_config, + MockVllmConfig(model_config), tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name}, ) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index b57f00ab7..2cef772c2 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -534,11 +534,16 @@ class MockModelConfig: return self.diff_sampling_param or {} +@dataclass +class MockVllmConfig: + model_config: MockModelConfig + + def _build_renderer(model_config: MockModelConfig): _, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config) return HfRenderer( - model_config, + MockVllmConfig(model_config), tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name}, ) @@ -749,7 +754,10 @@ async def test_serving_chat_mistral_token_ids_prompt_is_validated(): mock_engine.io_processor = MagicMock() mock_tokenizer = MagicMock(spec=MistralTokenizer) - mock_renderer = MistralRenderer(mock_engine.model_config, tokenizer_kwargs={}) + mock_renderer = MistralRenderer( + MockVllmConfig(mock_engine.model_config), + tokenizer_kwargs={}, + ) mock_renderer._tokenizer = mock_tokenizer # Force the Mistral chat template renderer to return token IDs. # Choose a prompt length that is < max_model_len, but large enough that @@ -788,7 +796,10 @@ async def test_serving_chat_mistral_token_ids_prompt_too_long_is_rejected(): mock_engine.io_processor = MagicMock() mock_tokenizer = MagicMock(spec=MistralTokenizer) - mock_renderer = MistralRenderer(mock_engine.model_config, tokenizer_kwargs={}) + mock_renderer = MistralRenderer( + MockVllmConfig(mock_engine.model_config), + tokenizer_kwargs={}, + ) mock_renderer._tokenizer = mock_tokenizer # prompt_token_ids length == max_model_len should be rejected for # completion-like requests (ChatCompletionRequest). diff --git a/tests/renderers/test_completions.py b/tests/renderers/test_completions.py index 1cef8551c..ec6d8a688 100644 --- a/tests/renderers/test_completions.py +++ b/tests/renderers/test_completions.py @@ -40,6 +40,11 @@ class MockModelConfig: is_encoder_decoder: bool = False +@dataclass +class MockVllmConfig: + model_config: MockModelConfig + + @dataclass class DummyTokenizer: truncation_side: str = "left" @@ -72,7 +77,7 @@ def _build_renderer( _, tokenizer_name, _, kwargs = tokenizer_args_from_config(model_config) renderer = HfRenderer( - model_config, + MockVllmConfig(model_config), tokenizer_kwargs={**kwargs, "tokenizer_name": tokenizer_name}, ) @@ -104,14 +109,14 @@ class TestValidatePrompt: renderer = _build_renderer(MockModelConfig()) with pytest.raises(ValueError, match="at least one prompt"): - renderer.render_prompts(_preprocess_prompt(renderer.config, [])) + renderer.render_prompts(_preprocess_prompt(renderer.model_config, [])) def test_invalid_type(self): renderer = _build_renderer(MockModelConfig()) with pytest.raises(TypeError, match="should be a list of integers"): renderer.render_prompts( - _preprocess_prompt(renderer.config, [[1, 2], ["foo", "bar"]]) # type: ignore[arg-type] + _preprocess_prompt(renderer.model_config, [[1, 2], ["foo", "bar"]]) # type: ignore[arg-type] ) @@ -120,7 +125,9 @@ class TestRenderPrompt: renderer = _build_renderer(MockModelConfig()) tokens = [101, 7592, 2088] - prompts = renderer.render_prompts(_preprocess_prompt(renderer.config, tokens)) + prompts = renderer.render_prompts( + _preprocess_prompt(renderer.model_config, tokens) + ) results = renderer.tokenize_prompts( prompts, TokenizeParams(max_total_tokens=100), @@ -134,7 +141,7 @@ class TestRenderPrompt: token_lists = [[101, 7592, 2088], [102, 1234, 5678, 9012], [103, 4567]] prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, token_lists) + _preprocess_prompt(renderer.model_config, token_lists) ) results = renderer.tokenize_prompts( prompts, @@ -151,7 +158,7 @@ class TestRenderPrompt: text_input = "x" * 10 prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, text_input) + _preprocess_prompt(renderer.model_config, text_input) ) results = renderer.tokenize_prompts( prompts, @@ -166,7 +173,7 @@ class TestRenderPrompt: text_list_input = ["x" * 10, "x" * 12, "x" * 14] prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, text_list_input) + _preprocess_prompt(renderer.model_config, text_list_input) ) results = renderer.tokenize_prompts( prompts, @@ -181,7 +188,7 @@ class TestRenderPrompt: renderer = _build_renderer(MockModelConfig()) prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, "x" * 200) + _preprocess_prompt(renderer.model_config, "x" * 200) ) results = renderer.tokenize_prompts( prompts, @@ -195,7 +202,7 @@ class TestRenderPrompt: renderer = _build_renderer(MockModelConfig()) prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, "x" * 200) + _preprocess_prompt(renderer.model_config, "x" * 200) ) results = renderer.tokenize_prompts( prompts, @@ -209,7 +216,7 @@ class TestRenderPrompt: renderer = _build_renderer(MockModelConfig()) prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, "x" * 200) + _preprocess_prompt(renderer.model_config, "x" * 200) ) results = renderer.tokenize_prompts( prompts, @@ -224,7 +231,7 @@ class TestRenderPrompt: long_tokens = [100, 101, 102, 103, 104, 105, 106, 107, 108, 109] # 10 tokens prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, long_tokens) + _preprocess_prompt(renderer.model_config, long_tokens) ) results = renderer.tokenize_prompts( prompts, @@ -240,7 +247,7 @@ class TestRenderPrompt: long_tokens = [100, 101, 102, 103, 104, 105, 106, 107, 108, 109] # 10 tokens prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, long_tokens) + _preprocess_prompt(renderer.model_config, long_tokens) ) results = renderer.tokenize_prompts( prompts, @@ -257,7 +264,7 @@ class TestRenderPrompt: # Exceeds max_total_tokens and max_total_tokens * VLLM_MAX_CHARS_PER_TOKEN long_tokens = "x" * 150 prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, long_tokens) + _preprocess_prompt(renderer.model_config, long_tokens) ) with pytest.raises( @@ -278,7 +285,7 @@ class TestRenderPrompt: # Exceeds max_total_tokens but not max_total_tokens * VLLM_MAX_CHARS_PER_TOKEN long_tokens = "x" * 150 prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, long_tokens) + _preprocess_prompt(renderer.model_config, long_tokens) ) with pytest.raises( @@ -299,7 +306,7 @@ class TestRenderPrompt: long_tokens = list(range(150)) # Exceeds max_total_tokens=100 prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, long_tokens) + _preprocess_prompt(renderer.model_config, long_tokens) ) with pytest.raises( @@ -315,7 +322,7 @@ class TestRenderPrompt: renderer = _build_renderer(MockModelConfig(skip_tokenizer_init=True)) prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, "Hello world") + _preprocess_prompt(renderer.model_config, "Hello world") ) with pytest.raises(ValueError, match="`skip_tokenizer_init=True`"): @@ -328,7 +335,9 @@ class TestRenderPrompt: renderer = _build_renderer(MockModelConfig()) tokens = [1, 2, 3, 4] - prompts = renderer.render_prompts(_preprocess_prompt(renderer.config, tokens)) + prompts = renderer.render_prompts( + _preprocess_prompt(renderer.model_config, tokens) + ) results = renderer.tokenize_prompts( prompts, TokenizeParams( @@ -358,7 +367,7 @@ class TestRenderEmbedPrompt: embed_bytes = self._create_test_embed_bytes(tensor_input) prompts = renderer.render_prompts( - _preprocess_prompt(renderer.config, embed_bytes) + _preprocess_prompt(renderer.model_config, embed_bytes) ) results = renderer.tokenize_prompts( prompts, @@ -379,7 +388,7 @@ class TestRenderEmbedPrompt: prompts = renderer.render_prompts( _preprocess_prompt( - renderer.config, + renderer.model_config, [self._create_test_embed_bytes(t) for t in tensor_inputs], ) ) @@ -400,7 +409,7 @@ class TestRenderEmbedPrompt: prompts = renderer.render_prompts( _preprocess_prompt( - renderer.config, self._create_test_embed_bytes(tensor_input) + renderer.model_config, self._create_test_embed_bytes(tensor_input) ) ) results = renderer.tokenize_prompts( @@ -427,7 +436,7 @@ class TestRenderEmbedPrompt: prompts = renderer.render_prompts( _preprocess_prompt( - renderer.config, self._create_test_embed_bytes(tensor_input) + renderer.model_config, self._create_test_embed_bytes(tensor_input) ) ) results = renderer.tokenize_prompts( @@ -446,7 +455,7 @@ class TestRenderEmbedPrompt: prompts = renderer.render_prompts( _preprocess_prompt( - renderer.config, self._create_test_embed_bytes(tensor_input) + renderer.model_config, self._create_test_embed_bytes(tensor_input) ) ) results = renderer.tokenize_prompts( @@ -466,7 +475,7 @@ class TestRenderEmbedPrompt: prompts = renderer.render_prompts( _preprocess_prompt( - renderer.config, + renderer.model_config, [text_input, self._create_test_embed_bytes(tensor_input)], ) ) diff --git a/tests/renderers/test_mistral.py b/tests/renderers/test_mistral.py index f1d73e738..8c68f750a 100644 --- a/tests/renderers/test_mistral.py +++ b/tests/renderers/test_mistral.py @@ -38,6 +38,11 @@ class MockModelConfig: is_encoder_decoder: bool = False +@dataclass +class MockVllmConfig: + model_config: MockModelConfig + + @pytest.mark.asyncio async def test_async_mistral_tokenizer_does_not_block_event_loop(): expected_tokens = [1, 2, 3] @@ -50,7 +55,10 @@ async def test_async_mistral_tokenizer_does_not_block_event_loop(): mock_model_config = MockModelConfig(skip_tokenizer_init=True) mock_tokenizer = Mock(spec=MistralTokenizer) mock_tokenizer.apply_chat_template = mocked_apply_chat_template - mock_renderer = MistralRenderer(mock_model_config, tokenizer_kwargs={}) + mock_renderer = MistralRenderer( + MockVllmConfig(mock_model_config), + tokenizer_kwargs={}, + ) mock_renderer._tokenizer = mock_tokenizer task = mock_renderer.render_messages_async([], ChatParams()) diff --git a/tests/test_inputs.py b/tests/test_inputs.py index 03e470427..fb1bbd21e 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -3,7 +3,7 @@ import pytest -from vllm.config import ModelConfig +from vllm.config import ModelConfig, VllmConfig from vllm.inputs.preprocess import InputPreprocessor pytestmark = pytest.mark.cpu_test @@ -20,7 +20,8 @@ pytestmark = pytest.mark.cpu_test ) def test_preprocessor_always_mm_code_path(model_id, prompt): model_config = ModelConfig(model=model_id) - input_preprocessor = InputPreprocessor(model_config) + vllm_config = VllmConfig(model_config=model_config) + input_preprocessor = InputPreprocessor(vllm_config) # HF processor adds sep token tokenizer = input_preprocessor.get_tokenizer() diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 2699f70cb..ef1f2e0bf 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -6,7 +6,7 @@ from typing import Any, overload from typing_extensions import assert_never -from vllm.config import ModelConfig, ObservabilityConfig +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal.cache import BaseMultiModalProcessorCache @@ -54,17 +54,16 @@ logger = init_logger(__name__) class InputPreprocessor: def __init__( self, - model_config: ModelConfig, - observability_config: ObservabilityConfig | None = None, + vllm_config: VllmConfig, renderer: BaseRenderer | None = None, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, mm_processor_cache: BaseMultiModalProcessorCache | None = None, ) -> None: super().__init__() - self.model_config = model_config - self.observability_config = observability_config - self.renderer = renderer or renderer_from_config(model_config) + self.model_config = vllm_config.model_config + self.observability_config = vllm_config.observability_config + self.renderer = renderer or renderer_from_config(vllm_config) self.mm_registry = mm_registry self.mm_processor_cache = mm_processor_cache diff --git a/vllm/renderers/base.py b/vllm/renderers/base.py index 0002bdf89..05058c549 100644 --- a/vllm/renderers/base.py +++ b/vllm/renderers/base.py @@ -21,7 +21,7 @@ from .inputs.preprocess import extract_target_prompt from .params import ChatParams, TokenizeParams if TYPE_CHECKING: - from vllm.config import ModelConfig + from vllm.config import VllmConfig from vllm.entrypoints.chat_utils import ( ChatCompletionMessageParam, ConversationMessage, @@ -35,15 +35,15 @@ class BaseRenderer(ABC): @abstractmethod def from_config( cls, - config: "ModelConfig", + config: "VllmConfig", tokenizer_kwargs: dict[str, Any], ) -> "BaseRenderer": raise NotImplementedError - def __init__(self, config: "ModelConfig") -> None: + def __init__(self, config: "VllmConfig") -> None: super().__init__() - self.config = config + self.model_config = config.model_config # Lazy initialization since offline LLM doesn't use async self._async_tokenizer: AsyncMicrobatchTokenizer | None = None @@ -90,7 +90,7 @@ class BaseRenderer(ABC): prompt: DictPrompt | bytes, ) -> DictPrompt: if isinstance(prompt, bytes): - embeds = safe_load_prompt_embeds(self.config, prompt) + embeds = safe_load_prompt_embeds(self.model_config, prompt) prompt = EmbedsPrompt(prompt_embeds=embeds) return prompt @@ -310,7 +310,7 @@ class BaseRenderer(ABC): return for prompt in prompts: - target_prompt = extract_target_prompt(self.config, prompt) + target_prompt = extract_target_prompt(self.model_config, prompt) target_prompt.update(prompt_extras) # type: ignore[arg-type] # Top-level methods @@ -325,7 +325,7 @@ class BaseRenderer(ABC): # NOTE: Some MM models have non-default `add_special_tokens` # so we handle tokenization in multi-modal processor - if self.config.is_multimodal_model: + if self.model_config.is_multimodal_model: self._apply_prompt_extras(dict_prompts, prompt_extras) return dict_prompts diff --git a/vllm/renderers/deepseek_v32.py b/vllm/renderers/deepseek_v32.py index e4cc3f0fb..f03a5973f 100644 --- a/vllm/renderers/deepseek_v32.py +++ b/vllm/renderers/deepseek_v32.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any -from vllm.config import ModelConfig +from vllm.config import VllmConfig from vllm.entrypoints.chat_utils import ( ChatCompletionMessageParam, ConversationMessage, @@ -26,19 +26,20 @@ class DeepseekV32Renderer(BaseRenderer): @classmethod def from_config( cls, - config: ModelConfig, + config: VllmConfig, tokenizer_kwargs: dict[str, Any], ) -> "BaseRenderer": return cls(config, tokenizer_kwargs) def __init__( self, - config: ModelConfig, + config: VllmConfig, tokenizer_kwargs: dict[str, Any], ) -> None: super().__init__(config) - if config.skip_tokenizer_init: + model_config = self.model_config + if model_config.skip_tokenizer_init: tokenizer = None else: tokenizer = cached_get_tokenizer( @@ -67,7 +68,7 @@ class DeepseekV32Renderer(BaseRenderer): tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = parse_chat_messages( messages, - self.config, + self.model_config, content_format="string", ) @@ -93,7 +94,7 @@ class DeepseekV32Renderer(BaseRenderer): tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = await parse_chat_messages_async( messages, - self.config, + self.model_config, content_format="string", ) diff --git a/vllm/renderers/grok2.py b/vllm/renderers/grok2.py index 141c72aa7..7e8681d82 100644 --- a/vllm/renderers/grok2.py +++ b/vllm/renderers/grok2.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any -from vllm.config import ModelConfig +from vllm.config import VllmConfig from vllm.entrypoints.chat_utils import ( ChatCompletionMessageParam, ConversationMessage, @@ -25,19 +25,20 @@ class Grok2Renderer(BaseRenderer): @classmethod def from_config( cls, - config: ModelConfig, + config: VllmConfig, tokenizer_kwargs: dict[str, Any], ) -> "BaseRenderer": return cls(config, tokenizer_kwargs) def __init__( self, - config: ModelConfig, + config: VllmConfig, tokenizer_kwargs: dict[str, Any], ) -> None: super().__init__(config) - if config.skip_tokenizer_init: + model_config = self.model_config + if model_config.skip_tokenizer_init: tokenizer = None else: tokenizer = cached_get_tokenizer( @@ -66,7 +67,7 @@ class Grok2Renderer(BaseRenderer): tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = parse_chat_messages( messages, - self.config, + self.model_config, content_format="string", ) @@ -92,7 +93,7 @@ class Grok2Renderer(BaseRenderer): tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = await parse_chat_messages_async( messages, - self.config, + self.model_config, content_format="string", ) diff --git a/vllm/renderers/hf.py b/vllm/renderers/hf.py index 83b17e961..407b28ae1 100644 --- a/vllm/renderers/hf.py +++ b/vllm/renderers/hf.py @@ -14,7 +14,7 @@ import jinja2.nodes import jinja2.parser import jinja2.sandbox -from vllm.config import ModelConfig +from vllm.config import ModelConfig, VllmConfig from vllm.entrypoints.chat_utils import ( ChatCompletionMessageParam, ChatTemplateContentFormat, @@ -589,23 +589,24 @@ class HfRenderer(BaseRenderer): @classmethod def from_config( cls, - config: ModelConfig, + config: VllmConfig, tokenizer_kwargs: dict[str, Any], ) -> "BaseRenderer": return cls(config, tokenizer_kwargs) def __init__( self, - config: ModelConfig, + config: VllmConfig, tokenizer_kwargs: dict[str, Any], ) -> None: super().__init__(config) + model_config = self.model_config self.use_unified_vision_chunk = getattr( - config.hf_config, "use_unified_vision_chunk", False + model_config.hf_config, "use_unified_vision_chunk", False ) - if config.skip_tokenizer_init: + if model_config.skip_tokenizer_init: tokenizer = None else: tokenizer = cast( @@ -634,7 +635,7 @@ class HfRenderer(BaseRenderer): messages: list[ChatCompletionMessageParam], params: ChatParams, ) -> tuple[list[ConversationMessage], DictPrompt]: - model_config = self.config + model_config = self.model_config tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = parse_chat_messages( @@ -688,7 +689,7 @@ class HfRenderer(BaseRenderer): messages: list[ChatCompletionMessageParam], params: ChatParams, ) -> tuple[list[ConversationMessage], DictPrompt]: - model_config = self.config + model_config = self.model_config tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = await parse_chat_messages_async( diff --git a/vllm/renderers/mistral.py b/vllm/renderers/mistral.py index 3d3141bdc..ae8078f41 100644 --- a/vllm/renderers/mistral.py +++ b/vllm/renderers/mistral.py @@ -3,7 +3,7 @@ from concurrent.futures import ThreadPoolExecutor from typing import Any -from vllm.config import ModelConfig +from vllm.config import VllmConfig from vllm.entrypoints.chat_utils import ( ChatCompletionMessageParam, ConversationMessage, @@ -54,19 +54,20 @@ class MistralRenderer(BaseRenderer): @classmethod def from_config( cls, - config: ModelConfig, + config: VllmConfig, tokenizer_kwargs: dict[str, Any], ) -> "BaseRenderer": return cls(config, tokenizer_kwargs) def __init__( self, - config: ModelConfig, + config: VllmConfig, tokenizer_kwargs: dict[str, Any], ) -> None: super().__init__(config) - if config.skip_tokenizer_init: + model_config = self.model_config + if model_config.skip_tokenizer_init: tokenizer = None else: tokenizer = cached_get_tokenizer( @@ -100,7 +101,7 @@ class MistralRenderer(BaseRenderer): tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = parse_chat_messages( messages, - self.config, + self.model_config, content_format="string", ) @@ -126,7 +127,7 @@ class MistralRenderer(BaseRenderer): tokenizer = self.get_tokenizer() conversation, mm_data, mm_uuids = await parse_chat_messages_async( messages, - self.config, + self.model_config, content_format="string", ) diff --git a/vllm/renderers/registry.py b/vllm/renderers/registry.py index 3abc7c9fe..cd09c80f9 100644 --- a/vllm/renderers/registry.py +++ b/vllm/renderers/registry.py @@ -10,7 +10,7 @@ from vllm.utils.import_utils import resolve_obj_by_qualname from .base import BaseRenderer if TYPE_CHECKING: - from vllm.config import ModelConfig + from vllm.config import VllmConfig logger = init_logger(__name__) @@ -55,7 +55,7 @@ class RendererRegistry: def load_renderer( self, renderer_mode: str, - config: "ModelConfig", + config: "VllmConfig", tokenizer_kwargs: dict[str, Any], ) -> BaseRenderer: renderer_cls = self.load_renderer_cls(renderer_mode) @@ -71,12 +71,16 @@ RENDERER_REGISTRY = RendererRegistry( """The global `RendererRegistry` instance.""" -def renderer_from_config(config: "ModelConfig", **kwargs): +def renderer_from_config(config: "VllmConfig", **kwargs): + model_config = config.model_config tokenizer_mode, tokenizer_name, args, kwargs = tokenizer_args_from_config( - config, **kwargs + model_config, **kwargs ) - if config.tokenizer_mode == "auto" and config.model_impl == "terratorch": + if ( + model_config.tokenizer_mode == "auto" + and model_config.model_impl == "terratorch" + ): renderer_mode = "terratorch" else: renderer_mode = tokenizer_mode diff --git a/vllm/renderers/terratorch.py b/vllm/renderers/terratorch.py index 2d00ebccb..0ee97f852 100644 --- a/vllm/renderers/terratorch.py +++ b/vllm/renderers/terratorch.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any -from vllm.config import ModelConfig +from vllm.config import VllmConfig from vllm.entrypoints.chat_utils import ( ChatCompletionMessageParam, ConversationMessage, @@ -24,15 +24,16 @@ class TerratorchRenderer(BaseRenderer): @classmethod def from_config( cls, - config: "ModelConfig", + config: VllmConfig, tokenizer_kwargs: dict[str, Any], ) -> "BaseRenderer": return cls(config) - def __init__(self, config: ModelConfig) -> None: + def __init__(self, config: VllmConfig) -> None: super().__init__(config) - if not config.skip_tokenizer_init: + model_config = self.model_config + if not model_config.skip_tokenizer_init: raise ValueError("Terratorch renderer requires `skip_tokenizer_init=True`") @property @@ -47,7 +48,7 @@ class TerratorchRenderer(BaseRenderer): messages: list[ChatCompletionMessageParam], params: ChatParams, ) -> tuple[list[ConversationMessage], DictPrompt]: - model_config = self.config + model_config = self.model_config conversation, mm_data, mm_uuids = parse_chat_messages( messages, @@ -68,7 +69,7 @@ class TerratorchRenderer(BaseRenderer): messages: list[ChatCompletionMessageParam], params: ChatParams, ) -> tuple[list[ConversationMessage], DictPrompt]: - model_config = self.config + model_config = self.model_config conversation, mm_data, mm_uuids = await parse_chat_messages_async( messages, diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index bab898da6..87410c420 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -132,7 +132,7 @@ class AsyncLLM(EngineClient): "enabling logging without default stat loggers." ) - self.renderer = renderer = renderer_from_config(self.model_config) + self.renderer = renderer = renderer_from_config(self.vllm_config) self.io_processor = get_io_processor( self.vllm_config, self.model_config.io_processor_plugin, diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 4c105c87b..1bda736fe 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -59,7 +59,7 @@ class InputProcessor: self.generation_config_fields = model_config.try_get_generation_config() - self.renderer = renderer or renderer_from_config(model_config) + self.renderer = renderer or renderer_from_config(vllm_config) self.mm_registry = mm_registry self.mm_processor_cache = mm_registry.processor_cache_from_config(vllm_config) @@ -75,8 +75,7 @@ class InputProcessor: mm_budget.reset_cache() # Not used anymore self.input_preprocessor = InputPreprocessor( - model_config, - self.observability_config, + vllm_config, renderer=renderer, mm_registry=mm_registry, mm_processor_cache=self.mm_processor_cache, diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 76aa8f438..c7eb93dc8 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -90,7 +90,7 @@ class LLMEngine: self.dp_group = None self.should_execute_dummy_batch = False - self.renderer = renderer = renderer_from_config(self.model_config) + self.renderer = renderer = renderer_from_config(self.vllm_config) self.io_processor = get_io_processor( self.vllm_config, self.model_config.io_processor_plugin, -- GitLab From eea3024f43e06ea4e037ec86464dcc249d0c0b44 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 12 Feb 2026 22:48:42 -0800 Subject: [PATCH 0168/1166] [Bugfix] Fix mamba state dtype setting for Qwen3-Next and Qwen3.5 (#34489) Signed-off-by: Roger Wang --- .../layers/mamba/mamba_utils.py | 6 ++-- vllm/model_executor/models/config.py | 29 +++++++++++++++++++ vllm/model_executor/models/qwen3_5.py | 5 ++-- vllm/model_executor/models/qwen3_next.py | 8 +++-- 4 files changed, 42 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm/model_executor/layers/mamba/mamba_utils.py index 7181ada1c..d66dee7c9 100644 --- a/vllm/model_executor/layers/mamba/mamba_utils.py +++ b/vllm/model_executor/layers/mamba/mamba_utils.py @@ -80,9 +80,11 @@ class MambaStateDtypeCalculator: cls, model_dtype: ModelDType | torch.dtype, mamba_cache_dtype: MambaDType, + mamba_ssm_cache_dtype: MambaDType = "auto", ) -> tuple[torch.dtype, torch.dtype]: - state_dtype = get_kv_cache_torch_dtype(mamba_cache_dtype, model_dtype) - return (state_dtype, state_dtype) + return cls._mamba_state_dtype( + model_dtype, mamba_cache_dtype, mamba_ssm_cache_dtype + ) @classmethod def kda_state_dtype( diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index a6c244b6e..749a97d0a 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -582,6 +582,33 @@ class NemotronHForCausalLMConfig(VerifyAndUpdateConfig): cache_config.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype +class Qwen3_5ForConditionalGenerationConfig(VerifyAndUpdateConfig): + @staticmethod + def verify_and_update_config(vllm_config: "VllmConfig") -> None: + """Update mamba_ssm_cache_dtype for Qwen3.5 models when set to 'auto' + (or not explicitly set), to the value specified in the HF config's + mamba_ssm_dtype field. Warn if the user explicitly overrides it to a + different value. + """ + cache_config = vllm_config.cache_config + hf_text_config = vllm_config.model_config.hf_text_config + mamba_ssm_dtype = getattr(hf_text_config, "mamba_ssm_dtype", None) + if cache_config.mamba_ssm_cache_dtype == "auto": + if mamba_ssm_dtype is not None: + cache_config.mamba_ssm_cache_dtype = mamba_ssm_dtype + elif ( + mamba_ssm_dtype is not None + and cache_config.mamba_ssm_cache_dtype != mamba_ssm_dtype + ): + logger.warning( + "Qwen3.5 model specifies mamba_ssm_dtype='%s' in its config, " + "but --mamba-ssm-cache-dtype='%s' was passed. " + "Using the user-specified value.", + mamba_ssm_dtype, + cache_config.mamba_ssm_cache_dtype, + ) + + class VoyageQwen3BidirectionalEmbedModelConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_model_config(model_config: "ModelConfig") -> None: @@ -611,5 +638,7 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = { "DeepseekV32ForCausalLM": DeepseekV32ForCausalLM, "NemotronHForCausalLM": NemotronHForCausalLMConfig, "NemotronHPuzzleForCausalLM": NemotronHForCausalLMConfig, + "Qwen3_5ForConditionalGeneration": Qwen3_5ForConditionalGenerationConfig, + "Qwen3_5MoeForConditionalGeneration": Qwen3_5ForConditionalGenerationConfig, "VoyageQwen3BidirectionalEmbedModel": VoyageQwen3BidirectionalEmbedModelConfig, } diff --git a/vllm/model_executor/models/qwen3_5.py b/vllm/model_executor/models/qwen3_5.py index c317c1e1a..55eb3408d 100644 --- a/vllm/model_executor/models/qwen3_5.py +++ b/vllm/model_executor/models/qwen3_5.py @@ -870,9 +870,10 @@ class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid) cls, vllm_config: "VllmConfig", ) -> tuple[torch.dtype, torch.dtype]: - mamba_ssm_dtype = vllm_config.model_config.hf_text_config.mamba_ssm_dtype return MambaStateDtypeCalculator.gated_delta_net_state_dtype( - vllm_config.model_config.dtype, mamba_ssm_dtype + vllm_config.model_config.dtype, + vllm_config.cache_config.mamba_cache_dtype, + vllm_config.cache_config.mamba_ssm_cache_dtype, ) @classmethod diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index d0c13dd49..6da5bca1b 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -341,7 +341,9 @@ class Qwen3NextGatedDeltaNet(nn.Module, MambaBase): def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]: return MambaStateDtypeCalculator.gated_delta_net_state_dtype( - self.model_config.dtype, self.cache_config.mamba_cache_dtype + self.model_config.dtype, + self.cache_config.mamba_cache_dtype, + self.cache_config.mamba_ssm_cache_dtype, ) def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]: @@ -1372,7 +1374,9 @@ class Qwen3NextForCausalLM( vllm_config: "VllmConfig", ) -> tuple[torch.dtype, torch.dtype]: return MambaStateDtypeCalculator.gated_delta_net_state_dtype( - vllm_config.model_config.dtype, vllm_config.cache_config.mamba_cache_dtype + vllm_config.model_config.dtype, + vllm_config.cache_config.mamba_cache_dtype, + vllm_config.cache_config.mamba_ssm_cache_dtype, ) @classmethod -- GitLab From ec090c2429d179309641cba9e7793eab34e19f8d Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 13 Feb 2026 14:48:45 +0800 Subject: [PATCH 0169/1166] [Refactor] Call renderer for online IO processor request (#34490) Signed-off-by: DarkLight1337 Signed-off-by: Cyrus Leung Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/entrypoints/llm.py | 16 ++++++++-------- vllm/entrypoints/openai/engine/serving.py | 15 +++++++++++---- vllm/entrypoints/pooling/pooling/protocol.py | 12 ++++++++++++ vllm/entrypoints/pooling/pooling/serving.py | 14 ++++++++------ 4 files changed, 39 insertions(+), 18 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index f54d9121c..9474c543e 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -500,7 +500,7 @@ class LLM: engine_prompts: Sequence[DictPrompt | TokPrompt] = [ engine_prompt for prompt, param in zip(seq_prompts, seq_params) - for engine_prompt in self._preprocess_completion( + for engine_prompt in self._preprocess_cmpl( [prompt], tokenization_kwargs=merge_kwargs( tokenization_kwargs, @@ -509,7 +509,7 @@ class LLM: ) ] else: - engine_prompts = self._preprocess_completion( + engine_prompts = self._preprocess_cmpl( seq_prompts, tokenization_kwargs=tokenization_kwargs, ) @@ -889,7 +889,7 @@ class LLM: add_special_tokens=not model_config.is_encoder_decoder, ).with_kwargs(tokenization_kwargs) - def _preprocess_completion( + def _preprocess_cmpl( self, prompts: Sequence[PromptType], tokenization_kwargs: dict[str, Any] | None = None, @@ -901,7 +901,7 @@ class LLM: Refer to [LLM.generate][] for a complete description of the arguments. Returns: - A list of `TokensPrompts` objects containing the tokenized prompt + A list of `TokPrompt` objects containing the tokenized prompt after chat template interpolation, and the raw multi-modal inputs. """ renderer = self.renderer @@ -943,7 +943,7 @@ class LLM: Refer to [LLM.chat][] for a complete description of the arguments. Returns: - A list of `TokensPrompts` objects containing the tokenized prompt + A list of `TokPrompt` objects containing the tokenized prompt after chat template interpolation, and the raw multi-modal inputs. """ renderer = self.renderer @@ -1823,11 +1823,11 @@ class LLM: if any(param.truncate_prompt_tokens is not None for param in seq_params): # TODO: Remove this after deprecating `param.truncate_prompt_tokens` # Then, move the code from the `else` block to the top and let - # `self._preprocess_completion` handle prompt normalization + # `self._preprocess_cmpl` handle prompt normalization engine_prompts: Sequence[DictPrompt | TokPrompt] = [ engine_prompt for prompt, param in zip(seq_prompts, seq_params) - for engine_prompt in self._preprocess_completion( + for engine_prompt in self._preprocess_cmpl( [prompt], tokenization_kwargs=merge_kwargs( tokenization_kwargs, @@ -1836,7 +1836,7 @@ class LLM: ) ] else: - engine_prompts = self._preprocess_completion( + engine_prompts = self._preprocess_cmpl( seq_prompts, tokenization_kwargs=tokenization_kwargs, ) diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py index d39decaa7..1484fca5b 100644 --- a/vllm/entrypoints/openai/engine/serving.py +++ b/vllm/entrypoints/openai/engine/serving.py @@ -5,7 +5,7 @@ import json import sys import time import traceback -from collections.abc import AsyncGenerator, Callable, Mapping +from collections.abc import AsyncGenerator, Callable, Mapping, Sequence from dataclasses import dataclass, field from http import HTTPStatus from typing import Any, ClassVar, Generic, Protocol, TypeAlias, TypeVar @@ -959,15 +959,22 @@ class OpenAIServing: prompt_input: str | list[str] | list[int] | list[list[int]] | None, prompt_embeds: bytes | list[bytes] | None, ) -> list[TokPrompt]: - renderer = self.renderer - model_config = self.model_config - prompts = list[SingletonPrompt | bytes]() if prompt_embeds is not None: # embeds take higher priority prompts.extend(prompt_to_seq(prompt_embeds)) if prompt_input is not None: prompts.extend(prompt_to_seq(prompt_input)) + return await self._preprocess_cmpl(request, prompts) + + async def _preprocess_cmpl( + self, + request: RendererRequest, + prompts: Sequence[PromptType | bytes], + ) -> list[TokPrompt]: + renderer = self.renderer + model_config = self.model_config + parsed_prompts = [ ( prompt diff --git a/vllm/entrypoints/pooling/pooling/protocol.py b/vllm/entrypoints/pooling/pooling/protocol.py index 6a5a743cd..a8c1c59ff 100644 --- a/vllm/entrypoints/pooling/pooling/protocol.py +++ b/vllm/entrypoints/pooling/pooling/protocol.py @@ -100,6 +100,18 @@ class IOProcessorRequest(PoolingBasicRequestMixin, EncodingRequestMixin, Generic data: T task: PoolingTask = "plugin" + def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams: + encoder_config = model_config.encoder_config or {} + + return TokenizeParams( + max_total_tokens=model_config.max_model_len, + max_output_tokens=0, + truncate_prompt_tokens=self.truncate_prompt_tokens, + do_lower_case=encoder_config.get("do_lower_case", False), + add_special_tokens=not model_config.is_encoder_decoder, + max_total_tokens_param="max_model_len", + ) + class IOProcessorResponse(OpenAIBaseModel, Generic[T]): request_id: str | None = None diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py index 5c5d649f6..16a9722c0 100644 --- a/vllm/entrypoints/pooling/pooling/serving.py +++ b/vllm/entrypoints/pooling/pooling/serving.py @@ -6,7 +6,7 @@ import json import time from collections.abc import AsyncGenerator, Callable, Sequence from functools import partial -from typing import Any, Final, Literal, cast +from typing import Final, Literal, cast import jinja2 from fastapi import Request @@ -108,7 +108,10 @@ class OpenAIServingPooling(OpenAIServing): raw_prompts = await self.io_processor.pre_process_async( prompt=validated_prompt, request_id=request_id ) - engine_prompts = prompt_to_seq(raw_prompts) + engine_prompts = await self._preprocess_cmpl( + request, + prompt_to_seq(raw_prompts), + ) elif isinstance(request, PoolingChatRequest): error_check_ret = self._validate_chat_template( request_chat_template=request.chat_template, @@ -146,12 +149,11 @@ class OpenAIServingPooling(OpenAIServing): pooling_params = self.io_processor.merge_pooling_params() if pooling_params.task is None: pooling_params.task = "plugin" - - tokenization_kwargs: dict[str, Any] = {} else: pooling_params = request.to_pooling_params() # type: ignore - tok_params = request.build_tok_params(self.model_config) # type: ignore - tokenization_kwargs = tok_params.get_encode_kwargs() + + tok_params = request.build_tok_params(self.model_config) + tokenization_kwargs = tok_params.get_encode_kwargs() for i, engine_prompt in enumerate(engine_prompts): request_id_item = f"{request_id}-{i}" -- GitLab From bcf0731aa07c11d92b6261c58f42d9ad07b949c6 Mon Sep 17 00:00:00 2001 From: myselvess <23743269+myselvess@users.noreply.github.com> Date: Fri, 13 Feb 2026 16:12:45 +0800 Subject: [PATCH 0170/1166] [New Model] support new model ovis2.6 (#34426) Signed-off-by: myselvess <23743269+myselvess@users.noreply.github.com> --- docs/models/supported_models.md | 2 + tests/models/registry.py | 6 +++ vllm/model_executor/models/ovis2_5.py | 42 ++++++++----------- vllm/model_executor/models/registry.py | 2 + vllm/model_executor/models/siglip2navit.py | 1 - vllm/transformers_utils/processors/ovis2_5.py | 42 +++++++++++-------- 6 files changed, 52 insertions(+), 43 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 7f20d2052..d30518da2 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -728,6 +728,8 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `OpenPanguVLForConditionalGeneration` | openpangu-VL | T + IE+ + VE+ |`FreedomIntelligence/openPangu-VL-7B` | ✅︎ | ✅︎ | | `Ovis` | Ovis2, Ovis1.6 | T + I+ | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | | `Ovis2_5` | Ovis2.5 | T + I+ + V | `AIDC-AI/Ovis2.5-9B`, etc. | | | +| `Ovis2_6ForCausalLM` | Ovis2.6 | T + I+ + V | `AIDC-AI/Ovis2.6-2B`, etc. | | | +| `Ovis2_6_MoeForCausalLM` | Ovis2.6 | T + I+ + V | `AIDC-AI/Ovis2.6-30B-A3B`, etc. | | | | `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I+ | `PaddlePaddle/PaddleOCR-VL`, etc. | | | | `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + IE | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | ✅︎ | ✅︎ | | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + IE+ | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index dcd1fa8ed..78d478020 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -915,6 +915,12 @@ _MULTIMODAL_EXAMPLE_MODELS = { }, ), "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True), + "Ovis2_6ForCausalLM": _HfExamplesInfo( + "AIDC-AI/Ovis2.6-2B", is_available_online=False, trust_remote_code=True + ), + "Ovis2_6_MoeForCausalLM": _HfExamplesInfo( + "AIDC-AI/Ovis2.6-30B-A3B", trust_remote_code=True + ), "PaddleOCRVLForConditionalGeneration": _HfExamplesInfo( "PaddlePaddle/PaddleOCR-VL", trust_remote_code=True, diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index 8d038d4ad..00418d707 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -42,21 +42,12 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP IMAGE_TOKEN = "" +IMAGE_PLACEHOLDER_ID = 151669 VIDEO_TOKEN = "