From b48d5cca16a5583d58e998ecd52ad949c450a3b9 Mon Sep 17 00:00:00 2001 From: Carol Zheng Date: Tue, 27 May 2025 14:54:59 -0700 Subject: [PATCH 001/274] [CI/Build] [TPU] Fix TPU CI exit code (#18282) Signed-off-by: Carol Zheng --- .../scripts/hardware_ci/run-tpu-v1-test.sh | 217 +++++++++++------- 1 file changed, 133 insertions(+), 84 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index 2d375d7e9..eb82da3a8 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -13,91 +13,140 @@ remove_docker_container # For HF_TOKEN. source /etc/environment -# Run a simple end-to-end example. + docker run --privileged --net host --shm-size=16G -it \ -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ - vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \ - && python3 -m pip install pytest pytest-asyncio tpu-info \ - && python3 -m pip install lm_eval[api]==0.4.4 \ - && export VLLM_XLA_CACHE_PATH= \ - && export VLLM_USE_V1=1 \ - && export VLLM_XLA_CHECK_RECOMPILATION=1 \ - && echo HARDWARE \ - && tpu-info \ - && { \ - echo TEST_0: Running test_perf.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \ - echo TEST_0_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_1: Running test_compilation.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \ - echo TEST_1_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_2: Running test_basic.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \ - echo TEST_2_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \ - python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \ - echo TEST_3_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_4: Running test_quantization_accuracy.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \ - echo TEST_4_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_5: Running examples/offline_inference/tpu.py; \ - python3 /workspace/vllm/examples/offline_inference/tpu.py; \ - echo TEST_5_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_6: Running test_tpu_model_runner.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \ - echo TEST_6_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_7: Running test_sampler.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \ - echo TEST_7_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_8: Running test_topk_topp_sampler.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \ - echo TEST_8_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_9: Running test_multimodal.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \ - echo TEST_9_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_10: Running test_pallas.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \ - echo TEST_10_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_11: Running test_struct_output_generate.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \ - echo TEST_11_EXIT_CODE: \$?; \ - } & \ - { \ - echo TEST_12: Running test_moe_pallas.py; \ - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \ - echo TEST_12_EXIT_CODE: \$?; \ - } & \ - # Disable the TPU LoRA tests until the feature is activated - # & { \ - # echo TEST_13: Running test_moe_pallas.py; \ - # python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \ - # echo TEST_13_EXIT_CODE: \$?; \ - # } & \ - wait \ - && echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \ -" + vllm-tpu /bin/bash -c ' +set -e # Exit immediately if a command exits with a non-zero status. +set -u # Treat unset variables as an error. + +echo "--- Starting script inside Docker container ---" + +# Create results directory +RESULTS_DIR=$(mktemp -d) +# If mktemp fails, set -e will cause the script to exit. +echo "Results will be stored in: $RESULTS_DIR" + +# Install dependencies +echo "--- Installing Python dependencies ---" +python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ + && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ + && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 +echo "--- Python dependencies installed ---" +export VLLM_USE_V1=1 +export VLLM_XLA_CHECK_RECOMPILATION=1 +export VLLM_XLA_CACHE_PATH= +echo "Using VLLM V1" + +echo "--- Hardware Information ---" +tpu-info +echo "--- Starting Tests ---" +set +e +overall_script_exit_code=0 + +# --- Test Definitions --- +# If a test fails, this function will print logs and will not cause the main script to exit. +run_test() { + local test_num=$1 + local test_name=$2 + local test_command=$3 + local log_file="$RESULTS_DIR/test_${test_num}.log" + local actual_exit_code + + echo "--- TEST_$test_num: Running $test_name ---" + + # Execute the test command. + eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2) + actual_exit_code=$? + + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log + + if [ "$actual_exit_code" -ne 0 ]; then + echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2 + echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2 + if [ -f "$log_file" ]; then + cat "$log_file" >&2 + else + echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2 + fi + echo "--- End of log for TEST_$test_num ($test_name) ---" >&2 + return "$actual_exit_code" # Return the failure code + else + echo "TEST_$test_num ($test_name) PASSED." + return 0 # Return success + fi +} + +# Helper function to call run_test and update the overall script exit code +run_and_track_test() { + local test_num_arg="$1" + local test_name_arg="$2" + local test_command_arg="$3" + + # Run the test + run_test "$test_num_arg" "$test_name_arg" "$test_command_arg" + local test_specific_exit_code=$? + + # If the test failed, set the overall script exit code to 1 + if [ "$test_specific_exit_code" -ne 0 ]; then + # No need for extra echo here, run_test already logged the failure. + overall_script_exit_code=1 + fi +} + +# --- Actual Test Execution --- +run_and_track_test 0 "test_perf.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py" +run_and_track_test 1 "test_compilation.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py" +run_and_track_test 2 "test_basic.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py" +run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \ + "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine" +run_and_track_test 4 "test_quantization_accuracy.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py" +run_and_track_test 5 "examples/offline_inference/tpu.py" \ + "python3 /workspace/vllm/examples/offline_inference/tpu.py" +run_and_track_test 6 "test_tpu_model_runner.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py" +run_and_track_test 7 "test_sampler.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" +run_and_track_test 8 "test_topk_topp_sampler.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py" +run_and_track_test 9 "test_multimodal.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py" +run_and_track_test 10 "test_pallas.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" +run_and_track_test 11 "test_struct_output_generate.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" +run_and_track_test 12 "test_moe_pallas.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" + +# Disable the TPU LoRA tests until the feature is activated +# run_and_track_test 13 "test_lora (directory)" \ +# "python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/" + +# After all tests have been attempted, exit with the overall status. +if [ "$overall_script_exit_code" -ne 0 ]; then + echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---" +else + echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---" +fi +exit "$overall_script_exit_code" +' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct. + +# Capture the exit code of the docker run command +DOCKER_RUN_EXIT_CODE=$? +# The trap will run for cleanup. +# Exit the main script with the Docker run command's exit code. +if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then + echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE." + exit "$DOCKER_RUN_EXIT_CODE" +else + echo "Docker run command completed successfully." + exit 0 +fi # TODO: This test fails because it uses RANDOM_SEED sampling -# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ +# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ -- GitLab From e0cbad4e30d7df4e9ee4634939ce19042639b735 Mon Sep 17 00:00:00 2001 From: Satyajith Chilappagari Date: Tue, 27 May 2025 15:10:33 -0700 Subject: [PATCH 002/274] [Neuron] Support quantization on neuron (#18283) Signed-off-by: Satyajith Chilappagari --- tests/neuron/1_core/test_neuron_quant.py | 11 +++++++++++ .../layers/quantization/neuron_quant.py | 9 ++++++++- vllm/platforms/neuron.py | 2 +- 3 files changed, 20 insertions(+), 2 deletions(-) create mode 100644 tests/neuron/1_core/test_neuron_quant.py diff --git a/tests/neuron/1_core/test_neuron_quant.py b/tests/neuron/1_core/test_neuron_quant.py new file mode 100644 index 000000000..68f0cb805 --- /dev/null +++ b/tests/neuron/1_core/test_neuron_quant.py @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: Apache-2.0 +from vllm.model_executor.layers.quantization.neuron_quant import ( + NeuronQuantConfig) + + +def test_get_supported_act_dtypes(): + neuron_quant_config = NeuronQuantConfig() + supported_act_dtypes = neuron_quant_config.get_supported_act_dtypes() + target_list = ["any_dtype1", "any_dtype2"] + for dtype in target_list: + assert dtype in supported_act_dtypes diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py index 38b374fee..b2d6bf5db 100644 --- a/vllm/model_executor/layers/quantization/neuron_quant.py +++ b/vllm/model_executor/layers/quantization/neuron_quant.py @@ -13,6 +13,12 @@ from vllm.model_executor.layers.quantization.base_config import ( SUPPORTED_QUANT_DTYPE_LIST = ['s8', 'f8e4m3fn'] +class AlwaysSupportedDtypes(list): + + def __contains__(self, item): + return True + + class NeuronQuantConfig(QuantizationConfig): """Int8 Quantization Config class for Neuron Backend.""" @@ -35,7 +41,8 @@ class NeuronQuantConfig(QuantizationConfig): return "neuron_quant" def get_supported_act_dtypes(self) -> list[str]: - return SUPPORTED_QUANT_DTYPE_LIST + # Neuron implements custom handling logic for quantization support + return AlwaysSupportedDtypes() @classmethod def get_min_capability(cls) -> int: diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index 9cd49fd34..474c70d04 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -28,7 +28,7 @@ class NeuronPlatform(Platform): device_name: str = "neuron" device_type: str = "neuron" ray_device_key: str = "neuron_cores" - supported_quantization: list[str] = ["neuron_quant"] + supported_quantization: list[str] = ["neuron_quant", "fbgemm_fp8"] device_control_env_var: str = "NEURON_RT_VISIBLE_CORES" @classmethod -- GitLab From e56f44d9ec6f8f07f0d2c7936eea9bb2c0212bf2 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 27 May 2025 19:59:48 -0400 Subject: [PATCH 003/274] Support datasets in `vllm bench serve` and sync with benchmark_[serving,datasets].py (#18566) --- vllm/benchmarks/datasets.py | 185 ++++++++++- vllm/benchmarks/endpoint_request_func.py | 226 +++++++++++++- vllm/benchmarks/serve.py | 382 +++++++++++++++++------ 3 files changed, 692 insertions(+), 101 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 74a9b2b03..712e83528 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -62,6 +62,7 @@ class SampleRequest: class BenchmarkDataset(ABC): DEFAULT_SEED = 0 + IS_MULTIMODAL = False def __init__( self, @@ -316,13 +317,15 @@ class RandomDataset(BenchmarkDataset): ) vocab_size = tokenizer.vocab_size + num_special_tokens = tokenizer.num_special_tokens_to_add() + real_input_len = input_len - num_special_tokens prefix_token_ids = (np.random.randint( 0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else []) # New sampling logic: [X * (1 - b), X * (1 + b)] - input_low = int(input_len * (1 - range_ratio)) - input_high = int(input_len * (1 + range_ratio)) + input_low = int(real_input_len * (1 - range_ratio)) + input_high = int(real_input_len * (1 + range_ratio)) output_low = int(output_len * (1 - range_ratio)) output_high = int(output_len * (1 + range_ratio)) @@ -345,6 +348,17 @@ class RandomDataset(BenchmarkDataset): vocab_size).tolist() token_sequence = prefix_token_ids + inner_seq prompt = tokenizer.decode(token_sequence) + # After decoding the prompt we have to encode and decode it again. + # This is done because in some cases N consecutive tokens + # give a string tokenized into != N number of tokens. + # For example for GPT2Tokenizer: + # [6880, 6881] -> ['Ġcalls', 'here'] -> + # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] + # To avoid uncontrolled change of the prompt length, + # the encoded sequence is truncated before being decode again. + re_encoded_sequence = tokenizer.encode( + prompt, add_special_tokens=False)[:input_lens[i]] + prompt = tokenizer.decode(re_encoded_sequence) total_input_len = prefix_len + int(input_lens[i]) requests.append( SampleRequest( @@ -637,6 +651,7 @@ class ConversationDataset(HuggingFaceDataset): SUPPORTED_DATASET_PATHS = { 'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered' } + IS_MULTIMODAL = True def sample(self, tokenizer: PreTrainedTokenizerBase, @@ -701,6 +716,7 @@ class VisionArenaDataset(HuggingFaceDataset): "lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"] } + IS_MULTIMODAL = True def sample( self, @@ -784,6 +800,64 @@ class InstructCoderDataset(HuggingFaceDataset): return sampled_requests +# ----------------------------------------------------------------------------- +# MT-Bench Dataset Implementation +# ----------------------------------------------------------------------------- + + +class MTBenchDataset(HuggingFaceDataset): + """ + MT-Bench Dataset. + https://huggingface.co/datasets/philschmid/mt-bench + + We create a single turn dataset for MT-Bench. + This is similar to Spec decoding benchmark setup in vLLM + https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18 + """ # noqa: E501 + + DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM + SUPPORTED_DATASET_PATHS = { + "philschmid/mt-bench", + } + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + **kwargs, + ) -> list: + output_len = (output_len + if output_len is not None else self.DEFAULT_OUTPUT_LEN) + sampled_requests = [] + + for item in self.data: + if len(sampled_requests) >= num_requests: + break + prompt = item["turns"][0] + + # apply template + prompt = tokenizer.apply_chat_template( + [{ + "role": "user", + "content": prompt + }], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + )) + self.maybe_oversample_requests(sampled_requests, num_requests) + return sampled_requests + + # ----------------------------------------------------------------------------- # AIMO Dataset Implementation # ----------------------------------------------------------------------------- @@ -858,18 +932,18 @@ def _format_zeta_prompt( sample: dict, original_start_marker: str = "<|editable_region_start|>") -> dict: """Format the zeta prompt for the Next Edit Prediction (NEP) dataset. - - This function formats examples from the NEP dataset - into prompts and expected outputs. It could be + + This function formats examples from the NEP dataset + into prompts and expected outputs. It could be further extended to support more NEP datasets. - + Args: - sample: The dataset sample containing events, + sample: The dataset sample containing events, inputs, and outputs. - original_start_marker: The marker indicating the - start of the editable region. Defaults to + original_start_marker: The marker indicating the + start of the editable region. Defaults to "<|editable_region_start|>". - + Returns: A dictionary with the formatted prompts and expected outputs. """ @@ -919,3 +993,94 @@ class NextEditPredictionDataset(HuggingFaceDataset): break self.maybe_oversample_requests(samples, num_requests) return samples + + +# ----------------------------------------------------------------------------- +# ASR Dataset Implementation +# ----------------------------------------------------------------------------- + + +class ASRDataset(HuggingFaceDataset): + """ + Dataset class for processing a ASR dataset for transcription. + Tested on the following set: + + +----------------+----------------------------------------+--------------------------+-----------------------------+ + | Dataset | Domain | Speaking Style | hf-subset | + +----------------+----------------------------------------+--------------------------+-----------------------------+ + | TED-LIUM | TED talks | Oratory | release1, release2, release3| + | | | | release3-speaker-adaptation | + | VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... | + | LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" | + | GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test | + | SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test | + | AMI | Meetings | Spontaneous | ihm, sdm | + +----------------+----------------------------------------+--------------------------+-----------------------------+ + + """ # noqa: E501 + + SUPPORTED_DATASET_PATHS = { + "openslr/librispeech_asr", + "facebook/voxpopuli", + "LIUM/tedlium", + "edinburghcstr/ami", + "speechcolab/gigaspeech", + "kensho/spgispeech", + } + + DEFAULT_OUTPUT_LEN = 128 + IS_MULTIMODAL = True + + # TODO Whisper-specific. Abstract interface when more models are supported. + TRANSCRIPTION_PREAMBLE = ( + "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>") + skip_long_audios: bool = True + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + output_len: Optional[int] = None, + **kwargs, + ) -> list: + try: + import librosa + except ImportError as e: + raise ImportError( + "librosa is required for ASRDataset. Please install it " + "using `pip install librosa`.") from e + + output_len = (output_len + if output_len is not None else self.DEFAULT_OUTPUT_LEN) + prompt = ASRDataset.TRANSCRIPTION_PREAMBLE + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests = [] + skipped = 0 + for item in self.data: + if len(sampled_requests) >= num_requests: + break + audio = item["audio"] + y, sr = audio["array"], audio["sampling_rate"] + duration_s = librosa.get_duration(y=y, sr=sr) + # Whisper max supported duration + if self.skip_long_audios and duration_s > 30: + skipped += 1 + continue + + mm_content = {"audio": (y, sr)} + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=mm_content, + )) + if skipped: + logger.warning( + "%d samples discarded from dataset due to" + " their length being greater than" + " what Whisper supports.", + skipped, + ) + self.maybe_oversample_requests(sampled_requests, num_requests) + return sampled_requests diff --git a/vllm/benchmarks/endpoint_request_func.py b/vllm/benchmarks/endpoint_request_func.py index 32767a896..a28630d50 100644 --- a/vllm/benchmarks/endpoint_request_func.py +++ b/vllm/benchmarks/endpoint_request_func.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """The request function for API endpoints.""" +import io import json import os import sys @@ -24,11 +25,11 @@ class RequestFuncInput: output_len: int model: str model_name: Optional[str] = None - best_of: int = 1 logprobs: Optional[int] = None extra_body: Optional[dict] = None multi_modal_content: Optional[dict] = None ignore_eos: bool = False + language: Optional[str] = None @dataclass @@ -71,7 +72,7 @@ async def async_request_openai_completions( if request_func_input.model_name else request_func_input.model, "prompt": request_func_input.prompt, "temperature": 0.0, - "best_of": request_func_input.best_of, + "repetition_penalty": 1.0, "max_tokens": request_func_input.output_len, "logprobs": request_func_input.logprobs, "stream": True, @@ -154,7 +155,226 @@ async def async_request_openai_completions( return output +async def async_request_openai_chat_completions( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + assert api_url.endswith(("chat/completions", "profile")), ( + "OpenAI Chat Completions API URL must end with 'chat/completions'.") + + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: + content = [{"type": "text", "text": request_func_input.prompt}] + if request_func_input.multi_modal_content: + content.append(request_func_input.multi_modal_content) + payload = { + "model": + request_func_input.model_name + if request_func_input.model_name else request_func_input.model, + "messages": [ + { + "role": "user", + "content": content + }, + ], + "temperature": + 0.0, + "max_completion_tokens": + request_func_input.output_len, + "stream": + True, + "stream_options": { + "include_usage": True, + }, + } + if request_func_input.ignore_eos: + payload["ignore_eos"] = request_func_input.ignore_eos + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post(url=api_url, json=payload, + headers=headers) as response: + if response.status == 200: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data: ") + if chunk != "[DONE]": + timestamp = time.perf_counter() + data = json.loads(chunk) + + if choices := data.get("choices"): + content = choices[0]["delta"].get("content") + # First token + if ttft == 0.0: + ttft = timestamp - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - + most_recent_timestamp) + + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") + + most_recent_timestamp = timestamp + + output.generated_text = generated_text + output.success = True + output.latency = most_recent_timestamp - st + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + +async def async_request_openai_audio( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + # Lazy import without PlaceholderModule to avoid vllm dep. + import soundfile + + api_url = request_func_input.api_url + assert api_url.endswith(("transcriptions", "translations")), ( + "OpenAI Chat Completions API URL must end with 'transcriptions' ") + "or `translations`." + + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: + content = [{"type": "text", "text": request_func_input.prompt}] + payload = { + "model": + request_func_input.model_name + if request_func_input.model_name else request_func_input.model, + "temperature": + 0.0, + "max_completion_tokens": + request_func_input.output_len, + "stream": + True, + "language": + "en", + # Flattened due to multipart/form-data + "stream_include_usage": + True, + "stream_continuous_usage_stats": + True, + } + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + } + + # Send audio file + def to_bytes(y, sr): + buffer = io.BytesIO() + soundfile.write(buffer, y, sr, format="WAV") + buffer.seek(0) + return buffer + + with to_bytes(*request_func_input.multi_modal_content["audio"]) as f: + form = aiohttp.FormData() + form.add_field("file", f, content_type="audio/wav") + for key, value in payload.items(): + form.add_field(key, str(value)) + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post(url=api_url, + data=form, + headers=headers) as response: + if response.status == 200: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data: ") + if chunk != "[DONE]": + timestamp = time.perf_counter() + data = json.loads(chunk) + + if choices := data.get("choices"): + content = choices[0]["delta"].get( + "content") + # First token + if ttft == 0.0: + ttft = timestamp - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append( + timestamp - most_recent_timestamp) + + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") + + most_recent_timestamp = timestamp + + output.generated_text = generated_text + output.success = True + output.latency = most_recent_timestamp - st + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + # TODO: Add more request functions for different API protocols. ASYNC_REQUEST_FUNCS = { - "openai-comp": async_request_openai_completions, + "vllm": async_request_openai_completions, + "openai": async_request_openai_completions, + "openai-chat": async_request_openai_chat_completions, + "openai-audio": async_request_openai_audio, } + +OPENAI_COMPATIBLE_BACKENDS = [ + k for k, v in ASYNC_REQUEST_FUNCS.items() + if v in (async_request_openai_completions, + async_request_openai_chat_completions) +] diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index dc0ec3219..040815e87 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -7,7 +7,7 @@ to launch the vLLM OpenAI API server: On the client side, run: vllm bench serve \ - --endpoint-type \ + --endpoint-type \ --label \ --model \ --dataset-name \ @@ -22,7 +22,7 @@ import os import random import time import warnings -from collections.abc import AsyncGenerator +from collections.abc import AsyncGenerator, Iterable from dataclasses import dataclass from datetime import datetime from typing import Any, Optional @@ -31,7 +31,14 @@ import numpy as np from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase +from vllm.benchmarks.datasets import (AIMODataset, ASRDataset, BurstGPTDataset, + ConversationDataset, HuggingFaceDataset, + InstructCoderDataset, MTBenchDataset, + NextEditPredictionDataset, RandomDataset, + SampleRequest, ShareGPTDataset, + SonnetDataset, VisionArenaDataset) from vllm.benchmarks.endpoint_request_func import (ASYNC_REQUEST_FUNCS, + OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput, RequestFuncOutput) from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format, @@ -71,53 +78,18 @@ class BenchmarkMetrics: percentiles_e2el_ms: list[tuple[float, float]] -def sample_random_requests( - prefix_len: int, - input_len: int, - output_len: int, - num_prompts: int, - range_ratio: float, - tokenizer: PreTrainedTokenizerBase, -) -> list[tuple[str, int, int]]: - prefix_token_ids = np.random.randint(0, - tokenizer.vocab_size, - size=prefix_len).tolist() - - input_lens = np.random.randint( - int(input_len * range_ratio), - input_len + 1, - size=num_prompts, - ) - output_lens = np.random.randint( - int(output_len * range_ratio), - output_len + 1, - size=num_prompts, - ) - offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts) - input_requests = [] - for i in range(num_prompts): - prompt = tokenizer.decode(prefix_token_ids + - [(offsets[i] + i + j) % tokenizer.vocab_size - for j in range(input_lens[i])]) - - input_requests.append((prompt, int(prefix_len + input_lens[i]), - int(output_lens[i]), None)) - - return input_requests - - async def get_request( - input_requests: list[tuple[str, int, int]], + input_requests: list[SampleRequest], request_rate: float, burstiness: float = 1.0, -) -> AsyncGenerator[tuple[str, int, int], None]: +) -> AsyncGenerator[SampleRequest, None]: """ Asynchronously generates requests at a specified rate with OPTIONAL burstiness. Args: input_requests: - A list of input requests, each represented as a tuple. + A list of input requests, each represented as a SampleRequest. request_rate: The rate at which requests are generated (requests/s). burstiness (optional): @@ -129,7 +101,7 @@ async def get_request( in more bursty requests, while a higher burstiness value (burstiness > 1) results in a more uniform arrival of requests. """ - input_requests = iter(input_requests) + input_requests: Iterable[SampleRequest] = iter(input_requests) # Calculate scale parameter theta to maintain the desired request_rate. assert burstiness > 0, ( @@ -151,7 +123,7 @@ async def get_request( def calculate_metrics( - input_requests: list[tuple[str, int, int]], + input_requests: list[SampleRequest], outputs: list[RequestFuncOutput], dur_s: float, tokenizer: PreTrainedTokenizerBase, @@ -184,7 +156,7 @@ def calculate_metrics( if outputs[i].success: output_len = outputs[i].output_tokens - if output_len is None: + if not output_len: # We use the tokenizer to count the number of output tokens # for some serving backends instead of looking at # len(outputs[i].itl) since multiple output tokens may be @@ -194,7 +166,7 @@ def calculate_metrics( tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids) actual_output_lens.append(output_len) - total_input += input_requests[i][1] + total_input += input_requests[i].prompt_len tpot = 0 if output_len > 1: latency_minus_ttft = outputs[i].latency - outputs[i].ttft @@ -277,19 +249,19 @@ async def benchmark( model_id: str, model_name: str, tokenizer: PreTrainedTokenizerBase, - input_requests: list[tuple[str, int, int]], + input_requests: list[SampleRequest], logprobs: Optional[int], - best_of: int, request_rate: float, burstiness: float, disable_tqdm: bool, profile: bool, selected_percentile_metrics: list[str], - selected_percentiles: list[str], + selected_percentiles: list[float], ignore_eos: bool, goodput_config_dict: dict[str, float], max_concurrency: Optional[int], - lora_modules: Optional[list[str]], + lora_modules: Optional[Iterable[str]], + extra_body: Optional[dict], ): if endpoint_type in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[endpoint_type] @@ -298,11 +270,13 @@ async def benchmark( print("Starting initial single prompt test run...") test_prompt, test_prompt_len, test_output_len, test_mm_content = ( - input_requests[0]) - if endpoint_type != "openai-chat" and test_mm_content is not None: - # multi-modal benchmark is only available on OpenAI Chat endpoint. - raise ValueError("Multi-modal content is only supported on " - "'openai-chat' endpoint_type.") + input_requests[0].prompt, + input_requests[0].prompt_len, + input_requests[0].expected_output_len, + input_requests[0].multi_modal_data, + ) + + assert test_mm_content is None or isinstance(test_mm_content, dict) test_input = RequestFuncInput( model=model_id, model_name=model_name, @@ -311,9 +285,9 @@ async def benchmark( prompt_len=test_prompt_len, output_len=test_output_len, logprobs=logprobs, - best_of=best_of, multi_modal_content=test_mm_content, ignore_eos=ignore_eos, + extra_body=extra_body, ) test_output = await request_func(request_func_input=test_input) @@ -338,9 +312,9 @@ async def benchmark( prompt_len=test_prompt_len, output_len=test_output_len, logprobs=logprobs, - best_of=best_of, multi_modal_content=test_mm_content, - ignore_eos=ignore_eos) + ignore_eos=ignore_eos, + extra_body=extra_body) profile_output = await request_func(request_func_input=profile_input) if profile_output.success: print("Profiler started") @@ -374,7 +348,12 @@ async def benchmark( benchmark_start_time = time.perf_counter() tasks: list[asyncio.Task] = [] async for request in get_request(input_requests, request_rate, burstiness): - prompt, prompt_len, output_len, mm_content = request + prompt, prompt_len, output_len, mm_content = ( + request.prompt, + request.prompt_len, + request.expected_output_len, + request.multi_modal_data, + ) req_model_id, req_model_name = model_id, model_name if lora_modules: req_lora_module = next(lora_modules) @@ -387,9 +366,9 @@ async def benchmark( prompt_len=prompt_len, output_len=output_len, logprobs=logprobs, - best_of=best_of, multi_modal_content=mm_content, - ignore_eos=ignore_eos) + ignore_eos=ignore_eos, + extra_body=extra_body) tasks.append( asyncio.create_task( limited_request_func(request_func_input=request_func_input, @@ -405,7 +384,6 @@ async def benchmark( prompt_len=test_prompt_len, output_len=test_output_len, logprobs=logprobs, - best_of=best_of, ) profile_output = await request_func(request_func_input=profile_input) if profile_output.success: @@ -567,7 +545,7 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--endpoint-type", type=str, - default="openai-comp", + default="openai", choices=list(ASYNC_REQUEST_FUNCS.keys()), ) parser.add_argument( @@ -596,9 +574,16 @@ def add_cli_args(parser: argparse.ArgumentParser): "--dataset-name", type=str, default="random", - choices=["random"], + choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"], help="Name of the dataset to benchmark on.", ) + parser.add_argument( + "--dataset-path", + type=str, + default=None, + help="Path to the sharegpt/sonnet dataset. " + "Or the huggingface dataset ID if using HF dataset.", + ) parser.add_argument( "--max-concurrency", type=int, @@ -624,13 +609,6 @@ def add_cli_args(parser: argparse.ArgumentParser): help= "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 ) - parser.add_argument( - "--best-of", - type=int, - default=1, - help="Generates `best_of` sequences per prompt and " - "returns the best one.", - ) parser.add_argument("--use-beam-search", action="store_true") parser.add_argument( "--num-prompts", @@ -691,6 +669,17 @@ def add_cli_args(parser: argparse.ArgumentParser): action="store_true", help="Specify to save benchmark results to a json file", ) + parser.add_argument( + "--save-detailed", + action="store_true", + help="When saving the results, whether to include per request " + "information such as response, error, ttfs, tpots, etc.", + ) + parser.add_argument( + "--append-result", + action="store_true", + help="Append the benchmark result to the existing json file.", + ) parser.add_argument( "--metadata", metavar="KEY=VALUE", @@ -733,6 +722,7 @@ def add_cli_args(parser: argparse.ArgumentParser): default="99", help="Comma-separated list of percentiles for selected metrics. " "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " + "Default value is \"99\"." "Use \"--percentile-metrics\" to select metrics.", ) parser.add_argument( @@ -745,7 +735,41 @@ def add_cli_args(parser: argparse.ArgumentParser): "separated by spaces. Allowed request level metric names are " "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of " "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " - "and the blog: https://hao-ai-lab.github.io/blogs/distserve") + "and the blog: https://hao-ai-lab.github.io/blogs/distserve", + ) + + # group for dataset specific arguments + sonnet_group = parser.add_argument_group("sonnet dataset options") + sonnet_group.add_argument( + "--sonnet-input-len", + type=int, + default=550, + help= + "Number of input tokens per request, used only for sonnet dataset.", + ) + sonnet_group.add_argument( + "--sonnet-output-len", + type=int, + default=150, + help= + "Number of output tokens per request, used only for sonnet dataset.", + ) + sonnet_group.add_argument( + "--sonnet-prefix-len", + type=int, + default=200, + help= + "Number of prefix tokens per request, used only for sonnet dataset.", + ) + + sharegpt_group = parser.add_argument_group("sharegpt dataset options") + sharegpt_group.add_argument( + "--sharegpt-output-len", + type=int, + default=None, + help="Output length for each request. Overrides the output length " + "from the ShareGPT dataset.", + ) random_group = parser.add_argument_group("random dataset options") random_group.add_argument( @@ -765,9 +789,11 @@ def add_cli_args(parser: argparse.ArgumentParser): random_group.add_argument( "--random-range-ratio", type=float, - default=1.0, - help="Range of sampled ratio of input/output length, " - "used only for random sampling.", + default=0.0, + help="Range ratio for sampling input/output length, " + "used only for random sampling. Must be in the range [0, 1) to define " + "a symmetric sampling range" + "[length * (1 - range_ratio), length * (1 + range_ratio)].", ) random_group.add_argument( "--random-prefix-len", @@ -778,6 +804,54 @@ def add_cli_args(parser: argparse.ArgumentParser): " request is [random-prefix-len, " " random-prefix-len + random-prefix-len * random-range-ratio).") + hf_group = parser.add_argument_group("hf dataset options") + hf_group.add_argument("--hf-subset", + type=str, + default=None, + help="Subset of the HF dataset.") + hf_group.add_argument("--hf-split", + type=str, + default=None, + help="Split of the HF dataset.") + hf_group.add_argument( + "--hf-output-len", + type=int, + default=None, + help="Output length for each request. Overrides the output lengths " + "from the sampled HF dataset.", + ) + + sampling_group = parser.add_argument_group("sampling parameters") + sampling_group.add_argument( + "--top-p", + type=float, + default=None, + help="Top-p sampling parameter. Only has effect on " + "openai-compatible backends.", + ) + sampling_group.add_argument( + "--top-k", + type=int, + default=None, + help="Top-k sampling parameter. Only has effect on " + "openai-compatible backends.", + ) + sampling_group.add_argument( + "--min-p", + type=float, + default=None, + help="Min-p sampling parameter. Only has effect on " + "openai-compatible backends.", + ) + sampling_group.add_argument( + "--temperature", + type=float, + default=None, + help="Temperature sampling parameter. Only has effect on " + "openai-compatible backends. If not specified, default to greedy " + "decoding (i.e. temperature==0.0).", + ) + parser.add_argument( '--tokenizer-mode', type=str, @@ -826,27 +900,142 @@ def main(args: argparse.Namespace): tokenizer = get_tokenizer(tokenizer_id, tokenizer_mode=tokenizer_mode, trust_remote_code=args.trust_remote_code) - # TODO: This should be refactored to use the benchmark_dataset.py - # in later PRs. + if args.dataset_name is None: raise ValueError( "Please specify '--dataset-name' and the corresponding " "'--dataset-path' if required.") - elif args.dataset_name == "random": - input_requests = sample_random_requests( - prefix_len=args.random_prefix_len, - input_len=args.random_input_len, - output_len=args.random_output_len, - num_prompts=args.num_prompts, - range_ratio=args.random_range_ratio, + + if args.dataset_name == "sonnet": + dataset = SonnetDataset(dataset_path=args.dataset_path) + # For the "sonnet" dataset, formatting depends on the backend. + if args.backend == "openai-chat": + input_requests = dataset.sample( + num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + return_prompt_formatted=False, + ) + else: + assert tokenizer.chat_template or tokenizer.default_chat_template, ( + "Tokenizer/model must have chat template for sonnet dataset.") + input_requests = dataset.sample( + num_requests=args.num_prompts, + input_len=args.sonnet_input_len, + output_len=args.sonnet_output_len, + prefix_len=args.sonnet_prefix_len, + tokenizer=tokenizer, + return_prompt_formatted=True, + ) + + elif args.dataset_name == "hf": + # all following datasets are implemented from the + # HuggingFaceDataset base class + if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: + dataset_class = VisionArenaDataset + args.hf_split = "train" + args.hf_subset = None + elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS: + dataset_class = InstructCoderDataset + args.hf_split = "train" + elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS: + dataset_class = MTBenchDataset + args.hf_split = "train" + elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS: + dataset_class = ConversationDataset + args.hf_split = "train" + elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS: + dataset_class = AIMODataset + args.hf_split = "train" + elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS: # noqa: E501 + dataset_class = NextEditPredictionDataset + args.hf_split = "train" + elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS: + dataset_class = ASRDataset + args.hf_split = "train" + else: + supported_datasets = set([ + dataset_name for cls in HuggingFaceDataset.__subclasses__() + for dataset_name in cls.SUPPORTED_DATASET_PATHS + ]) + raise ValueError( + f"Unsupported dataset path: {args.dataset_path}. " + "Huggingface dataset only supports dataset_path" + f" from one of following: {supported_datasets}. " + "Please consider contributing if you would " + "like to add support for additional dataset formats.") + + if dataset_class.IS_MULTIMODAL and endpoint_type not in [ + "openai-chat", + "openai-audio", + ]: + # multi-modal benchmark is only available on OpenAI Chat backend. + raise ValueError( + "Multi-modal content is only supported on 'openai-chat' and " + "'openai-audio' backend.") + input_requests = dataset_class( + dataset_path=args.dataset_path, + dataset_subset=args.hf_subset, + dataset_split=args.hf_split, + random_seed=args.seed, + ).sample( + num_requests=args.num_prompts, tokenizer=tokenizer, + output_len=args.hf_output_len, ) else: - raise ValueError(f"Unknown dataset: {args.dataset_name}") - + # For datasets that follow a similar structure, use a mapping. + dataset_mapping = { + "sharegpt": + lambda: ShareGPTDataset(random_seed=args.seed, + dataset_path=args.dataset_path).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + output_len=args.sharegpt_output_len, + ), + "burstgpt": + lambda: BurstGPTDataset(random_seed=args.seed, + dataset_path=args.dataset_path). + sample(tokenizer=tokenizer, num_requests=args.num_prompts), + "random": + lambda: RandomDataset(dataset_path=args.dataset_path).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + prefix_len=args.random_prefix_len, + input_len=args.random_input_len, + output_len=args.random_output_len, + range_ratio=args.random_range_ratio, + ), + } + + try: + input_requests = dataset_mapping[args.dataset_name]() + except KeyError as err: + raise ValueError(f"Unknown dataset: {args.dataset_name}") from err goodput_config_dict = check_goodput_args(args) + # Collect the sampling parameters. + sampling_params = { + k: v + for k, v in { + "top_p": args.top_p, + "top_k": args.top_k, + "min_p": args.min_p, + "temperature": args.temperature, + }.items() if v is not None + } + + # Sampling parameters are only supported by openai-compatible backend. + if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS: + raise ValueError("Sampling parameters are only supported by " + "openai-compatible backends.") + + if "temperature" not in sampling_params: + sampling_params["temperature"] = 0.0 # Default to greedy decoding. + # Avoid GC processing "static" data - reduce pause times. gc.collect() gc.freeze() @@ -861,7 +1050,6 @@ def main(args: argparse.Namespace): tokenizer=tokenizer, input_requests=input_requests, logprobs=args.logprobs, - best_of=args.best_of, request_rate=args.request_rate, burstiness=args.burstiness, disable_tqdm=args.disable_tqdm, @@ -874,10 +1062,11 @@ def main(args: argparse.Namespace): goodput_config_dict=goodput_config_dict, max_concurrency=args.max_concurrency, lora_modules=args.lora_modules, + extra_body=sampling_params, )) # Save config and results to json - if args.save_result: + if args.save_result or args.append_result: result_json: dict[str, Any] = {} # Setup @@ -887,7 +1076,6 @@ def main(args: argparse.Namespace): result_json["label"] = label result_json["model_id"] = model_id result_json["tokenizer_id"] = tokenizer_id - result_json["best_of"] = args.best_of result_json["num_prompts"] = args.num_prompts # Metadata @@ -910,6 +1098,19 @@ def main(args: argparse.Namespace): # Merge with benchmark result result_json = {**result_json, **benchmark_result} + if not args.save_detailed: + # Remove fields with too many data points + for field in [ + "input_lens", + "output_lens", + "ttfts", + "itls", + "generated_texts", + "errors", + ]: + if field in result_json: + del result_json[field] + # Save to file base_model_id = model_id.split("/")[-1] max_concurrency_str = (f"-concurrency{args.max_concurrency}" @@ -920,6 +1121,11 @@ def main(args: argparse.Namespace): file_name = args.result_filename if args.result_dir: file_name = os.path.join(args.result_dir, file_name) - with open(file_name, "w", encoding='utf-8') as outfile: + with open(file_name, + mode="a+" if args.append_result else "w", + encoding="utf-8") as outfile: + # Append a newline. + if args.append_result and outfile.tell() != 0: + outfile.write("\n") json.dump(result_json, outfile) save_to_pytorch_benchmark_format(args, result_json, file_name) -- GitLab From 51e98e4ffd696289bce1ec92c78cc7298dc1600b Mon Sep 17 00:00:00 2001 From: cascade Date: Tue, 27 May 2025 17:18:09 -0700 Subject: [PATCH 004/274] [Bugfix] Disable prefix caching by default for benchmark (#18771) Signed-off-by: cascade812 --- vllm/benchmarks/latency.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py index 2c992727b..0dd938e75 100644 --- a/vllm/benchmarks/latency.py +++ b/vllm/benchmarks/latency.py @@ -82,7 +82,7 @@ def add_cli_args(parser: argparse.ArgumentParser): parser = EngineArgs.add_cli_args(parser) # V1 enables prefix caching by default which skews the latency # numbers. We need to disable prefix caching by default. - parser.set_defaults(enable_prefix_caching=True) + parser.set_defaults(enable_prefix_caching=False) def main(args: argparse.Namespace): -- GitLab From a3896c7f0216d6930a912924e9149f31087232c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luka=20Govedi=C4=8D?= Date: Tue, 27 May 2025 20:49:24 -0400 Subject: [PATCH 005/274] [Build] Fixes for CMake install (#18570) --- CMakeLists.txt | 5 +++++ cmake/external_projects/vllm_flash_attn.cmake | 20 +++++++++++++++++-- cmake/utils.cmake | 2 +- setup.py | 5 +---- 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c5856fc5..6536e9a57 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,9 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake) # Suppress potential warnings about unused manually-specified variables set(ignoreMe "${VLLM_PYTHON_PATH}") +# Prevent installation of dependencies (cutlass) by default. +install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) + # # Supported python versions. These versions will be searched in order, the # first match will be selected. These should be kept in sync with setup.py. @@ -785,5 +788,7 @@ endif() # For CUDA we also build and ship some external projects. if (VLLM_GPU_LANG STREQUAL "CUDA") include(cmake/external_projects/flashmla.cmake) + + # vllm-flash-attn should be last as it overwrites some CMake functions include(cmake/external_projects/vllm_flash_attn.cmake) endif () diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index b04e4c2d0..a4edd5b96 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -46,22 +46,38 @@ else() endif() +# Ensure the vllm/vllm_flash_attn directory exists before installation +install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" ALL_COMPONENTS) + +# Make sure vllm-flash-attn install rules are nested under vllm/ +# This is here to support installing all components under the same prefix with cmake --install. +# setup.py installs every component separately but uses the same prefix for all. +# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3, +# and these statements don't hurt when installing neither component. +install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS) +install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS) +install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_COMPONENTS) + # Fetch the vllm-flash-attn library FetchContent_MakeAvailable(vllm-flash-attn) message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}") +# Restore the install prefix +install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS) +install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) + # Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in # case only one is built, in the case both are built redundant work is done) install( DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ - DESTINATION vllm_flash_attn + DESTINATION vllm/vllm_flash_attn COMPONENT _vllm_fa2_C FILES_MATCHING PATTERN "*.py" ) install( DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/ - DESTINATION vllm_flash_attn + DESTINATION vllm/vllm_flash_attn COMPONENT _vllm_fa3_C FILES_MATCHING PATTERN "*.py" ) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 12e4e3902..6d90555f2 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -76,7 +76,7 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS) set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc) add_custom_target( hipify${NAME} - COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS} + COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS} DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS} BYPRODUCTS ${HIP_SRCS} COMMENT "Running hipify on ${NAME} extension source files.") diff --git a/setup.py b/setup.py index 180f2f978..b822a4ec3 100644 --- a/setup.py +++ b/setup.py @@ -251,11 +251,8 @@ class cmake_build_ext(build_ext): # CMake appends the extension prefix to the install path, # and outdir already contains that prefix, so we need to remove it. - # We assume only the final component of extension prefix is added by - # CMake, this is currently true for current extensions but may not - # always be the case. prefix = outdir - if '.' in ext.name: + for _ in range(ext.name.count('.')): prefix = prefix.parent # prefix here should actually be the same for all components -- GitLab From d73a9457a56e99d33676f668611bfb232357f1b9 Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Wed, 28 May 2025 02:46:21 +0100 Subject: [PATCH 006/274] [Core] Improve Tensor serialisation (#18774) Signed-off-by: Lukas Geiger --- vllm/v1/serial_utils.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index 0dcf02113..fbd38fc47 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -158,10 +158,8 @@ class MsgpackEncoder: self, obj: torch.Tensor ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]: assert self.aux_buffers is not None - # this creates a copy of the tensor if it's not already contiguous - obj = obj.contiguous() # view the tensor as a 1D array of bytes - arr = obj.view((obj.numel(), )).view(torch.uint8).numpy() + arr = obj.flatten().view(torch.uint8).numpy() if obj.nbytes < self.size_threshold: # Smaller tensors are encoded inline, just like ndarrays. data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data) @@ -169,7 +167,7 @@ class MsgpackEncoder: # Otherwise encode index of backing buffer to avoid copy. data = len(self.aux_buffers) self.aux_buffers.append(arr.data) - dtype = str(obj.dtype)[6:] # remove 'torch.' prefix + dtype = str(obj.dtype).removeprefix("torch.") return dtype, obj.shape, data def _encode_nested_tensors(self, nt: NestedTensors) -> Any: @@ -245,7 +243,7 @@ class MsgpackDecoder: # zero-copy decode. We assume the ndarray will not be kept around, # as it now locks the whole received message buffer in memory. buffer = self.aux_buffers[data] if isinstance(data, int) else data - return np.ndarray(buffer=buffer, dtype=np.dtype(dtype), shape=shape) + return np.frombuffer(buffer, dtype=dtype).reshape(shape) def _decode_tensor(self, arr: Any) -> torch.Tensor: dtype, shape, data = arr @@ -254,12 +252,15 @@ class MsgpackDecoder: # not complain about a readonly memoryview. buffer = self.aux_buffers[data] if isinstance(data, int) \ else bytearray(data) - # Create numpy wrapper around the bytes - arr = np.ndarray(buffer=buffer, dtype=np.uint8, shape=(len(buffer), )) torch_dtype = getattr(torch, dtype) assert isinstance(torch_dtype, torch.dtype) + if not buffer: # torch.frombuffer doesn't like empty buffers + assert 0 in shape + return torch.empty(shape, dtype=torch_dtype) + # Create uint8 array + arr = torch.frombuffer(buffer, dtype=torch.uint8) # Convert back to proper shape & type - return torch.from_numpy(arr).view(torch_dtype).view(shape) + return arr.view(torch_dtype).view(shape) def _decode_mm_items(self, obj: list) -> list[MultiModalKwargsItem]: decoded_items = [] -- GitLab From 794ae1f551e4152a258f88c0c914ce5ea130bd05 Mon Sep 17 00:00:00 2001 From: fxmarty-amd Date: Wed, 28 May 2025 04:45:41 +0200 Subject: [PATCH 007/274] [rocm] Fix wrong attention log (#18764) Signed-off-by: Felix Marty --- vllm/platforms/rocm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index e1dcd9870..b5e742c65 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -194,8 +194,9 @@ class RocmPlatform(Platform): f" The selected backend, {selected_backend.name}," f"is not MLA type while requested for MLA backend.") - selected_backend = (_Backend.ROCM_FLASH if selected_backend - == _Backend.FLASH_ATTN else selected_backend) + if selected_backend is None or selected_backend == _Backend.FLASH_ATTN: + selected_backend = _Backend.ROCM_FLASH + if envs.VLLM_USE_V1: logger.info("Using Triton Attention backend on V1 engine.") return ("vllm.v1.attention.backends." -- GitLab From 3e9ce609bd0636301b3dd95deeea4b7b6df3f654 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Wed, 28 May 2025 11:29:53 +0800 Subject: [PATCH 008/274] [Bugfix] Fix nomic max_model_len (#18755) --- .../offline_inference/context_extension.py | 46 +++++++ .../pooling/test_nomic_max_model_len.py | 130 ++++++++++++++++++ vllm/config.py | 14 ++ vllm/model_executor/models/bert_with_rope.py | 55 +++++++- 4 files changed, 242 insertions(+), 3 deletions(-) create mode 100644 examples/offline_inference/context_extension.py create mode 100644 tests/models/language/pooling/test_nomic_max_model_len.py diff --git a/examples/offline_inference/context_extension.py b/examples/offline_inference/context_extension.py new file mode 100644 index 000000000..1a70446c3 --- /dev/null +++ b/examples/offline_inference/context_extension.py @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: Apache-2.0 + +from vllm import LLM, SamplingParams + +rope_theta = 1000000 +original_max_position_embeddings = 32768 +factor = 4.0 + +# Use yarn to extend context +hf_overrides = { + "rope_theta": rope_theta, + "rope_scaling": { + "rope_type": "yarn", + "factor": factor, + "original_max_position_embeddings": original_max_position_embeddings, + }, + "max_model_len": int(original_max_position_embeddings * factor), +} + +llm = LLM(model="Qwen/Qwen3-0.6B", hf_overrides=hf_overrides) + +sampling_params = SamplingParams( + temperature=0.8, + top_p=0.95, + max_tokens=128, +) + +conversation = [ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hello! How can I assist you today?"}, +] +outputs = llm.chat(conversation, sampling_params, use_tqdm=False) + + +def print_outputs(outputs): + print("\nGenerated Outputs:\n" + "-" * 80) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\n") + print(f"Generated text: {generated_text!r}") + print("-" * 80) + + +print_outputs(outputs) diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py new file mode 100644 index 000000000..68603e628 --- /dev/null +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -0,0 +1,130 @@ +# SPDX-License-Identifier: Apache-2.0 +# ruff: noqa: SIM117 +import pytest + +from ...utils import EmbedModelInfo + +MODELS = [ + EmbedModelInfo("nomic-ai/nomic-embed-text-v1"), + #EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"), + #EmbedModelInfo("nomic-ai/CodeRankEmbed"), + EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe"), + #EmbedModelInfo("Snowflake/snowflake-arctic-embed-m-long"), +] + +rope_theta = 1000 +factor = 4.0 +original_max_position_embeddings = 2048 +max_model_len = int(original_max_position_embeddings * factor) + + +@pytest.mark.parametrize("model_info", MODELS) +def test_default(model_info, vllm_runner): + with vllm_runner(model_info.name, task="embed", + max_model_len=None) as vllm_model: + model_config = vllm_model.model.llm_engine.model_config + if model_info.name == "nomic-ai/nomic-embed-text-v2-moe": + # For nomic-embed-text-v2-moe the length is set to 512 + # by sentence_bert_config.json. + assert model_config.max_model_len == 512 + else: + assert ( + model_config.max_model_len == original_max_position_embeddings) + + +@pytest.mark.parametrize("model_info", MODELS) +def test_set_max_model_len_legal(model_info, vllm_runner): + # set max_model_len <= 512 + with vllm_runner(model_info.name, task="embed", + max_model_len=256) as vllm_model: + model_config = vllm_model.model.llm_engine.model_config + assert model_config.max_model_len == 256 + + # set 512 < max_model_len <= 2048 + if model_info.name == "nomic-ai/nomic-embed-text-v2-moe": + # For nomic-embed-text-v2-moe the length is set to 512 + # by sentence_bert_config.json. + with pytest.raises(ValueError): + with vllm_runner(model_info.name, task="embed", + max_model_len=1024): + pass + else: + with vllm_runner(model_info.name, task="embed", + max_model_len=1024) as vllm_model: + model_config = vllm_model.model.llm_engine.model_config + assert model_config.max_model_len == 1024 + + +@pytest.mark.parametrize("model_info", MODELS) +def test_set_max_model_len_illegal(model_info, vllm_runner): + # set max_model_len > 2048 + with pytest.raises(ValueError): + with vllm_runner(model_info.name, task="embed", max_model_len=4096): + pass + + # set max_model_len > 2048 by hf_overrides + hf_overrides = {"max_model_len": 4096} + with pytest.raises(ValueError): + with vllm_runner(model_info.name, + task="embed", + max_model_len=None, + hf_overrides=hf_overrides): + pass + + +@pytest.mark.parametrize("model_info", MODELS) +def test_use_rope_scaling_legal(model_info, vllm_runner): + hf_overrides = { + "rope_theta": rope_theta, + "rope_scaling": { + "rope_type": "yarn", + "factor": factor, + "original_max_position_embeddings": + original_max_position_embeddings + }, + "max_model_len": max_model_len + } + + with vllm_runner(model_info.name, + task="embed", + max_model_len=None, + hf_overrides=hf_overrides): + pass + + +@pytest.mark.parametrize("model_info", MODELS) +def test_use_rope_scaling_illegal(model_info, vllm_runner): + hf_overrides = { + "rope_theta": rope_theta, + "rope_scaling": { + "rope_type": "yarn", + "factor": factor, + "original_max_position_embeddings": + original_max_position_embeddings + } + } + # illegal max_model_len + with pytest.raises(ValueError): + with vllm_runner(model_info.name, + task="embed", + max_model_len=max_model_len + 1, + hf_overrides=hf_overrides): + pass + + hf_overrides = { + "rope_theta": rope_theta, + "rope_scaling": { + "rope_type": "yarn", + "factor": factor, + "original_max_position_embeddings": + original_max_position_embeddings + }, + "max_model_len": max_model_len + 1 + } + # illegal max_model_len by hf_overrides + with pytest.raises(ValueError): + with vllm_runner(model_info.name, + task="embed", + max_model_len=None, + hf_overrides=hf_overrides): + pass diff --git a/vllm/config.py b/vllm/config.py index db35c848b..4afdda3cc 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -571,6 +571,7 @@ class ModelConfig: sliding_window = None + self.original_max_model_len = self.max_model_len self.max_model_len = _get_and_verify_max_len( hf_config=self.hf_text_config, max_model_len=self.max_model_len, @@ -4471,6 +4472,19 @@ class VllmConfig: self.compilation_config.init_with_cudagraph_sizes( batch_size_capture_list) + def recalculate_max_model_len(self, max_model_len: int): + model_config = self.model_config + max_model_len = _get_and_verify_max_len( + hf_config=model_config.hf_text_config, + max_model_len=max_model_len, + disable_sliding_window=model_config.disable_sliding_window, + sliding_window_len=model_config.get_hf_config_sliding_window(), + spec_target_max_model_len=model_config.spec_target_max_model_len, + encoder_config=model_config.encoder_config) + self.model_config.max_model_len = max_model_len + self.scheduler_config.max_model_len = max_model_len + self.compute_hash() + def __str__(self): return ( f"model={self.model_config.model!r}," diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index af6deb3bf..8a387d71f 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from collections.abc import Iterable +from copy import deepcopy from typing import Optional import torch @@ -10,6 +11,7 @@ from vllm.attention import Attention, AttentionType from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.logger import init_logger from vllm.model_executor.layers.activation import (get_act_and_mul_fn, get_act_fn) from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -27,6 +29,8 @@ from vllm.model_executor.models.interfaces import SupportsQuant from vllm.model_executor.models.utils import WeightsMapper from vllm.sequence import IntermediateTensors +logger = init_logger(__name__) + class BertWithRopeEmbedding(nn.Module): @@ -513,10 +517,11 @@ class NomicBertModel(BertWithRope): head_dim = config.hidden_size // config.num_attention_heads rotary_emb_dim = head_dim * config.rotary_emb_fraction + max_trained_positions = getattr(config, "max_trained_positions", 2048) config.rotary_kwargs = { "head_size": head_dim, "rotary_dim": rotary_emb_dim, - "max_position": config.max_trained_positions, + "max_position": max_trained_positions, "base": getattr(config, "rope_theta", config.rotary_emb_base), "rope_scaling": getattr(config, "rope_scaling", None) } @@ -525,8 +530,52 @@ class NomicBertModel(BertWithRope): # than max_trained_positions 2048, the results are consistent # with SentenceTransformer. # The context extension uses vllm style rope_theta and rope_scaling. - # See #17785 - + # See #17785 #18755 + if (not vllm_config.model_config.hf_overrides + and vllm_config.model_config.original_max_model_len is None): + # Default + # Reset max_model_len to max_trained_positions. + # nomic-embed-text-v2-moe the length is set to 512 + # by sentence_bert_config.json. + max_model_len_before = vllm_config.model_config.max_model_len + max_model_len = min(vllm_config.model_config.max_model_len, + max_trained_positions) + + vllm_config.recalculate_max_model_len(max_model_len) + logger.warning( + "Nomic context extension is disabled. " + "Changing max_model_len from %s to %s. " + "To enable context extension, see: " + "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/context_extension.html", + max_model_len_before, vllm_config.model_config.max_model_len) + else: + # We need to re-verify max_model_len to avoid lengths + # greater than position_embedding. + model_config = vllm_config.model_config + hf_text_config = model_config.hf_text_config + + if isinstance(model_config.hf_overrides, dict): + # hf_overrides_kw + max_model_len = model_config.hf_overrides.get( + "max_model_len", vllm_config.model_config.max_model_len) + else: + # hf_overrides_fn + # This might be overridden by sentence_bert_config.json. + max_model_len = vllm_config.model_config.max_model_len + + # reset hf_text_config for recalculate_max_model_len. + if hasattr(hf_text_config, "max_model_len"): + delattr(hf_text_config, "max_model_len") + hf_text_config.max_position_embeddings = max_trained_positions + hf_text_config.rope_scaling = config.rotary_kwargs["rope_scaling"] + + # The priority of sentence_bert_config.json is higher + # than max_position_embeddings + encoder_config = deepcopy(model_config.encoder_config) + encoder_config.pop("max_seq_length", None) + model_config.encoder_config = encoder_config + + vllm_config.recalculate_max_model_len(max_model_len) return config -- GitLab From 9a21e331ff8c4ab052a654b3ebd9f67ddfff4845 Mon Sep 17 00:00:00 2001 From: Guillaume Calmettes Date: Wed, 28 May 2025 05:35:43 +0200 Subject: [PATCH 009/274] [Bugfix]: correctly propagate errors message caught at the chat_templating step to the client (#18769) Signed-off-by: Guillaume Calmettes --- vllm/entrypoints/chat_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index ec1b327da..b051cd333 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1252,7 +1252,7 @@ def apply_hf_chat_template( # investigation. logger.exception( "An error occurred in `transformers` while applying chat template") - raise ValueError from e + raise ValueError(str(e)) from e def apply_mistral_chat_template( tokenizer: MistralTokenizer, @@ -1281,7 +1281,7 @@ def apply_mistral_chat_template( # We convert those assertion errors to ValueErrors so they can be # are properly caught in the preprocessing_input step except (AssertionError, MistralCommonException) as e: - raise ValueError from e + raise ValueError(str(e)) from e # External library exceptions can sometimes occur despite the framework's # internal exception management capabilities. @@ -1292,7 +1292,7 @@ def apply_mistral_chat_template( logger.exception( "An error occurred in `mistral_common` while applying chat " "template") - raise ValueError from e + raise ValueError(str(e)) from e def random_tool_call_id() -> str: return f"chatcmpl-tool-{random_uuid()}" -- GitLab From 774c5fde307dcadb8e0d2a58bf2d429eb6cb648d Mon Sep 17 00:00:00 2001 From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com> Date: Tue, 27 May 2025 23:16:30 -0500 Subject: [PATCH 010/274] [V1] fix torch profiling for V1 offline scenarios (#18445) Signed-off-by: Divakar Verma --- benchmarks/benchmark_latency.py | 37 +++++++++------------------------ vllm/benchmarks/latency.py | 33 ++++++++--------------------- vllm/v1/worker/gpu_worker.py | 2 ++ vllm/worker/worker.py | 2 ++ 4 files changed, 23 insertions(+), 51 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 84759c5c3..de62bf5c6 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -6,13 +6,12 @@ import dataclasses import json import os import time -from pathlib import Path from typing import Any, Optional import numpy as np -import torch from tqdm import tqdm +import vllm.envs as envs from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from vllm import LLM, SamplingParams from vllm.engine.arg_utils import EngineArgs @@ -80,17 +79,9 @@ def main(args: argparse.Namespace): def run_to_completion(profile_dir: Optional[str] = None): if profile_dir: - with torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - on_trace_ready=torch.profiler.tensorboard_trace_handler( - str(profile_dir) - ), - ) as p: - llm_generate() - print(p.key_averages().table(sort_by="self_cuda_time_total")) + llm.start_profile() + llm_generate() + llm.stop_profile() else: start_time = time.perf_counter() llm_generate() @@ -103,11 +94,7 @@ def main(args: argparse.Namespace): run_to_completion(profile_dir=None) if args.profile: - profile_dir = args.profile_result_dir - if not profile_dir: - profile_dir = ( - Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}" - ) + profile_dir = envs.VLLM_TORCH_PROFILER_DIR print(f"Profiling (results will be saved to '{profile_dir}')...") run_to_completion(profile_dir=profile_dir) return @@ -164,15 +151,6 @@ if __name__ == "__main__": action="store_true", help="profile the generation process of a single batch", ) - parser.add_argument( - "--profile-result-dir", - type=str, - default=None, - help=( - "path to save the pytorch profiler output. Can be visualized " - "with ui.perfetto.dev or Tensorboard." - ), - ) parser.add_argument( "--output-json", type=str, @@ -193,4 +171,9 @@ if __name__ == "__main__": # numbers. We need to disable prefix caching by default. parser.set_defaults(enable_prefix_caching=False) args = parser.parse_args() + if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: + raise OSError( + "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " + "Please set it to a valid path to use torch profiler." + ) main(args) diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py index 0dd938e75..c9e03cc3b 100644 --- a/vllm/benchmarks/latency.py +++ b/vllm/benchmarks/latency.py @@ -6,13 +6,12 @@ import dataclasses import json import os import time -from pathlib import Path from typing import Any, Optional import numpy as np -import torch from tqdm import tqdm +import vllm.envs as envs from vllm import LLM, SamplingParams from vllm.benchmarks.utils import (convert_to_pytorch_benchmark_format, write_to_json) @@ -59,13 +58,6 @@ def add_cli_args(parser: argparse.ArgumentParser): action="store_true", help="profile the generation process of a single batch", ) - parser.add_argument( - "--profile-result-dir", - type=str, - default=None, - help=("path to save the pytorch profiler output. Can be visualized " - "with ui.perfetto.dev or Tensorboard."), - ) parser.add_argument( "--output-json", type=str, @@ -87,7 +79,10 @@ def add_cli_args(parser: argparse.ArgumentParser): def main(args: argparse.Namespace): print(args) - + if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: + raise OSError( + "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " + "Please set it to a valid path to use torch profiler.") engine_args = EngineArgs.from_cli_args(args) # NOTE(woosuk): If the request cannot be processed in a single batch, @@ -131,16 +126,9 @@ def main(args: argparse.Namespace): def run_to_completion(profile_dir: Optional[str] = None): if profile_dir: - with torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - on_trace_ready=torch.profiler.tensorboard_trace_handler( - str(profile_dir)), - ) as p: - llm_generate() - print(p.key_averages().table(sort_by="self_cuda_time_total")) + llm.start_profile() + llm_generate() + llm.stop_profile() else: start_time = time.perf_counter() llm_generate() @@ -153,10 +141,7 @@ def main(args: argparse.Namespace): run_to_completion(profile_dir=None) if args.profile: - profile_dir = args.profile_result_dir - if not profile_dir: - profile_dir = (Path(".") / "vllm_benchmark_result" / - f"latency_result_{time.time()}") + profile_dir = envs.VLLM_TORCH_PROFILER_DIR print(f"Profiling (results will be saved to '{profile_dir}')...") run_to_completion(profile_dir=profile_dir) return diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index bce5cbb5f..dd06e7296 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -292,6 +292,8 @@ class Worker(WorkerBase): self.profiler.start() else: self.profiler.stop() + print(self.profiler.key_averages().table( + sort_by="self_cuda_time_total")) def execute_dummy_batch(self) -> None: self.model_runner._dummy_run(1) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 6e45b8423..2a4317271 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -128,6 +128,8 @@ class Worker(LocalOrDistributedWorkerBase): if self.profiler is None: raise RuntimeError("Profiler is not enabled.") self.profiler.stop() + print( + self.profiler.key_averages().table(sort_by="self_cuda_time_total")) def sleep(self, level: int = 1) -> None: free_bytes_before_sleep = torch.cuda.mem_get_info()[0] -- GitLab From 5e13c07d00df0ca906c4c06110c277421202d3e2 Mon Sep 17 00:00:00 2001 From: RonaldBXu <72748153+RonaldBXu@users.noreply.github.com> Date: Tue, 27 May 2025 22:09:14 -0700 Subject: [PATCH 011/274] [V1] [Bugfix] eagle bugfix and enable correct lm_head for multimodal (2) (#18781) Signed-off-by: Ronald Xu --- vllm/transformers_utils/configs/eagle.py | 2 +- vllm/v1/spec_decode/eagle.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py index 377523efe..a43e4746c 100644 --- a/vllm/transformers_utils/configs/eagle.py +++ b/vllm/transformers_utils/configs/eagle.py @@ -70,7 +70,7 @@ class EAGLEConfig(PretrainedConfig): if self.model is not None: for k, v in self.model.to_dict().items(): - if not hasattr(self, k): + if k not in kwargs: setattr(self, k, v) @classmethod diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 971b06758..1ca856423 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -9,6 +9,7 @@ from vllm.distributed.parallel_state import get_pp_group from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model +from vllm.model_executor.models import supports_multimodal from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM from vllm.v1.attention.backends.flash_attn import (CommonAttentionMetadata, FlashAttentionMetadata) @@ -346,7 +347,10 @@ class EagleProposer: if self.vllm_config.speculative_config.method != "eagle3" and \ hasattr(target_model, "lm_head"): logger.info("Loading EAGLE LM head weights from the target model.") - self.model.lm_head = target_model.lm_head + if supports_multimodal(target_model): + self.model.lm_head = target_model.get_language_model().lm_head + else: + self.model.lm_head = target_model.lm_head @torch.inference_mode() def dummy_run( -- GitLab From b78f844a6743732b58022f2f84858d61b40b5913 Mon Sep 17 00:00:00 2001 From: Rabi Mishra Date: Wed, 28 May 2025 11:12:54 +0530 Subject: [PATCH 012/274] [Bugfix][FailingTest]Fix test_model_load_with_params.py (#18758) Signed-off-by: rabi --- .buildkite/test-pipeline.yaml | 22 +++++++++---------- .../test_logits_processor.py | 0 .../test_model_load_with_params.py | 19 +++++++--------- .../{weight_utils.py => test_weight_utils.py} | 0 4 files changed, 19 insertions(+), 22 deletions(-) rename tests/{ => model_executor}/test_logits_processor.py (100%) rename tests/model_executor/{weight_utils.py => test_weight_utils.py} (100%) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 80a5a610c..4e7bea25e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -274,17 +274,6 @@ steps: - pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers -- label: LogitsProcessor Test # 5min - mirror_hardwares: [amdexperimental, amdproduction] - source_file_dependencies: - - vllm/model_executor/layers - - vllm/model_executor/guided_decoding - - tests/test_logits_processor - - tests/model_executor/test_guided_processors - commands: - - pytest -v -s test_logits_processor.py - - pytest -v -s model_executor/test_guided_processors.py - - label: Speculative decoding tests # 40min mirror_hardwares: [amdexperimental] source_file_dependencies: @@ -397,6 +386,17 @@ steps: - pytest -v -s tensorizer_loader - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py +- label: Model Executor Test + mirror_hardwares: [amdexperimental, amdproduction] + soft_fail: true + source_file_dependencies: + - vllm/model_executor + - tests/model_executor + commands: + - apt-get update && apt-get install -y curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s model_executor + - label: Benchmarks # 9min mirror_hardwares: [amdexperimental, amdproduction] working_dir: "/vllm-workspace/.buildkite" diff --git a/tests/test_logits_processor.py b/tests/model_executor/test_logits_processor.py similarity index 100% rename from tests/test_logits_processor.py rename to tests/model_executor/test_logits_processor.py diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index f8efa2eff..7fda1f0e8 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -4,7 +4,7 @@ import os import pytest -from vllm.model_executor.layers.pooler import CLSPool, PoolingType +from vllm.model_executor.layers.pooler import CLSPool, MeanPool, PoolingType from vllm.model_executor.models.bert import BertEmbeddingModel from vllm.model_executor.models.roberta import RobertaEmbeddingModel from vllm.platforms import current_platform @@ -14,7 +14,7 @@ MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5") REVISION = os.environ.get("REVISION", "main") MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME", - "intfloat/multilingual-e5-small") + "intfloat/multilingual-e5-base") REVISION_ROBERTA = os.environ.get("REVISION", "main") @@ -40,17 +40,15 @@ def test_model_loading_with_params(vllm_runner): # asserts on the pooling config files assert model_config.pooler_config.pooling_type == PoolingType.CLS.name - assert model_config.pooler_config.pooling_norm + assert model_config.pooler_config.normalize # asserts on the tokenizer loaded assert model_tokenizer.tokenizer_id == "BAAI/bge-base-en-v1.5" - assert model_tokenizer.tokenizer_config["do_lower_case"] assert model_tokenizer.tokenizer.model_max_length == 512 def check_model(model): assert isinstance(model, BertEmbeddingModel) - assert model._pooler.pooling_type == PoolingType.CLS - assert model._pooler.normalize + assert isinstance(model._pooler, CLSPool) vllm_model.apply_model(check_model) @@ -80,16 +78,15 @@ def test_roberta_model_loading_with_params(vllm_runner): # asserts on the pooling config files assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name - assert model_config.pooler_config.pooling_norm + assert model_config.pooler_config.normalize # asserts on the tokenizer loaded - assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-small" - assert not model_tokenizer.tokenizer_config["do_lower_case"] + assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-base" + assert model_tokenizer.tokenizer.model_max_length == 512 def check_model(model): assert isinstance(model, RobertaEmbeddingModel) - assert model._pooler.pooling_type == PoolingType.MEAN - assert model._pooler.normalize + assert isinstance(model._pooler, MeanPool) vllm_model.apply_model(check_model) diff --git a/tests/model_executor/weight_utils.py b/tests/model_executor/test_weight_utils.py similarity index 100% rename from tests/model_executor/weight_utils.py rename to tests/model_executor/test_weight_utils.py -- GitLab From 7f2c1a87e9c34df7c8820ab6c916a608d518c1e9 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 28 May 2025 15:08:35 +0800 Subject: [PATCH 013/274] [Deprecation] Require overriding `get_dummy_text` and `get_dummy_mm_data` (#18796) Signed-off-by: DarkLight1337 --- vllm/multimodal/profiling.py | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index a85b13fb2..53f5b243d 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 - -from abc import ABC +from abc import ABC, abstractmethod from collections.abc import Mapping from dataclasses import dataclass, field from typing import Generic, NamedTuple, Optional, TypeVar, Union, cast @@ -60,29 +59,14 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]): self.info = info - # TODO: @abstractmethod after transition + @abstractmethod def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: """ Build the text input corresponding to `mm_counts`. """ - if (type(self).get_dummy_processor_inputs == - BaseDummyInputsBuilder.get_dummy_processor_inputs): - raise NotImplementedError - - logger.warning_once("`get_dummy_processor_inputs` has been split up " - "into `get_dummy_text` and `get_dummy_mm_data`. " - "These two methods will be marked as abstract " - "in an upcoming release.") - - seq_len = self.info.ctx.model_config.max_model_len - - prompt = self.get_dummy_processor_inputs(seq_len, mm_counts).prompt - if not isinstance(prompt, str): - prompt = self.info.get_tokenizer().decode(prompt) - - return prompt + raise NotImplementedError - # TODO: @abstractmethod after transition + @abstractmethod def get_dummy_mm_data( self, seq_len: int, -- GitLab From 0f0926b43fb5445e2468034eb0bc8076cdd6af86 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 28 May 2025 15:08:48 +0800 Subject: [PATCH 014/274] [Deprecation] Remove unused sync methods in `async_timeout` (#18792) Signed-off-by: DarkLight1337 --- vllm/engine/async_timeout.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py index aa54c0693..94674262b 100644 --- a/vllm/engine/async_timeout.py +++ b/vllm/engine/async_timeout.py @@ -8,7 +8,6 @@ import asyncio import enum import sys -import warnings from types import TracebackType from typing import Any, Optional, Type @@ -66,24 +65,6 @@ else: else: self.update(deadline) - def __enter__(self) -> "Timeout": - warnings.warn( - "with timeout() is deprecated, use async with timeout()", - DeprecationWarning, - stacklevel=2, - ) - self._do_enter() - return self - - def __exit__( - self, - exc_type: Optional[Type[BaseException]], - exc_val: Optional[BaseException], - exc_tb: Optional[TracebackType], - ) -> Optional[bool]: - self._do_exit(exc_type) - return None - async def __aenter__(self) -> "Timeout": self._do_enter() return self -- GitLab From 0c492b782420f9043e06a7c7f21ff62496a7a392 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 28 May 2025 15:09:04 +0800 Subject: [PATCH 015/274] [Deprecation] Remove fallbacks for Embeddings API (#18795) Signed-off-by: DarkLight1337 --- vllm/config.py | 17 +++++------- vllm/entrypoints/openai/api_server.py | 39 ++++----------------------- vllm/outputs.py | 16 +---------- 3 files changed, 12 insertions(+), 60 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 4afdda3cc..738a9b337 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -797,17 +797,12 @@ class ModelConfig: else: # Aliases if task_option == "embedding": - preferred_task = self._get_preferred_task( - architectures, supported_tasks) - if preferred_task != "embed": - msg = ("The 'embedding' task will be restricted to " - "embedding models in a future release. Please " - "pass `--task classify`, `--task score`, or " - "`--task reward` explicitly for other pooling " - "models.") - warnings.warn(msg, DeprecationWarning, stacklevel=2) - - task_option = preferred_task or "embed" + msg = ("The 'embedding' task has been renamed to " + "'embed', please use the new name. The old name " + "will be removed in v1.0.") + warnings.warn(msg, DeprecationWarning, stacklevel=2) + + task_option = "embed" if task_option not in supported_tasks: msg = ( diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 2da89b4f5..b991cb3a4 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -17,7 +17,7 @@ from contextlib import asynccontextmanager from functools import partial from http import HTTPStatus from json import JSONDecodeError -from typing import Annotated, Optional, Union +from typing import Annotated, Optional import prometheus_client import regex as re @@ -59,9 +59,7 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, EmbeddingChatRequest, EmbeddingCompletionRequest, EmbeddingRequest, - EmbeddingResponse, - EmbeddingResponseData, - ErrorResponse, + EmbeddingResponse, ErrorResponse, LoadLoRAAdapterRequest, PoolingChatRequest, PoolingCompletionRequest, @@ -627,37 +625,10 @@ async def create_completion(request: CompletionRequest, raw_request: Request): async def create_embedding(request: EmbeddingRequest, raw_request: Request): handler = embedding(raw_request) if handler is None: - fallback_handler = pooling(raw_request) - if fallback_handler is None: - return base(raw_request).create_error_response( - message="The model does not support Embeddings API") + return base(raw_request).create_error_response( + message="The model does not support Embeddings API") - logger.warning( - "Embeddings API will become exclusive to embedding models " - "in a future release. To return the hidden states directly, " - "use the Pooling API (`/pooling`) instead.") - - res = await fallback_handler.create_pooling(request, raw_request) - - generator: Union[ErrorResponse, EmbeddingResponse] - if isinstance(res, PoolingResponse): - generator = EmbeddingResponse( - id=res.id, - object=res.object, - created=res.created, - model=res.model, - data=[ - EmbeddingResponseData( - index=d.index, - embedding=d.data, # type: ignore - ) for d in res.data - ], - usage=res.usage, - ) - else: - generator = res - else: - generator = await handler.create_embedding(request, raw_request) + generator = await handler.create_embedding(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), diff --git a/vllm/outputs.py b/vllm/outputs.py index 33cc50c87..3960388bf 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from typing import Any, Generic, Optional, Union import torch -from typing_extensions import TypeVar, deprecated +from typing_extensions import TypeVar from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -76,14 +76,6 @@ class PoolingOutput: return (isinstance(other, self.__class__) and bool( (self.data == other.data).all())) - @property - @deprecated("`LLM.encode()` now stores raw outputs in the `data` " - "attribute. To return embeddings, use `LLM.embed()`. " - "To return class probabilities, use `LLM.classify()` " - "and access the `probs` attribute. ") - def embedding(self) -> list[float]: - return self.data.tolist() - class RequestOutput: """The output data of a completion request to the LLM. @@ -506,12 +498,6 @@ class ScoringOutput: def __repr__(self) -> str: return f"ScoringOutput(score={self.score})" - @property - @deprecated("`LLM.score()` now returns scalar scores. " - "Please access it via the `score` attribute. ") - def embedding(self) -> list[float]: - return [self.score] - class ScoringRequestOutput(PoolingRequestOutput[ScoringOutput]): -- GitLab From de65fc8e1e7056e845a90bf4df4ff39c25cb03fa Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Wed, 28 May 2025 15:16:35 +0800 Subject: [PATCH 016/274] [CI] improve embed testing (#18747) --- .../openai/correctness/test_mteb.py | 3 +- tests/entrypoints/openai/test_embedding.py | 3 +- .../openai/test_embedding_dimensions.py | 4 +- tests/models/language/pooling/embed_utils.py | 72 ++++++++++++++ tests/models/language/pooling/mteb_utils.py | 11 ++- tests/models/language/pooling/test_baai.py | 71 ++++++++++++++ tests/models/language/pooling/test_gte.py | 33 ++----- tests/models/language/pooling/test_jina.py | 93 +++++++++---------- tests/models/language/pooling/test_nomic.py | 36 +++---- .../pooling/test_snowflake_arctic_embed.py | 40 ++------ tests/models/registry.py | 2 +- tests/models/utils.py | 24 +---- vllm/config.py | 26 +++--- 13 files changed, 244 insertions(+), 174 deletions(-) create mode 100644 tests/models/language/pooling/embed_utils.py create mode 100644 tests/models/language/pooling/test_baai.py diff --git a/tests/entrypoints/openai/correctness/test_mteb.py b/tests/entrypoints/openai/correctness/test_mteb.py index ebf2f829b..44d7ac193 100644 --- a/tests/entrypoints/openai/correctness/test_mteb.py +++ b/tests/entrypoints/openai/correctness/test_mteb.py @@ -4,6 +4,7 @@ import os import pytest from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS, + MTEB_EMBED_TOL, OpenAIClientMtebEncoder, run_mteb_embed_task, run_mteb_embed_task_st) @@ -38,4 +39,4 @@ def test_mteb(server): print("SentenceTransformer main score: ", st_main_score) print("Difference: ", st_main_score - vllm_main_score) - assert st_main_score == pytest.approx(vllm_main_score, rel=1e-4) + assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL) diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 1019bfd58..81ca65b65 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -11,7 +11,8 @@ import requests from vllm.entrypoints.openai.protocol import EmbeddingResponse from vllm.transformers_utils.tokenizer import get_tokenizer -from ...models.utils import run_embedding_correctness_test +from ...models.language.pooling.embed_utils import ( + run_embedding_correctness_test) from ...utils import RemoteOpenAIServer MODEL_NAME = "intfloat/multilingual-e5-small" diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py index 332fa332a..341defae0 100644 --- a/tests/entrypoints/openai/test_embedding_dimensions.py +++ b/tests/entrypoints/openai/test_embedding_dimensions.py @@ -11,7 +11,9 @@ import pytest from vllm.entrypoints.openai.protocol import EmbeddingResponse from ...conftest import HfRunner -from ...models.utils import EmbedModelInfo, run_embedding_correctness_test +from ...models.language.pooling.embed_utils import ( + run_embedding_correctness_test) +from ...models.utils import EmbedModelInfo from ...utils import RemoteOpenAIServer MODELS = [ diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py new file mode 100644 index 000000000..0c8ac2ab1 --- /dev/null +++ b/tests/models/language/pooling/embed_utils.py @@ -0,0 +1,72 @@ +# SPDX-License-Identifier: Apache-2.0 +from collections.abc import Sequence +from typing import Optional + +import pytest + +from tests.conftest import HfRunner +from tests.models.utils import (EmbedModelInfo, check_embeddings_close, + matryoshka_fy) + + +def run_embedding_correctness_test( + hf_model: "HfRunner", + inputs: list[str], + vllm_outputs: Sequence[list[float]], + dimensions: Optional[int] = None, +): + hf_outputs = hf_model.encode(inputs) + if dimensions: + hf_outputs = matryoshka_fy(hf_outputs, dimensions) + + check_embeddings_close( + embeddings_0_lst=hf_outputs, + embeddings_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + tol=1e-2, + ) + + +def correctness_test_embed_models(hf_runner, + vllm_runner, + model_info: EmbedModelInfo, + example_prompts, + vllm_extra_kwargs=None, + hf_model_callback=None): + if not model_info.enable_test: + # A model family has many models with the same architecture, + # and we don't need to test each one. + pytest.skip("Skipping test.") + + # The example_prompts has ending "\n", for example: + # "Write a short story about a robot that dreams for the first time.\n" + # sentence_transformers will strip the input texts, see: + # https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159 + # This makes the input_ids different between hf_model and vllm_model. + # So we need to strip the input texts to avoid test failing. + example_prompts = [str(s).strip() for s in example_prompts] + + vllm_extra_kwargs = vllm_extra_kwargs or {} + vllm_extra_kwargs["dtype"] = model_info.dtype + + with vllm_runner(model_info.name, + task="embed", + max_model_len=None, + **vllm_extra_kwargs) as vllm_model: + vllm_outputs = vllm_model.encode(example_prompts) + vllm_dtype = vllm_model.model.llm_engine.model_config.dtype + model_dtype = getattr( + vllm_model.model.llm_engine.model_config.hf_config, "torch_dtype", + vllm_dtype) + + with hf_runner( + model_info.name, + dtype=model_dtype, + is_sentence_transformer=True, + ) as hf_model: + + if hf_model_callback is not None: + hf_model_callback(hf_model) + + run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs) diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index f83c9940d..f4837ae95 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -80,18 +80,19 @@ def run_mteb_embed_task_st(model_name, tasks): def mteb_test_embed_models(hf_runner, vllm_runner, model_info: EmbedModelInfo, - vllm_extra_kwargs=None): + vllm_extra_kwargs=None, + hf_model_callback=None): if not model_info.enable_test: # A model family has many models with the same architecture, # and we don't need to test each one. pytest.skip("Skipping test.") vllm_extra_kwargs = vllm_extra_kwargs or {} + vllm_extra_kwargs["dtype"] = model_info.dtype with vllm_runner(model_info.name, task="embed", max_model_len=None, - dtype=model_info.dtype, **vllm_extra_kwargs) as vllm_model: if model_info.architecture: @@ -108,10 +109,14 @@ def mteb_test_embed_models(hf_runner, with set_default_torch_dtype(model_dtype) and hf_runner( model_info.name, is_sentence_transformer=True, dtype=model_dtype) as hf_model: + + if hf_model_callback is not None: + hf_model_callback(hf_model) + st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS) print("VLLM:", vllm_dtype, vllm_main_score) print("SentenceTransformer:", model_dtype, st_main_score) print("Difference:", st_main_score - vllm_main_score) - assert st_main_score == pytest.approx(vllm_main_score, rel=MTEB_EMBED_TOL) + assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL) diff --git a/tests/models/language/pooling/test_baai.py b/tests/models/language/pooling/test_baai.py new file mode 100644 index 000000000..fc0e82079 --- /dev/null +++ b/tests/models/language/pooling/test_baai.py @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest + +from .embed_utils import EmbedModelInfo, correctness_test_embed_models +from .mteb_utils import mteb_test_embed_models + +MODELS = [ + ########## BertModel + EmbedModelInfo("BAAI/bge-base-en", + architecture="BertModel", + enable_test=True), + EmbedModelInfo("BAAI/bge-base-zh", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-small-en", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-small-zh", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-large-en", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-large-zh", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-large-zh-noinstruct", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-base-en-v1.5", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-base-zh-v1.5", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-small-en-v1.5", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-small-zh-v1.5", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-large-en-v1.5", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("BAAI/bge-large-zh-v1.5", + architecture="BertModel", + enable_test=False), + ########## XLMRobertaModel + EmbedModelInfo("BAAI/bge-m3", + architecture="XLMRobertaModel", + enable_test=True), + ########## Qwen2Model + EmbedModelInfo("BAAI/bge-code-v1", + architecture="Qwen2Model", + dtype="float32", + enable_test=True), +] + + +@pytest.mark.parametrize("model_info", MODELS) +def test_embed_models_mteb(hf_runner, vllm_runner, + model_info: EmbedModelInfo) -> None: + mteb_test_embed_models(hf_runner, vllm_runner, model_info) + + +@pytest.mark.parametrize("model_info", MODELS) +def test_embed_models_correctness(hf_runner, vllm_runner, + model_info: EmbedModelInfo, + example_prompts) -> None: + correctness_test_embed_models(hf_runner, vllm_runner, model_info, + example_prompts) diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index 91d10f529..18b27a688 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -3,7 +3,8 @@ from typing import Any import pytest -from ...utils import EmbedModelInfo, run_embedding_correctness_test +from .embed_utils import EmbedModelInfo, correctness_test_embed_models +from .mteb_utils import mteb_test_embed_models MODELS = [ ########## BertModel @@ -53,9 +54,8 @@ MODELS = [ @pytest.mark.parametrize("model_info", MODELS) -def test_models_mteb(hf_runner, vllm_runner, - model_info: EmbedModelInfo) -> None: - from .mteb_utils import mteb_test_embed_models +def test_embed_models_mteb(hf_runner, vllm_runner, + model_info: EmbedModelInfo) -> None: vllm_extra_kwargs: dict[str, Any] = {} if model_info.architecture == "GteNewModel": @@ -66,28 +66,13 @@ def test_models_mteb(hf_runner, vllm_runner, @pytest.mark.parametrize("model_info", MODELS) -def test_models_correctness(hf_runner, vllm_runner, model_info: EmbedModelInfo, - example_prompts) -> None: - if not model_info.enable_test: - pytest.skip("Skipping test.") - - # ST will strip the input texts, see test_embedding.py - example_prompts = [str(s).strip() for s in example_prompts] +def test_embed_models_correctness(hf_runner, vllm_runner, + model_info: EmbedModelInfo, + example_prompts) -> None: vllm_extra_kwargs: dict[str, Any] = {} if model_info.architecture == "GteNewModel": vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]} - with vllm_runner(model_info.name, - task="embed", - dtype=model_info.dtype, - max_model_len=None, - **vllm_extra_kwargs) as vllm_model: - vllm_outputs = vllm_model.encode(example_prompts) - - with hf_runner( - model_info.name, - dtype=model_info.dtype, - is_sentence_transformer=True, - ) as hf_model: - run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs) + correctness_test_embed_models(hf_runner, vllm_runner, model_info, + example_prompts, vllm_extra_kwargs) diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py index 0ddff2146..0403a20a4 100644 --- a/tests/models/language/pooling/test_jina.py +++ b/tests/models/language/pooling/test_jina.py @@ -1,9 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 +from functools import partial + import pytest from vllm import PoolingParams -from ...utils import check_embeddings_close, matryoshka_fy +from .embed_utils import (EmbedModelInfo, check_embeddings_close, + correctness_test_embed_models, matryoshka_fy) +from .mteb_utils import mteb_test_embed_models SCORING_MODELS = [ "jinaai/jina-reranker-v2-base-multilingual", # Roberta @@ -25,16 +29,10 @@ TEXTS_2 = [ ] EMBEDDING_MODELS = [ - "jinaai/jina-embeddings-v3", -] - -EMBEDDING_PROMPTS = [ - "Follow the white rabbit.", # English - "Sigue al conejo blanco.", # Spanish - "Suis le lapin blanc.", # French - "跟着白兔走。", # Chinese - "اتبع الأرنب الأبيض.", # Arabic - "Folge dem weißen Kaninchen.", # German + EmbedModelInfo("jinaai/jina-embeddings-v3", + architecture="XLMRobertaModel", + is_matryoshka=True, + dtype="float32") ] @@ -80,73 +78,66 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str): assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01) -@pytest.fixture(scope="module", params=EMBEDDING_MODELS) -def emb_model_name(request): - yield request.param +@pytest.mark.parametrize("model_info", EMBEDDING_MODELS) +def test_embed_models_mteb(hf_runner, vllm_runner, + model_info: EmbedModelInfo) -> None: + def hf_model_callback(model): + model.encode = partial(model.encode, task="text-matching") -def test_is_matryoshka(vllm_runner, emb_model_name): - with vllm_runner(emb_model_name, task="embed", - max_model_len=None) as vllm_model: - assert vllm_model.model.llm_engine.model_config.is_matryoshka - - -@pytest.mark.parametrize("model", EMBEDDING_MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -def test_embeddings( - hf_runner, - vllm_runner, - model, - dtype: str, - monkeypatch, -) -> None: + mteb_test_embed_models(hf_runner, + vllm_runner, + model_info, + hf_model_callback=hf_model_callback) - example_prompts = EMBEDDING_PROMPTS - with hf_runner( - model, - dtype=dtype, - is_sentence_transformer=True, - ) as hf_model: - hf_outputs = hf_model.encode(example_prompts, task="text-matching") +@pytest.mark.parametrize("model_info", EMBEDDING_MODELS) +def test_embed_models_correctness(hf_runner, vllm_runner, + model_info: EmbedModelInfo, + example_prompts) -> None: - with vllm_runner(model, task="embed", dtype=dtype, - max_model_len=None) as vllm_model: - vllm_outputs = vllm_model.encode(example_prompts) + def hf_model_callback(model): + model.encode = partial(model.encode, task="text-matching") - check_embeddings_close( - embeddings_0_lst=hf_outputs, - embeddings_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - tol=1e-2, - ) + correctness_test_embed_models(hf_runner, + vllm_runner, + model_info, + example_prompts, + hf_model_callback=hf_model_callback) -@pytest.mark.parametrize("model", EMBEDDING_MODELS) +@pytest.mark.parametrize("model_info", EMBEDDING_MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dimensions", [16, 32]) def test_matryoshka( hf_runner, vllm_runner, - model, + model_info, dtype: str, dimensions: int, + example_prompts, monkeypatch, ) -> None: + if not model_info.is_matryoshka: + pytest.skip("Model is not matryoshka") - example_prompts = EMBEDDING_PROMPTS + # ST will strip the input texts, see test_embedding.py + example_prompts = [str(s).strip() for s in example_prompts] with hf_runner( - model, + model_info.name, dtype=dtype, is_sentence_transformer=True, ) as hf_model: hf_outputs = hf_model.encode(example_prompts, task="text-matching") hf_outputs = matryoshka_fy(hf_outputs, dimensions) - with vllm_runner(model, task="embed", dtype=dtype, + with vllm_runner(model_info.name, + task="embed", + dtype=dtype, max_model_len=None) as vllm_model: + assert vllm_model.model.llm_engine.model_config.is_matryoshka + matryoshka_dimensions = ( vllm_model.model.llm_engine.model_config.matryoshka_dimensions) assert matryoshka_dimensions is not None diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling/test_nomic.py index 28df32e0c..92cd7cc56 100644 --- a/tests/models/language/pooling/test_nomic.py +++ b/tests/models/language/pooling/test_nomic.py @@ -2,7 +2,8 @@ import pytest -from ...utils import EmbedModelInfo, run_embedding_correctness_test +from .embed_utils import EmbedModelInfo, correctness_test_embed_models +from .mteb_utils import mteb_test_embed_models MODELS = [ EmbedModelInfo("nomic-ai/nomic-embed-text-v1", @@ -13,6 +14,9 @@ MODELS = [ architecture="NomicBertModel", dtype="float32", enable_test=False), + EmbedModelInfo("nomic-ai/CodeRankEmbed", + architecture="NomicBertModel", + enable_test=False), EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe", architecture="NomicBertModel", dtype="float32", @@ -21,30 +25,14 @@ MODELS = [ @pytest.mark.parametrize("model_info", MODELS) -def test_models_mteb(hf_runner, vllm_runner, - model_info: EmbedModelInfo) -> None: - from .mteb_utils import mteb_test_embed_models +def test_embed_models_mteb(hf_runner, vllm_runner, + model_info: EmbedModelInfo) -> None: mteb_test_embed_models(hf_runner, vllm_runner, model_info) @pytest.mark.parametrize("model_info", MODELS) -def test_models_correctness(hf_runner, vllm_runner, model_info: EmbedModelInfo, - example_prompts) -> None: - if not model_info.enable_test: - pytest.skip("Skipping test.") - - # ST will strip the input texts, see test_embedding.py - example_prompts = [str(s).strip() for s in example_prompts] - - with vllm_runner(model_info.name, - task="embed", - dtype=model_info.dtype, - max_model_len=None) as vllm_model: - vllm_outputs = vllm_model.encode(example_prompts) - - with hf_runner( - model_info.name, - dtype=model_info.dtype, - is_sentence_transformer=True, - ) as hf_model: - run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs) +def test_embed_models_correctness(hf_runner, vllm_runner, + model_info: EmbedModelInfo, + example_prompts) -> None: + correctness_test_embed_models(hf_runner, vllm_runner, model_info, + example_prompts) diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py index 5679e0e1c..c6c2d1e7a 100644 --- a/tests/models/language/pooling/test_snowflake_arctic_embed.py +++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py @@ -2,7 +2,8 @@ import pytest -from ...utils import EmbedModelInfo, run_embedding_correctness_test +from .embed_utils import EmbedModelInfo, correctness_test_embed_models +from .mteb_utils import mteb_test_embed_models MODELS = [ EmbedModelInfo("Snowflake/snowflake-arctic-embed-xs", @@ -41,37 +42,14 @@ MODELS = [ @pytest.mark.parametrize("model_info", MODELS) -def test_models_mteb( - hf_runner, - vllm_runner, - model_info: EmbedModelInfo, -) -> None: - from .mteb_utils import mteb_test_embed_models +def test_embed_models_mteb(hf_runner, vllm_runner, + model_info: EmbedModelInfo) -> None: mteb_test_embed_models(hf_runner, vllm_runner, model_info) @pytest.mark.parametrize("model_info", MODELS) -def test_models_correctness( - hf_runner, - vllm_runner, - model_info: EmbedModelInfo, - example_prompts, -) -> None: - if not model_info.enable_test: - pytest.skip("Skipping test.") - - # ST will strip the input texts, see test_embedding.py - example_prompts = [str(s).strip() for s in example_prompts] - - with vllm_runner(model_info.name, - task="embed", - dtype=model_info.dtype, - max_model_len=None) as vllm_model: - vllm_outputs = vllm_model.encode(example_prompts) - - with hf_runner( - model_info.name, - dtype=model_info.dtype, - is_sentence_transformer=True, - ) as hf_model: - run_embedding_correctness_test(hf_model, example_prompts, vllm_outputs) +def test_embed_models_correctness(hf_runner, vllm_runner, + model_info: EmbedModelInfo, + example_prompts) -> None: + correctness_test_embed_models(hf_runner, vllm_runner, model_info, + example_prompts) diff --git a/tests/models/registry.py b/tests/models/registry.py index a49e3ad6b..18342b671 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -283,7 +283,7 @@ _EMBEDDING_EXAMPLE_MODELS = { "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"), "ModernBertModel": _HfExamplesInfo("Alibaba-NLP/gte-modernbert-base", trust_remote_code=True), - "NomicBertModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-long", # noqa: E501 + "NomicBertModel": _HfExamplesInfo("nomic-ai/nomic-embed-text-v2-moe", trust_remote_code=True), "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"), "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"), diff --git a/tests/models/utils.py b/tests/models/utils.py index a43fd77c6..ac1fc6c8f 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -2,7 +2,7 @@ import warnings from collections.abc import Sequence -from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union +from typing import Any, NamedTuple, Optional, Union import torch import torch.nn.functional as F @@ -13,9 +13,6 @@ from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs from .registry import HF_EXAMPLE_MODELS -if TYPE_CHECKING: - from ..conftest import HfRunner - TokensText = tuple[list[int], str] @@ -337,22 +334,3 @@ class EmbedModelInfo(NamedTuple): architecture: str = "" dtype: str = "auto" enable_test: bool = True - - -def run_embedding_correctness_test( - hf_model: "HfRunner", - inputs: list[str], - vllm_outputs: Sequence[list[float]], - dimensions: Optional[int] = None, -): - hf_outputs = hf_model.encode(inputs) - if dimensions: - hf_outputs = matryoshka_fy(hf_outputs, dimensions) - - check_embeddings_close( - embeddings_0_lst=hf_outputs, - embeddings_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - tol=1e-2, - ) diff --git a/vllm/config.py b/vllm/config.py index 738a9b337..30d1a64a4 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -572,13 +572,7 @@ class ModelConfig: sliding_window = None self.original_max_model_len = self.max_model_len - self.max_model_len = _get_and_verify_max_len( - hf_config=self.hf_text_config, - max_model_len=self.max_model_len, - disable_sliding_window=self.disable_sliding_window, - sliding_window_len=self.get_hf_config_sliding_window(), - spec_target_max_model_len=self.spec_target_max_model_len, - encoder_config=self.encoder_config) + self.max_model_len = self.get_and_verify_max_len(self.max_model_len) self.served_model_name = get_served_model_name(self.model, self.served_model_name) self.multimodal_config = self._init_multimodal_config() @@ -1382,6 +1376,16 @@ class ModelConfig: def matryoshka_dimensions(self): return getattr(self.hf_config, "matryoshka_dimensions", None) + def get_and_verify_max_len(self, max_model_len: int): + max_model_len = _get_and_verify_max_len( + hf_config=self.hf_text_config, + max_model_len=max_model_len, + disable_sliding_window=self.disable_sliding_window, + sliding_window_len=self.get_hf_config_sliding_window(), + spec_target_max_model_len=self.spec_target_max_model_len, + encoder_config=self.encoder_config) + return max_model_len + BlockSize = Literal[1, 8, 16, 32, 64, 128] CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2"] @@ -4469,13 +4473,7 @@ class VllmConfig: def recalculate_max_model_len(self, max_model_len: int): model_config = self.model_config - max_model_len = _get_and_verify_max_len( - hf_config=model_config.hf_text_config, - max_model_len=max_model_len, - disable_sliding_window=model_config.disable_sliding_window, - sliding_window_len=model_config.get_hf_config_sliding_window(), - spec_target_max_model_len=model_config.spec_target_max_model_len, - encoder_config=model_config.encoder_config) + max_model_len = model_config.get_and_verify_max_len(max_model_len) self.model_config.max_model_len = max_model_len self.scheduler_config.max_model_len = max_model_len self.compute_hash() -- GitLab From aa42561e4054f43f41bf0ea564369f5ea3147316 Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Wed, 28 May 2025 04:40:53 -0400 Subject: [PATCH 017/274] Fix PiecewiseCompileInterpreter (#17338) Signed-off-by: rzou --- vllm/compilation/backends.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 8114cddcd..0358c9d0d 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -10,6 +10,7 @@ from typing import Any, Callable, Optional import torch import torch.fx as fx +from torch._dispatch.python import enable_python_dispatcher import vllm.envs as envs from vllm.config import CompilationConfig, VllmConfig @@ -269,7 +270,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t for t in args ] - with self.fake_mode: + with self.fake_mode, enable_python_dispatcher(): return super().run(*fake_args) def call_module(self, target: torch.fx.node.Target, -- GitLab From ce75efeecb57acb5421aeb545a95e922f3dc8b3e Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 28 May 2025 04:59:39 -0400 Subject: [PATCH 018/274] [BugFix] FA2 MLA Accuracy Issue (#18807) Signed-off-by: LucasWilkinson --- csrc/attention/merge_attn_states.cu | 8 ++++++++ vllm/attention/backends/mla/common.py | 8 ++++---- vllm/v1/attention/backends/mla/common.py | 8 ++++---- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/csrc/attention/merge_attn_states.cu b/csrc/attention/merge_attn_states.cu index 14e5edd7e..6bee9e4ce 100644 --- a/csrc/attention/merge_attn_states.cu +++ b/csrc/attention/merge_attn_states.cu @@ -143,6 +143,14 @@ void merge_attn_states_launcher(torch::Tensor& output, const uint pack_size = 16 / sizeof(scalar_t); TORCH_CHECK(head_size % pack_size == 0, "headsize must be multiple of pack_size:", pack_size); + TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1, + "output heads must be contiguous in memory"); + TORCH_CHECK( + prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1, + "prefix_output heads must be contiguous in memory"); + TORCH_CHECK( + suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1, + "suffix_output heads must be contiguous in memory"); float* output_lse_ptr = nullptr; if (output_lse.has_value()) { output_lse_ptr = output_lse.value().data_ptr(); diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index d48462684..1007140ef 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -1093,10 +1093,6 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): if isinstance(attn_out, tuple): attn_out, *rest = attn_out - # unpad if necessary - if self._pad_v: - attn_out = attn_out[..., :v.shape[-1]] - # Remain consistent with old `flash_attn_varlen_func` where there # is only one output tensor if `return_softmax_lse` is False. if return_softmax_lse: @@ -1294,6 +1290,10 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): suffix_lse=suffix_lse, ) + # unpad if necessary + if self._pad_v: + output = output[..., :v.shape[-1]] + return output.flatten(start_dim=-2) @abstractmethod diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 83e181116..1edfab26b 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -653,10 +653,6 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): if isinstance(attn_out, tuple): attn_out, lse = attn_out[0], attn_out[1] - # unpad if necessary - if self._pad_v: - attn_out = attn_out[..., :v.shape[-1]] - # Remain consistent with old `flash_attn_varlen_func` where there # is only one output tensor if `return_softmax_lse` is False. if return_softmax_lse: @@ -839,6 +835,10 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): suffix_lse=suffix_lse, ) + # unpad if necessary + if self._pad_v: + output = output[..., :v.shape[-1]] + return output.flatten(start_dim=-2) @abstractmethod -- GitLab From d781930f90c0863ee1978820caf0568f59bf39ca Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Wed, 28 May 2025 18:52:34 +0800 Subject: [PATCH 019/274] [Platform][Dist] Make torch distributed process group extendable (#18763) Signed-off-by: Mengqing Cao --- vllm/distributed/utils.py | 89 ++++++++++++++++++++++--------------- vllm/platforms/cuda.py | 33 ++++++++++++++ vllm/platforms/interface.py | 16 +++++++ vllm/platforms/rocm.py | 33 ++++++++++++++ 4 files changed, 134 insertions(+), 37 deletions(-) diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index 93a069d36..96d08dc1a 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -5,7 +5,6 @@ # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. import dataclasses -import datetime import os import pickle import socket @@ -14,14 +13,14 @@ import time import uuid from collections import deque from collections.abc import Sequence +from datetime import timedelta from typing import Any, Optional import torch from torch.distributed import ProcessGroup, TCPStore from torch.distributed.distributed_c10d import (Backend, PrefixStore, _get_default_timeout, - _unregister_process_group, - is_nccl_available) + _unregister_process_group) from torch.distributed.rendezvous import rendezvous import vllm.envs as envs @@ -406,7 +405,7 @@ class StatelessProcessGroup: port=port, world_size=world_size, is_master=launch_server, - timeout=datetime.timedelta(seconds=store_timeout), + timeout=timedelta(seconds=store_timeout), use_libuv=False, # for now: github.com/pytorch/pytorch/pull/150215 master_listen_fd=listen_fd, ) @@ -419,6 +418,43 @@ class StatelessProcessGroup: data_expiration_seconds=data_expiration_seconds) +def init_gloo_process_group(backend: Backend, prefix_store: PrefixStore, + group_rank: int, group_size: int, + timeout: timedelta) -> ProcessGroup: + """ + Stateless init ProcessGroup with gloo backend compatible with + different torch versions. + """ + if is_torch_equal_or_newer("2.6"): + pg = ProcessGroup( + prefix_store, + group_rank, + group_size, + ) + else: + options = ProcessGroup.Options(backend=backend) + pg = ProcessGroup( + prefix_store, + group_rank, + group_size, + options, + ) + from torch.distributed.distributed_c10d import ProcessGroupGloo + backend_class = ProcessGroupGloo(prefix_store, + group_rank, + group_size, + timeout=timeout) + backend_type = ProcessGroup.BackendType.GLOO + device = torch.device("cpu") + if is_torch_equal_or_newer("2.6"): + # _set_default_backend is supported in torch >= 2.6 + pg._set_default_backend(backend_type) + backend_class._set_sequence_number_for_group() + + pg._register_backend(device, backend_type, backend_class) + return pg + + def stateless_init_torch_distributed_process_group( host: str, port: int, rank: int, world_size: int, backend: str) -> ProcessGroup: @@ -468,40 +504,19 @@ def stateless_init_torch_distributed_process_group( # different systems (e.g. RPC) in case the store is multi-tenant. prefix_store = PrefixStore(init_method, store) - pg: ProcessGroup = ProcessGroup( - prefix_store, - group_rank, - group_size, - ) - if backend == "gloo": - from torch.distributed.distributed_c10d import ProcessGroupGloo - backend_class = ProcessGroupGloo(prefix_store, - group_rank, - group_size, - timeout=timeout) - backend_type = ProcessGroup.BackendType.GLOO - device = torch.device("cpu") - elif backend == "nccl": - assert is_nccl_available() - from torch.distributed.distributed_c10d import ProcessGroupNCCL - - backend_options = ProcessGroupNCCL.Options() - backend_options._timeout = timeout - - backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size, - backend_options) - backend_type = ProcessGroup.BackendType.NCCL - device = torch.device("cuda") - else: - raise RuntimeError(f"Unsupported torch distributed backend: {backend}") - - pg._set_default_backend(backend_type) - backend_class._set_sequence_number_for_group() - - pg._register_backend(device, backend_type, backend_class) - - return pg + return init_gloo_process_group(backend=backend, + prefix_store=prefix_store, + group_rank=group_rank, + group_size=group_size, + timeout=timeout) + from vllm.platforms import current_platform + return current_platform.stateless_init_device_torch_dist_pg( + backend=backend, + prefix_store=prefix_store, + group_rank=group_rank, + group_size=group_size, + timeout=timeout) def stateless_destroy_torch_distributed_process_group( diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 8bb3dfe74..0bed44f73 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -4,10 +4,13 @@ pynvml. However, it should not initialize cuda context. """ import os +from datetime import timedelta from functools import wraps from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union import torch +from torch.distributed import PrefixStore, ProcessGroup +from torch.distributed.distributed_c10d import is_nccl_available from typing_extensions import ParamSpec # import custom ops, trigger op registration @@ -316,6 +319,36 @@ class CudaPlatformBase(Platform): def get_piecewise_backend_cls(cls) -> str: return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend" # noqa + @classmethod + def stateless_init_device_torch_dist_pg( + cls, + backend: str, + prefix_store: PrefixStore, + group_rank: int, + group_size: int, + timeout: timedelta, + ) -> ProcessGroup: + assert is_nccl_available() + pg: ProcessGroup = ProcessGroup( + prefix_store, + group_rank, + group_size, + ) + from torch.distributed.distributed_c10d import ProcessGroupNCCL + + backend_options = ProcessGroupNCCL.Options() + backend_options._timeout = timeout + + backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size, + backend_options) + backend_type = ProcessGroup.BackendType.NCCL + device = torch.device("cuda") + pg._set_default_backend(backend_type) + backend_class._set_sequence_number_for_group() + + pg._register_backend(device, backend_type, backend_class) + return pg + # NVML utils # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 504c3b42a..5c4f7a2f7 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -3,11 +3,13 @@ import enum import os import platform import random +from datetime import timedelta from platform import uname from typing import TYPE_CHECKING, NamedTuple, Optional, Union import numpy as np import torch +from torch.distributed import PrefixStore, ProcessGroup from vllm.inputs import ProcessorInputs, PromptType from vllm.logger import init_logger @@ -486,6 +488,20 @@ class Platform: """ return "vllm.compilation.base_piecewise_backend.AbstractPiecewiseBackend" # noqa + @classmethod + def stateless_init_device_torch_dist_pg( + cls, + backend: str, + prefix_store: PrefixStore, + group_rank: int, + group_size: int, + timeout: timedelta, + ) -> ProcessGroup: + """ + Init platform-specific torch distributed process group. + """ + raise RuntimeError(f"Unsupported torch distributed backend: {backend}") + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index b5e742c65..d544b4ab4 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -1,10 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 import os +from datetime import timedelta from functools import cache, lru_cache, wraps from typing import TYPE_CHECKING, Optional import torch +from torch.distributed import PrefixStore, ProcessGroup +from torch.distributed.distributed_c10d import is_nccl_available import vllm.envs as envs from vllm.logger import init_logger @@ -387,3 +390,33 @@ class RocmPlatform(Platform): @classmethod def get_piecewise_backend_cls(cls) -> str: return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend" # noqa + + @classmethod + def stateless_init_device_torch_dist_pg( + cls, + backend: str, + prefix_store: PrefixStore, + group_rank: int, + group_size: int, + timeout: timedelta, + ) -> ProcessGroup: + assert is_nccl_available() + pg: ProcessGroup = ProcessGroup( + prefix_store, + group_rank, + group_size, + ) + from torch.distributed.distributed_c10d import ProcessGroupNCCL + + backend_options = ProcessGroupNCCL.Options() + backend_options._timeout = timeout + + backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size, + backend_options) + backend_type = ProcessGroup.BackendType.NCCL + device = torch.device("cuda") + pg._set_default_backend(backend_type) + backend_class._set_sequence_number_for_group() + + pg._register_backend(device, backend_type, backend_class) + return pg -- GitLab From 4c2b38ce9e90a0ac7c3e7ca400daf3a622cc7bca Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 28 May 2025 13:46:04 +0100 Subject: [PATCH 020/274] Enable Pydantic mypy checks and convert configs to Pydantic dataclasses (#17599) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- pyproject.toml | 1 + tests/lora/test_quant_model.py | 14 +-- tests/tracing/test_tracing.py | 2 +- vllm/config.py | 108 ++++++++++-------- vllm/engine/arg_utils.py | 43 +++---- vllm/entrypoints/llm.py | 5 +- vllm/entrypoints/openai/protocol.py | 12 +- vllm/entrypoints/openai/serving_engine.py | 16 +-- .../guided_decoding/guided_fields.py | 6 +- vllm/utils.py | 8 -- 11 files changed, 115 insertions(+), 102 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b45619a32..628782228 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -58,7 +58,7 @@ repos: entry: tools/mypy.sh 0 "local" language: python types: [python] - additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests] + additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic] stages: [pre-commit] # Don't run in CI - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward name: Run mypy for Python 3.9 diff --git a/pyproject.toml b/pyproject.toml index 62a734d79..eb55a9ffc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -110,6 +110,7 @@ ignore = [ ] [tool.mypy] +plugins = ['pydantic.mypy'] ignore_missing_imports = true check_untyped_defs = true follow_imports = "silent" diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index caf71976a..7a76ffb74 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -24,16 +24,16 @@ if current_platform.is_rocm(): MODELS = [ ModelWithQuantization( model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", - quantization="GPTQ"), + quantization="gptq"), ] else: MODELS = [ ModelWithQuantization( model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", - quantization="AWQ"), + quantization="awq"), ModelWithQuantization( model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", - quantization="GPTQ"), + quantization="gptq"), ] @@ -100,7 +100,7 @@ def test_quant_model_lora(tinyllama_lora_files, model): "#ff8050", "#ff8080", ] - elif model.quantization == "AWQ": + elif model.quantization == "awq": expected_no_lora_output = [ "I'm sorry, I don't understand", "I'm sorry, I don't understand", @@ -109,7 +109,7 @@ def test_quant_model_lora(tinyllama_lora_files, model): "#f07700: A v", "#f00000: A v", ] - elif model.quantization == "GPTQ": + elif model.quantization == "gptq": expected_no_lora_output = [ "I'm sorry, I don't have", "I'm sorry, I don't have", @@ -122,7 +122,7 @@ def test_quant_model_lora(tinyllama_lora_files, model): def expect_match(output, expected_output): # HACK: GPTQ lora outputs are just incredibly unstable. # Assert that the outputs changed. - if (model.quantization == "GPTQ" + if (model.quantization == "gptq" and expected_output is expected_lora_output): assert output != expected_no_lora_output for i, o in enumerate(output): @@ -172,7 +172,7 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, model): if num_gpus_available < 2: pytest.skip(f"Not enough GPUs for tensor parallelism {2}") - if model.quantization == "GPTQ": + if model.quantization == "gptq": pytest.skip("GPTQ lora outputs are just incredibly unstable") llm_tp1 = vllm.LLM( model=model.model_path, diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py index a781b8b56..caa233ec3 100644 --- a/tests/tracing/test_tracing.py +++ b/tests/tracing/test_tracing.py @@ -173,7 +173,7 @@ def test_traces_with_detailed_steps( llm = LLM( model=model, otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS, - collect_detailed_traces="all", + collect_detailed_traces=["all"], ) prompts = ["This is a short prompt"] outputs = llm.generate(prompts, sampling_params=sampling_params) diff --git a/vllm/config.py b/vllm/config.py index 30d1a64a4..4d9ca580f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -11,8 +11,8 @@ import uuid import warnings from collections import Counter from contextlib import contextmanager -from dataclasses import (MISSING, Field, asdict, dataclass, field, fields, - is_dataclass, replace) +from dataclasses import (MISSING, Field, asdict, field, fields, is_dataclass, + replace) from functools import cached_property from importlib.util import find_spec from pathlib import Path @@ -21,9 +21,12 @@ from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional, import regex as re import torch +from pydantic import (ConfigDict, SkipValidation, TypeAdapter, field_validator, + model_validator) +from pydantic.dataclasses import dataclass from torch.distributed import ProcessGroup, ReduceOp from transformers import PretrainedConfig -from typing_extensions import deprecated +from typing_extensions import deprecated, runtime_checkable import vllm.envs as envs from vllm import version @@ -57,10 +60,15 @@ if TYPE_CHECKING: from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.model_loader import BaseModelLoader + from vllm.model_executor.model_loader.tensorizer import TensorizerConfig ConfigType = type[DataclassInstance] else: + PlacementGroup = Any + ExecutorBase = Any QuantizationConfig = Any + BaseModelLoader = Any + TensorizerConfig = Any ConfigType = type logger = init_logger(__name__) @@ -92,6 +100,7 @@ HfOverrides = Union[dict[str, Any], Callable[[PretrainedConfig], PretrainedConfig]] +@runtime_checkable class SupportsHash(Protocol): def compute_hash(self) -> str: @@ -223,7 +232,7 @@ ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] @config -@dataclass +@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) class ModelConfig: """Configuration for the model.""" @@ -236,7 +245,7 @@ class ModelConfig: task, even if the same model can be used for multiple tasks. When the model only supports one task, "auto" can be used to select it; otherwise, you must specify explicitly which task to use.""" - tokenizer: str = None # type: ignore + tokenizer: SkipValidation[str] = None # type: ignore """Name or path of the Hugging Face tokenizer to use. If unspecified, model name or path will be used.""" tokenizer_mode: TokenizerMode = "auto" @@ -284,7 +293,7 @@ class ModelConfig: """The specific revision to use for the tokenizer on the Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.""" - max_model_len: int = None # type: ignore + max_model_len: SkipValidation[int] = None # type: ignore """Model context length (prompt and output). If unspecified, will be automatically derived from the model config. @@ -602,6 +611,22 @@ class ModelConfig: self._verify_cuda_graph() self._verify_bnb_config() + @field_validator("quantization", mode="before") + @classmethod + def validate_quantization_before(cls, value: Any) -> Any: + if isinstance(value, str): + return value.lower() + return value + + @model_validator(mode="after") + def validate_model_config_after(self: "ModelConfig") -> "ModelConfig": + if not isinstance(self.tokenizer, str): + raise ValueError("tokenizer must be a string after __post_init__.") + if not isinstance(self.max_model_len, int): + raise ValueError( + "max_model_len must be an integer after __post_init__.") + return self + @property def registry(self): return ModelRegistry @@ -823,8 +848,7 @@ class ModelConfig: "quark", "modelopt_fp4", "bitblas", "gptq_bitblas" ] if self.quantization is not None: - self.quantization = cast(QuantizationMethods, - self.quantization.lower()) + self.quantization = cast(QuantizationMethods, self.quantization) # Parse quantization method from the HF model config, if available. quant_cfg = self._parse_quant_hf_config() @@ -1397,7 +1421,7 @@ PrefixCachingHashAlgo = Literal["builtin", "sha256"] class CacheConfig: """Configuration for the KV cache.""" - block_size: BlockSize = None # type: ignore + block_size: SkipValidation[BlockSize] = None # type: ignore """Size of a contiguous cache block in number of tokens. This is ignored on neuron devices and set to `--max-model-len`. On CUDA devices, only block sizes up to 32 are supported. On HPU devices, block size defaults to 128. @@ -1619,7 +1643,8 @@ class LoadConfig: download_dir: Optional[str] = None """Directory to download and load the weights, default to the default cache directory of Hugging Face.""" - model_loader_extra_config: dict = field(default_factory=dict) + model_loader_extra_config: Union[dict, TensorizerConfig] = field( + default_factory=dict) """Extra config for model loader. This will be passed to the model loader corresponding to the chosen load_format.""" ignore_patterns: Optional[Union[list[str], str]] = None @@ -1929,19 +1954,19 @@ class SchedulerConfig: runner_type: RunnerType = "generate" """The runner type to launch for the model.""" - max_num_batched_tokens: int = None # type: ignore + max_num_batched_tokens: SkipValidation[int] = None # type: ignore """Maximum number of tokens to be processed in a single iteration. This config has no static default. If left unspecified by the user, it will be set in `EngineArgs.create_engine_config` based on the usage context.""" - max_num_seqs: int = None # type: ignore + max_num_seqs: SkipValidation[int] = None # type: ignore """Maximum number of sequences to be processed in a single iteration. This config has no static default. If left unspecified by the user, it will be set in `EngineArgs.create_engine_config` based on the usage context.""" - max_model_len: int = None # type: ignore + max_model_len: SkipValidation[int] = None # type: ignore """Maximum length of a sequence (including prompt and generated text). This is primarily set in `ModelConfig` and that value should be manually duplicated here.""" @@ -1980,7 +2005,7 @@ class SchedulerConfig: """Apply a delay (of delay factor multiplied by previous prompt latency) before scheduling next prompt.""" - enable_chunked_prefill: bool = None # type: ignore + enable_chunked_prefill: SkipValidation[bool] = None # type: ignore """If True, prefill requests can be chunked based on the remaining max_num_batched_tokens.""" @@ -2202,7 +2227,7 @@ Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"] @config -@dataclass +@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) class DeviceConfig: """Configuration for the device to use for vLLM execution.""" @@ -2260,8 +2285,8 @@ class DeviceConfig: self.device = torch.device(self.device_type) -SpeculativeMethod = Literal["ngram", "eagle", "medusa", "mlp_speculator", - "draft_model", "deepseek_mtp"] +SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa", + "mlp_speculator", "draft_model", "deepseek_mtp"] SpeculativeAcceptanceMethod = Literal["rejection_sampler", "typical_acceptance_sampler"] @@ -2272,8 +2297,7 @@ class SpeculativeConfig: """Configuration for speculative decoding.""" # General speculative decoding control - num_speculative_tokens: int = field(default=None, - init=True) # type: ignore + num_speculative_tokens: SkipValidation[int] = None # type: ignore """The number of speculative tokens, if provided. It will default to the number in the draft model config if present, otherwise, it is required.""" model: Optional[str] = None @@ -2349,26 +2373,23 @@ class SpeculativeConfig: """Specifies the tree structure for speculative token generation. """ # required configuration params passed from engine - target_model_config: ModelConfig = field(default=None, - init=True) # type: ignore + target_model_config: SkipValidation[ModelConfig] = None # type: ignore """The configuration of the target model.""" - target_parallel_config: ParallelConfig = field(default=None, - init=True) # type: ignore + target_parallel_config: SkipValidation[ + ParallelConfig] = None # type: ignore """The parallel configuration for the target model.""" - enable_chunked_prefill: bool = field(default=None, - init=True) # type: ignore + enable_chunked_prefill: SkipValidation[bool] = None # type: ignore """Whether vLLM is configured to use chunked prefill or not. Used for raising an error since it's not yet compatible with speculative decode.""" - disable_log_stats: bool = field(default=None, init=True) # type: ignore + disable_log_stats: SkipValidation[bool] = None # type: ignore """Whether to disable the periodic printing of stage times in speculative decoding.""" # params generated in the post-init stage - draft_model_config: ModelConfig = field(default=None, - init=True) # type: ignore + draft_model_config: SkipValidation[ModelConfig] = None # type: ignore """The configuration of the draft model initialized internal.""" - draft_parallel_config: ParallelConfig = field(default=None, - init=True) # type: ignore + draft_parallel_config: SkipValidation[ + ParallelConfig] = None # type: ignore """The parallel configuration for the draft model initialized internal.""" def compute_hash(self) -> str: @@ -2766,7 +2787,7 @@ LoRADType = Literal["auto", "float16", "bfloat16"] @config -@dataclass +@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) class LoRAConfig: """Configuration for LoRA.""" @@ -2863,7 +2884,7 @@ class LoRAConfig: @config -@dataclass +@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) class PromptAdapterConfig: """Configuration for PromptAdapters.""" @@ -3892,17 +3913,11 @@ class CompilationConfig: "pass_config", "traced_files", } - include = dict() - for k, v in asdict(self).items(): - if k in exclude: - continue - f = get_field(CompilationConfig, k) - if (d := f.default) is not MISSING and d == v: - continue - if (df := f.default_factory) is not MISSING and df() == v: - continue - include[k] = v - return json.dumps(include) + # The cast to string is necessary because Pydantic is mocked in docs + # builds and sphinx-argparse doesn't know the return type of decode() + return str( + TypeAdapter(CompilationConfig).dump_json( + self, exclude=exclude, exclude_unset=True).decode()) __str__ = __repr__ @@ -3911,7 +3926,7 @@ class CompilationConfig: """Parse the CLI value for the compilation config.""" if cli_value in ["0", "1", "2", "3"]: return cls(level=int(cli_value)) - return cls(**json.loads(cli_value)) + return TypeAdapter(CompilationConfig).validate_json(cli_value) def __post_init__(self) -> None: count_none = self.custom_ops.count("none") @@ -4037,7 +4052,7 @@ class CompilationConfig: @config -@dataclass +@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) class VllmConfig: """Dataclass which contains all vllm-related configuration. This simplifies passing around the distinct configurations in the codebase. @@ -4294,9 +4309,6 @@ class VllmConfig: "To workaround this limitation, vLLM will set 'ieee' input " "precision for chunked prefill triton kernels.") - if self.compilation_config is None: - self.compilation_config = CompilationConfig() - # async tp is built on top of sequence parallelism # and requires it to be enabled. if self.compilation_config.pass_config.enable_async_tp: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 442e4100f..2a1a34211 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -14,6 +14,7 @@ from typing import (Annotated, Any, Callable, Dict, List, Literal, Optional, import regex as re import torch +from pydantic import SkipValidation, TypeAdapter, ValidationError from typing_extensions import TypeIs, deprecated import vllm.envs as envs @@ -38,7 +39,7 @@ from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 from vllm.transformers_utils.utils import check_gguf_file from vllm.usage.usage_lib import UsageContext from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser, - GiB_bytes, is_in_doc_build, is_in_ray_actor) + GiB_bytes, is_in_ray_actor) # yapf: enable @@ -156,7 +157,8 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]: # Get the set of possible types for the field type_hints: set[TypeHint] = set() if get_origin(field.type) in {Union, Annotated}: - type_hints.update(get_args(field.type)) + predicate = lambda arg: not isinstance(arg, SkipValidation) + type_hints.update(filter(predicate, get_args(field.type))) else: type_hints.add(field.type) @@ -168,10 +170,7 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]: if field.default is not MISSING: default = field.default elif field.default_factory is not MISSING: - if is_dataclass(field.default_factory) and is_in_doc_build(): - default = {} - else: - default = field.default_factory() + default = field.default_factory() # Get the help text for the field name = field.name @@ -189,12 +188,16 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]: - `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n - `--json-arg.key1 value1 --json-arg.key2.key3 value2`\n\n""" if dataclass_cls is not None: - dataclass_init = lambda x, f=dataclass_cls: f(**json.loads(x)) - # Special case for configs with a from_cli method - if hasattr(dataclass_cls, "from_cli"): - from_cli = dataclass_cls.from_cli - dataclass_init = lambda x, f=from_cli: f(x) - kwargs[name]["type"] = dataclass_init + + def parse_dataclass(val: str, cls=dataclass_cls) -> Any: + try: + if hasattr(cls, "from_cli"): + return cls.from_cli(val) + return TypeAdapter(cls).validate_json(val) + except ValidationError as e: + raise argparse.ArgumentTypeError(repr(e)) from e + + kwargs[name]["type"] = parse_dataclass kwargs[name]["help"] += json_tip elif contains_type(type_hints, bool): # Creates --no- and -- flags @@ -225,12 +228,11 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]: kwargs[name]["type"] = human_readable_int elif contains_type(type_hints, float): kwargs[name]["type"] = float - elif contains_type(type_hints, - dict) and (contains_type(type_hints, str) or any( - is_not_builtin(th) for th in type_hints)): + elif (contains_type(type_hints, dict) + and (contains_type(type_hints, str) + or any(is_not_builtin(th) for th in type_hints))): kwargs[name]["type"] = union_dict_and_str elif contains_type(type_hints, dict): - # Dict arguments will always be optional kwargs[name]["type"] = parse_type(json.loads) kwargs[name]["help"] += json_tip elif (contains_type(type_hints, str) @@ -317,8 +319,7 @@ class EngineArgs: rope_scaling: dict[str, Any] = get_field(ModelConfig, "rope_scaling") rope_theta: Optional[float] = ModelConfig.rope_theta hf_token: Optional[Union[bool, str]] = ModelConfig.hf_token - hf_overrides: Optional[HfOverrides] = \ - get_field(ModelConfig, "hf_overrides") + hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides") tokenizer_revision: Optional[str] = ModelConfig.tokenizer_revision quantization: Optional[QuantizationMethods] = ModelConfig.quantization enforce_eager: bool = ModelConfig.enforce_eager @@ -398,7 +399,8 @@ class EngineArgs: get_field(ModelConfig, "override_neuron_config") override_pooler_config: Optional[Union[dict, PoolerConfig]] = \ ModelConfig.override_pooler_config - compilation_config: Optional[CompilationConfig] = None + compilation_config: CompilationConfig = \ + get_field(VllmConfig, "compilation_config") worker_cls: str = ParallelConfig.worker_cls worker_extension_cls: str = ParallelConfig.worker_extension_cls @@ -413,7 +415,8 @@ class EngineArgs: calculate_kv_scales: bool = CacheConfig.calculate_kv_scales - additional_config: Optional[Dict[str, Any]] = None + additional_config: dict[str, Any] = \ + get_field(VllmConfig, "additional_config") enable_reasoning: Optional[bool] = None # DEPRECATED reasoning_parser: str = DecodingConfig.reasoning_backend diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 59cc44eb0..7e2e5161c 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -207,6 +207,9 @@ class LLM: if isinstance(worker_cls, type): kwargs["worker_cls"] = cloudpickle.dumps(worker_cls) + if hf_overrides is None: + hf_overrides = {} + if compilation_config is not None: if isinstance(compilation_config, int): compilation_config_instance = CompilationConfig( @@ -218,7 +221,7 @@ class LLM: else: compilation_config_instance = compilation_config else: - compilation_config_instance = None + compilation_config_instance = CompilationConfig() engine_args = EngineArgs( model=model, diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 393cf381b..a7f85e9ee 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -175,11 +175,15 @@ class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel): type: Literal["function"] = "function" +# extra="forbid" is a workaround to have kwargs as a field, +# see https://github.com/pydantic/pydantic/issues/3125 class LogitsProcessorConstructor(BaseModel): qualname: str args: Optional[list[Any]] = None kwargs: Optional[dict[str, Any]] = None + model_config = ConfigDict(extra="forbid") + LogitsProcessors = list[Union[str, LogitsProcessorConstructor]] @@ -234,7 +238,7 @@ class ChatCompletionRequest(OpenAIBaseModel): presence_penalty: Optional[float] = 0.0 response_format: Optional[AnyResponseFormat] = None seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) - stop: Optional[Union[str, list[str]]] = Field(default_factory=list) + stop: Optional[Union[str, list[str]]] = [] stream: Optional[bool] = False stream_options: Optional[StreamOptions] = None temperature: Optional[float] = None @@ -258,7 +262,7 @@ class ChatCompletionRequest(OpenAIBaseModel): min_p: Optional[float] = None repetition_penalty: Optional[float] = None length_penalty: float = 1.0 - stop_token_ids: Optional[list[int]] = Field(default_factory=list) + stop_token_ids: Optional[list[int]] = [] include_stop_str_in_output: bool = False ignore_eos: bool = False min_tokens: int = 0 @@ -756,7 +760,7 @@ class CompletionRequest(OpenAIBaseModel): n: int = 1 presence_penalty: Optional[float] = 0.0 seed: Optional[int] = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max) - stop: Optional[Union[str, list[str]]] = Field(default_factory=list) + stop: Optional[Union[str, list[str]]] = [] stream: Optional[bool] = False stream_options: Optional[StreamOptions] = None suffix: Optional[str] = None @@ -770,7 +774,7 @@ class CompletionRequest(OpenAIBaseModel): min_p: Optional[float] = None repetition_penalty: Optional[float] = None length_penalty: float = 1.0 - stop_token_ids: Optional[list[int]] = Field(default_factory=list) + stop_token_ids: Optional[list[int]] = [] include_stop_str_in_output: bool = False ignore_eos: bool = False min_tokens: int = 0 diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index c73575b48..f96a4ac8b 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -134,11 +134,9 @@ class RequestProcessingMixin(BaseModel): Mixin for request processing, handling prompt preparation and engine input. """ - request_prompts: Optional[Sequence[RequestPrompt]] = \ - Field(default_factory=list) + request_prompts: Optional[Sequence[RequestPrompt]] = [] engine_prompts: Optional[Union[list[EngineTokensPrompt], - list[EngineEmbedsPrompt]]] = Field( - default_factory=list) + list[EngineEmbedsPrompt]]] = [] model_config = ConfigDict(arbitrary_types_allowed=True) @@ -528,12 +526,14 @@ class OpenAIServing: if isinstance(request, (EmbeddingChatRequest, EmbeddingCompletionRequest, ScoreRequest, RerankRequest, ClassificationRequest)): - operation = { - ScoreRequest: "score", - ClassificationRequest: "classification" - }.get(type(request), "embedding generation") if token_num > self.max_model_len: + operations: dict[type[AnyRequest], str] = { + ScoreRequest: "score", + ClassificationRequest: "classification" + } + operation = operations.get(type(request), + "embedding generation") raise ValueError( f"This model's maximum context length is " f"{self.max_model_len} tokens. However, you requested " diff --git a/vllm/model_executor/guided_decoding/guided_fields.py b/vllm/model_executor/guided_decoding/guided_fields.py index 085f37a5d..316860718 100644 --- a/vllm/model_executor/guided_decoding/guided_fields.py +++ b/vllm/model_executor/guided_decoding/guided_fields.py @@ -3,12 +3,10 @@ from dataclasses import dataclass from typing import Optional, TypedDict, Union -from pydantic import BaseModel - # These classes are deprecated, see SamplingParams class LLMGuidedOptions(TypedDict, total=False): - guided_json: Union[dict, BaseModel, str] + guided_json: Union[dict, str] guided_regex: str guided_choice: list[str] guided_grammar: str @@ -20,7 +18,7 @@ class LLMGuidedOptions(TypedDict, total=False): @dataclass class GuidedDecodingRequest: """One of the fields will be used to retrieve the logit processor.""" - guided_json: Optional[Union[dict, BaseModel, str]] = None + guided_json: Optional[Union[dict, str]] = None guided_regex: Optional[str] = None guided_choice: Optional[list[str]] = None guided_grammar: Optional[str] = None diff --git a/vllm/utils.py b/vllm/utils.py index 846df7743..c1213d463 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1878,14 +1878,6 @@ def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor: return torch.ops._C.get_cuda_view_from_cpu_tensor(cpu_tensor) -def is_in_doc_build() -> bool: - try: - from sphinx.ext.autodoc.mock import _MockModule - return isinstance(zmq, _MockModule) - except ModuleNotFoundError: - return False - - def import_from_path(module_name: str, file_path: Union[str, os.PathLike]): """ Import a Python file according to its file path. -- GitLab From 435fa95444b2b0e408dc4b51ce7e759bbb3d0bcc Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Wed, 28 May 2025 22:08:57 +0800 Subject: [PATCH 021/274] [Frontend] add run batch to CLI (#18804) Signed-off-by: reidliu41 Co-authored-by: reidliu41 --- .../offline_inference/openai_batch/README.md | 37 ++++++++++++- tests/entrypoints/openai/test_run_batch.py | 26 ++++----- vllm/entrypoints/cli/main.py | 2 + vllm/entrypoints/cli/run_batch.py | 55 +++++++++++++++++++ vllm/entrypoints/openai/run_batch.py | 12 ++-- 5 files changed, 110 insertions(+), 22 deletions(-) create mode 100644 vllm/entrypoints/cli/run_batch.py diff --git a/examples/offline_inference/openai_batch/README.md b/examples/offline_inference/openai_batch/README.md index 42a19f71e..ce7529782 100644 --- a/examples/offline_inference/openai_batch/README.md +++ b/examples/offline_inference/openai_batch/README.md @@ -48,7 +48,19 @@ The batch running tool is designed to be used from the command line. You can run the batch with the following command, which will write its results to a file called `results.jsonl` ```console -python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai_batch/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch \ + -i offline_inference/openai_batch/openai_example_batch.jsonl \ + -o results.jsonl \ + --model meta-llama/Meta-Llama-3-8B-Instruct +``` + +or use command-line: + +```console +vllm run-batch \ + -i offline_inference/openai_batch/openai_example_batch.jsonl \ + -o results.jsonl \ + --model meta-llama/Meta-Llama-3-8B-Instruct ``` ### Step 3: Check your results @@ -68,7 +80,19 @@ The batch runner supports remote input and output urls that are accessible via h For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl`, you can run ```console -python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch \ + -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \ + -o results.jsonl \ + --model meta-llama/Meta-Llama-3-8B-Instruct +``` + +or use command-line: + +```console +vllm run-batch \ + -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \ + -o results.jsonl \ + --model meta-llama/Meta-Llama-3-8B-Instruct ``` ## Example 3: Integrating with AWS S3 @@ -164,6 +188,15 @@ python -m vllm.entrypoints.openai.run_batch \ --model --model meta-llama/Meta-Llama-3-8B-Instruct ``` +or use command-line: + +```console +vllm run-batch \ + -i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \ + -o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \ + --model --model meta-llama/Meta-Llama-3-8B-Instruct +``` + ### Step 4: View your results Your results are now on S3. You can view them in your terminal by running diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index 643d0d06a..27802945a 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -2,7 +2,6 @@ import json import subprocess -import sys import tempfile from vllm.entrypoints.openai.protocol import BatchRequestOutput @@ -35,9 +34,8 @@ def test_empty_file(): input_file.write("") input_file.flush() proc = subprocess.Popen([ - sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", - input_file.name, "-o", output_file.name, "--model", - "intfloat/multilingual-e5-small" + "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name, + "--model", "intfloat/multilingual-e5-small" ], ) proc.communicate() proc.wait() @@ -54,9 +52,8 @@ def test_completions(): input_file.write(INPUT_BATCH) input_file.flush() proc = subprocess.Popen([ - sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", - input_file.name, "-o", output_file.name, "--model", - "NousResearch/Meta-Llama-3-8B-Instruct" + "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name, + "--model", "NousResearch/Meta-Llama-3-8B-Instruct" ], ) proc.communicate() proc.wait() @@ -79,9 +76,8 @@ def test_completions_invalid_input(): input_file.write(INVALID_INPUT_BATCH) input_file.flush() proc = subprocess.Popen([ - sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", - input_file.name, "-o", output_file.name, "--model", - "NousResearch/Meta-Llama-3-8B-Instruct" + "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name, + "--model", "NousResearch/Meta-Llama-3-8B-Instruct" ], ) proc.communicate() proc.wait() @@ -95,9 +91,8 @@ def test_embeddings(): input_file.write(INPUT_EMBEDDING_BATCH) input_file.flush() proc = subprocess.Popen([ - sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", - input_file.name, "-o", output_file.name, "--model", - "intfloat/multilingual-e5-small" + "vllm", "run-batch", "-i", input_file.name, "-o", output_file.name, + "--model", "intfloat/multilingual-e5-small" ], ) proc.communicate() proc.wait() @@ -117,9 +112,8 @@ def test_score(): input_file.write(INPUT_SCORE_BATCH) input_file.flush() proc = subprocess.Popen([ - sys.executable, - "-m", - "vllm.entrypoints.openai.run_batch", + "vllm", + "run-batch", "-i", input_file.name, "-o", diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py index 6676c294c..5eba72fec 100644 --- a/vllm/entrypoints/cli/main.py +++ b/vllm/entrypoints/cli/main.py @@ -7,6 +7,7 @@ import sys import vllm.entrypoints.cli.benchmark.main import vllm.entrypoints.cli.collect_env import vllm.entrypoints.cli.openai +import vllm.entrypoints.cli.run_batch import vllm.entrypoints.cli.serve import vllm.version from vllm.entrypoints.utils import VLLM_SERVE_PARSER_EPILOG, cli_env_setup @@ -17,6 +18,7 @@ CMD_MODULES = [ vllm.entrypoints.cli.serve, vllm.entrypoints.cli.benchmark.main, vllm.entrypoints.cli.collect_env, + vllm.entrypoints.cli.run_batch, ] diff --git a/vllm/entrypoints/cli/run_batch.py b/vllm/entrypoints/cli/run_batch.py new file mode 100644 index 000000000..f74c8da9b --- /dev/null +++ b/vllm/entrypoints/cli/run_batch.py @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import asyncio + +from prometheus_client import start_http_server + +from vllm.entrypoints.cli.types import CLISubcommand +from vllm.entrypoints.logger import logger +from vllm.entrypoints.openai.run_batch import main as run_batch_main +from vllm.entrypoints.openai.run_batch import make_arg_parser +from vllm.utils import FlexibleArgumentParser +from vllm.version import __version__ as VLLM_VERSION + + +class RunBatchSubcommand(CLISubcommand): + """The `run-batch` subcommand for vLLM CLI.""" + + def __init__(self): + self.name = "run-batch" + super().__init__() + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + logger.info("vLLM batch processing API version %s", VLLM_VERSION) + logger.info("args: %s", args) + + # Start the Prometheus metrics server. + # LLMEngine uses the Prometheus client + # to publish metrics at the /metrics endpoint. + if args.enable_metrics: + logger.info("Prometheus metrics enabled") + start_http_server(port=args.port, addr=args.url) + else: + logger.info("Prometheus metrics disabled") + + asyncio.run(run_batch_main(args)) + + def subparser_init( + self, + subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser: + run_batch_parser = subparsers.add_parser( + "run-batch", + help="Run batch prompts and write results to file.", + description=( + "Run batch prompts using vLLM's OpenAI-compatible API.\n" + "Supports local or HTTP input/output files."), + usage= + "vllm run-batch -i INPUT.jsonl -o OUTPUT.jsonl --model ", + ) + return make_arg_parser(run_batch_parser) + + +def cmd_init() -> list[CLISubcommand]: + return [RunBatchSubcommand()] diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index eae83c9a4..f38465b22 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -33,9 +33,7 @@ from vllm.utils import FlexibleArgumentParser, random_uuid from vllm.version import __version__ as VLLM_VERSION -def parse_args(): - parser = FlexibleArgumentParser( - description="vLLM OpenAI-Compatible batch runner.") +def make_arg_parser(parser: FlexibleArgumentParser): parser.add_argument( "-i", "--input-file", @@ -98,7 +96,13 @@ def parse_args(): default=False, help="If set to True, enable prompt_tokens_details in usage.") - return parser.parse_args() + return parser + + +def parse_args(): + parser = FlexibleArgumentParser( + description="vLLM OpenAI-Compatible batch runner.") + return make_arg_parser(parser).parse_args() # explicitly use pure text format, with a newline at the end -- GitLab From 6e4cea1cc56da6d4a558a50196c5698c36385890 Mon Sep 17 00:00:00 2001 From: daniel-salib Date: Wed, 28 May 2025 07:15:12 -0700 Subject: [PATCH 022/274] decrement server_load on listen for disconnect (#18784) Signed-off-by: Daniel Salib --- vllm/entrypoints/utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index cc651a172..1b0ea6909 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -26,6 +26,11 @@ async def listen_for_disconnect(request: Request) -> None: while True: message = await request.receive() if message["type"] == "http.disconnect": + if request.app.state.enable_server_load_tracking: + # on timeout/cancellation the BackgroundTask in load_aware_call + # cannot decrement the server load metrics. + # Must be decremented by with_cancellation instead. + request.app.state.server_load_metrics -= 1 break -- GitLab From 321331b8ae41f13e519a63f99a0c427dc3907126 Mon Sep 17 00:00:00 2001 From: Alex Brooks Date: Wed, 28 May 2025 09:58:24 -0600 Subject: [PATCH 023/274] [Core] Add Lora Support to Beam Search (#18346) Signed-off-by: Alex-Brooks --- .../entrypoints/openai/test_lora_adapters.py | 34 ++++++++++ tests/lora/test_qwen2vl.py | 62 ++++++++++++++++++- vllm/beam_search.py | 4 ++ vllm/engine/protocol.py | 22 ++++--- vllm/entrypoints/llm.py | 42 +++++++++++-- vllm/entrypoints/openai/serving_chat.py | 1 + vllm/entrypoints/openai/serving_completion.py | 1 + 7 files changed, 150 insertions(+), 16 deletions(-) diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py index 2fc08b475..cd07ca46c 100644 --- a/tests/entrypoints/openai/test_lora_adapters.py +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -313,3 +313,37 @@ async def test_loading_invalid_adapters_does_not_break_others( prompt=["Hello there", "Foo bar bazz buzz"], max_tokens=5, ) + + +@pytest.mark.asyncio +async def test_beam_search_with_lora_adapters( + client: openai.AsyncOpenAI, + tmp_path, + zephyr_lora_files, +): + """Validate that async beam search can be used with lora.""" + + async def load_and_run_adapter(adapter_name: str): + await client.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": adapter_name, + "lora_path": str(zephyr_lora_files) + }) + for _ in range(3): + await client.completions.create( + model=adapter_name, + prompt=["Hello there", "Foo bar bazz buzz"], + max_tokens=5, + extra_body=dict(use_beam_search=True), + ) + + lora_tasks = [] + for i in range(3): + lora_tasks.append( + asyncio.create_task(load_and_run_adapter(f"adapter_{i}"))) + + results, _ = await asyncio.wait(lora_tasks) + + for r in results: + assert not isinstance(r, Exception), f"Got exception {r}" diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index 7bd3e3d0f..162714df2 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -10,6 +10,7 @@ import vllm from vllm.assets.image import ImageAsset from vllm.lora.request import LoRARequest from vllm.platforms import current_platform +from vllm.sampling_params import BeamSearchParams @pytest.fixture(autouse=not current_platform.is_cpu()) @@ -69,7 +70,7 @@ class Qwen2VLTester: expected_outputs: list[str], lora_id: Optional[int] = None, temperature: float = 0, - max_tokens: int = 5) -> list[str]: + max_tokens: int = 5): sampling_params = vllm.SamplingParams( temperature=temperature, @@ -97,7 +98,35 @@ class Qwen2VLTester: generated), f"Generated text {generated} doesn't " f"match expected pattern {expected}" - return generated_texts + def run_beam_search_test(self, + images: list[ImageAsset], + expected_outputs: list[list[str]], + lora_id: Optional[int] = None, + temperature: float = 0, + beam_width: int = 2, + max_tokens: int = 5): + + beam_search_params = BeamSearchParams(beam_width=beam_width, + max_tokens=max_tokens, + temperature=temperature) + + inputs = [{ + "prompt": self.PROMPT_TEMPLATE, + "multi_modal_data": { + "image": asset.pil_image + }, + } for asset in images] + + lora_request = LoRARequest(str(lora_id), lora_id, + self.config.lora_path) + outputs = self.llm.beam_search(inputs, + beam_search_params, + lora_request=lora_request) + + for output_obj, expected_outs in zip(outputs, expected_outputs): + output_texts = [seq.text for seq in output_obj.sequences] + assert output_texts == expected_outs, \ + f"Generated texts {output_texts} do not match expected {expected_outs}" # noqa: E501 TEST_IMAGES = [ @@ -110,6 +139,14 @@ EXPECTED_OUTPUTS = [ "A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.", # noqa: E501 ] +# NOTE - beam search .text contains the whole text +EXPECTED_BEAM_SEARCH_OUTPUTS = [ + [ + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is in the image?<|im_end|>\n<|im_start|>assistant\nA majestic skyscraper stands", # noqa: E501 + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is in the image?<|im_end|>\n<|im_start|>assistant\nA majestic tower stands tall", # noqa: E501 + ], +] + QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct" QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct" @@ -130,6 +167,27 @@ def test_qwen2vl_lora(qwen2vl_lora_files): lora_id=lora_id) +@pytest.mark.xfail( + current_platform.is_rocm(), + reason="Qwen2-VL dependency xformers incompatible with ROCm") +def test_qwen2vl_lora_beam_search(qwen2vl_lora_files): + """Test Qwen 2.0 VL model with LoRA through beam search.""" + config = TestConfig(model_path=QWEN2VL_MODEL_PATH, + lora_path=qwen2vl_lora_files) + tester = Qwen2VLTester(config) + + # Test with different LoRA IDs + for lora_id in [1, 2]: + # NOTE currently, we only test cherry blossom since stop sign + # output is slightly different for v1; - the root cause is likely + # independent of the intent of this test, which is to ensure beam + # search passes through lora through correctly. + tester.run_beam_search_test( + [ImageAsset("cherry_blossom")], + expected_outputs=EXPECTED_BEAM_SEARCH_OUTPUTS, + lora_id=lora_id) + + @pytest.mark.xfail( current_platform.is_rocm(), reason="Qwen2.5-VL dependency xformers incompatible with ROCm", diff --git a/vllm/beam_search.py b/vllm/beam_search.py index 967510aba..ddacc6695 100644 --- a/vllm/beam_search.py +++ b/vllm/beam_search.py @@ -3,6 +3,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Optional, Union +from vllm.lora.request import LoRARequest from vllm.sequence import Logprob if TYPE_CHECKING: @@ -19,6 +20,7 @@ class BeamSearchSequence: # The tokens includes the prompt. tokens: list[int] logprobs: list[dict[int, Logprob]] + lora_request: Optional[LoRARequest] = None cum_logprob: float = 0.0 text: Optional[str] = None finish_reason: Optional[str] = None @@ -41,6 +43,7 @@ class BeamSearchInstance: def __init__( self, prompt_tokens: list[int], + lora_request: Optional[LoRARequest] = None, logprobs: Optional[list[dict[int, Logprob]]] = None, **kwargs, ): @@ -48,6 +51,7 @@ class BeamSearchInstance: BeamSearchSequence( tokens=prompt_tokens, logprobs=[] if logprobs is None else list(logprobs), + lora_request=lora_request, **kwargs, ) ] diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index a837a2d28..28341c2c6 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -65,6 +65,7 @@ class EngineClient(ABC): prompt: PromptType, request_id: str, params: BeamSearchParams, + lora_request: Optional[LoRARequest] = None, ) -> AsyncGenerator[RequestOutput, None]: beam_width = params.beam_width @@ -106,27 +107,31 @@ class EngineClient(ABC): cum_logprob=0, logprobs=[], multi_modal_data=multi_modal_data, - mm_processor_kwargs=mm_processor_kwargs) + mm_processor_kwargs=mm_processor_kwargs, + lora_request=lora_request) ] completed = [] for _ in range(max_tokens): - prompts_batch = [ + prompts_batch, lora_req_batch = zip(*[( TokensPrompt(prompt_token_ids=beam.tokens, multi_modal_data=beam.multi_modal_data, - mm_processor_kwargs=beam.mm_processor_kwargs) - for beam in all_beams - ] + mm_processor_kwargs=beam.mm_processor_kwargs), + beam.lora_request, + ) for beam in all_beams]) tasks = [] request_id = f"beam_search-{random_uuid()}" - for i, individual_prompt in enumerate(prompts_batch): + for i, (individual_prompt, + lora_req) in enumerate(zip(prompts_batch, lora_req_batch)): request_id_item = f"{request_id}-{i}" task = asyncio.create_task( collect_from_async_generator( - self.generate(individual_prompt, beam_search_params, - request_id_item))) + self.generate(individual_prompt, + beam_search_params, + request_id_item, + lora_request=lora_req))) tasks.append(task) output = await asyncio.gather(*tasks) @@ -159,6 +164,7 @@ class EngineClient(ABC): tokens=current_beam.tokens + [token_id], logprobs=current_beam.logprobs + [logprobs], + lora_request=current_beam.lora_request, cum_logprob=current_beam.cum_logprob + logprob_obj.logprob, multi_modal_data=current_beam. diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 7e2e5161c..f8eeae61f 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -522,10 +522,28 @@ class LLM: executor = self.llm_engine.model_executor return executor.apply_model(func) + def _get_beam_search_lora_requests( + self, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]], + prompts: list[Union[TokensPrompt, TextPrompt]], + ) -> list[Optional[LoRARequest]]: + """Get the optional lora request corresponding to each prompt.""" + if isinstance(lora_request, + Sequence) and len(lora_request) != len(prompts): + raise ValueError( + "Lora request list should be the same length as the prompts") + return lora_request + + if lora_request is None or isinstance(lora_request, LoRARequest): + return [lora_request] * len(prompts) + + raise TypeError(f"Invalid lora_request type {type(lora_request)}") + def beam_search( self, prompts: list[Union[TokensPrompt, TextPrompt]], params: BeamSearchParams, + lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None, ) -> list[BeamSearchOutput]: """ Generate sequences using beam search. @@ -534,6 +552,7 @@ class LLM: prompts: A list of prompts. Each prompt can be a string or a list of token IDs. params: The beam search parameters. + lora_request: LoRA request to use for generation, if any. """ # TODO: how does beam search work together with length penalty, # frequency, penalty, and stopping criteria, etc.? @@ -543,6 +562,9 @@ class LLM: ignore_eos = params.ignore_eos length_penalty = params.length_penalty + lora_requests = self._get_beam_search_lora_requests( + lora_request, prompts) + def sort_beams_key(x: BeamSearchSequence) -> float: return get_beam_search_score(x.tokens, x.cum_logprob, tokenizer.eos_token_id, @@ -570,7 +592,7 @@ class LLM: temperature=temperature) instances: list[BeamSearchInstance] = [] - for prompt in prompts: + for lora_req, prompt in zip(lora_requests, prompts): # Add multimodal processor kwargs & data mm_kwargs = {} if "multi_modal_data" in prompt: @@ -586,7 +608,12 @@ class LLM: prompt_tokens = tokenizer.encode(prompt["prompt"]) instances.append( - BeamSearchInstance(prompt_tokens, logprobs=None, **mm_kwargs)) + BeamSearchInstance( + prompt_tokens, + lora_request=lora_req, + logprobs=None, + **mm_kwargs, + ), ) for _ in range(max_tokens): all_beams: list[BeamSearchSequence] = list( @@ -600,15 +627,17 @@ class LLM: if len(all_beams) == 0: break - prompts_batch = [ - create_tokens_prompt_from_beam(beam) for beam in all_beams - ] + # create the corresponding batch entries for prompt & optional lora + prompts_batch, lora_req_batch = zip( + *[(create_tokens_prompt_from_beam(beam), beam.lora_request) + for beam in all_beams]) # only runs for one step # we don't need to use tqdm here output = self.generate(prompts_batch, sampling_params=beam_search_params, - use_tqdm=False) + use_tqdm=False, + lora_request=lora_req_batch) for (start, end), instance in zip(instance_start_and_end, instances): @@ -626,6 +655,7 @@ class LLM: new_beam = BeamSearchSequence( tokens=current_beam.tokens + [token_id], logprobs=current_beam.logprobs + [logprobs], + lora_request=current_beam.lora_request, cum_logprob=current_beam.cum_logprob + logprob_obj.logprob, multi_modal_data=current_beam.multi_modal_data, diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index bc11686d7..6a0e3b14d 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -236,6 +236,7 @@ class OpenAIServingChat(OpenAIServing): prompt=engine_prompt, request_id=request_id, params=sampling_params, + lora_request=lora_request, ) else: generator = self.engine_client.generate( diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 7beaae287..1c06070cb 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -186,6 +186,7 @@ class OpenAIServingCompletion(OpenAIServing): prompt=engine_prompt, request_id=request_id, params=sampling_params, + lora_request=lora_request, ) else: generator = self.engine_client.generate( -- GitLab From fced756923e3b65dad0e1f4da202c4d2f82d2a4b Mon Sep 17 00:00:00 2001 From: Aaron Pham Date: Wed, 28 May 2025 11:59:11 -0400 Subject: [PATCH 024/274] [Chore] update ty configuration (#18839) Signed-off-by: Aaron Pham --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index eb55a9ffc..5286724b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -172,7 +172,8 @@ plugins.md033.enabled = false # inline-html plugins.md046.enabled = false # code-block-style plugins.md024.allow_different_nesting = true # no-duplicate-headers -[tool.ty] +[tool.ty.src] +root = "./vllm" respect-ignore-files = true [tool.ty.environment] -- GitLab From c68b5c63eba0ecccda8ab34ca1829cb4ced2c430 Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Thu, 29 May 2025 01:36:21 +0800 Subject: [PATCH 025/274] [Misc] fix olmoe model layer can't laod in tp gt 1 (#18828) Signed-off-by: rongfu.leng --- vllm/model_executor/models/olmoe.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 6364b89fb..af2894555 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -13,6 +13,7 @@ # limitations under the License. """Inference-only OLMoE model compatible with HuggingFace weights.""" from collections.abc import Iterable +from functools import partial from typing import Any, Optional, Union import torch @@ -22,7 +23,10 @@ from transformers import PretrainedConfig from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather) +from vllm.distributed.utils import split_tensor_along_last_dim from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm @@ -140,8 +144,11 @@ class OlmoeAttention(nn.Module): bias=False, quant_config=quant_config, ) - self.q_norm = RMSNorm(hidden_size, eps=1e-5) - self.k_norm = RMSNorm(hidden_size, eps=1e-5) + self.tp_size = tp_size + self.tp_rank = get_tensor_model_parallel_rank() + self.q_norm = RMSNorm(self.total_num_heads * self.head_dim, eps=1e-5) + self.k_norm = RMSNorm(self.total_num_kv_heads * self.head_dim, + eps=1e-5) self.o_proj = RowParallelLinear( self.total_num_heads * self.head_dim, hidden_size, @@ -165,6 +172,20 @@ class OlmoeAttention(nn.Module): quant_config=quant_config, prefix=f"{prefix}.attn") + def _apply_qk_norm(self, q: torch.Tensor, + k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + if self.tp_size > 1: + q = tensor_model_parallel_all_gather(q.contiguous()) + k = tensor_model_parallel_all_gather(k.contiguous()) + q = self.q_norm(q) + k = self.k_norm(k) + if self.tp_size > 1: + splitter = partial(split_tensor_along_last_dim, + num_partitions=self.tp_size) + q = splitter(q)[self.tp_rank] + k = splitter(k)[self.tp_rank] + return q, k + def forward( self, positions: torch.Tensor, @@ -172,7 +193,7 @@ class OlmoeAttention(nn.Module): ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = self.q_norm(q.contiguous()), self.k_norm(k.contiguous()) + q, k = self._apply_qk_norm(q, k) q, k = self.rotary_emb(positions, q, k) attn_output = self.attn(q, k, v) output, _ = self.o_proj(attn_output) -- GitLab From 0e98964e94385c914783842eac4b448236018689 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Wed, 28 May 2025 19:54:12 +0100 Subject: [PATCH 026/274] [V1][Metrics] Remove metrics that were deprecated in 0.8 (#18837) Signed-off-by: Mark McLoughlin --- docs/usage/metrics.md | 13 --- .../prometheus_grafana/grafana.json | 30 ------- tests/entrypoints/openai/test_metrics.py | 7 +- vllm/engine/llm_engine.py | 15 ---- vllm/engine/metrics.py | 89 ------------------- vllm/engine/metrics_types.py | 3 - 6 files changed, 1 insertion(+), 156 deletions(-) diff --git a/docs/usage/metrics.md b/docs/usage/metrics.md index 9ad725318..6603aa83b 100644 --- a/docs/usage/metrics.md +++ b/docs/usage/metrics.md @@ -35,19 +35,6 @@ The following metrics are exposed: --8<-- "vllm/engine/metrics.py:metrics-definitions" ``` -The following metrics are deprecated and due to be removed in a future version: - -- `vllm:num_requests_swapped`, `vllm:cpu_cache_usage_perc`, and - `vllm:cpu_prefix_cache_hit_rate` because KV cache offloading is not - used in V1. -- `vllm:gpu_prefix_cache_hit_rate` is replaced by queries+hits - counters in V1. -- `vllm:time_in_queue_requests` because it duplicates - `vllm:request_queue_time_seconds`. -- `vllm:model_forward_time_milliseconds` and - `vllm:model_execute_time_milliseconds` because - prefill/decode/inference time metrics should be used instead. - Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1` but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch, and are then removed in version `X.Y+2`. diff --git a/examples/online_serving/prometheus_grafana/grafana.json b/examples/online_serving/prometheus_grafana/grafana.json index fbe96b48e..3488956a5 100644 --- a/examples/online_serving/prometheus_grafana/grafana.json +++ b/examples/online_serving/prometheus_grafana/grafana.json @@ -577,23 +577,6 @@ "refId": "A", "useBackend": false }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "Num Swapped", - "range": true, - "refId": "B", - "useBackend": false - }, { "datasource": { "type": "prometheus", @@ -874,19 +857,6 @@ "legendFormat": "GPU Cache Usage", "range": true, "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "editorMode": "code", - "expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}", - "hide": false, - "instant": false, - "legendFormat": "CPU Cache Usage", - "range": true, - "refId": "B" } ], "title": "Cache Utilization", diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 42f7b098f..b21c0173c 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -171,10 +171,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer, EXPECTED_METRICS = [ "vllm:num_requests_running", - "vllm:num_requests_swapped", # deprecated "vllm:num_requests_waiting", "vllm:gpu_cache_usage_perc", - "vllm:cpu_cache_usage_perc", # deprecated "vllm:time_to_first_token_seconds_sum", "vllm:time_to_first_token_seconds_bucket", "vllm:time_to_first_token_seconds_count", @@ -274,10 +272,7 @@ EXPECTED_METRICS_V1 = [ "vllm:request_decode_time_seconds_count", ] -HIDDEN_DEPRECATED_METRICS = [ - "vllm:num_requests_swapped", - "vllm:cpu_cache_usage_perc", -] +HIDDEN_DEPRECATED_METRICS: list[str] = [] @pytest.mark.asyncio diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index ff33d566a..a9600a2c8 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1680,9 +1680,6 @@ class LLMEngine: time_inference_requests: List[float] = [] time_prefill_requests: List[float] = [] time_decode_requests: List[float] = [] - time_in_queue_requests: List[float] = [] - model_forward_time_requests: List[float] = [] - model_execute_time_requests: List[float] = [] # Metadata num_prompt_tokens_requests: List[int] = [] num_generation_tokens_requests: List[int] = [] @@ -1790,15 +1787,6 @@ class LLMEngine: now - seq_group.metrics.first_token_time) time_inference_requests.append( now - seq_group.metrics.first_scheduled_time) - if seq_group.metrics.time_in_queue is not None: - time_in_queue_requests.append( - seq_group.metrics.time_in_queue) - if seq_group.metrics.model_forward_time is not None: - model_forward_time_requests.append( - seq_group.metrics.model_forward_time) - if seq_group.metrics.model_execute_time is not None: - model_execute_time_requests.append( - seq_group.metrics.model_execute_time * 1000) # Metadata num_prompt_tokens_requests.append( len(seq_group.prompt_token_ids)) @@ -1867,9 +1855,6 @@ class LLMEngine: time_inference_requests=time_inference_requests, time_prefill_requests=time_prefill_requests, time_decode_requests=time_decode_requests, - time_in_queue_requests=time_in_queue_requests, - model_forward_time_requests=model_forward_time_requests, - model_execute_time_requests=model_execute_time_requests, # Metadata num_prompt_tokens_requests=num_prompt_tokens_requests, num_generation_tokens_requests=num_generation_tokens_requests, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 34b48f83b..916afe0c8 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -80,17 +80,6 @@ class Metrics: multiprocess_mode="livemostrecent", ) - # Deprecated in 0.8 - KV cache offloading is not used in V1 - # Hidden in 0.9, due to be removed in 0.10 - if self.show_hidden_metrics: - self.gauge_scheduler_swapped = self._gauge_cls( - name="vllm:num_requests_swapped", - documentation=( - "Number of requests swapped to CPU. " - "DEPRECATED: KV cache offloading is not used in V1"), - labelnames=labelnames, - multiprocess_mode="sum") - # KV Cache Usage in % self.gauge_gpu_cache_usage = self._gauge_cls( name="vllm:gpu_cache_usage_perc", @@ -98,35 +87,6 @@ class Metrics: labelnames=labelnames, multiprocess_mode="sum") - # Deprecated in 0.8 - KV cache offloading is not used in V1 - # Hidden in 0.9, due to be removed in 0.10 - if self.show_hidden_metrics: - self.gauge_cpu_cache_usage = self._gauge_cls( - name="vllm:cpu_cache_usage_perc", - documentation=( - "CPU KV-cache usage. 1 means 100 percent usage. " - "DEPRECATED: KV cache offloading is not used in V1"), - labelnames=labelnames, - multiprocess_mode="sum") - self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls( - name="vllm:cpu_prefix_cache_hit_rate", - documentation=( - "CPU prefix cache block hit rate. " - "DEPRECATED: KV cache offloading is not used in V1"), - labelnames=labelnames, - multiprocess_mode="sum") - - # Deprecated in 0.8 - replaced by queries+hits counters in V1 - # Hidden in 0.9, due to be removed in 0.10 - if self.show_hidden_metrics: - self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls( - name="vllm:gpu_prefix_cache_hit_rate", - documentation=("GPU prefix cache block hit rate. " - "DEPRECATED: use vllm:gpu_prefix_cache_queries " - "and vllm:gpu_prefix_cache_queries in V1"), - labelnames=labelnames, - multiprocess_mode="sum") - # Iteration stats self.counter_num_preemption = self._counter_cls( name="vllm:num_preemptions_total", @@ -200,36 +160,6 @@ class Metrics: "Histogram of time spent in DECODE phase for request.", labelnames=labelnames, buckets=request_latency_buckets) - # Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds: - # Hidden in 0.9, due to be removed in 0.10 - if self.show_hidden_metrics: - self.histogram_time_in_queue_request = self._histogram_cls( - name="vllm:time_in_queue_requests", - documentation= - ("Histogram of time the request spent in the queue in seconds. " - "DEPRECATED: use vllm:request_queue_time_seconds instead."), - labelnames=labelnames, - buckets=request_latency_buckets) - - # Deprecated in 0.8 - use prefill/decode/inference time metrics - # Hidden in 0.9, due to be removed in 0.10 - if self.show_hidden_metrics: - self.histogram_model_forward_time_request = self._histogram_cls( - name="vllm:model_forward_time_milliseconds", - documentation= - ("Histogram of time spent in the model forward pass in ms. " - "DEPRECATED: use prefill/decode/inference time metrics instead" - ), - labelnames=labelnames, - buckets=build_1_2_3_5_8_buckets(3000)) - self.histogram_model_execute_time_request = self._histogram_cls( - name="vllm:model_execute_time_milliseconds", - documentation= - ("Histogram of time spent in the model execute function in ms." - "DEPRECATED: use prefill/decode/inference time metrics instead" - ), - labelnames=labelnames, - buckets=build_1_2_3_5_8_buckets(3000)) # Metadata self.histogram_num_prompt_tokens_request = self._histogram_cls( @@ -580,20 +510,10 @@ class PrometheusStatLogger(StatLoggerBase): # System state data self._log_gauge(self.metrics.gauge_scheduler_running, stats.num_running_sys) - if self.metrics.show_hidden_metrics: - self._log_gauge(self.metrics.gauge_scheduler_swapped, - stats.num_swapped_sys) self._log_gauge(self.metrics.gauge_scheduler_waiting, stats.num_waiting_sys) self._log_gauge(self.metrics.gauge_gpu_cache_usage, stats.gpu_cache_usage_sys) - if self.metrics.show_hidden_metrics: - self._log_gauge(self.metrics.gauge_cpu_cache_usage, - stats.cpu_cache_usage_sys) - self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate, - stats.cpu_prefix_cache_hit_rate) - self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate, - stats.gpu_prefix_cache_hit_rate) # Including max-lora in metric, in future this property of lora # config maybe extended to be dynamic. lora_info = { @@ -631,15 +551,6 @@ class PrometheusStatLogger(StatLoggerBase): stats.time_prefill_requests) self._log_histogram(self.metrics.histogram_decode_time_request, stats.time_decode_requests) - if self.metrics.show_hidden_metrics: - self._log_histogram(self.metrics.histogram_time_in_queue_request, - stats.time_in_queue_requests) - self._log_histogram( - self.metrics.histogram_model_forward_time_request, - stats.model_forward_time_requests) - self._log_histogram( - self.metrics.histogram_model_execute_time_request, - stats.model_execute_time_requests) # Metadata finished_reason_counter = CollectionsCounter( stats.finished_reason_requests) diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 9e6d5ef29..acc83011d 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -53,9 +53,6 @@ class Stats: time_inference_requests: List[float] time_prefill_requests: List[float] time_decode_requests: List[float] - time_in_queue_requests: List[float] - model_forward_time_requests: List[float] - model_execute_time_requests: List[float] # Metadata num_prompt_tokens_requests: List[int] num_generation_tokens_requests: List[int] -- GitLab From a09c7ca9f2c01325eb46e82272c55358e5f98e8e Mon Sep 17 00:00:00 2001 From: Aaron Pham Date: Wed, 28 May 2025 14:57:19 -0400 Subject: [PATCH 027/274] [Chore][Spec Decode] Update check NoneType instead of assigning variables (#18836) Signed-off-by: Aaron Pham --- vllm/v1/worker/gpu_model_runner.py | 50 ++++++++++++++---------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 910c0e80b..5d5558162 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -146,31 +146,27 @@ class GPUModelRunner(LoRAModelRunnerMixin): # req_id -> (input_id -> encoder_output) self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {} - # Set up speculative decoding. - self.use_spec_decode = False self.use_aux_hidden_state_outputs = False - if self.speculative_config: - self.use_spec_decode = True - - # NOTE(Jiayi): currently we put the entire draft model on - # the last PP rank. This is not ideal if there are many - # layers in the draft model. - if get_pp_group().is_last_rank: - if self.speculative_config.method == "ngram": - self.drafter = NgramProposer(self.vllm_config) - elif self.speculative_config.use_eagle(): - self.drafter = EagleProposer(self.vllm_config, self.device, - self) # type: ignore - if self.speculative_config.method == "eagle3": - self.use_aux_hidden_state_outputs = True - elif self.speculative_config.method == "medusa": - self.drafter = MedusaProposer( - vllm_config=self.vllm_config, - device=self.device) # type: ignore - else: - raise ValueError("Unknown speculative decoding method: " - f"{self.speculative_config.method}") - self.rejection_sampler = RejectionSampler() + # Set up speculative decoding. + # NOTE(Jiayi): currently we put the entire draft model on + # the last PP rank. This is not ideal if there are many + # layers in the draft model. + if self.speculative_config and get_pp_group().is_last_rank: + if self.speculative_config.method == "ngram": + self.drafter = NgramProposer(self.vllm_config) + elif self.speculative_config.use_eagle(): + self.drafter = EagleProposer(self.vllm_config, self.device, + self) # type: ignore + if self.speculative_config.method == "eagle3": + self.use_aux_hidden_state_outputs = True + elif self.speculative_config.method == "medusa": + self.drafter = MedusaProposer( + vllm_config=self.vllm_config, + device=self.device) # type: ignore + else: + raise ValueError("Unknown speculative decoding method: " + f"{self.speculative_config.method}") + self.rejection_sampler = RejectionSampler() # Request states. self.requests: dict[str, CachedRequestState] = {} @@ -1318,7 +1314,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): for i in discard_sampled_tokens_req_indices: valid_sampled_token_ids[i].clear() - if not self.use_spec_decode: + if not self.speculative_config: # Speculative decoding is not enabled. spec_token_ids = None elif self.speculative_config.method == "ngram": @@ -1740,7 +1736,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): else: hidden_states = outputs - if self.use_spec_decode and self.speculative_config.use_eagle(): + if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) self.drafter.dummy_run(num_tokens) @@ -1795,7 +1791,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): "initializing the engine.") from e else: raise e - if self.use_spec_decode: + if self.speculative_config: draft_token_ids = [[0] for _ in range(num_reqs)] dummy_spec_decode_metadata = SpecDecodeMetadata.make_dummy( draft_token_ids, self.device) -- GitLab From 643622ba465f0bbcb07e1d81c60dac35ed994ebf Mon Sep 17 00:00:00 2001 From: Akshat Tripathi Date: Wed, 28 May 2025 20:59:09 +0100 Subject: [PATCH 028/274] [Hardware][TPU][V1] Multi-LoRA Optimisations for the V1 TPU backend (#15655) Signed-off-by: Akshat Tripathi Signed-off-by: Chengji Yao Signed-off-by: xihajun Signed-off-by: Jorge de Freitas Signed-off-by: Jorge de Freitas Co-authored-by: Chengji Yao Co-authored-by: xihajun Co-authored-by: Jorge de Freitas Co-authored-by: Jorge de Freitas --- .../scripts/hardware_ci/run-tpu-v1-test.sh | 6 +- tests/tpu/lora/test_pallas_kernels.py | 73 -------- vllm/lora/models.py | 4 +- vllm/lora/ops/xla_ops/lora_ops.py | 140 ++++++++++------ vllm/lora/ops/xla_ops/pallas.py | 133 --------------- vllm/lora/punica_wrapper/punica_tpu.py | 157 +++++++++++++----- vllm/v1/worker/lora_model_runner_mixin.py | 57 +++++-- vllm/v1/worker/tpu_model_runner.py | 82 +++++++-- vllm/v1/worker/tpu_worker.py | 7 +- 9 files changed, 325 insertions(+), 334 deletions(-) delete mode 100644 tests/tpu/lora/test_pallas_kernels.py delete mode 100644 vllm/lora/ops/xla_ops/pallas.py diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index eb82da3a8..5dd53420d 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -122,10 +122,8 @@ run_and_track_test 11 "test_struct_output_generate.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" run_and_track_test 12 "test_moe_pallas.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" - -# Disable the TPU LoRA tests until the feature is activated -# run_and_track_test 13 "test_lora (directory)" \ -# "python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/" +run_and_track_test 13 "test_lora.py" \ + "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py" # After all tests have been attempted, exit with the overall status. if [ "$overall_script_exit_code" -ne 0 ]; then diff --git a/tests/tpu/lora/test_pallas_kernels.py b/tests/tpu/lora/test_pallas_kernels.py deleted file mode 100644 index 8bd47de50..000000000 --- a/tests/tpu/lora/test_pallas_kernels.py +++ /dev/null @@ -1,73 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -import pytest -import torch - -# Required to register the custom ops -import vllm.lora.ops.xla_ops.pallas # noqa # pylint: disable=unused-import - -N_TOKENS = [16, 1024, 4096] -HIDDEN_SIZES = [1024, 2048, 4096] - -DTYPES = [torch.bfloat16] -NUM_LORA = [1, 4, 16] -RANKS = [32, 256, 512] - - -def generate_test_data(T, D, L, N, seed, dtype=torch.float32): - """ - Inputs: (All integers) - T: Total number of tokens - D: Input dim - L: LoRA Dim - N: N LoRAs - - Outputs: - inputs: torch.Tensor - shape (T, D) - loras: torch.Tensor - shape (N, 1, L, D) - idxs: torch.Tensor - shape (T, ) - all values must be in [0, N) - - ref_output: torch.Tensor - shape (T, L) - inputs @ loras[idxs].T - """ - torch.manual_seed(seed) - - inputs = torch.randn((T, D), device="xla", dtype=dtype) - loras = torch.randn((N, 1, L, D), device="xla", dtype=dtype) - idxs = torch.randint(0, N, (T, ), dtype=torch.int32, device="xla") - - ref_output = ref_bgmv(inputs, loras, idxs) - return inputs, loras, idxs, ref_output - - -def ref_bgmv(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.Tensor): - selected_loras = loras[idxs] - if len(selected_loras.shape) == 4: - selected_loras = selected_loras.squeeze(axis=1) - - batch_size, output_size, input_size = selected_loras.shape - return (selected_loras @ inputs.reshape( - (batch_size, input_size, 1))).reshape((batch_size, output_size)) - - -# Parameterize tests with various shapes and dtypes -@pytest.mark.parametrize("T", N_TOKENS) -@pytest.mark.parametrize("D", HIDDEN_SIZES) -@pytest.mark.parametrize("L", RANKS) -@pytest.mark.parametrize("N", NUM_LORA) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("op_type", ["shrink", "expand"]) -@pytest.mark.parametrize("seed", [0]) -def test_bgmv_correctness(T, D, L, N, dtype, op_type, seed): - if op_type == "expand": - D, L = L, D - - inputs, loras, idxs, ref_output = generate_test_data( - T, D, L, N, seed, dtype) - - # Run bgmv - output = torch.ops.xla.bgmv(inputs, loras, idxs) - - # Make sure we have no NaNs - assert not torch.any(torch.isnan(output)) - - # Compare with reference output - assert torch.allclose(output, ref_output, rtol=1e-2, atol=1e-2) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index af5cebdf2..d3b1374a9 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -200,7 +200,7 @@ class LoRAModel(AdapterModel): weights_mapper: Optional[WeightsMapper] = None, tensorizer_config_dict: Optional[dict] = None) -> "LoRAModel": """Create a LoRAModel from a local checkpoint. - + Args: lora_dir: The local path that has lora data. expected_lora_modules: Name of modules that are expected to be @@ -620,7 +620,7 @@ class LoRAModelManager(AdapterModelManager): def _filter_unsupported_mm_module(self, module_name: str) -> bool: """ Regarding multimodal models, vLLM currently only supports adding LoRA to - language model. LoRA for other modules, such as the vision tower, will + language model. LoRA for other modules, such as the vision tower, will be filtered out. """ if self.supports_mm: diff --git a/vllm/lora/ops/xla_ops/lora_ops.py b/vllm/lora/ops/xla_ops/lora_ops.py index acbec0cfa..dff4d5181 100644 --- a/vllm/lora/ops/xla_ops/lora_ops.py +++ b/vllm/lora/ops/xla_ops/lora_ops.py @@ -1,63 +1,99 @@ # SPDX-License-Identifier: Apache-2.0 +import jax +import jax.numpy as jnp import torch +import torch.nn.functional as F +import torch_xla.core.xla_builder as xb +from torch.library import impl +from torch_xla.experimental.custom_kernel import XLA_LIB, jax_import_guard -# Required to register the custom ops -import vllm.lora.ops.xla_ops.pallas # noqa # pylint: disable=unused-import +@jax.jit +def bgmv_jax(inputs, loras, idxs): + return jnp.einsum( + "td,tX,Xld->tl", + inputs, + jax.nn.one_hot(idxs, loras.shape[0], dtype=inputs.dtype), + loras, + ) -def bgmv_expand(inputs: torch.Tensor, - lora_b_weights: torch.Tensor, - output_tensor: torch.Tensor, - lora_indices_tensor: torch.Tensor, - add_inputs: bool = True): + +XLA_LIB.define("bgmv(Tensor inputs, Tensor loras, Tensor idxs) -> Tensor") + + +@impl(XLA_LIB, "bgmv", "XLA") +def bgmv_xla(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.IntTensor): + if len(loras.shape) == 4: + loras = loras.squeeze(axis=1) + + jax_import_guard() + return xb.call_jax(bgmv_jax, (inputs, loras, idxs)) + + +@impl(XLA_LIB, "bgmv", "CompositeExplicitAutograd") +def bgmv_non_xla(inputs: torch.Tensor, loras: torch.Tensor, + idxs: torch.IntTensor): + T, _ = inputs.shape + if len(loras.shape) == 4: + loras = loras.squeeze(axis=1) + _, L, _ = loras.shape + + return torch.empty((T, L), device=inputs.device) + + +def bgmv_expand( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + add_inputs: bool = True, +): """ Args: inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size]. - - lora_b_weights (torch.Tensor): LoRA weights of shape + + lora_b_weights (torch.Tensor): LoRA weights of shape [num_loras, lora_rank, hidden_size]. - - output_tensor (torch.Tensor): output tensor of shape + + output_tensor (torch.Tensor): output tensor of shape [num_tokens, hidden_size * num_slices]. - - lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] + + lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] indicating which LoRA matrix to use for each token. - add_inputs (bool): Whether or not to add the input tensor to the output + add_inputs (bool): Whether or not to add the input tensor to the output tensor. """ outputs = torch.ops.xla.bgmv(inputs, lora_b_weights, lora_indices_tensor) - n_tokens = outputs.size(0) limit = output_tensor.shape[0] if outputs.shape[0] == 1 and output_tensor.shape[0] != 1: limit = 1 - outputs = torch.cat( - (outputs, - torch.zeros((n_tokens, output_tensor.shape[1] - outputs.shape[1]), - device=outputs.device)), - dim=1) + if output_tensor.shape[1] > outputs.shape[1]: + outputs = F.pad(outputs, + (0, output_tensor.shape[1] - outputs.shape[1], 0, 0)) if add_inputs: - return output_tensor + outputs[:limit, :] + return output_tensor + outputs[:limit, :output_tensor.shape[1]] else: - return outputs[:limit, :] + return outputs[:limit, :output_tensor.shape[1]] -def bgmv_shrink(inputs: torch.Tensor, - lora_b_weights: torch.Tensor, - output_tensor: torch.Tensor, - lora_indices_tensor: torch.Tensor, - scaling: float = 1.0): +def bgmv_shrink( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + lora_indices_tensor: torch.Tensor, + scaling: float = 1.0, +): """ Args: inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size]. - lora_b_weights (torch.Tensor): LoRA weights of shape + lora_b_weights (torch.Tensor): LoRA weights of shape [num_loras, lora_rank, hidden_size]. output_tensor (torch.Tensor): (Unused) output tensor (placeholder). - lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] + lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] indicating which LoRA matrix to use for each token. scaling (float, optional): Scalar multiplier applied to the output. """ @@ -66,39 +102,41 @@ def bgmv_shrink(inputs: torch.Tensor, lora_indices_tensor) -def bgmv_expand_slice(inputs: torch.Tensor, - lora_b_weights: torch.Tensor, - output_tensor: torch.Tensor, - lora_indices_tensor: torch.Tensor, - slice_offset: int, - slice_size: int, - add_inputs: bool = True): +def bgmv_expand_slice( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + slice_offset: int, + slice_size: int, + add_inputs: bool = True, +): """ Args: inputs (torch.Tensor): Input tensor of shape [num_tokens, hidden_size]. - - lora_b_weights (torch.Tensor): LoRA weights of shape + + lora_b_weights (torch.Tensor): LoRA weights of shape [num_loras, lora_rank, hidden_size]. - - output_tensor (torch.Tensor): output tensor of shape + + output_tensor (torch.Tensor): output tensor of shape [num_tokens, hidden_size * num_slices]. - - lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] + + lora_indices_tensor (torch.Tensor): Tensor of shape [num_tokens] indicating which LoRA matrix to use for each token. - add_inputs (bool): Whether or not to add the input tensor to the output + add_inputs (bool): Whether or not to add the input tensor to the output tensor. """ outputs = torch.ops.xla.bgmv(inputs, lora_b_weights, lora_indices_tensor) - n_tokens = outputs.size(0) - outputs = torch.cat(( - torch.zeros((n_tokens, slice_offset), device=outputs.device), + outputs = F.pad( outputs, - torch.zeros( - (n_tokens, output_tensor.shape[1] - (slice_offset + slice_size)), - device=outputs.device), - ), - dim=1) + ( + slice_offset, + output_tensor.shape[1] - (slice_offset + slice_size), + 0, + 0, + ), + ) if add_inputs: return output_tensor + outputs diff --git a/vllm/lora/ops/xla_ops/pallas.py b/vllm/lora/ops/xla_ops/pallas.py deleted file mode 100644 index 35dc30753..000000000 --- a/vllm/lora/ops/xla_ops/pallas.py +++ /dev/null @@ -1,133 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -import functools - -import jax -import jax.numpy as jnp -import torch -from jax.experimental import pallas as pl -from jax.experimental.pallas import tpu as pltpu -from torch.library import impl -from torch_xla.experimental.custom_kernel import (XLA_LIB, jax_import_guard, - make_kernel_from_pallas) - -# TODO: Tune these -TOKENS_BLOCK = 16 -LORA_RANK_BLOCK = 128 -DIM_BLOCK_SIZE = 128 - - -def _bgmv_kernel(bT: int, bL: int, idx_ref, inp_ref, lora_ref, out_ref, - acc_ref, mask_ref): - - @pl.when(pl.program_id(2) == 0) - def _(): - acc_ref[...] = jnp.zeros_like(acc_ref[...], dtype=jnp.float32) - - t = pl.program_id(0) - - for i in range(bT): - idx = idx_ref[i + bT * t] - mask_ref[...] = jnp.zeros_like(mask_ref[...], dtype=jnp.float32) - mask_ref[i, :] = jnp.ones((bL, ), dtype=jnp.float32) - - acc_ref[...] += jax.lax.dot_general( - inp_ref[...], - lora_ref[idx, ...], (((1, ), (1, )), ((), ())), - preferred_element_type=jnp.float32) * mask_ref[...] - - @pl.when(pl.program_id(2) == pl.num_programs(2) - 1) - def _(): - out_ref[...] = acc_ref[...].astype(out_ref.dtype) - - -@jax.jit -def _bgmv( - idxs: jax.Array, # (T, ) int32 - inputs: jax.Array, # (T, D) model dtype - loras: jax.Array # (N, L, D) model dtype -) -> jax.Array: # (T, L) model dtype - T, D = inputs.shape - N, L, _ = loras.shape - - return pl.pallas_call( - kernel=functools.partial(_bgmv_kernel, TOKENS_BLOCK, LORA_RANK_BLOCK), - out_shape=jax.ShapeDtypeStruct((T, L), dtype=inputs.dtype), - grid_spec=pltpu.PrefetchScalarGridSpec( - num_scalar_prefetch=1, - grid=(T // TOKENS_BLOCK, L // LORA_RANK_BLOCK, - D // DIM_BLOCK_SIZE), - in_specs=[ - pl.BlockSpec((TOKENS_BLOCK, DIM_BLOCK_SIZE), - lambda i, j, k, block_idx: (i, k)), - pl.BlockSpec((N, LORA_RANK_BLOCK, DIM_BLOCK_SIZE), - lambda i, j, k, block_idx: (0, j, k)), - ], - out_specs=pl.BlockSpec((TOKENS_BLOCK, LORA_RANK_BLOCK), - lambda i, j, k, block_idx: (i, j)), - scratch_shapes=[ - pltpu.VMEM((TOKENS_BLOCK, LORA_RANK_BLOCK), jnp.float32), - pltpu.VMEM((TOKENS_BLOCK, LORA_RANK_BLOCK), jnp.float32) - ]), - compiler_params=pltpu.TPUCompilerParams( - dimension_semantics=("parallel", "parallel", "arbitrary")), - name="bgmv")(idxs, inputs, loras) - - -def bgmv_shape_function(idxs, inputs, loras): - T, _ = inputs.shape - _, L, _ = loras.shape - - return [((T, L), inputs.dtype)] - - -XLA_LIB.define("bgmv(Tensor inputs, Tensor loras, Tensor idxs) -> Tensor", ) - - -@impl(XLA_LIB, "bgmv", "XLA") -def bgmv_xla(inputs: torch.Tensor, loras: torch.Tensor, idxs: torch.IntTensor): - inputs = inputs.to(dtype=loras.dtype) - - if len(loras.shape) == 4: - loras = loras.squeeze(axis=1) - - jax_import_guard() - kernel = make_kernel_from_pallas(_bgmv, bgmv_shape_function) - - T, _ = inputs.shape - _, L, D = loras.shape - - # Pad the loras' rank if it's too low. This is to allow it to fit in a TPU - # register. This has to happen in pytorch, doing it in Jax will lead to NaNs - L1 = L - if LORA_RANK_BLOCK > L or L % LORA_RANK_BLOCK != 0: - L1 = (L // LORA_RANK_BLOCK + 1) * LORA_RANK_BLOCK - - D1 = D - if DIM_BLOCK_SIZE > D or D % DIM_BLOCK_SIZE != 0: - D1 = (D // DIM_BLOCK_SIZE + 1) * DIM_BLOCK_SIZE - - T1 = T - if TOKENS_BLOCK > T or T % TOKENS_BLOCK != 0: - T1 = (T // TOKENS_BLOCK + 1) * TOKENS_BLOCK - - if D1 != D or L1 != L: - loras = torch.nn.functional.pad(loras, (0, D1 - D, 0, L1 - L, 0, 0)) - if D1 != D or T1 != T: - inputs = torch.nn.functional.pad(inputs, (0, D1 - D, 0, T1 - T)) - if T1 != T: - idxs = torch.nn.functional.pad(idxs, ((0, T1 - T))) - - return kernel(idxs, inputs, loras)[:T, :L] - - -@impl(XLA_LIB, "bgmv", "CompositeExplicitAutograd") -def bgmv_non_xla(inputs: torch.Tensor, loras: torch.Tensor, - idxs: torch.IntTensor): - T, _ = inputs.shape - - if len(loras.shape) == 4: - loras = loras.squeeze(axis=1) - - _, L, _ = loras.shape - - return torch.empty((T, L), device=inputs.device) diff --git a/vllm/lora/punica_wrapper/punica_tpu.py b/vllm/lora/punica_wrapper/punica_tpu.py index f3153c6da..0556e583f 100644 --- a/vllm/lora/punica_wrapper/punica_tpu.py +++ b/vllm/lora/punica_wrapper/punica_tpu.py @@ -1,11 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional, Union +import math +from typing import TYPE_CHECKING, Optional, Union import torch import torch.nn.functional as F +import torch_xla.core.xla_model as xm from vllm.lora.ops.xla_ops import bgmv_expand, bgmv_expand_slice, bgmv_shrink +from vllm.lora.punica_wrapper.utils import convert_mapping + +if TYPE_CHECKING: + # avoid circuit import + from vllm.lora.layers import LoRAMapping + from vllm.lora.models import LongContextLoRAContext from .punica_base import PunicaWrapperBase @@ -31,6 +39,15 @@ class PunicaWrapperTPU(PunicaWrapperBase): self._sampler_indices_padded = self._sampler_indices_padded.to( dtype=torch.int32) + torch.ops.xla.dynamo_set_buffer_donor_(self._token_lora_indices, True) + torch.ops.xla.dynamo_set_buffer_donor_(self._sampler_indices, True) + torch.ops.xla.dynamo_set_buffer_donor_(self._sampler_indices_padded, + True) + torch.ops.xla.dynamo_set_buffer_donor_(self._embeddings_indices, True) + torch.ops.xla.dynamo_set_buffer_donor_(self._long_lora_indices, True) + torch.ops.xla.dynamo_set_buffer_donor_(self._lora_indices_per_batch, + True) + torch._dynamo.mark_dynamic(self._token_lora_indices, 0) torch._dynamo.mark_dynamic(self._embeddings_indices, 1) torch._dynamo.mark_dynamic(self._sampler_indices_padded, 0) @@ -55,15 +72,11 @@ class PunicaWrapperTPU(PunicaWrapperBase): def shrink( self, - y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, scale: float, ): - if self.no_lora: - return y - return bgmv_shrink(x, w_t_all, y, self._get_token_lora_indices(x), - scale) + return bgmv_shrink(x, w_t_all, self._get_token_lora_indices(x), scale) def expand(self, y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, add_inputs: bool): @@ -72,7 +85,7 @@ class PunicaWrapperTPU(PunicaWrapperBase): def expand_slice(self, y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, y_offset: int, y_slice_size: int, - y_total_size: int, add_inputs: bool) -> torch.Tensor: + add_inputs: bool) -> torch.Tensor: return bgmv_expand_slice(x, w_t_all, y, self._get_token_lora_indices(x), y_offset, y_slice_size, add_inputs) @@ -98,9 +111,8 @@ class PunicaWrapperTPU(PunicaWrapperBase): x = x.view(-1, x.shape[-1]) for slice_idx in range(len(lora_a_stacked)): - y_s = y[slice_idx] lora_s = lora_a_stacked[slice_idx] - y_s = self.shrink(y_s, x, lora_s, scale) + y_s = self.shrink(x, lora_s, scale) y[slice_idx, :, :] = y_s # type: ignore[index] return y @@ -140,15 +152,12 @@ class PunicaWrapperTPU(PunicaWrapperBase): y = self._apply_bias(self._get_token_lora_indices(y), y, output_slices, lora_bias_stacked) for slice_idx in range(len(lora_b_stacked)): - y = self.expand_slice( - y, - x[slice_idx], - lora_b_stacked[slice_idx], - offset_left, - output_slices[slice_idx], - y_total_size=sum(output_slices), - add_inputs=add_inputs, - ) + y = self.expand_slice(y, + x[slice_idx], + lora_b_stacked[slice_idx], + offset_left, + output_slices[slice_idx], + add_inputs=add_inputs) offset_left += output_slices[slice_idx] return y.view_as(y_org) @@ -216,12 +225,10 @@ class PunicaWrapperTPU(PunicaWrapperBase): if buffer is None: r = lora_b_stacked[0].size(-1) - # We set the buffer to be float32 by default, consistent with the - # triton op T = x.size(0) buffer = torch.zeros( (len(output_slices), T, r), - dtype=torch.float32, + dtype=x.dtype, device=x.device, ) buffer = self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs) @@ -257,26 +264,16 @@ class PunicaWrapperTPU(PunicaWrapperBase): scale (float): Scaling factor. buffer (Optional[torch.Tensor]):Default to None. """ - if self.no_lora: - return y - y_org = y y = y.view(-1, y.shape[-1]) x = x.view(-1, x.shape[-1]) - r = lora_b_stacked.size(-1) - if buffer is None: - # We set the buffer to be float32 by default, consistent with the - # triton op - buffer = torch.zeros((x.size(0), r), - dtype=torch.float32, - device=x.device) - - buffer = bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, - scale) + + sampler_indices = torch.narrow(self._sampler_indices, 0, 0, x.size(0)) + buffer = bgmv_shrink(x, lora_a_stacked, sampler_indices, scale) y = bgmv_expand(buffer, lora_b_stacked, y, - self.sampler_indices, + sampler_indices, add_inputs=True) return y.view_as(y_org) @@ -316,10 +313,92 @@ class PunicaWrapperTPU(PunicaWrapperBase): return output.view_as(org_output) + # This performs the same tensor ops as the base method, except it does them + # on the CPU then transfers the results to the TPU + def _update_base_metadata( + self, + mapping: "LoRAMapping", + lora_index_to_id: list[Optional[int]], + max_loras: int, + vocab_size: int, + extra_vocab_size: int, + long_lora_context: Optional["LongContextLoRAContext"] = None, + ): + # Make sure we don't accidentally collect outside operations + xm.mark_step() + + # Pad the prompt mapping to avoid running into recompiles on the TPU + # TODO: Should this happen inside mapping internally? If so how can we + # avoid having backend specific LoRAMapping classes? + mapping.prompt_mapping = self._pad_prompt_mapping( + mapping.prompt_mapping) + + ( + base_indices, + sampler_indices, + sampler_indices_padded, + embeddings_indices, + long_lora_offsets_tensor, + indices_len, + ) = convert_mapping( + mapping, + lora_index_to_id, + max_loras, + vocab_size, + extra_vocab_size, + "cpu", + long_lora_context, + ) + self._token_lora_indices = self._pad_to_shape( + base_indices, self._token_lora_indices.shape, + dims=1).to(self.device) + self._sampler_indices = self._pad_to_shape(sampler_indices, + self._sampler_indices.shape, + dims=1).to(self.device) + self._sampler_indices_padded = self._pad_to_shape( + sampler_indices_padded, self._sampler_indices_padded.shape, + dims=1).to(self.device) + self._embeddings_indices = self._pad_to_shape( + embeddings_indices, self._embeddings_indices.shape, + dims=2).to(self.device) + if long_lora_offsets_tensor is not None: + self._long_lora_indices = self._pad_to_shape( + long_lora_offsets_tensor, + self._long_lora_indices.shape, + dims=1).to(self.device) + else: + zeroed = torch.zeros_like(self._long_lora_indices.cpu(), + dtype=torch.int32) + self._long_lora_indices = zeroed.to(self.device) + self.indices_len[:] = indices_len + def _update_prefill_metadata(self, token_lora_tensor: torch.Tensor) -> None: self.batch_size = 1 - self._lora_indices_per_batch[:self.batch_size].copy_( - token_lora_tensor[:self.batch_size]) - # TODO: .item() is extremely inefficient on TPU, so find a way around it - self.no_lora = torch.all(token_lora_tensor == -1).item() + self._lora_indices_per_batch[:self. + batch_size] = token_lora_tensor[:self. + batch_size] + + def _pad_prompt_mapping( + self, prompt_mapping: tuple[int, ...]) -> tuple[int, ...]: + num_reqs = len(prompt_mapping) + + # From vllm/v1/worker/tpu_model_runner:51, but need to avoid a circular + # import + MIN_NUM_SEQS = 8 + + padded_num_reqs = max(2**math.ceil(math.log2(num_reqs)), MIN_NUM_SEQS) + pad_len = padded_num_reqs - num_reqs + + padding = [-1] * pad_len + return tuple(list(prompt_mapping) + padding) + + def _pad_to_shape(self, src, target_shape, dims=1): + if dims == 1: + pad_len = target_shape[0] - src.shape[0] + return F.pad(src, (0, pad_len), value=0).to(torch.int32) + else: + pad_rows = target_shape[0] - src.shape[0] + pad_cols = target_shape[1] - src.shape[1] + return F.pad(src, (0, pad_cols, 0, pad_rows), + value=0).to(torch.int32) diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index 3cbab840e..eb8ed6221 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -80,8 +80,38 @@ class LoRAModelRunnerMixin: lora_requests) @contextmanager - def maybe_dummy_run_with_lora(self, lora_config: LoRAConfig, - num_scheduled_tokens: np.ndarray): + def maybe_setup_dummy_loras(self, lora_config): + if lora_config is None: + yield + else: + # __enter__ code + assert self.lora_manager is not None, "LoRA is not enabled" + + num_loras = lora_config.max_loras + + # Make dummy lora requests + lora_requests: set[LoRARequest] = { + LoRARequest(lora_name=f"warmup_{lora_id}", + lora_int_id=lora_id, + lora_path="/not/a/real/path") + for lora_id in range(1, num_loras + 1) + } + + with self.lora_manager.dummy_lora_cache(): + # Add the dummy LoRAs here so _set_active_loras doesn't try to + # load from disk. + for lr in lora_requests: + self.lora_manager.add_dummy_lora( + lr, rank=self.LORA_WARMUP_RANK) + + yield + + # __exit__ code + self.lora_manager.remove_all_adapters() + + @contextmanager + def maybe_select_dummy_loras(self, lora_config: LoRAConfig, + num_scheduled_tokens: np.ndarray): if lora_config is None: yield else: @@ -108,21 +138,18 @@ class LoRAModelRunnerMixin: for lora_id in range(1, num_loras + 1) } - with self.lora_manager.dummy_lora_cache(): - # Add the dummy LoRAs here so _set_active_loras doesn't try to - # load from disk. - for lr in lora_requests: - self.lora_manager.add_dummy_lora( - lr, rank=self.LORA_WARMUP_RANK) - - self._set_active_loras(tuple(prompt_lora_mapping), - tuple(token_lora_mapping), - lora_requests) + self._set_active_loras(tuple(prompt_lora_mapping), + tuple(token_lora_mapping), lora_requests) - yield + yield - # __exit__ code - self.lora_manager.remove_all_adapters() + @contextmanager + def maybe_dummy_run_with_lora(self, lora_config: LoRAConfig, + num_scheduled_tokens: np.ndarray): + with self.maybe_setup_dummy_loras( + lora_config), self.maybe_select_dummy_loras( + lora_config, num_scheduled_tokens): + yield def add_lora(self, lora_request: LoRARequest) -> bool: if not self.lora_manager: diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 46bcf64ed..669908cb5 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -20,6 +20,7 @@ from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.forward_context import set_forward_context from vllm.logger import init_logger +from vllm.lora.layers import BaseLayerWithLoRA from vllm.model_executor.model_loader import get_model from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs, @@ -152,6 +153,9 @@ class TPUModelRunner(LoRAModelRunnerMixin): self.hidden_size = model_config.get_hidden_size() self.vocab_size = model_config.get_vocab_size() + if self.lora_config is not None: + self.vocab_size += self.lora_config.lora_extra_vocab_size + # Multi-modal data support self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope @@ -591,6 +595,17 @@ class TPUModelRunner(LoRAModelRunnerMixin): logits_indices = self.query_start_loc_cpu[1:padded_num_reqs + 1] - 1 logits_indices = logits_indices.to(self.device) + if self.lora_config is not None: + # We need to respect padding when activating LoRA adapters + padded_num_scheduled_tokens_per_req = np.copy( + num_scheduled_tokens_per_req + ) # Copying to avoid accidental state corruption bugs + padded_num_scheduled_tokens_per_req[-1] += \ + padded_total_num_scheduled_tokens - total_num_scheduled_tokens + + self.set_active_loras(self.input_batch, + padded_num_scheduled_tokens_per_req) + layer_names = get_layers_from_vllm_config(self.vllm_config, Attention).keys() per_layer_attn_metadata = { @@ -916,6 +931,7 @@ class TPUModelRunner(LoRAModelRunnerMixin): model = self.load_lora_model(model, self.model_config, self.scheduler_config, self.lora_config, self.device) + replace_set_lora(model) # Sync all pending XLA execution during model initialization and weight # loading. @@ -980,7 +996,7 @@ class TPUModelRunner(LoRAModelRunnerMixin): for layer_name in layer_names } - with self.maybe_dummy_run_with_lora( + with self.maybe_select_dummy_loras( self.lora_config, np.array([num_tokens], dtype=np.int32)), set_forward_context( per_layer_attn_metadata, self.vllm_config, 0): @@ -989,6 +1005,13 @@ class TPUModelRunner(LoRAModelRunnerMixin): inputs_embeds=inputs_embeds) self._hidden_states_dtype = out.dtype + def _set_active_loras(self, prompt_lora_mapping, token_lora_mapping, + lora_requests) -> None: + xm.mark_step() # Captures input updates + super()._set_active_loras(prompt_lora_mapping, token_lora_mapping, + lora_requests) + xm.mark_step() # Captures metadata updates + def _precompile_mm_encoder(self) -> None: # Pre-compile MM encoder for all supported data modalities. hf_config = self.vllm_config.model_config.hf_config @@ -1151,7 +1174,10 @@ class TPUModelRunner(LoRAModelRunnerMixin): generate_params_if_all_greedy, )) sampling_metadata.all_greedy = all_greedy - self.sample_from_logits(dummy_logits, sampling_metadata) + with self.maybe_select_dummy_loras( + self.lora_config, np.array([num_reqs], + dtype=np.int32)): + self.sample_from_logits(dummy_logits, sampling_metadata) logger.info(" -- num_seqs: %d", num_reqs) xm.wait_device_ops() end = time.perf_counter() @@ -1167,7 +1193,9 @@ class TPUModelRunner(LoRAModelRunnerMixin): dtype=self._hidden_states_dtype) dummy_tokens = torch.zeros((num_reqs, 1), dtype=torch.int64).to(self.device) - self.gather_logprobs(dummy_logits, dummy_tokens) + with self.maybe_select_dummy_loras( + self.lora_config, np.array([num_reqs], dtype=np.int32)): + self.gather_logprobs(dummy_logits, dummy_tokens) logger.info(" -- num_seqs: %d", num_reqs) xm.wait_device_ops() end = time.perf_counter() @@ -1178,13 +1206,14 @@ class TPUModelRunner(LoRAModelRunnerMixin): """ Precompile all the subgraphs with possible input shapes. """ - self._precompile_mm_encoder() - self._precompile_backbone() - self._precompile_select_hidden_states() - self._precompile_compute_logits() - self._precompile_structured_decoding() - self._precompile_sample_from_logits() - self._precompile_gather_logprobs() + with self.maybe_setup_dummy_loras(self.lora_config): + self._precompile_mm_encoder() + self._precompile_backbone() + self._precompile_select_hidden_states() + self._precompile_compute_logits() + self._precompile_structured_decoding() + self._precompile_sample_from_logits() + self._precompile_gather_logprobs() def profile_run( self, @@ -1467,11 +1496,11 @@ def _get_token_paddings(min_token_size: int, max_token_size: int, padding_gap: int) -> list[int]: """Generate a list of padding size, starting from min_token_size, ending with a number that can cover max_token_size - + If padding_gap == 0 then: increase 2X each time (exponential) else: - first increase the size to twice, + first increase the size to twice, then increase the padding size by padding_gap. """ # assert min_token_size is power of 2 @@ -1508,3 +1537,32 @@ def _get_padded_token_len(paddings: list[int], x: int) -> int: index = bisect.bisect_left(paddings, x) assert index < len(paddings) return paddings[index] + + +def replace_set_lora(model): + + def _tpu_set_lora( + self, + index: int, + lora_a: torch.Tensor, + lora_b: torch.Tensor, + embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, + ): + # TODO: The integer index leads to a recompilation, but converting it + # to a tensor doesn't seem to work anymore. This might be fixed with a + # later release of torch_xla. + self._original_set_lora(index, lora_a, lora_b, embeddings_tensor, bias) + xm.mark_step() + + def _tpu_reset_lora(self, index: int): + self._original_reset_lora(index) + xm.mark_step() + + for _, module in model.named_modules(): + if isinstance(module, BaseLayerWithLoRA): + module._original_set_lora = module.set_lora + module._original_reset_lora = module.reset_lora + module.set_lora = _tpu_set_lora.__get__(module, module.__class__) + module.reset_lora = _tpu_reset_lora.__get__( + module, module.__class__) diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index fa4eb30cc..0707e17af 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -83,10 +83,6 @@ class TPUWorker: if self.model_config.seed is None: self.model_config.seed = 0 - if vllm_config.lora_config is not None: - raise NotImplementedError( - "The V1 TPU backend doesn't support LoRA serving") - def init_device(self): os.environ["PJRT_DEVICE"] = "TPU" # Note: Currently the XLA compiler wrongly uses 2D ring strategy on 1D @@ -166,7 +162,8 @@ class TPUWorker: runner_kv_caches) # `max_num_tokens >= max_num_batched_tokens` due to padding. - self.model_runner.profile_run(self.model_runner.max_num_tokens) + with self.model_runner.maybe_setup_dummy_loras(self.lora_config): + self.model_runner.profile_run(self.model_runner.max_num_tokens) # Synchronize before measuring the memory usage. xm.wait_device_ops() -- GitLab From 6dbe5b5c934268387fb03485a5625b1793049d8c Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 28 May 2025 22:32:19 +0100 Subject: [PATCH 029/274] Remove checks for `None` for fields which should never be `None` (#17985) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 4d9ca580f..fe2ad70f5 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4282,25 +4282,22 @@ class VllmConfig: self.model_config.verify_dual_chunk_attention_config( self.load_config) - if self.cache_config is not None: - self.cache_config.verify_with_parallel_config(self.parallel_config) + self.cache_config.verify_with_parallel_config(self.parallel_config) - if self.lora_config: + if self.lora_config is not None: self.lora_config.verify_with_cache_config(self.cache_config) self.lora_config.verify_with_model_config(self.model_config) self.lora_config.verify_lora_support() - if self.prompt_adapter_config: + if self.prompt_adapter_config is not None: self.prompt_adapter_config.verify_with_model_config( self.model_config) - if self.quant_config is None and \ - self.model_config is not None and self.load_config is not None: + if self.quant_config is None and self.model_config is not None: self.quant_config = VllmConfig._get_quantization_config( self.model_config, self.load_config) from vllm.platforms import current_platform - if self.scheduler_config is not None and \ - self.model_config is not None and \ + if self.model_config is not None and \ self.scheduler_config.chunked_prefill_enabled and \ self.model_config.dtype == torch.float32 and \ current_platform.get_device_capability() == (7, 5): @@ -4335,8 +4332,7 @@ class VllmConfig: self._set_cudagraph_sizes() - if self.cache_config is not None and \ - self.cache_config.cpu_offload_gb > 0 and \ + if self.cache_config.cpu_offload_gb > 0 and \ self.compilation_config.level != CompilationLevel.NO_COMPILATION \ and not envs.VLLM_USE_V1: logger.warning( @@ -4358,16 +4354,16 @@ class VllmConfig: "full_cuda_graph is not supported with " "cascade attention. Disabling cascade attention.") self.model_config.disable_cascade_attn = True - if self.cache_config is not None: - self.cache_config.enable_prefix_caching = False + self.cache_config.enable_prefix_caching = False - if (self.kv_events_config + if (self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events and not self.cache_config.enable_prefix_caching): logger.warning( "KV cache events are on, but prefix caching is not enabled." "Use --enable-prefix-caching to enable.") - if (self.kv_events_config and self.kv_events_config.publisher != "null" + if (self.kv_events_config is not None + and self.kv_events_config.publisher != "null" and not self.kv_events_config.enable_kv_cache_events): logger.warning("KV cache events are disabled," "but the scheduler is configured to publish them." -- GitLab From 7951d78738581c336db7c1a77f94f1fea8f09fca Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Wed, 28 May 2025 18:55:30 -0400 Subject: [PATCH 030/274] [Core] Enable CUDA graphs for DP + All2All kernels (#18724) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- vllm/forward_context.py | 63 ++++++++++++------- vllm/model_executor/layers/fused_moe/layer.py | 42 ++++++++++++- vllm/platforms/cuda.py | 11 ---- vllm/v1/worker/gpu_model_runner.py | 21 ++++++- 4 files changed, 100 insertions(+), 37 deletions(-) diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 3c8083e3d..592ca650a 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -10,7 +10,7 @@ import torch import torch.distributed as dist import vllm.envs as envs -from vllm.config import VllmConfig +from vllm.config import ParallelConfig, VllmConfig from vllm.logger import init_logger if TYPE_CHECKING: @@ -30,6 +30,44 @@ class DPMetadata: max_tokens_across_dp_cpu: torch.Tensor cu_tokens_across_dp_cpu: torch.Tensor + @staticmethod + def num_tokens_across_dp(num_tokens: int, dp_size: int, + dp_rank: int) -> torch.Tensor: + """ + Gather the num_tokens across all DP ranks and return results in a + CPU tensor of size dp_size. + """ + num_tokens_across_dp = [0] * dp_size + num_tokens_across_dp[dp_rank] = num_tokens + num_tokens_tensor = torch.tensor(num_tokens_across_dp, + device="cpu", + dtype=torch.int32) + from vllm.distributed.parallel_state import get_dp_group + dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group) + return num_tokens_tensor + + @staticmethod + def make(parallel_config: ParallelConfig, attn_metadata: Any, + num_tokens: int) -> "DPMetadata": + + assert parallel_config.data_parallel_size > 1 + dp_size = parallel_config.data_parallel_size + dp_rank = parallel_config.data_parallel_rank + if attn_metadata is not None and hasattr(attn_metadata, + "num_prefill_tokens"): + # for v0 attention backends + batchsize = attn_metadata.num_prefill_tokens + \ + attn_metadata.num_decode_tokens + else: + # for v1 attention backends or no attn_metadata + batchsize = num_tokens + + num_tokens_tensor = DPMetadata.num_tokens_across_dp( + batchsize, dp_size, dp_rank) + max_tokens_across_dp_cpu = torch.max(num_tokens_tensor) + cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_tensor, dim=0) + return DPMetadata(max_tokens_across_dp_cpu, cu_tokens_across_dp_cpu) + @dataclass class ForwardContext: @@ -74,27 +112,8 @@ def set_forward_context(attn_metadata: Any, forward_start_time = time.perf_counter() dp_metadata: Optional[DPMetadata] = None if vllm_config.parallel_config.data_parallel_size > 1: - dp_size = vllm_config.parallel_config.data_parallel_size - dp_rank = vllm_config.parallel_config.data_parallel_rank - if attn_metadata is not None and hasattr(attn_metadata, - "num_prefill_tokens"): - # for v0 attention backends - batchsize = attn_metadata.num_prefill_tokens + \ - attn_metadata.num_decode_tokens - else: - # for v1 attention backends or no attn_metadata - batchsize = num_tokens - num_tokens_across_dp = [0] * dp_size - num_tokens_across_dp[dp_rank] = batchsize - num_tokens_tensor = torch.tensor(num_tokens_across_dp, - device="cpu", - dtype=torch.int32) - from vllm.distributed.parallel_state import get_dp_group - dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group) - max_tokens_across_dp_cpu = torch.max(num_tokens_tensor) - cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_tensor, dim=0) - dp_metadata = DPMetadata(max_tokens_across_dp_cpu, - cu_tokens_across_dp_cpu) + dp_metadata = DPMetadata.make(vllm_config.parallel_config, + attn_metadata, num_tokens) global _forward_context prev_context = _forward_context diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 29b41e720..838a7c24b 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -828,6 +828,21 @@ class FusedMoE(torch.nn.Module): self.quant_method.create_weights(layer=self, **moe_quant_params) + # Chunked all2all staging tensor + self.batched_hidden_states: Optional[torch.Tensor] = None + self.batched_router_logits: Optional[torch.Tensor] = None + if self.moe_parallel_config.use_pplx_kernels: + act_dtype = vllm_config.model_config.dtype + self.batched_hidden_states = torch.zeros( + (MOE_DP_CHUNK_SIZE, self.hidden_size), + dtype=act_dtype, + device=torch.cuda.current_device()) + + self.batched_router_logits = torch.zeros( + (MOE_DP_CHUNK_SIZE, self.global_num_experts), + dtype=act_dtype, + device=torch.cuda.current_device()) + @property def tp_size(self): return self.moe_parallel_config.tp_size @@ -1217,18 +1232,39 @@ class FusedMoE(torch.nn.Module): def forward_impl_chunked(self, full_hidden_states: torch.Tensor, full_router_logits: torch.Tensor): + assert self.batched_hidden_states is not None + assert self.batched_router_logits is not None + assert self.batched_hidden_states.dtype == full_hidden_states.dtype + assert self.batched_router_logits.dtype == full_router_logits.dtype + # Check size compatibility. + assert ( + self.batched_hidden_states.size(-1) == full_hidden_states.size(-1)) + assert ( + self.batched_router_logits.size(-1) == full_router_logits.size(-1)) full_final_hidden_states = torch.empty_like(full_hidden_states) def process_chunk(chunk_start, chunk_end, skip_result_store=False): + chunk_size = chunk_end - chunk_start hidden_states = full_hidden_states[chunk_start:chunk_end, :] router_logits = full_router_logits[chunk_start:chunk_end, :] + assert (self.batched_hidden_states.size(0) # type: ignore + >= chunk_size) + assert (self.batched_router_logits.size(0) # type: ignore + >= chunk_size) + staged_hidden_states = self.batched_hidden_states[: + chunk_size, :] # type: ignore + staged_router_logits = self.batched_router_logits[: + chunk_size, :] # type: ignore + staged_hidden_states.copy_(hidden_states, non_blocking=True) + staged_router_logits.copy_(router_logits, non_blocking=True) + # Matrix multiply. final_hidden_states = self.quant_method.apply( layer=self, - x=hidden_states, - router_logits=router_logits, + x=staged_hidden_states, + router_logits=staged_router_logits, top_k=self.top_k, renormalize=self.renormalize, use_grouped_topk=self.use_grouped_topk, @@ -1244,7 +1280,7 @@ class FusedMoE(torch.nn.Module): if not skip_result_store: full_final_hidden_states[chunk_start:chunk_end, :].copy_( - final_hidden_states) + final_hidden_states, non_blocking=True) ctx = get_forward_context() max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 0bed44f73..9f833cbb5 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -106,7 +106,6 @@ class CudaPlatformBase(Platform): def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: parallel_config = vllm_config.parallel_config scheduler_config = vllm_config.scheduler_config - compilation_config = vllm_config.compilation_config model_config = vllm_config.model_config if parallel_config.worker_cls == "auto": @@ -154,16 +153,6 @@ class CudaPlatformBase(Platform): logger.info( "Forcing kv cache block size to 64 for FlashMLA backend.") - if (parallel_config.data_parallel_size > 1 - and compilation_config.use_cudagraph): - logger.info( - "Data Parallel: Forcing enforce eager to be True since DP is " - "currently not supported with CUDA Graphs.") - vllm_config.model_config.enforce_eager = True - compilation_config.use_cudagraph = False - # FIXME: inductor breaks cudagraph (from @bnell) - compilation_config.use_inductor = False - @classmethod def get_current_memory_usage(cls, device: Optional[torch.types.Device] = None diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 5d5558162..d1195bcfb 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -24,7 +24,8 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 from vllm.distributed.parallel_state import ( get_pp_group, get_tp_group, graph_capture, prepare_communication_buffer_for_model) -from vllm.forward_context import get_forward_context, set_forward_context +from vllm.forward_context import (DPMetadata, get_forward_context, + set_forward_context) from vllm.logger import init_logger from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import TensorizerLoader, get_model @@ -1104,6 +1105,18 @@ class GPUModelRunner(LoRAModelRunnerMixin): for k, v in self.intermediate_tensors.items() }) + def get_dp_padding(self, num_tokens: int): + dp_size = self.vllm_config.parallel_config.data_parallel_size + dp_rank = self.vllm_config.parallel_config.data_parallel_rank + if dp_size == 1: + # Early exit. + return 0 + + num_tokens_across_dp = DPMetadata.num_tokens_across_dp( + num_tokens, dp_size, dp_rank) + max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item() + return max_tokens_across_dp_cpu - num_tokens + @torch.inference_mode() def execute_model( self, @@ -1141,6 +1154,9 @@ class GPUModelRunner(LoRAModelRunnerMixin): else: num_input_tokens = num_scheduled_tokens + # Padding for DP + num_input_tokens += self.get_dp_padding(num_input_tokens) + # _prepare_inputs may reorder the batch, so we must gather multi # modal outputs after that to ensure the correct order if self.is_multimodal_model: @@ -1658,6 +1674,9 @@ class GPUModelRunner(LoRAModelRunnerMixin): skip_attn: bool = True, ) -> torch.Tensor: + # Padding for DP + num_tokens += self.get_dp_padding(num_tokens) + # Set num_scheduled_tokens based on num_tokens and max_num_seqs # for dummy run with LoRA so that the num_reqs collectively # has num_tokens in total. -- GitLab From 269d901734326432d5ef15deaca07364149f9b48 Mon Sep 17 00:00:00 2001 From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Date: Wed, 28 May 2025 19:21:46 -0400 Subject: [PATCH 031/274] [Bugfix][ROCm] fix the power of 2 exception from triton_unified_attention.py when running llama4 models and unit test fix (#18100) Signed-off-by: Hongxia Yang Signed-off-by: tjtanaa Co-authored-by: tjtanaa --- .../test_triton_unified_attention.py | 4 +- .../attention/ops/triton_unified_attention.py | 106 +++++++++--------- 2 files changed, 54 insertions(+), 56 deletions(-) diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py index 4e15d0025..be3d1879d 100644 --- a/tests/kernels/attention/test_triton_unified_attention.py +++ b/tests/kernels/attention/test_triton_unified_attention.py @@ -13,7 +13,9 @@ HEAD_SIZES = [128, 256] BLOCK_SIZES = [16, 32] DTYPES = [torch.float16, torch.bfloat16] -QDTYPES = [None, torch.float8_e4m3fn] +QDTYPES = [None, torch.float8_e4m3fn] if not current_platform.is_rocm() else [ + None, torch.float8_e4m3fnuz +] # one value large enough to test overflow in index calculation. # one value small enough to test the schema op check NUM_BLOCKS = [32768, 2048] diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index 4bced7797..87cf333f7 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -29,41 +29,42 @@ def apply_softcap(S, x): @triton.jit def kernel_unified_attention_2d( - output_ptr, # [num_tokens, num_query_heads, head_size] - query_ptr, # [num_tokens, num_query_heads, head_size] - key_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] - value_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] - block_tables_ptr, # [num_seqs, max_num_blocks_per_seq] - seq_lens_ptr, # [num_seqs] - alibi_slopes_ptr, # [num_query_heads] - scale, # float32 - k_scale, # float32 - v_scale, # float32 - softcap, # float32 - num_query_heads: tl.constexpr, # int - num_queries_per_kv: tl.constexpr, # int - block_table_stride: tl.int64, # int - query_stride_0: tl.int64, # int - query_stride_1: tl.int64, # int, should be equal to head_size - output_stride_0: tl.int64, # int - output_stride_1: tl.int64, # int, should be equal to head_size - BLOCK_SIZE: tl.constexpr, # int - HEAD_SIZE: tl.constexpr, # int - HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 - USE_ALIBI_SLOPES: tl.constexpr, # bool - USE_SOFTCAP: tl.constexpr, # bool - SLIDING_WINDOW: tl.constexpr, # int - stride_k_cache_0: tl.int64, # int - stride_k_cache_1: tl.int64, # int - stride_k_cache_2: tl.int64, # int - stride_k_cache_3: tl.constexpr, # int - stride_v_cache_0: tl.int64, # int - stride_v_cache_1: tl.int64, # int - stride_v_cache_2: tl.int64, # int - stride_v_cache_3: tl.constexpr, # int - query_start_len_ptr, # [num_seqs+1] - BLOCK_Q: tl.constexpr, # int - num_seqs: tl.int32, + output_ptr, # [num_tokens, num_query_heads, head_size] + query_ptr, # [num_tokens, num_query_heads, head_size] + key_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] + value_cache_ptr, # [num_blks, blk_size, num_kv_heads, head_size] + block_tables_ptr, # [num_seqs, max_num_blocks_per_seq] + seq_lens_ptr, # [num_seqs] + alibi_slopes_ptr, # [num_query_heads] + scale, # float32 + k_scale, # float32 + v_scale, # float32 + softcap, # float32 + num_query_heads: tl.constexpr, # int + num_queries_per_kv: tl.constexpr, # int + block_table_stride: tl.int64, # int + query_stride_0: tl.int64, # int + query_stride_1: tl.int64, # int, should be equal to head_size + output_stride_0: tl.int64, # int + output_stride_1: tl.int64, # int, should be equal to head_size + BLOCK_SIZE: tl.constexpr, # int + HEAD_SIZE: tl.constexpr, # int + HEAD_SIZE_PADDED: tl.constexpr, # int, must be power of 2 + USE_ALIBI_SLOPES: tl.constexpr, # bool + USE_SOFTCAP: tl.constexpr, # bool + SLIDING_WINDOW: tl.constexpr, # int + stride_k_cache_0: tl.int64, # int + stride_k_cache_1: tl.int64, # int + stride_k_cache_2: tl.int64, # int + stride_k_cache_3: tl.constexpr, # int + stride_v_cache_0: tl.int64, # int + stride_v_cache_1: tl.int64, # int + stride_v_cache_2: tl.int64, # int + stride_v_cache_3: tl.constexpr, # int + query_start_len_ptr, # [num_seqs+1] + BLOCK_Q: tl.constexpr, # int + num_seqs: tl.int32, + BLOCK_M: tl.constexpr, # int ): q_block_global_idx = tl.program_id(0) @@ -94,15 +95,13 @@ def kernel_unified_attention_2d( if q_block_local_idx * BLOCK_Q >= cur_batch_query_len: return - offs_m = tl.arange(0, BLOCK_Q * num_queries_per_kv) + offs_m = tl.arange(0, BLOCK_M) offs_d = tl.arange(0, HEAD_SIZE_PADDED) - query_pos = q_block_local_idx * BLOCK_Q + offs_m // num_queries_per_kv query_offset_0 = cur_batch_in_all_start_index + query_pos query_offset_1 = kv_head_idx * num_queries_per_kv + \ offs_m % num_queries_per_kv - query_offset = (query_offset_0[:, None] * query_stride_0 + query_offset_1[:, None] * query_stride_1 + offs_d[None, :]) @@ -110,7 +109,7 @@ def kernel_unified_attention_2d( query_mask_0 = tl.where(query_pos < cur_batch_query_len, 1, 0).to(tl.int1) query_mask_1 = tl.where(query_offset_1 < num_query_heads, 1, 0).to(tl.int1) - # Q : (BLOCK_Q * num_queries_per_kv, HEAD_SIZE,) + # Q : (BLOCK_M, HEAD_SIZE_PADDED) Q = tl.load( query_ptr + query_offset, mask=dim_mask[None, :] & query_mask_0[:, None] & query_mask_1[:, None], @@ -119,12 +118,9 @@ def kernel_unified_attention_2d( block_table_offset = seq_idx * block_table_stride - M = tl.full([BLOCK_Q * num_queries_per_kv], - float("-inf"), - dtype=tl.float32) - L = tl.full([BLOCK_Q * num_queries_per_kv], 1.0, dtype=tl.float32) - acc = tl.zeros([BLOCK_Q * num_queries_per_kv, HEAD_SIZE_PADDED], - dtype=tl.float32) + M = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + L = tl.full([BLOCK_M], 1.0, dtype=tl.float32) + acc = tl.zeros([BLOCK_M, HEAD_SIZE_PADDED], dtype=tl.float32) # sequence len for this particular sequence seq_len = tl.load(seq_lens_ptr + seq_idx) @@ -183,13 +179,12 @@ def kernel_unified_attention_2d( else: V = V_load - seq_offset = j * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + seq_offset = j * BLOCK_SIZE + offs_n seq_mask = seq_offset[None, :] < context_len + query_pos[:, None] + 1 - # S : (BLOCK_Q * num_queries_per_kv, BLOCK_SIZE,) - S = tl.zeros(shape=(BLOCK_Q * num_queries_per_kv, BLOCK_SIZE), - dtype=tl.float32) + # S : (BLOCK_M, BLOCK_SIZE) + S = tl.zeros(shape=(BLOCK_M, BLOCK_SIZE), dtype=tl.float32) S += scale * tl.dot(Q, K) @@ -207,29 +202,29 @@ def kernel_unified_attention_2d( S += alibi_slope[:, None] * (seq_offset - context_len) # compute running maximum - # m_j : (BLOCK_Q * num_queries_per_kv,) + # m_j : (BLOCK_M,) m_j = tl.maximum(M, tl.max(S, axis=1)) # For sliding window there's a chance the max is -inf due to masking of # the entire row. In this case we need to set m_j 0 to avoid NaN m_j = tl.where(m_j > float("-inf"), m_j, 0.0) - # P : (BLOCK_Q * num_queries_per_kv, BLOCK_SIZE,) + # P : (BLOCK_M, BLOCK_SIZE) P = tl.exp(S - m_j[:, None]) - # l_j : (BLOCK_Q * num_queries_per_kv,) + # l_j : (BLOCK_M,) l_j = tl.sum(P, axis=1) - # alpha : (BLOCK_Q * num_queries_per_kv, ) + # alpha : (BLOCK_M, ) alpha = tl.exp(M - m_j) - # acc : (BLOCK_Q * num_queries_per_kv, BLOCK_SIZE,) + # acc : (BLOCK_M, HEAD_SIZE_PADDED) acc = acc * alpha[:, None] # update constants L = L * alpha + l_j M = m_j - # acc : (BLOCK_Q * num_queries_per_kv, BLOCK_SIZE,) + # acc : (BLOCK_M, HEAD_SIZE_PADDED) acc += tl.dot(P.to(V.dtype), V) # epilogue @@ -334,4 +329,5 @@ def unified_attention( query_start_len_ptr=cu_seqlens_q, BLOCK_Q=BLOCK_Q, num_seqs=num_seqs, + BLOCK_M=BLOCK_M, ) -- GitLab From 515b413ebf8feb8093556dbef21a02f9f6f877cc Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser Date: Wed, 28 May 2025 23:16:17 -0300 Subject: [PATCH 032/274] Prevent the cross-encoder logic from being applied to classification tasks (#18838) Signed-off-by: Max de Bayser Co-authored-by: Cyrus Leung --- vllm/model_executor/layers/pooler.py | 34 ++++++++++++++---------- vllm/model_executor/models/bert.py | 6 ++--- vllm/model_executor/models/modernbert.py | 7 ++--- vllm/model_executor/models/roberta.py | 6 +++-- vllm/transformers_utils/config.py | 11 ++++++-- 5 files changed, 40 insertions(+), 24 deletions(-) diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 6abbc9081..d2c42191b 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -6,10 +6,9 @@ from typing import Optional, Union import torch import torch.nn as nn import torch.nn.functional as F -from transformers import PretrainedConfig from typing_extensions import assert_never -from vllm.config import PoolerConfig +from vllm.config import ModelConfig, PoolerConfig from vllm.model_executor.pooling_metadata import (PoolingMetadata, PoolingTensors) from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput @@ -283,30 +282,37 @@ class Pooler(nn.Module): ) -class CrossEncodingPooler(nn.Module): - """A layer that pools specific information from hidden states. +class ClassifierPooler(nn.Module): + """A pooling layer for classification tasks. This layer does the following: - 1. Extracts specific tokens or aggregates data based on pooling method. - 2. Normalizes output if specified. - 3. Returns structured results as `PoolerOutput`. - - Attributes: - pooling_type: The type of pooling to use. - normalize: Whether to normalize the pooled data. + 1. Applies a classification layer to the hidden states. + 2. Optionally applies a pooler layer. + 3. Applies an activation function to the output. In the case of + classification models it is either sigmoid or softmax. In the + case of scoring models, the same behavior is configuration + dependent, as in the sentence-transformers library. """ def __init__( self, - config: PretrainedConfig, + config: ModelConfig, classifier: nn.Module, pooler: Optional[nn.Module] = None, ): super().__init__() self.classifier = classifier self.pooler = pooler - self.default_activation_function = \ - get_cross_encoder_activation_function(config) + + if config.task == "score": + self.default_activation_function = \ + get_cross_encoder_activation_function(config.hf_config) + elif config.task == "classify": + self.default_activation_function = nn.Sigmoid() \ + if config.hf_config.num_labels == 1 else nn.Softmax() + else: + raise NotImplementedError(f"task={config.task!r} is not supported" + " with the classification pooler") def forward( self, diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 0c6593bbe..0b1d0f103 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -16,7 +16,7 @@ from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import (CrossEncodingPooler, Pooler, +from vllm.model_executor.layers.pooler import (ClassifierPooler, Pooler, PoolingType) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -470,8 +470,8 @@ class BertForSequenceClassification(nn.Module, SupportsCrossEncoding, embedding_class=BertEmbedding, add_pooling_layer=True) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self._pooler = CrossEncodingPooler(config, self.classifier, - self.bert.pooler) + self._pooler = ClassifierPooler(vllm_config.model_config, + self.classifier, self.bert.pooler) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 86552aa05..18eab6051 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -12,7 +12,7 @@ from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.linear import (QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import CrossEncodingPooler +from vllm.model_executor.layers.pooler import ClassifierPooler from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) @@ -278,8 +278,9 @@ class ModernBertForSequenceClassification(nn.Module, SupportsCrossEncoding): self.model = ModernBertModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "modernbert")) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self._pooler = CrossEncodingPooler(config, self.classifier, - ModernBertPooler(config)) + self._pooler = ClassifierPooler(vllm_config.model_config, + self.classifier, + ModernBertPooler(config)) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 9a4d0ab2d..76008b729 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -9,7 +9,7 @@ from torch import nn from transformers import RobertaConfig from vllm.config import VllmConfig -from vllm.model_executor.layers.pooler import CrossEncodingPooler +from vllm.model_executor.layers.pooler import ClassifierPooler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -186,7 +186,9 @@ class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding, embedding_class=RobertaEmbedding, add_pooling_layer=False) self.classifier = RobertaClassificationHead(config) - self._pooler = CrossEncodingPooler(config, self.classifier) + + self._pooler = ClassifierPooler(vllm_config.model_config, + self.classifier) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): bert_weights, task_weights = roberta_task_weights_filter(weights) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 69e7207cc..2ed71a4d3 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -823,10 +823,17 @@ def try_get_generation_config( def get_cross_encoder_activation_function(config: PretrainedConfig): - if (hasattr(config, "sbert_ce_default_activation_function") - and config.sbert_ce_default_activation_function is not None): + function_name: Optional[str] = None + if hasattr(config, "sentence_transformers") and "activation_fn" in \ + config.sentence_transformers: + function_name = config.sentence_transformers["activation_fn"] + + elif (hasattr(config, "sbert_ce_default_activation_function") + and config.sbert_ce_default_activation_function is not None): function_name = config.sbert_ce_default_activation_function + + if function_name is not None: assert function_name.startswith("torch.nn.modules."), \ "Loading of activation functions is restricted to " \ "torch.nn.modules for security reasons" -- GitLab From 26b4fa45bead5d65d4e15bfaffaa52ac71bea270 Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Wed, 28 May 2025 22:16:52 -0400 Subject: [PATCH 033/274] Add ability to use CUDAGraphs with use_inductor=False (#17345) Signed-off-by: rzou --- tests/compile/piecewise/test_simple.py | 11 ++++++- tests/compile/piecewise/test_toy_llama.py | 40 +++++++++++++++++++---- vllm/compilation/compiler_interface.py | 4 +++ vllm/compilation/counter.py | 4 +++ vllm/config.py | 5 --- 5 files changed, 51 insertions(+), 13 deletions(-) diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index 143cb4969..5ce520a44 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -74,11 +74,12 @@ class SillyModel(nn.Module): return x -def test_simple_piecewise_compile(): +def _test_simple_piecewise_compile(*, use_inductor): vllm_config = VllmConfig(compilation_config=CompilationConfig( level=CompilationLevel.PIECEWISE, use_cudagraph=True, + use_inductor=use_inductor, splitting_ops=["silly.attention"], cudagraph_copy_inputs=True, cudagraph_capture_sizes=[1, 2], @@ -108,3 +109,11 @@ def test_simple_piecewise_compile(): output = model(input) assert global_counter == 2 assert torch.allclose(output.cpu(), torch.tensor([3., 1.])) + + +def test_simple_piecewise_compile_inductor(): + _test_simple_piecewise_compile(use_inductor=True) + + +def test_simple_piecewise_compile_no_inductor(): + _test_simple_piecewise_compile(use_inductor=False) diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index d4551b1cc..22560befc 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -261,12 +261,14 @@ def tractable_computation(input_ids: torch.Tensor, @torch.inference_mode def run_model(llama_config, use_compile: bool, + use_inductor: bool, split_attn: bool = False) -> torch.Tensor: if use_compile: compilation_config = CompilationConfig( level=CompilationLevel.PIECEWISE, use_cudagraph=True, + use_inductor=use_inductor, cudagraph_capture_sizes=[1, 2], ) if split_attn: @@ -304,7 +306,7 @@ def run_model(llama_config, return output.cpu() -def test_toy_llama(): +def _test_toy_llama(*, use_inductor): # compare output with and without piecewise compilation llama_config = LlamaConfig(hidden_size=128, @@ -326,8 +328,14 @@ def test_toy_llama(): num_backend_compilations=0, num_cudagraph_caputured=0, ): - outputs.append(run_model(llama_config, use_compile=False)) - run_model(tractable_config, use_compile=False) + outputs.append( + run_model(llama_config, use_inductor=False, use_compile=False)) + run_model(tractable_config, use_inductor=False, use_compile=False) + + if use_inductor: + kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0} + else: + kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0} with compilation_counter.expect( num_graphs_seen=1, # one graph for the model @@ -336,9 +344,13 @@ def test_toy_llama(): num_backend_compilations=1, # num_piecewise_capturable_graphs_seen num_cudagraph_caputured= 2, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + **kwargs, ): - outputs.append(run_model(llama_config, use_compile=True)) - run_model(tractable_config, use_compile=True) + outputs.append( + run_model(llama_config, + use_inductor=use_inductor, + use_compile=True)) + run_model(tractable_config, use_inductor=use_inductor, use_compile=True) with compilation_counter.expect( num_graphs_seen=1, # one graph for the model @@ -353,13 +365,27 @@ def test_toy_llama(): ), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen ): outputs.append( - run_model(llama_config, use_compile=True, split_attn=True)) - run_model(tractable_config, use_compile=True, split_attn=True) + run_model(llama_config, + use_inductor=use_inductor, + use_compile=True, + split_attn=True)) + run_model(tractable_config, + use_inductor=use_inductor, + use_compile=True, + split_attn=True) for i in range(1, len(outputs)): assert torch.allclose(outputs[0], outputs[i]) +def test_toy_llama_inductor(): + _test_toy_llama(use_inductor=True) + + +def test_toy_no_inductor(): + _test_toy_llama(use_inductor=False) + + @torch.inference_mode def benchmark(): from triton.testing import do_bench diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 21af5eb76..7e9186f86 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -12,6 +12,7 @@ import torch._inductor.compile_fx import torch.fx as fx import vllm.envs as envs +from vllm.compilation.counter import compilation_counter from vllm.config import VllmConfig from vllm.utils import is_torch_equal_or_newer @@ -175,6 +176,7 @@ class InductorStandaloneAdaptor(CompilerInterface): runtime_shape: Optional[int] = None, key: Optional[str] = None, ) -> tuple[Optional[Callable], Optional[Any]]: + compilation_counter.num_inductor_compiles += 1 current_config = {} if compiler_config is not None: current_config.update(compiler_config) @@ -262,6 +264,7 @@ class InductorAdaptor(CompilerInterface): runtime_shape: Optional[int] = None, key: Optional[str] = None, ) -> tuple[Optional[Callable], Optional[Any]]: + compilation_counter.num_inductor_compiles += 1 from torch._inductor.compile_fx import compile_fx current_config = {} if compiler_config is not None: @@ -528,6 +531,7 @@ class EagerAdaptor(CompilerInterface): runtime_shape: Optional[int] = None, key: Optional[str] = None, ) -> tuple[Optional[Callable], Optional[Any]]: + compilation_counter.num_eager_compiles += 1 # we don't need to compile the graph, just return the graph itself. # It does not support caching, return None for the handle. return graph, None diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py index 5be452593..2200671b8 100644 --- a/vllm/compilation/counter.py +++ b/vllm/compilation/counter.py @@ -15,6 +15,10 @@ class CompilationCounter: num_piecewise_capturable_graphs_seen: int = 0 num_backend_compilations: int = 0 num_cudagraph_caputured: int = 0 + # InductorAdapter.compile calls + num_inductor_compiles: int = 0 + # EagerAdapter.compile calls + num_eager_compiles: int = 0 def clone(self) -> "CompilationCounter": return copy.deepcopy(self) diff --git a/vllm/config.py b/vllm/config.py index fe2ad70f5..3172cbe45 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4315,15 +4315,10 @@ class VllmConfig: self.compilation_config.custom_ops.append("+rms_norm") if envs.VLLM_USE_V1 and self.model_config is not None and \ not self.model_config.enforce_eager: - # NOTE(woosuk): Currently, we use inductor because the piecewise - # CUDA graphs do not work properly with the custom CUDA kernels. - # FIXME(woosuk): Disable inductor to reduce the compilation time - # and avoid any potential issues with the inductor. # FIXME(rob): Add function to set all of these. if not self.compilation_config.custom_ops: self.compilation_config.custom_ops = ["none"] self.compilation_config.use_cudagraph = True - self.compilation_config.use_inductor = True self.compilation_config.cudagraph_num_of_warmups = 1 self.compilation_config.pass_config.enable_fusion = False self.compilation_config.pass_config.enable_noop = False -- GitLab From 8e882ffdc0b4e9e3e6fc3fabe1c199fb6b8736a2 Mon Sep 17 00:00:00 2001 From: Chengji Yao Date: Wed, 28 May 2025 19:34:19 -0700 Subject: [PATCH 034/274] [Bugfix][TPU] fix moe custom kernel import (#18853) Signed-off-by: Chengji Yao --- vllm/model_executor/layers/fused_moe/moe_pallas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py index babeb9730..539459992 100644 --- a/vllm/model_executor/layers/fused_moe/moe_pallas.py +++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py @@ -2,6 +2,7 @@ import torch import torch.nn.functional as F +import torch_xla.experimental.custom_kernel # noqa: F401 def _histogram(input: torch.Tensor, min: int, max: int) -> torch.Tensor: -- GitLab From 1661a9c28f090505fe127ca4cb82678486a523cc Mon Sep 17 00:00:00 2001 From: aws-elaineyz Date: Wed, 28 May 2025 19:44:01 -0700 Subject: [PATCH 035/274] [Doc][Neuron] Update documentation for Neuron (#18868) Signed-off-by: Elaine Zhao --- docs/features/compatibility_matrix.md | 3 + .../quantization/supported_hardware.md | 6 +- .../installation/ai_accelerator/neuron.inc.md | 184 +++++++++--------- vllm/config.py | 2 +- 4 files changed, 100 insertions(+), 95 deletions(-) diff --git a/docs/features/compatibility_matrix.md b/docs/features/compatibility_matrix.md index 77ceea49f..71882d317 100644 --- a/docs/features/compatibility_matrix.md +++ b/docs/features/compatibility_matrix.md @@ -75,3 +75,6 @@ th:not(:first-child) { | multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:8477) | ✅ | | best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | + +!!! note + Please refer to [Feature support through NxD Inference backend][feature-support-through-nxd-inference-backend] for features supported on AWS Neuron hardware diff --git a/docs/features/quantization/supported_hardware.md b/docs/features/quantization/supported_hardware.md index 2967bf9c7..6a585b1cc 100644 --- a/docs/features/quantization/supported_hardware.md +++ b/docs/features/quantization/supported_hardware.md @@ -5,13 +5,13 @@ title: Supported Hardware The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: -| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | x86 CPU | AWS Inferentia | Google TPU | +| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | x86 CPU | AWS Neuron | Google TPU | |-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-----------|------------------|--------------| | AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | ❌ | ❌ | | GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | ❌ | ❌ | | Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | -| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ✅︎ | ❌ | ✅︎ | -| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | +| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | +| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ✅︎ | ❌ | | BitBLAS (GPTQ) | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | | AQLM | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | | bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ | diff --git a/docs/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/getting_started/installation/ai_accelerator/neuron.inc.md index f08c78fba..86c12472f 100644 --- a/docs/getting_started/installation/ai_accelerator/neuron.inc.md +++ b/docs/getting_started/installation/ai_accelerator/neuron.inc.md @@ -1,8 +1,9 @@ # --8<-- [start:installation] -vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching. -Paged Attention and Chunked Prefill are currently in development and will be available soon. -Data types currently supported in Neuron SDK are FP16 and BF16. +[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) is the software development kit (SDK) used to run deep learning and + generative AI workloads on AWS Inferentia and AWS Trainium powered Amazon EC2 instances and UltraServers (Inf1, Inf2, Trn1, Trn2, + and Trn2 UltraServer). Both Trainium and Inferentia are powered by fully-independent heterogeneous compute-units called NeuronCores. + This tab describes how to set up your environment to run vLLM on Neuron. !!! warning There are no pre-built wheels or images for this device, so you must build vLLM from source. @@ -11,58 +12,30 @@ Data types currently supported in Neuron SDK are FP16 and BF16. # --8<-- [start:requirements] - OS: Linux -- Python: 3.9 -- 3.11 -- Accelerator: NeuronCore_v2 (in trn1/inf2 instances) -- Pytorch 2.0.1/2.1.1 -- AWS Neuron SDK 2.16/2.17 (Verified on python 3.8) +- Python: 3.9 or newer +- Pytorch 2.5/2.6 +- Accelerator: NeuronCore-v2 (in trn1/inf2 chips) or NeuronCore-v3 (in trn2 chips) +- AWS Neuron SDK 2.23 ## Configure a new environment -### Launch Trn1/Inf2 instances +### Launch a Trn1/Trn2/Inf2 instance and verify Neuron dependencies -Here are the steps to launch trn1/inf2 instances, in order to install [PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html). +The easiest way to launch a Trainium or Inferentia instance with pre-installed Neuron dependencies is to follow this +[quick start guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/multiframework/multi-framework-ubuntu22-neuron-dlami.html#setup-ubuntu22-multi-framework-dlami) using the Neuron Deep Learning AMI (Amazon machine image). -- Please follow the instructions at [launch an Amazon EC2 Instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance) to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type. -- To get more information about instances sizes and pricing see: [Trn1 web page](https://aws.amazon.com/ec2/instance-types/trn1/), [Inf2 web page](https://aws.amazon.com/ec2/instance-types/inf2/) -- Select Ubuntu Server 22.04 TLS AMI -- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB. - After launching the instance, follow the instructions in [Connect to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) to connect to the instance - -### Install drivers and tools - -The installation of drivers and tools wouldn't be necessary, if [Deep Learning AMI Neuron](https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html) is installed. In case the drivers and tools are not installed on the operating system, follow the steps below: - +- Once inside your instance, activate the pre-installed virtual environment for inference by running ```console -# Configure Linux for Neuron repository updates -. /etc/os-release -sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. - -Following instructions are applicable to Neuron SDK 2.16 and beyond. - -#### Install transformers-neuronx and its dependencies +#### Install vLLM from source -[transformers-neuronx](https://github.com/aws-neuron/transformers-neuronx) will be the backend to support inference on trn1/inf2 instances. -Follow the steps below to install transformer-neuronx package and its dependencies. +Install vllm as follows: ```console -# Install Python venv -sudo apt-get install -y python3.10-venv g++ - -# Create Python venv -python3.10 -m venv aws_neuron_venv_pytorch - -# Activate Python venv -source aws_neuron_venv_pytorch/bin/activate - -# Install Jupyter notebook kernel -pip install ipykernel -python3.10 -m ipykernel install \ - --user \ - --name aws_neuron_venv_pytorch \ - --display-name "Python (torch-neuronx)" -pip install jupyter notebook -pip install environment_kernels - -# Set pip repository pointing to the Neuron repository -python -m pip config set \ - global.extra-index-url \ - https://pip.repos.neuron.amazonaws.com - -# Install wget, awscli -python -m pip install wget -python -m pip install awscli - -# Update Neuron Compiler and Framework -python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx +git clone https://github.com/vllm-project/vllm.git +cd vllm +pip install -U -r requirements/neuron.txt +VLLM_TARGET_DEVICE="neuron" pip install -e . ``` -#### Install vLLM from source +AWS Neuron maintains a [Github fork of vLLM](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2) at + [https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2), which contains several features in addition to what's + available on vLLM V0. Please utilize the AWS Fork for the following features: + +- Llama-3.2 multi-modal support +- Multi-node distributed inference -Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows: +Refer to [vLLM User Guide for NxD Inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/vllm-user-guide.html) + for more details and usage examples. + +To install the AWS Neuron fork, run the following: ```console -git clone https://github.com/vllm-project/vllm.git -cd vllm -pip install -U -r requirements/neuron.txt -VLLM_TARGET_DEVICE="neuron" pip install . +git clone -b neuron-2.23-vllm-v0.7.2 https://github.com/aws-neuron/upstreaming-to-vllm.git +cd upstreaming-to-vllm +pip install -r requirements/neuron.txt +VLLM_TARGET_DEVICE="neuron" pip install -e . ``` -If neuron packages are detected correctly in the installation process, `vllm-0.3.0+neuron212` will be installed. +Note that the AWS Neuron fork is only intended to support Neuron hardware; compatibility with other hardwares is not tested. # --8<-- [end:build-wheel-from-source] # --8<-- [start:set-up-using-docker] @@ -148,5 +98,57 @@ Make sure to use in place of the default Dock # --8<-- [end:build-image-from-source] # --8<-- [start:extra-information] -There is no extra information for this device. +[](){ #feature-support-through-nxd-inference-backend } +### Feature support through NxD Inference backend + +The current vLLM and Neuron integration relies on either the `neuronx-distributed-inference` (preferred) or `transformers-neuronx` backend + to perform most of the heavy lifting which includes PyTorch model initialization, compilation, and runtime execution. Therefore, most + [features supported on Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html) are also available via the vLLM integration. + +To configure NxD Inference features through the vLLM entrypoint, use the `override_neuron_config` setting. Provide the configs you want to override +as a dictionary (or JSON object when starting vLLM from the CLI). For example, to disable auto bucketing, include +```console +override_neuron_config={ + "enable_bucketing":False, +} +``` +or when launching vLLM from the CLI, pass +```console +--override-neuron-config "{\"enable_bucketing\":false}" +``` + +Alternatively, users can directly call the NxDI library to trace and compile your model, then load the pre-compiled artifacts +(via `NEURON_COMPILED_ARTIFACTS` environment variable) in vLLM to run inference workloads. + +### Known limitations + +- EAGLE speculative decoding: NxD Inference requires the EAGLE draft checkpoint to include the LM head weights from the target model. Refer to this + [guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/feature-guide.html#eagle-checkpoint-compatibility) + for how to convert pretrained EAGLE model checkpoints to be compatible for NxDI. +- Quantization: the native quantization flow in vLLM is not well supported on NxD Inference. It is recommended to follow this + [Neuron quantization guide](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/nxd-inference/developer_guides/custom-quantization.html) + to quantize and compile your model using NxD Inference, and then load the compiled artifacts into vLLM. +- Multi-LoRA serving: NxD Inference only supports loading of LoRA adapters at server startup. Dynamic loading of LoRA adapters at + runtime is not currently supported. Refer to [multi-lora example](https://github.com/aws-neuron/upstreaming-to-vllm/blob/neuron-2.23-vllm-v0.7.2/examples/offline_inference/neuron_multi_lora.py) +- Multi-modal support: multi-modal support is only available through the AWS Neuron fork. This feature has not been upstreamed + to vLLM main because NxD Inference currently relies on certain adaptations to the core vLLM logic to support this feature. +- Multi-node support: distributed inference across multiple Trainium/Inferentia instances is only supported on the AWS Neuron fork. Refer + to this [multi-node example](https://github.com/aws-neuron/upstreaming-to-vllm/tree/neuron-2.23-vllm-v0.7.2/examples/neuron/multi_node) + to run. Note that tensor parallelism (distributed inference across NeuronCores) is available in vLLM main. +- Known edge case bug in speculative decoding: An edge case failure may occur in speculative decoding when sequence length approaches + max model length (e.g. when requesting max tokens up to the max model length and ignoring eos). In this scenario, vLLM may attempt + to allocate an additional block to ensure there is enough memory for number of lookahead slots, but since we do not have good support + for paged attention, there isn't another Neuron block for vLLM to allocate. A workaround fix (to terminate 1 iteration early) is + implemented in the AWS Neuron fork but is not upstreamed to vLLM main as it modifies core vLLM logic. + + +### Environment variables +- `NEURON_COMPILED_ARTIFACTS`: set this environment variable to point to your pre-compiled model artifacts directory to avoid + compilation time upon server initialization. If this variable is not set, the Neuron module will perform compilation and save the + artifacts under `neuron-compiled-artifacts/{unique_hash}/` sub-directory in the model path. If this environment variable is set, + but the directory does not exist, or the contents are invalid, Neuron will also fallback to a new compilation and store the artifacts + under this specified path. +- `NEURON_CONTEXT_LENGTH_BUCKETS`: Bucket sizes for context encoding. (Only applicable to `transformers-neuronx` backend). +- `NEURON_TOKEN_GEN_BUCKETS`: Bucket sizes for token generation. (Only applicable to `transformers-neuronx` backend). + # --8<-- [end:extra-information] diff --git a/vllm/config.py b/vllm/config.py index 3172cbe45..085060535 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -380,7 +380,7 @@ class ModelConfig: """Initialize non-default neuron config or override default neuron config that are specific to Neuron devices, this argument will be used to configure the neuron config that can not be gathered from the vllm - arguments. e.g. `{"cast_logits_dtype": "bloat16"}`.""" + arguments. e.g. `{"cast_logits_dtype": "bfloat16"}`.""" pooler_config: Optional["PoolerConfig"] = field(init=False) """Pooler config which controls the behaviour of output pooling in pooling models.""" -- GitLab From 3c49dbdd03f33fb938bc67230dbc2e8f536ed490 Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Thu, 29 May 2025 11:12:30 +0800 Subject: [PATCH 036/274] Skip device and quant Pydantic validation to make plugin device work (#18843) Signed-off-by: Yikun Jiang --- vllm/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 085060535..6cec97a5f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -304,7 +304,7 @@ class ModelConfig: - 25.6k -> 25,600""" spec_target_max_model_len: Optional[int] = None """Specify the maximum length for spec decoding draft models.""" - quantization: Optional[QuantizationMethods] = None + quantization: SkipValidation[Optional[QuantizationMethods]] = None """Method used to quantize the weights. If `None`, we first check the `quantization_config` attribute in the model config file. If that is `None`, we assume the model weights are not quantized and use `dtype` to @@ -2231,7 +2231,7 @@ Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"] class DeviceConfig: """Configuration for the device to use for vLLM execution.""" - device: Union[Device, torch.device] = "auto" + device: SkipValidation[Union[Device, torch.device]] = "auto" """Device type for vLLM execution. This parameter is deprecated and will be removed in a future release. -- GitLab From fd7bb88d72ba721d6eb4f9d34198ad930c36c177 Mon Sep 17 00:00:00 2001 From: Brent Salisbury Date: Thu, 29 May 2025 00:41:39 -0400 Subject: [PATCH 037/274] Fixes a dead link in nightly benchmark readme (#18856) Signed-off-by: Brent Salisbury --- .buildkite/nightly-benchmarks/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index d3f5fc5cd..72c52d5bb 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do ### Visualizing the results -The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results. +The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results. You can find the result presented as a table inside the `buildkite/performance-benchmark` job page. If you do not see the table, please wait till the benchmark finish running. The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. -- GitLab From 972eddf7c9cee072bb1f92618072782a4ed0b7c0 Mon Sep 17 00:00:00 2001 From: Satyajith Chilappagari Date: Thu, 29 May 2025 01:41:22 -0700 Subject: [PATCH 038/274] [Neuron] Add multi-LoRA support for Neuron. (#18284) Signed-off-by: Satyajith Chilappagari --- tests/neuron/2_core/test_multi_lora.py | 98 +++++++++++ .../model_loader/neuronx_distributed.py | 31 ++-- vllm/platforms/neuron.py | 3 - vllm/worker/neuron_model_runner.py | 32 +++- vllm/worker/neuron_worker.py | 42 ++++- .../neuronx_distributed_model_runner.py | 163 +++++++++++++++++- 6 files changed, 343 insertions(+), 26 deletions(-) create mode 100644 tests/neuron/2_core/test_multi_lora.py diff --git a/tests/neuron/2_core/test_multi_lora.py b/tests/neuron/2_core/test_multi_lora.py new file mode 100644 index 000000000..6fa8f9128 --- /dev/null +++ b/tests/neuron/2_core/test_multi_lora.py @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: Apache-2.0 + +from huggingface_hub import snapshot_download + +from vllm import LLM, SamplingParams +from vllm.lora.request import LoRARequest + + +def test_llama_single_lora(): + sql_lora_files = snapshot_download( + repo_id="yard1/llama-2-7b-sql-lora-test") + llm = LLM(model="meta-llama/Llama-2-7b-hf", + tensor_parallel_size=2, + max_num_seqs=4, + max_model_len=512, + use_v2_block_manager=True, + override_neuron_config={ + "sequence_parallel_enabled": False, + "skip_warmup": True, + "lora_modules": [{ + "name": "lora_id_1", + "path": sql_lora_files + }] + }, + enable_lora=True, + max_loras=1, + max_lora_rank=256, + device="neuron") + """For multi-lora requests using NxDI as the backend, only the lora_name + needs to be specified. The lora_id and lora_path are supplied at the LLM + class/server initialization, after which the paths are handled by NxDI""" + lora_req_1 = LoRARequest("lora_id_1", 0, " ") + prompts = [ + "The president of the United States is", + "The capital of France is", + ] + outputs = llm.generate(prompts, + SamplingParams(top_k=1), + lora_request=[lora_req_1, lora_req_1]) + + expected_outputs = [ + " the head of state and head of government of the United States. " + "The president direct", + " a city of contrasts. The city is home to the Eiffel Tower" + ] + + for expected_output, output in zip(expected_outputs, outputs): + generated_text = output.outputs[0].text + assert (expected_output == generated_text) + + +def test_llama_multiple_lora(): + sql_lora_files = snapshot_download( + repo_id="yard1/llama-2-7b-sql-lora-test") + llm = LLM(model="meta-llama/Llama-2-7b-hf", + tensor_parallel_size=2, + max_num_seqs=4, + max_model_len=512, + use_v2_block_manager=True, + override_neuron_config={ + "sequence_parallel_enabled": + False, + "skip_warmup": + True, + "lora_modules": [{ + "name": "lora_id_1", + "path": sql_lora_files + }, { + "name": "lora_id_2", + "path": sql_lora_files + }] + }, + enable_lora=True, + max_loras=2, + max_lora_rank=256, + device="neuron") + """For multi-lora requests using NxDI as the backend, only the lora_name + needs to be specified. The lora_id and lora_path are supplied at the LLM + class/server initialization, after which the paths are handled by NxDI""" + lora_req_1 = LoRARequest("lora_id_1", 0, " ") + lora_req_2 = LoRARequest("lora_id_2", 1, " ") + prompts = [ + "The president of the United States is", + "The capital of France is", + ] + outputs = llm.generate(prompts, + SamplingParams(top_k=1), + lora_request=[lora_req_1, lora_req_2]) + + expected_outputs = [ + " the head of state and head of government of the United States. " + "The president direct", + " a city of contrasts. The city is home to the Eiffel Tower" + ] + + for expected_output, output in zip(expected_outputs, outputs): + generated_text = output.outputs[0].text + assert (expected_output == generated_text) diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py index 557feea46..624bd476c 100644 --- a/vllm/model_executor/model_loader/neuronx_distributed.py +++ b/vllm/model_executor/model_loader/neuronx_distributed.py @@ -17,6 +17,8 @@ from neuronx_distributed_inference.models.config import ( FusedSpecNeuronConfig, OnDeviceSamplingConfig) from neuronx_distributed_inference.models.mllama.utils import ( create_vision_mask) +from neuronx_distributed_inference.modules.lora_serving import ( + LoraServingConfig) from neuronx_distributed_inference.utils.hf_adapter import ( load_pretrained_config) from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig @@ -80,25 +82,26 @@ class NeuronCausalLM(nn.Module): # Lazy initialized self.model: nn.Module - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - input_block_ids: torch.Tensor, - sampling_params: torch.Tensor, - ) -> torch.Tensor: + def forward(self, + input_ids: torch.Tensor, + positions: torch.Tensor, + input_block_ids: torch.Tensor, + sampling_params: torch.Tensor, + prev_hidden: Optional[torch.Tensor] = None, + adapter_ids: Optional[torch.Tensor] = None) -> torch.Tensor: # sort block ids sequentially for perf/neuron support reasons sorted_input_block_ids, sorted_indices = torch.sort(input_block_ids) input_ids = torch.index_select(input_ids, 0, sorted_indices) positions = torch.index_select(positions, 0, sorted_indices) sampling_params = torch.index_select(sampling_params, 0, sorted_indices) - output = self.model(input_ids, attention_mask=None, position_ids=positions, seq_ids=sorted_input_block_ids, - sampling_params=sampling_params) + sampling_params=sampling_params, + prev_hidden=prev_hidden, + adapter_ids=adapter_ids) # on-device sampling if self.config.neuron_config.on_device_sampling_config: output = output.hidden_states @@ -522,7 +525,8 @@ def _get_model_architecture(config: PretrainedConfig) -> str: def _get_default_neuron_config(model_config: ModelConfig, parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig): + scheduler_config: SchedulerConfig, + lora_serving_config: LoraServingConfig): """Generate a neuron config based on vllm config args.""" on_device_sampling_config = OnDeviceSamplingConfig(dynamic=True, deterministic=False) @@ -541,7 +545,7 @@ def _get_default_neuron_config(model_config: ModelConfig, padding_side="right", on_device_sampling_config=on_device_sampling_config, sequence_parallel_enabled=True, - ) + lora_serving_config=lora_serving_config) return neuron_config @@ -581,7 +585,8 @@ def _get_neuron_config_after_override(default_neuron_config, def get_neuron_model(model_config: ModelConfig, parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig) -> nn.Module: + scheduler_config: SchedulerConfig, + lora_serving_config: LoraServingConfig) -> nn.Module: """Initializes a neuron-optimized model for inference.""" model_arch = _get_model_architecture(model_config.hf_config) if model_arch == "MllamaForConditionalGeneration": @@ -589,7 +594,7 @@ def get_neuron_model(model_config: ModelConfig, else: model = NeuronCausalLM(model_config.hf_config) default_neuron_config_args = _get_default_neuron_config( - model_config, parallel_config, scheduler_config) + model_config, parallel_config, scheduler_config, lora_serving_config) neuron_config = _get_neuron_config_after_override( default_neuron_config_args, model_config.override_neuron_config) diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index 474c70d04..56f204e71 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -49,9 +49,6 @@ class NeuronPlatform(Platform): if parallel_config.world_size > 1: parallel_config.distributed_executor_backend = "uni" - assert (vllm_config.lora_config - is None), "LoRA is not supported for Neuron backend." - if vllm_config.cache_config and vllm_config.model_config: # neuron needs block_size = max_model_len vllm_config.cache_config.block_size = \ diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index 968596471..292fe57f3 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -2,13 +2,15 @@ import os from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union import torch from torch import nn from vllm.config import DeviceConfig, VllmConfig from vllm.logger import init_logger +from vllm.lora.layers import LoRAMapping +from vllm.lora.request import LoRARequest from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.neuron import get_neuron_model @@ -36,6 +38,7 @@ class ModelInputForNeuron(ModelRunnerInputBase): input_block_ids: Optional[torch.Tensor] = None sampling_metadata: SamplingMetadata = None multi_modal_kwargs: BatchedTensorInputs = None + adapter_ids: Optional[str] = None def as_broadcastable_tensor_dict( self) -> Dict[str, Union[int, torch.Tensor]]: @@ -80,6 +83,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): "The model will run without sliding window.") self.device_config = (self.device_config if self.device_config is not None else DeviceConfig()) + self.lora_config = vllm_config.lora_config self.device = self.device_config.device self.pin_memory = is_pin_memory_available() @@ -378,6 +382,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): positions=model_input.input_positions, input_block_ids=model_input.input_block_ids, sampling_params=sampling_params, + adapter_ids=model_input.adapter_ids, **MultiModalKwargs.as_kwargs( model_input.multi_modal_kwargs or {}, dtype=self.model_config.dtype, @@ -416,3 +421,28 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): @property def vocab_size(self) -> int: return self.model_config.get_vocab_size() + + def remove_all_loras(self): + raise NotImplementedError( + "LoRAs are not supported for Transformers NeuronX framework") + + def set_active_loras(self, lora_requests: Set[LoRARequest], + lora_mapping: LoRAMapping) -> None: + raise NotImplementedError( + "LoRAs are not supported for Transformers NeuronX framework") + + def add_lora(self, lora_request: LoRARequest): + raise NotImplementedError( + "LoRAs are not supported for Transformers NeuronX framework") + + def remove_lora(self, lora_id: int) -> bool: + raise NotImplementedError( + "LoRAs are not supported for Transformers NeuronX framework") + + def pin_lora(self, lora_id: int) -> bool: + raise NotImplementedError( + "LoRAs are not supported for Transformers NeuronX framework") + + def list_loras(self) -> Set[int]: + raise NotImplementedError( + "LoRAs are not supported for Transformers NeuronX framework") diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index aa8e39613..64daee31b 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """A Neuron worker class.""" import os -from typing import List, Optional, Tuple +from typing import List, Optional, Set, Tuple import torch.distributed @@ -9,19 +9,19 @@ from vllm.config import VllmConfig from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.logger import init_logger +from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed from vllm.platforms import current_platform from vllm.platforms.neuron import NeuronFramework from vllm.sequence import ExecuteModelRequest from vllm.worker.neuron_model_runner import NeuronModelRunner -from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, - LoRANotSupportedWorkerBase, WorkerBase, +from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase, WorkerInput) logger = init_logger(__name__) -class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase): +class NeuronWorker(LocalOrDistributedWorkerBase): """A worker class that executes the model on a group of neuron cores. """ @@ -38,6 +38,7 @@ class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase): self.rank = rank self.distributed_init_method = distributed_init_method self.is_driver_worker = is_driver_worker + self.lora_config = vllm_config.lora_config if self.model_config.trust_remote_code: # note: lazy import to avoid importing torch before initializing @@ -59,6 +60,9 @@ class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase): "[transformers-neuronx, neuronx-distributed-inference]") def get_tnx_model_runner(self, vllm_config): + assert (self.lora_config + is None), ("LoRA is not supported for TransformersNeuronX " + "framework.") from vllm.worker.multi_step_neuron_model_runner import ( MultiStepNeuronModelRunner) if self.speculative_config is not None: @@ -72,6 +76,8 @@ class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase): from vllm.worker.neuronx_distributed_model_runner import ( NeuronxDistributedModelRunner) if self.speculative_config is not None: + assert (self.lora_config + is None), "LoRA is not supported for Speculative Decoding" return MultiStepNeuronxDistributedModelRunner( vllm_config=vllm_config) else: @@ -156,3 +162,31 @@ class NeuronWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase): 1, 1, ) + + def add_lora(self, lora_request: LoRARequest) -> bool: + if current_platform.use_transformers_neuronx(): + raise NotImplementedError( + f"{type(self)} does not support LoRA with Neuron Framework " + f"Transformers NeuronX") + return self.model_runner.add_lora(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + if current_platform.use_transformers_neuronx(): + raise NotImplementedError( + f"{type(self)} does not support LoRA with Neuron Framework " + f"Transformers NeuronX") + return self.model_runner.remove_lora(lora_id) + + def pin_lora(self, lora_id: int) -> bool: + if current_platform.use_transformers_neuronx(): + raise NotImplementedError( + f"{type(self)} does not support LoRA with Neuron Framework " + f"Transformers NeuronX") + return self.model_runner.pin_lora(lora_id) + + def list_loras(self) -> Set[int]: + if current_platform.use_transformers_neuronx(): + raise NotImplementedError( + f"{type(self)} does not support LoRA with Neuron Framework " + f"Transformers NeuronX") + return self.model_runner.list_loras() diff --git a/vllm/worker/neuronx_distributed_model_runner.py b/vllm/worker/neuronx_distributed_model_runner.py index 4e784e5e0..aa94706c8 100644 --- a/vllm/worker/neuronx_distributed_model_runner.py +++ b/vllm/worker/neuronx_distributed_model_runner.py @@ -1,17 +1,24 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional +from typing import List, Optional, Set import torch from neuronx_distributed_inference.modules.generation.sampling import ( prepare_sampling_params) +from neuronx_distributed_inference.modules.lora_serving import ( + LoraCheckpoint, LoraServingConfig) from vllm.config import VllmConfig +from vllm.entrypoints.openai.serving_models import LoRAModulePath from vllm.logger import init_logger +from vllm.lora.layers import LoRAMapping +from vllm.lora.request import LoRARequest +from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.neuronx_distributed import ( _get_model_architecture, get_neuron_model) -from vllm.sequence import IntermediateTensors +from vllm.platforms import current_platform +from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.worker.neuron_model_runner import (ModelInputForNeuron, NeuronModelRunner) @@ -25,11 +32,44 @@ class NeuronxDistributedModelRunner(NeuronModelRunner): vllm_config: VllmConfig, ): super().__init__(vllm_config) + self.lora_checkpoint = None + self.model = None + self.lora_serving_config = None + + @staticmethod + def _get_lora_paths_strings(lora_modules: List[LoRAModulePath]): + if not lora_modules: + return None + return {_.get("name"): _.get("path") for _ in lora_modules} + + def _get_nxdi_lora_config(self): + override_neuron_config = self.model_config.override_neuron_config + lora_modules = override_neuron_config.pop("lora_modules", None) + target_modules = override_neuron_config.pop("target_modules", None) + lora_ckpt_paths = self._get_lora_paths_strings(lora_modules) + if self.lora_config.max_loras < len(lora_ckpt_paths): + raise ValueError( + "Number of LoRAs (%s) exceeds maximum " + "allowed (%s)", len(lora_ckpt_paths), + self.lora_config.max_loras) + + return LoraServingConfig( + max_loras=self.lora_config.max_loras, + max_lora_rank=self.lora_config.max_lora_rank, + target_modules=target_modules, + lora_ckpt_paths=lora_ckpt_paths, + ) def load_model(self) -> None: - self.model = get_neuron_model(self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config) + # Update LoRA config + if self.lora_config is not None: + self.lora_serving_config = self._get_nxdi_lora_config() + self.lora_checkpoint = LoraCheckpoint(self.lora_serving_config) + self.model = get_neuron_model( + self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + lora_serving_config=self.lora_serving_config) def get_nxd_sampling_params(self, sampling_metadata): if self.model.config.neuron_config.on_device_sampling_config: @@ -134,3 +174,116 @@ class NeuronxDistributedModelRunner(NeuronModelRunner): ) return [output] + + def _get_lora_adapter_ids(self, seq_group_metadata_list): + # set LoRA adapter IDs for multi-lora serving + batch_size = len(seq_group_metadata_list) + if self.lora_checkpoint is not None: + # "0" indicates NxDI to use the base model for inference + adapter_ids = ["0"] * batch_size + for idx, seq_group_metadata in enumerate(seq_group_metadata_list): + if seq_group_metadata.lora_request is not None: + adapter_ids[ + idx] = seq_group_metadata.lora_request.lora_name + + # convert adapter_ids from strings to integers + adapter_ids = self.lora_checkpoint.convert_adapter_ids_to_indices( + adapter_ids, batch_size) + else: + adapter_ids = torch.zeros((batch_size), dtype=torch.int32) + + return adapter_ids + + def prepare_model_input( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + virtual_engine: int = 0, + finished_requests_ids: Optional[List[str]] = None + ) -> ModelInputForNeuron: + multi_modal_kwargs = None + # NOTE: We assume that all sequences in the group are all prompts or + # all decodes. + is_prompt = seq_group_metadata_list[0].is_prompt + # Prepare input tensors. + if is_prompt: + (input_tokens, input_positions, input_block_ids, seq_lens, + multi_modal_kwargs + ) = self._prepare_prompt(seq_group_metadata_list) + else: + (input_tokens, input_positions, + input_block_ids) = self._prepare_decode(seq_group_metadata_list) + seq_lens = None + + if not self._on_device_sampling_disabled: + for seq_group_metadata in seq_group_metadata_list: + sampling_params = seq_group_metadata.sampling_params + top_k, top_p, temperature = ( + self._convert_to_neuron_sampling_params(sampling_params)) + sampling_params.top_k = top_k + sampling_params.top_p = top_p + sampling_params.temperature = temperature + + lora_adapter_ids = self._get_lora_adapter_ids(seq_group_metadata_list) + + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, + seq_lens, + # query_lens is not needed if chunked prefill is not + # supported. Since neuron worker doesn't support chunked prefill + # just use seq_lens instead. + seq_lens, + self.device, + self.pin_memory, + generators=self.get_generators(finished_requests_ids)) + + if current_platform.use_transformers_neuronx( + ) and not self._on_device_sampling_disabled: + # Once the request IDs are changed in current iteration, we will + # update the on-device sampling parameters. + current_batch_request_ids = [ + seq_group_meta_data.request_id + for seq_group_meta_data in seq_group_metadata_list + ] + if current_batch_request_ids != self._previous_batch_request_ids: + self._update_neuron_sampling_params(seq_group_metadata_list) + self._previous_batch_request_ids = current_batch_request_ids + + return ModelInputForNeuron(input_tokens=input_tokens, + input_positions=input_positions, + input_block_ids=input_block_ids, + sampling_metadata=sampling_metadata, + multi_modal_kwargs=multi_modal_kwargs, + adapter_ids=lora_adapter_ids) + + def remove_all_loras(self): + raise NotImplementedError( + "Managing LoRAs is only supported through the " + "lora_modules parameter in override_neuron_config") + + def set_active_loras(self, lora_requests: Set[LoRARequest], + lora_mapping: LoRAMapping) -> None: + raise NotImplementedError( + "Managing LoRAs is only supported through the " + "lora_modules parameter in override_neuron_config") + + def add_lora(self, lora_request: LoRARequest): + logger.warning( + "Adding LoRAs is only supported through the " + "lora_modules parameter in override_neuron_config. If you supplied " + "the parameter, you can ignore this warning. Ignoring" + "lora request: ", lora_request) + + def remove_lora(self, lora_id: int) -> bool: + raise NotImplementedError( + "Managing LoRAs is only supported through the " + "lora_modules parameter in override_neuron_config") + + def pin_lora(self, lora_id: int) -> bool: + raise NotImplementedError( + "Managing LoRAs is only supported through the " + "lora_modules parameter in override_neuron_config") + + def list_loras(self) -> Set[int]: + raise NotImplementedError( + "Managing LoRAs is only supported through the " + "lora_modules parameter in override_neuron_config") -- GitLab From 34d6c447c4b9cf2dd986df7788684cc40662e5e7 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 29 May 2025 16:46:24 +0800 Subject: [PATCH 039/274] [LoRA] Add LoRA support for InternVL (#18842) Signed-off-by: Jee Jee Li --- vllm/model_executor/models/internvl.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 4612fc438..71be2b48d 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -22,6 +22,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.models.intern_vit import (InternVisionModel, InternVisionPatchModel) +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import convert_image_mode @@ -36,7 +37,8 @@ from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.transformers_utils.tokenizer import AnyTokenizer -from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP +from .interfaces import (MultiModalEmbeddings, SupportsLoRA, + SupportsMultiModal, SupportsPP) from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -1014,7 +1016,17 @@ class InternVLMultiModalProcessor( InternVLMultiModalProcessor, info=InternVLProcessingInfo, dummy_inputs=InternVLDummyInputsBuilder) -class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): +class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, + SupportsLoRA): + + packed_modules_mapping = { + "wqkv": ["wqkv"], + "qkv": ["qkv"], + "gate_up_proj": [ + "w1", + "w3", + ], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() @@ -1403,3 +1415,12 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): ] loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) return loader.load_weights(weights) + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="mlp1", + tower_model="vision_model") -- GitLab From a652e71dd0ed2cfd951205bd836c069aeec34380 Mon Sep 17 00:00:00 2001 From: Michael Yao Date: Thu, 29 May 2025 17:51:20 +0800 Subject: [PATCH 040/274] [Doc] Remove redundant spaces from compatibility_matrix.md (#18891) Signed-off-by: windsonsea --- docs/features/compatibility_matrix.md | 35 ++++++++++++++------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/docs/features/compatibility_matrix.md b/docs/features/compatibility_matrix.md index 71882d317..5d448eb5c 100644 --- a/docs/features/compatibility_matrix.md +++ b/docs/features/compatibility_matrix.md @@ -10,6 +10,7 @@ The symbols used have the following meanings: - ✅ = Full compatibility - 🟠 = Partial compatibility - ❌ = No compatibility +- ❔ = Unknown or TBD !!! note Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/hardware combination. @@ -36,23 +37,23 @@ th:not(:first-child) { } -| Feature | [CP][chunked-prefill] | [APC][automatic-prefix-caching] | [LoRA][lora-adapter] | prmpt adptr | [SD][spec-decode] | CUDA graph | pooling | enc-dec | logP | prmpt logP | async output | multi-step | mm | best-of | beam-search | -|-----------------------------------------------------------|-------------------------|-----------------------------------|------------------------|---------------------------------------------------|---------------------|--------------|-----------------------------------------------|-------------------------------------------------------|--------------------------------------|---------------------------------------------------|-------------------------------------------------------------|--------------------|---------------------------------------------|-----------|---------------| -| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | | -| [APC][automatic-prefix-caching] | ✅ | ✅ | | | | | | | | | | | | | | -| [LoRA][lora-adapter] | ✅ | ✅ | ✅ | | | | | | | | | | | | | -| prmpt adptr | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | | -| [SD][spec-decode] | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | | -| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | -| pooling | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | | | | | | | | -| enc-dec | ❌ | [❌](gh-issue:7366) | ❌ | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | -| logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | -| prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | -| async output | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | -| multi-step | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | -| mm | ✅ | [🟠](gh-pr:8348) | [🟠](gh-pr:4194) | ❔ | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | -| best-of | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | -| beam-search | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | +| Feature | [CP][chunked-prefill] | [APC][automatic-prefix-caching] | [LoRA][lora-adapter] | prmpt adptr | [SD][spec-decode] | CUDA graph | pooling | enc-dec | logP | prmpt logP | async output | multi-step | mm | best-of | beam-search | +|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---| +| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | | +| [APC][automatic-prefix-caching] | ✅ | ✅ | | | | | | | | | | | | | | +| [LoRA][lora-adapter] | ✅ | ✅ | ✅ | | | | | | | | | | | | | +| prmpt adptr | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | | +| [SD][spec-decode] | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | | +| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | +| pooling | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | | | | | | | | +| enc-dec | ❌ | [❌](gh-issue:7366) | ❌ | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | +| logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | +| prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | +| async output | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | +| multi-step | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | +| mm | ✅ | [🟠](gh-pr:8348) | [🟠](gh-pr:4194) | ❔ | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | +| best-of | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | +| beam-search | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | [](){ #feature-x-hardware } -- GitLab From e740d07f07d82983217077b89e23beaae134a30b Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Thu, 29 May 2025 17:51:36 +0800 Subject: [PATCH 041/274] [doc] add CLI doc (#18871) Signed-off-by: reidliu41 Co-authored-by: reidliu41 --- docs/.nav.yml | 3 + docs/cli/README.md | 179 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 182 insertions(+) create mode 100644 docs/cli/README.md diff --git a/docs/.nav.yml b/docs/.nav.yml index 42aba9775..a9c594c29 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -12,6 +12,7 @@ nav: - User Guide: usage/README.md - Developer Guide: contributing/README.md - API Reference: api/README.md + - CLI Reference: cli/README.md - Timeline: - Roadmap: https://roadmap.vllm.ai - Releases: https://github.com/vllm-project/vllm/releases @@ -56,6 +57,8 @@ nav: - Contents: - glob: api/vllm/* preserve_directory_names: true + - CLI Reference: + - Summary: cli/README.md - Community: - community/* - Blog: https://blog.vllm.ai diff --git a/docs/cli/README.md b/docs/cli/README.md new file mode 100644 index 000000000..5feb316d6 --- /dev/null +++ b/docs/cli/README.md @@ -0,0 +1,179 @@ +# vLLM CLI Guide + +The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with: + +``` +vllm --help +``` + +Available Commands: + +``` +vllm {chat,complete,serve,bench,collect-env,run-batch} +``` + +## Table of Contents + +- [serve](#serve) +- [chat](#chat) +- [complete](#complete) +- [bench](#bench) + - [latency](#latency) + - [serve](#serve-1) + - [throughput](#throughput) +- [collect-env](#collect-env) +- [run-batch](#run-batch) +- [More Help](#more-help) + +## serve + +Start the vLLM OpenAI Compatible API server. + +Examples: + +```bash +# Start with a model +vllm serve meta-llama/Llama-2-7b-hf + +# Specify the port +vllm serve meta-llama/Llama-2-7b-hf --port 8100 + +# Check with --help for more options +# To list all groups +vllm serve --help=listgroup + +# To view a argument group +vllm serve --help=ModelConfig + +# To view a single argument +vllm serve --help=max-num-seqs + +# To search by keyword +vllm serve --help=max +``` + +## chat + +Generate chat completions via the running API server. + +Examples: + +```bash +# Directly connect to localhost API without arguments +vllm chat + +# Specify API url +vllm chat --url http://{vllm-serve-host}:{vllm-serve-port}/v1 + +# Quick chat with a single prompt +vllm chat --quick "hi" +``` + +## complete + +Generate text completions based on the given prompt via the running API server. + +Examples: + +```bash +# Directly connect to localhost API without arguments +vllm complete + +# Specify API url +vllm complete --url http://{vllm-serve-host}:{vllm-serve-port}/v1 + +# Quick complete with a single prompt +vllm complete --quick "The future of AI is" +``` + +## bench + +Run benchmark tests for latency online serving throughput and offline inference throughput. + +Available Commands: + +```bash +vllm bench {latency, serve, throughput} +``` + +### latency + +Benchmark the latency of a single batch of requests. + +Example: + +```bash +vllm bench latency \ + --model meta-llama/Llama-3.2-1B-Instruct \ + --input-len 32 \ + --output-len 1 \ + --enforce-eager \ + --load-format dummy +``` + +### serve + +Benchmark the online serving throughput. + +Example: + +```bash +vllm bench serve \ + --model meta-llama/Llama-3.2-1B-Instruct \ + --host server-host \ + --port server-port \ + --random-input-len 32 \ + --random-output-len 4 \ + --num-prompts 5 +``` + +### throughput + +Benchmark offline inference throughput. + +Example: + +```bash +vllm bench throughput \ + --model meta-llama/Llama-3.2-1B-Instruct \ + --input-len 32 \ + --output-len 1 \ + --enforce-eager \ + --load-format dummy +``` + +## collect-env + +Start collecting environment information. + +```bash +vllm collect-env +``` + +## run-batch + +Run batch prompts and write results to file. + +Examples: + +```bash +# Running with a local file +vllm run-batch \ + -i offline_inference/openai_batch/openai_example_batch.jsonl \ + -o results.jsonl \ + --model meta-llama/Meta-Llama-3-8B-Instruct + +# Using remote file +vllm run-batch \ + -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \ + -o results.jsonl \ + --model meta-llama/Meta-Llama-3-8B-Instruct +``` + +## More Help + +For detailed options of any subcommand, use: + +```bash +vllm --help +``` -- GitLab From 7fcfd954ffbf947ea323abd364b2af8229aa0a93 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 29 May 2025 17:54:14 +0800 Subject: [PATCH 042/274] [Bugfix] Fix misleading information in the documentation (#18845) Signed-off-by: Jee Jee Li --- docs/models/supported_models.md | 114 ++++++++++++++++---------------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 7594c6e6f..0202ba5a6 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -302,31 +302,31 @@ Specified using `--task generate`. | Architecture | Models | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | |---------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------| | `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | -| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | ✅︎ | | +| `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | | `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | -| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | | | -| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | ✅︎ | | +| `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | +| `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | | `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | | `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM | `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc. | ✅︎ | ✅︎ | | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. | ✅︎ | ✅︎ | -| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | ✅︎ | | -| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | | -| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc. | ✅︎ | | -| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc. | ✅︎ | | -| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc. | ✅︎ | | +| `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | +| `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | +| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc. | | ✅︎ | +| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc. | | ✅︎ | +| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc. | | ✅︎ | | `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | -| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | ✅︎ | | -| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | ✅︎ | ✅︎ | +| `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ | +| `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ | | `FalconH1ForCausalLM` | Falcon-H1 | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc. | ✅︎ | ✅︎ | | `GemmaForCausalLM` | Gemma | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc. | ✅︎ | ✅︎ | | `Gemma2ForCausalLM` | Gemma 2 | `google/gemma-2-9b`, `google/gemma-2-27b`, etc. | ✅︎ | ✅︎ | | `Gemma3ForCausalLM` | Gemma 3 | `google/gemma-3-1b-it`, etc. | ✅︎ | ✅︎ | | `GlmForCausalLM` | GLM-4 | `THUDM/glm-4-9b-chat-hf`, etc. | ✅︎ | ✅︎ | | `Glm4ForCausalLM` | GLM-4-0414 | `THUDM/GLM-4-32B-0414`, etc. | ✅︎ | ✅︎ | -| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | ✅︎ | | +| `GPT2LMHeadModel` | GPT-2 | `gpt2`, `gpt2-xl`, etc. | | ✅︎ | | `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | -| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | ✅︎ | | -| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | ✅︎ | | +| `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | +| `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ | | `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | | `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | | `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | @@ -336,39 +336,39 @@ Specified using `--task generate`. | `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | -| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | ✅︎ | | +| `JAISLMHeadModel` | Jais | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. | | ✅︎ | | `JambaForCausalLM` | Jamba | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. | ✅︎ | ✅︎ | | `LlamaForCausalLM` | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. | ✅︎ | ✅︎ | -| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | ✅︎ | | +| `MambaForCausalLM` | Mamba | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. | | ✅︎ | | `MiniCPMForCausalLM` | MiniCPM | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. | ✅︎ | ✅︎ | | `MiniCPM3ForCausalLM` | MiniCPM3 | `openbmb/MiniCPM3-4B`, etc. | ✅︎ | ✅︎ | | `MistralForCausalLM` | Mistral, Mistral-Instruct | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. | ✅︎ | ✅︎ | | `MixtralForCausalLM` | Mixtral-8x7B, Mixtral-8x7B-Instruct | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. | ✅︎ | ✅︎ | -| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | ✅︎ | | +| `MPTForCausalLM` | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. | | ✅︎ | | `NemotronForCausalLM` | Nemotron-3, Nemotron-4, Minitron | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. | ✅︎ | ✅︎ | -| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | | -| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | | -| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | ✅︎ | ✅︎ | -| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | | -| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | ✅︎ | | +| `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | | ✅︎ | +| `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | | ✅︎ | +| `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ | +| `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | | ✅︎ | +| `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | | `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ | | `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | -| `Phi3SmallForCausalLM` | Phi-3-Small | `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. | ✅︎ | | +| `Phi3SmallForCausalLM` | Phi-3-Small | `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. | | ✅︎ | | `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | -| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | ✅︎ | | +| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | | `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | | | `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | | `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | -| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | | +| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | | ✅︎ | | `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | -| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | | -| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | ✅︎ | | -| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | ✅︎ | | +| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | | ✅︎ | +| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | +| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | | `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ | | `TeleChat2ForCausalLM` | TeleChat2 | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. | ✅︎ | ✅︎ | | `TeleFLMForCausalLM` | TeleFLM | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc. | ✅︎ | ✅︎ | | `XverseForCausalLM` | XVERSE | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. | ✅︎ | ✅︎ | -| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | ✅︎ | | +| `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | | `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | !!! note @@ -512,44 +512,44 @@ Specified using `--task generate`. | Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | [V1](gh-issue:8779) | |----------------------------------------------|--------------------------------------------------------------------------|-----------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------| -| `AriaForConditionalGeneration` | Aria | T + I+ | `rhymes-ai/Aria` | ✅︎ | ✅︎ | | -| `AyaVisionForConditionalGeneration` | Aya Vision | T + I+ | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | ✅︎ | ✅︎ | | -| `Blip2ForConditionalGeneration` | BLIP-2 | T + IE | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | ✅︎ | ✅︎ | | -| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b` etc. | ✅︎ | ✅︎ | | -| `DeepseekVLV2ForCausalLM`^ | DeepSeek-VL2 | T + I+ | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. | ✅︎ | ✅︎ | | +| `AriaForConditionalGeneration` | Aria | T + I+ | `rhymes-ai/Aria` | | | ✅︎ | +| `AyaVisionForConditionalGeneration` | Aya Vision | T + I+ | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc. | | ✅︎ | ✅︎ | +| `Blip2ForConditionalGeneration` | BLIP-2 | T + IE | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | ✅︎ | +| `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b` etc. | | ✅︎ | ✅︎ | +| `DeepseekVLV2ForCausalLM`^ | DeepSeek-VL2 | T + I+ | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. | | ✅︎ | ✅︎ | | `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large` etc. | | | | -| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b` etc. | ✅︎ | ✅︎ | | +| `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b` etc. | | ✅︎ | ✅︎ | | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I+ | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ | | `GLM4VForCausalLM`^ | GLM-4V | T + I | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220` etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | -| `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | ✅︎ | ✅︎\* | | -| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3` etc. | ✅︎ | ✅︎ | | -| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | | -| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | ✅︎ | | | -| `Llama4ForConditionalGeneration` | Llama 4 | T + I+ | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎ | ✅︎ | | -| `LlavaForConditionalGeneration` | LLaVA-1.5 | T + IE+ | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | ✅︎ | ✅︎ | | -| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + IE+ | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | ✅︎ | ✅︎ | | -| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | ✅︎ | ✅︎ | | -| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I+ + V+ | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | ✅︎ | ✅︎ | | +| `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎\* | +| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3` etc. | ✅︎ | | ✅︎ | +| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | | ✅︎ | ✅︎ | +| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ | +| `Llama4ForConditionalGeneration` | Llama 4 | T + I+ | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ | +| `LlavaForConditionalGeneration` | LLaVA-1.5 | T + IE+ | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | | ✅︎ | ✅︎ | +| `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + IE+ | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | ✅︎ | +| `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ | +| `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I+ + V+ | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ | | `MiniCPMO` | MiniCPM-O | T + IE+ + VE+ + AE+ | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MiniCPMV` | MiniCPM-V | T + IE+ + VE+ | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + IE+ | `MiniMaxAI/MiniMax-VL-01`, etc. | ✅︎ | ✅︎ | | +| `MiniCPMV` | MiniCPM-V | T + IE+ + VE+ | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. | ✅︎ | | ✅︎ | +| `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + IE+ | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | | | `Mistral3ForConditionalGeneration` | Mistral3 | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `MllamaForConditionalGeneration` | Llama 3.2 | T + I+ | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | | +| `MllamaForConditionalGeneration` | Llama 3.2 | T + I+ | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | | | `MolmoForCausalLM` | Molmo | T + I+ | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `NVLM_D_Model` | NVLM-D 1.0 | T + I+ | `nvidia/NVLM-D-72B`, etc. | ✅︎ | ✅︎ | | -| `Ovis` | Ovis2, Ovis1.6 | T + I+ | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | ✅︎ | | | -| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + IE | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | ✅︎ | ⚠️ | | -| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + IE+ | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | ✅︎ | ✅︎ | | -| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | | -| `PixtralForConditionalGeneration` | Pixtral | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc. | ✅︎ | ✅︎ | | +| `NVLM_D_Model` | NVLM-D 1.0 | T + I+ | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ | +| `Ovis` | Ovis2, Ovis1.6 | T + I+ | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | | ✅︎ | +| `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + IE | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ | +| `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + IE+ | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ | +| `Phi4MMForCausalLM` | Phi-4-multimodal | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `PixtralForConditionalGeneration` | Pixtral | T + I+ | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc. | | ✅︎ | ✅︎ | | `QwenVLForConditionalGeneration`^ | Qwen-VL | T + IE+ | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A+ | `Qwen/Qwen2-Audio-7B-Instruct` | ✅︎ | ✅︎ | | +| `Qwen2AudioForConditionalGeneration` | Qwen2-Audio | T + A+ | `Qwen/Qwen2-Audio-7B-Instruct` | | ✅︎ | ✅︎ | | `Qwen2VLForConditionalGeneration` | QVQ, Qwen2-VL | T + IE+ + VE+ | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | T + IE+ + VE+ | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen2.5-Omni-7B` | ✅︎ | ✅︎\* | | -| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | ✅︎ | ✅︎ | | -| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | ✅︎ | | +| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎\* | +| `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ | +| `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ | ^ You need to set the architecture name via `--hf-overrides` to match the one in vLLM.     • For example, to use DeepSeek-VL2 series models: @@ -647,7 +647,7 @@ The following table lists those that are tested in vLLM. | Architecture | Models | Inputs | Example HF Models | [LoRA][lora-adapter] | [PP][distributed-serving] | |-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------| -| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based | T / I | `royokong/e5-v` | ✅︎ | | +| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | | | `Phi3VForCausalLM` | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | 🚧 | ✅︎ | #### Transcription -- GitLab From 24d0ef89705e0ab8df3d79fcbfd669cf5575772b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Thu, 29 May 2025 11:58:14 +0200 Subject: [PATCH 043/274] [Misc] Replace TODO in serving transcription (#18895) Signed-off-by: NickLucche --- vllm/entrypoints/openai/serving_transcription.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py index 13565d0ef..9fc5b562e 100644 --- a/vllm/entrypoints/openai/serving_transcription.py +++ b/vllm/entrypoints/openai/serving_transcription.py @@ -278,7 +278,9 @@ class OpenAIServingTranscription(OpenAIServing): result_generator: Optional[AsyncGenerator[RequestOutput, None]] = None try: - # TODO(rob): subtract len of tokenized prompt. + # Unlike most decoder-only models, whisper generation length is not + # constrained by the size of the input audio, which is mapped to a + # fixed-size log-mel-spectogram. default_max_tokens = self.model_config.max_model_len sampling_params = request.to_sampling_params( default_max_tokens, self.default_sampling_params) -- GitLab From 0b1447f890087c7610b1855ae12de023a26ddc7f Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Thu, 29 May 2025 11:05:20 +0100 Subject: [PATCH 044/274] [Bugfix] Ensure tensors are contiguous during serialisation (#18860) Signed-off-by: Lukas Geiger --- vllm/v1/serial_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index fbd38fc47..78f37c1e8 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -158,8 +158,8 @@ class MsgpackEncoder: self, obj: torch.Tensor ) -> tuple[str, tuple[int, ...], Union[int, memoryview]]: assert self.aux_buffers is not None - # view the tensor as a 1D array of bytes - arr = obj.flatten().view(torch.uint8).numpy() + # view the tensor as a contiguous 1D array of bytes + arr = obj.flatten().contiguous().view(torch.uint8).numpy() if obj.nbytes < self.size_threshold: # Smaller tensors are encoded inline, just like ndarrays. data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr.data) -- GitLab From f274581f44c26bcec8bcdf6d4cdfd92a4310f995 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luka=20Govedi=C4=8D?= Date: Thu, 29 May 2025 06:05:46 -0400 Subject: [PATCH 045/274] [BugFix] Update pydantic to fix error on python 3.10 (#18852) Signed-off-by: luka --- requirements/common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/common.txt b/requirements/common.txt index 625efc336..de4b3b531 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -14,7 +14,7 @@ protobuf # Required by LlamaTokenizer. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. aiohttp openai >= 1.52.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support) -pydantic >= 2.9 +pydantic >= 2.10 prometheus_client >= 0.18.0 pillow # Required for image processing prometheus-fastapi-instrumentator >= 7.0.0 -- GitLab From f8977c233f453313c124dc0753dfc1669ef401f0 Mon Sep 17 00:00:00 2001 From: Chenyaaang <42742451+Chenyaaang@users.noreply.github.com> Date: Thu, 29 May 2025 03:07:20 -0700 Subject: [PATCH 046/274] Fix an error in dummy weight loading for quantization models (#18855) Signed-off-by: Chenyaaang --- vllm/model_executor/model_loader/weight_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index f61956f4e..7a9a68be8 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -696,7 +696,7 @@ def initialize_dummy_weights( # Note: We avoid using torch.rank_like as it doesn't currently # support the generator argument. param.copy_((high - low) * - torch.rand(*param.shape, + torch.rand(param.shape, generator=generator, dtype=param.dtype, layout=param.layout, -- GitLab From b169d5f7b6ad65586c5352030bda154d589a4d89 Mon Sep 17 00:00:00 2001 From: Duyi-Wang Date: Thu, 29 May 2025 20:02:08 +0800 Subject: [PATCH 047/274] [Misc][Tools][Benchmark] Add benchmark_serving supports for llama.cpp. (#18692) Signed-off-by: Duyi-Wang --- benchmarks/backend_request_func.py | 3 ++- benchmarks/benchmark_serving.py | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 88616e110..85e6eda7f 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -324,7 +324,7 @@ async def async_request_openai_completions( most_recent_timestamp = timestamp generated_text += text or "" - elif usage := data.get("usage"): + if usage := data.get("usage"): output.output_tokens = usage.get("completion_tokens") if first_chunk_received: output.success = True @@ -611,6 +611,7 @@ ASYNC_REQUEST_FUNCS = { "tensorrt-llm": async_request_trt_llm, "scalellm": async_request_openai_completions, "sglang": async_request_openai_completions, + "llama.cpp": async_request_openai_completions, } OPENAI_COMPATIBLE_BACKENDS = [ diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index a887e7150..79024a9d6 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -762,6 +762,10 @@ def main(args: argparse.Namespace): if "temperature" not in sampling_params: sampling_params["temperature"] = 0.0 # Default to greedy decoding. + if args.backend == "llama.cpp": + # Disable prompt caching in llama.cpp backend + sampling_params["cache_prompt"] = False + # Avoid GC processing "static" data - reduce pause times. gc.collect() gc.freeze() -- GitLab From 6f2909405edbd39478891f71dc2bfcbcda8530fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?= Date: Thu, 29 May 2025 23:38:55 +0900 Subject: [PATCH 048/274] [Doc] Fix codeblocks formatting in LoRA adapters documentation (#18907) Signed-off-by: Zerohertz --- docs/features/lora.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/features/lora.md b/docs/features/lora.md index 642462f7c..04e92dbc4 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -165,6 +165,7 @@ it will first look in the local directory for a directory `foobar`, and attempt that adapter will then be available for normal use on the server. Alternatively, follow these example steps to implement your own plugin: + 1. Implement the LoRAResolver interface. Example of a simple S3 LoRAResolver implementation: @@ -198,9 +199,9 @@ Alternatively, follow these example steps to implement your own plugin: return lora_request ``` -2. Register LoRAResolver plugin. +2. Register `LoRAResolver` plugin. - ```python + ```python from vllm.lora.resolver import LoRAResolverRegistry s3_resolver = S3LoRAResolver() -- GitLab From c9479b292086b9ae0623f619af15cf54d58842c1 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 29 May 2025 22:39:25 +0800 Subject: [PATCH 049/274] [Bugfix] Fix the failing gte embedding test (#18720) Signed-off-by: Isotr0py <2037008807@qq.com> --- tests/conftest.py | 13 +++++++------ .../models/language/pooling/test_embedding.py | 18 +++++++++++------- tests/models/language/pooling/test_gte.py | 1 + tests/models/utils.py | 1 + 4 files changed, 20 insertions(+), 13 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 19c2c6247..26674483f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -311,6 +311,7 @@ class HfRunner: dtype: str = "auto", *, model_kwargs: Optional[dict[str, Any]] = None, + trust_remote_code: bool = True, is_sentence_transformer: bool = False, is_cross_encoder: bool = False, skip_tokenizer_init: bool = False, @@ -320,7 +321,7 @@ class HfRunner: self.config = AutoConfig.from_pretrained( model_name, - trust_remote_code=True, + trust_remote_code=trust_remote_code, ) self.device = self.get_default_device() self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype) @@ -336,7 +337,7 @@ class HfRunner: model_name, device=self.device, model_kwargs=model_kwargs, - trust_remote_code=True, + trust_remote_code=trust_remote_code, ) elif is_cross_encoder: # Lazy init required for AMD CI @@ -346,12 +347,12 @@ class HfRunner: model_name, device=self.device, automodel_args=model_kwargs, - trust_remote_code=True, + trust_remote_code=trust_remote_code, ) else: model = auto_cls.from_pretrained( model_name, - trust_remote_code=True, + trust_remote_code=trust_remote_code, **model_kwargs, ) @@ -372,7 +373,7 @@ class HfRunner: self.tokenizer = AutoTokenizer.from_pretrained( model_name, torch_dtype=torch_dtype, - trust_remote_code=True, + trust_remote_code=trust_remote_code, ) # don't put this import at the top level @@ -381,7 +382,7 @@ class HfRunner: self.processor = AutoProcessor.from_pretrained( model_name, torch_dtype=torch_dtype, - trust_remote_code=True, + trust_remote_code=trust_remote_code, ) if skip_tokenizer_init: self.tokenizer = self.processor.tokenizer diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index a44b2154b..306cfdf37 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -10,18 +10,22 @@ from ...utils import check_embeddings_close @pytest.mark.parametrize( "model", [ - # [Encoder-only] - pytest.param("BAAI/bge-base-en-v1.5", - marks=[pytest.mark.core_model, pytest.mark.cpu_model]), - pytest.param("sentence-transformers/all-MiniLM-L12-v2"), - pytest.param("intfloat/multilingual-e5-small"), - pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), + # Be careful of the order of models, decoder-only models should be + # placed before encoder-only models, otherwise `Qwen2.5-0.5B-Instruct` + # case won't pass because gte-Qwen2-1.5B-instruct will cache custom + # model code with bidirectional attention. # [Decoder-only] pytest.param("BAAI/bge-multilingual-gemma2", marks=[pytest.mark.core_model]), pytest.param("intfloat/e5-mistral-7b-instruct", marks=[pytest.mark.core_model, pytest.mark.cpu_model]), pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), + # [Encoder-only] + pytest.param("BAAI/bge-base-en-v1.5", + marks=[pytest.mark.core_model, pytest.mark.cpu_model]), + pytest.param("sentence-transformers/all-MiniLM-L12-v2"), + pytest.param("intfloat/multilingual-e5-small"), + pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), # [Cross-Encoder] pytest.param("sentence-transformers/stsb-roberta-base-v2"), ], @@ -44,7 +48,7 @@ def test_models( vllm_extra_kwargs = {} if model == "ssmits/Qwen2-7B-Instruct-embed-base": vllm_extra_kwargs["override_pooler_config"] = \ - PoolerConfig(pooling_type="MEAN") + PoolerConfig(pooling_type="MEAN", normalize=False) # The example_prompts has ending "\n", for example: # "Write a short story about a robot that dreams for the first time.\n" diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index 18b27a688..725e3d168 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -45,6 +45,7 @@ MODELS = [ ########### Qwen2ForCausalLM EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct", architecture="Qwen2ForCausalLM", + dtype="float32", enable_test=True), ########## ModernBertModel EmbedModelInfo("Alibaba-NLP/gte-modernbert-base", diff --git a/tests/models/utils.py b/tests/models/utils.py index ac1fc6c8f..ffc904bd1 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -314,6 +314,7 @@ def check_embeddings_close( dim=0) fail_msg = (f"Test{prompt_idx}:" + f"\nCosine similarity: \t{sim:.4f}" f"\n{name_0}:\t{embeddings_0[:16]!r}" f"\n{name_1}:\t{embeddings_1[:16]!r}") -- GitLab From da4b69d0b435e74c0a208cffbf2f430f5529ec64 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Thu, 29 May 2025 10:48:24 -0400 Subject: [PATCH 050/274] [Attention][V1] Toggle for v1 attention backend (#18275) Signed-off-by: Gregory Shtrasberg --- vllm/attention/ops/chunked_prefill_paged_decode.py | 4 ++-- vllm/attention/ops/prefix_prefill.py | 4 ++-- vllm/envs.py | 12 ++++++++++-- vllm/v1/attention/backends/triton_attn.py | 9 ++++++--- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py index 785799b6b..6ca2a6414 100644 --- a/vllm/attention/ops/chunked_prefill_paged_decode.py +++ b/vllm/attention/ops/chunked_prefill_paged_decode.py @@ -264,8 +264,8 @@ def chunked_prefill_paged_decode( # Conversion of FP8 Tensor from uint8 storage to # appropriate torch.dtype for interpretation by Triton if "fp8" in kv_cache_dtype: - assert key_cache.dtype == torch.uint8 - assert value_cache.dtype == torch.uint8 + assert key_cache.dtype in [torch.uint8, current_platform.fp8_dtype()] + assert value_cache.dtype in [torch.uint8, current_platform.fp8_dtype()] if kv_cache_dtype in ("fp8", "fp8_e4m3"): target_dtype = current_platform.fp8_dtype() diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index 86d256b63..729b61b02 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -744,8 +744,8 @@ def context_attention_fwd(q, # Conversion of FP8 Tensor from uint8 storage to # appropriate torch.dtype for interpretation by Triton if "fp8" in kv_cache_dtype: - assert (k_cache.dtype == torch.uint8) - assert (v_cache.dtype == torch.uint8) + assert k_cache.dtype in [torch.uint8, current_platform.fp8_dtype()] + assert v_cache.dtype in [torch.uint8, current_platform.fp8_dtype()] if kv_cache_dtype in ("fp8", "fp8_e4m3"): target_dtype = current_platform.fp8_dtype() diff --git a/vllm/envs.py b/vllm/envs.py index b007bf8c5..bd9104afa 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -15,6 +15,7 @@ if TYPE_CHECKING: VLLM_NCCL_SO_PATH: Optional[str] = None LD_LIBRARY_PATH: Optional[str] = None VLLM_USE_TRITON_FLASH_ATTN: bool = False + VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False VLLM_FLASH_ATTN_VERSION: Optional[int] = None LOCAL_RANK: int = 0 CUDA_VISIBLE_DEVICES: Optional[str] = None @@ -290,6 +291,13 @@ environment_variables: dict[str, Callable[[], Any]] = { lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1")), + # Use separate prefill and decode kernels for V1 attention instead of + # the unified triton kernel. + "VLLM_V1_USE_PREFILL_DECODE_ATTENTION": + lambda: + (os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower() in + ("true", "1")), + # Force vllm to use a specific flash-attention version (2 or 3), only valid # when using the flash-attention backend. "VLLM_FLASH_ATTN_VERSION": @@ -323,8 +331,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # Whether to log responses from API Server for debugging "VLLM_DEBUG_LOG_API_SERVER_RESPONSE": - lambda: os.environ.get("VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False"). - lower() == "true", + lambda: os.environ.get("VLLM_DEBUG_LOG_API_SERVER_RESPONSE", "False" + ).lower() == "true", # S3 access information, used for tensorizer to load model from S3 "S3_ACCESS_KEY_ID": diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 4000f9398..a97bb8500 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, Any, Optional import torch from vllm import _custom_ops as ops +from vllm import envs from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) from vllm.attention.ops.chunked_prefill_paged_decode import ( @@ -126,6 +127,8 @@ class TritonAttentionImpl(AttentionImpl): "TritonAttentionImpl") self.fp8_dtype = current_platform.fp8_dtype() + self.force_prefill_decode_attn = \ + envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION def forward( self, @@ -166,9 +169,9 @@ class TritonAttentionImpl(AttentionImpl): # performance to make sure it does not introduce any overhead. num_queries_per_kv = query.shape[1] // key.shape[1] - use_prefill_decode_attn = (num_queries_per_kv & - (num_queries_per_kv - 1)) != 0 - + num_q_is_pow2 = (num_queries_per_kv & (num_queries_per_kv - 1)) == 0 + use_prefill_decode_attn = (self.force_prefill_decode_attn + or not num_q_is_pow2) num_actual_tokens = attn_metadata.num_actual_tokens if use_prefill_decode_attn: -- GitLab From 1b7cfd5a367b8aa60ce6a9acd058047a3b20c797 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Thu, 29 May 2025 12:13:18 -0400 Subject: [PATCH 051/274] [ROCm][V0][Attention] Revert to the previous FA triton kernel (#18226) Signed-off-by: Gregory Shtrasberg --- vllm/attention/backends/rocm_flash_attn.py | 5 +- vllm/attention/ops/triton_flash_attention.py | 1766 +++++++----------- vllm/platforms/rocm.py | 6 + 3 files changed, 694 insertions(+), 1083 deletions(-) diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index abcb68911..7134472da 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -770,8 +770,9 @@ class ROCmFlashAttentionImpl(AttentionImpl): and layer._v_scale and layer._prob_scale and self.kv_cache_dtype == "fp8") full_scales = ( - layer._q_scale, layer._k_scale, layer._v_scale, - layer._prob_scale) if use_fp8_scales else None + layer._q_scale.item(), layer._k_scale.item(), + layer._v_scale.item(), + layer._prob_scale.item()) if use_fp8_scales else None self.triton_attn_func( query, key, diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py index 8940d0b66..62cfb813d 100644 --- a/vllm/attention/ops/triton_flash_attention.py +++ b/vllm/attention/ops/triton_flash_attention.py @@ -1,236 +1,33 @@ +#!/usr/bin/env python # SPDX-License-Identifier: Apache-2.0 """ Fused Attention =============== -This is a Triton implementation of the Flash Attention v2 algorithm -See https://tridao.me/publications/flash2/flash2.pdf +This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao +(https://tridao.me/publications/flash2/flash2.pdf) +Credits: OpenAI kernel team, AMD ML Frameworks Triton team -Credits: -AMD Triton kernels team -OpenAI kernel team - -Currently only the forward kernel is supported, and contains these features: +Features supported: 1) Fwd with causal masking -2) Arbitrary Q and KV sequence lengths -3) Arbitrary head sizes -4) Multi and grouped query attention -5) Variable sequence lengths -6) ALiBi and matrix bias -7) FP8 support +2) Any sequence lengths without padding (currently fwd kernel only) +3) Support for different sequence lengths for q and k +4) Nested tensor API currently does not support dropout or bias. -""" +Not currently supported: -from typing import Optional +1) Non power of two head dims + +""" import torch -from vllm import _custom_ops as ops from vllm.platforms import current_platform +from vllm.platforms.rocm import on_gfx1x from vllm.triton_utils import tl, triton -SUPPORTED_LAYOUTS = ['thd', 'bhsd', 'bshd'] - -default_eight_bit_dtype_triton = tl.float8e4b8 -default_eight_bit_dtype_torch = current_platform.fp8_dtype() -default_float8_info = torch.finfo(default_eight_bit_dtype_torch) - -FP8_MIN = triton.language.constexpr(default_float8_info.min) - -# According to https://github.com/vllm-project/vllm/blob/main -# /csrc/quantization/utils.cuh#L31, -# need to make the max for the uz datatype be 224.0 for accuracy reasons. -FP8_MAX = triton.language.constexpr( - default_float8_info.max if default_eight_bit_dtype_torch != - torch.float8_e4m3fnuz else 224.0) - - -class MetaData: - cu_seqlens_q = None - cu_seqlens_k = None - max_seqlens_q = 0 - max_seqlens_k = 0 - bias = None - alibi_slopes = None - causal = False - num_contexts = 0 - varlen = False - eight_bit = False - layout = None - return_encoded_softmax = False - eight_bit_dtype_triton = default_eight_bit_dtype_triton - eight_bit_dtype_torch = default_eight_bit_dtype_torch - output_dtype = None - - # Note about layouts: - # - # thd - [num_tokens, num_heads, head_size] - # bshd - [batch_size, seq_len, num_heads, head_size] - # bhsd - [batch_size, num_heads, seq_len, head_size] - # - # This is for each tensor, all tensors must have same layout. - # Q can have num_heads and seq_len differ from from K and V, - # however K and V must agree on this. - # - # Notes about varlen and bias: - # Only one or the other is implemented, meaning can't combine - # both varlen and bias right now. - # - # Note about quantization: - # Only 8-bit quantization supported (for now) and specifically fp8. - # Scales must be tensors. - # o_scale: This is 'output scaling', but comes from parameter called - # 'input_scale', this is applied to the output from the kernel. - # o_scale should be None if none of the other quantization parameters - # are used. - # - # NOTE: Object is in a tentatively good state after initialized, however, - # to verify, call check_args(q,k,v,o) where o is the output tensor. - def __init__( - self, - sm_scale=1.0, - layout=None, # layout can be 'bshd', 'bhsd', or 'thd' - output_dtype=None, - max_seqlens_q=0, - max_seqlens_k=0, - # varlen params - cu_seqlens_q=None, # only 'thd' layout supported for varlen - cu_seqlens_k=None, - # quant params - q_descale=None, - k_descale=None, - v_descale=None, - p_scale=None, - o_scale=None, - # bias params - bias=None, # varlen not implemented for bias - seqlen_q=None, - seqlen_k=None, - # alibi params - alibi_slopes=None, - alibi_batch=None, - alibi_nheads=None, - # causal - causal=None, - ): - self.sm_scale = sm_scale - self.output_dtype = output_dtype - self.max_seqlens_q = max_seqlens_q - self.max_seqlens_k = max_seqlens_k - self.layout = layout - if cu_seqlens_q is not None or cu_seqlens_k is not None: - assert cu_seqlens_q is not None and cu_seqlens_k is not None - assert layout is None or layout not in [ - 'bshd', 'bhsd' - ], "Varlen only implemented for thd layout" - self.set_varlen_params(cu_seqlens_q, cu_seqlens_k) - quant_params = [q_descale, k_descale, v_descale, p_scale, o_scale] - if any(x is not None for x in quant_params): - p_descale = 1.0 / p_scale if p_scale is not None else None - self.set_eight_bit_params(q_descale, k_descale, v_descale, p_scale, - p_descale, o_scale) - if bias is not None: - self.need_bias(bias, seqlen_q, seqlen_k) - if alibi_slopes is not None: - self.need_alibi(alibi_slopes, alibi_batch, alibi_nheads) - if causal is not None and causal: - self.need_causal() - - def set_varlen_params(self, cu_seqlens_q, cu_seqlens_k): - self.varlen = True - self.layout = 'thd' - self.cu_seqlens_q = cu_seqlens_q - self.cu_seqlens_k = cu_seqlens_k - # Without "varlen", there should still be one sequence. - assert len(cu_seqlens_q) >= 2 - assert len(cu_seqlens_q) == len(cu_seqlens_k) - self.num_contexts = len(cu_seqlens_q) - 1 - for i in range(0, self.num_contexts): - self.max_seqlens_q = max( - cu_seqlens_q[i + 1].item() - cu_seqlens_q[i].item(), - self.max_seqlens_q) - self.max_seqlens_k = max( - cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item(), - self.max_seqlens_k) - - def set_eight_bit_params(self, q_descale, k_descale, v_descale, p_scale, - p_descale, o_scale): - self.eight_bit = True - self.q_descale = q_descale - self.k_descale = k_descale - self.v_descale = v_descale - self.p_scale = p_scale - self.p_descale = p_descale - self.o_scale = o_scale - self.use_p_scale = (p_scale is not None) and ( - p_descale is not None) and (v_descale is not None) - self.eight_bit_kv = ((q_descale is None) and (k_descale is not None) - and (v_descale is not None)) - self.eight_bit_dtype_torch = default_eight_bit_dtype_torch - - def need_bias(self, bias, seqlen_q, seqlen_k): - assert bias is not None - assert bias.is_cuda - assert bias.dim() == 4 - assert bias.shape[0] == 1 - assert bias.shape[2:] == (seqlen_q, seqlen_k) - self.bias = bias - - def need_alibi(self, alibi_slopes, batch, nheads): - assert alibi_slopes.is_cuda - assert alibi_slopes.dim() == 2 - assert alibi_slopes.shape[0] == batch - assert alibi_slopes.shape[1] == nheads - self.alibi_slopes = alibi_slopes - - def need_causal(self): - self.causal = True - - def check_args(self, q, k, v, o): - assert q.dim() == k.dim() and q.dim() == v.dim() - - batch, nheads_q, nheads_k, head_size = get_shape_from_layout( - q, k, self) - if self.varlen: - assert q.dim() == 3 - assert self.cu_seqlens_q is not None - assert self.cu_seqlens_k is not None - assert len(self.cu_seqlens_q) == len(self.cu_seqlens_k) - # TODO: Remove once bias is supported with varlen - assert self.bias is None - assert not self.return_encoded_softmax - else: - assert q.dim() == 4 - assert self.max_seqlens_q > 0 and self.max_seqlens_k > 0 - assert self.cu_seqlens_q is None and self.cu_seqlens_k is None - assert k.shape == v.shape - assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1] - # TODO: Change assert if we support qkl f8 and v f16 - if self.eight_bit: - if self.eight_bit_kv: - assert (v.dtype == k.dtype - and k.dtype == self.eight_bit_dtype_torch) - assert q.dtype != k.dtype - assert (self.v_descale is not None) and (self.k_descale - is not None) - else: - assert (q.dtype == k.dtype and q.dtype == v.dtype - and q.dtype == self.eight_bit_dtype_torch) - assert (self.q_descale - is not None) and (self.k_descale - is not None) and (self.v_descale - is not None) - if self.use_p_scale: - assert (self.p_scale is not None) and (self.p_descale - is not None) - else: - assert (q.dtype == k.dtype) and (q.dtype == v.dtype) - assert head_size <= 256 - assert o.shape == q.shape - assert (nheads_q % nheads_k) == 0 - assert self.layout is not None - assert self.layout == 'thd' or not self.varlen +torch_dtype: tl.constexpr = torch.float16 @triton.jit @@ -243,85 +40,40 @@ def max_fn(x, y): return tl.math.max(x, y) -# Convenience function to load with optional boundary checks. -# "First" is the major dim, "second" is the minor dim. @triton.jit -def masked_load(ptrs, offset_first, offset_second, boundary_first, - boundary_second): - if offset_first is not None and offset_second is not None: - mask = (offset_first[:, None] < boundary_first) & \ - (offset_second[None, :] < boundary_second) - tensor = tl.load(ptrs, mask=mask, other=0.0) - elif offset_first is not None: - mask = offset_first[:, None] < boundary_first - tensor = tl.load(ptrs, mask=mask, other=0.0) - elif offset_second is not None: - mask = offset_second[None, :] < boundary_second - tensor = tl.load(ptrs, mask=mask, other=0.0) - else: - tensor = tl.load(ptrs) - return tensor +def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride): + ms = tl.arange(0, m) + ns = tl.arange(0, n) + return philox_offset + ms[:, None] * stride + ns[None, :] @triton.jit -def compute_alibi_block(alibi_slope, - seqlen_q, - seqlen_k, - offs_m, - offs_n, - transpose=False): - # when seqlen_k and seqlen_q are different we want the diagonal to stick to - # the bottom right of the attention matrix - # for casual mask we want something like this where (1 is kept and 0 is - # masked) - # seqlen_q = 2 and seqlen_k = 5 - # 1 1 1 1 0 - # 1 1 1 1 1 - # seqlen_q = 5 and seqlen_k = 2 - # 0 0 - # 0 0 - # 0 0 - # 1 0 - # 1 1 - # for alibi the diagonal is 0 indicating no penalty for attending to that - # spot and increasing penalty for attending further from the diagonal - # e.g. alibi_slope = 1, seqlen_q = 2, seqlen_k = 5, - # offs_m = [0, 1, 2, 3], offs_n = [0, 1, 2, 3, 4], transpose = False - # 1. offs_m[:,None] = [[0], - # [1], - # 2. offs_m[:,None] + seqlen_k = [[5], - # [6], - # 3. offs_m[:,None] + seqlen_k - seqlen_q = [[3], - # [4], - # 4. offs_m[:,None] + seqlen_k - seqlen_q - offs_n[None,:] = - # [[3], - [[0, 1, 2, 3, 4]] = [[ 3, 2, 1, 0,-1], [4], [ 4, 3, 2, 1, 0]] - # 5. -1 * alibi_slope * tl.abs(relative_pos_block) = [[ -3, -2, -1, 0,-1], - # [ -4, -3, -2, -1, 0]], - relative_pos_block = (offs_m[:, None] + seqlen_k - seqlen_q - - offs_n[None, :]) - alibi_block = -1 * alibi_slope * tl.abs(relative_pos_block) - if transpose: - return alibi_block.T - else: - return alibi_block +def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride): + rng_offsets = dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, + stride).to(tl.uint32) + # TODO: use tl.randint for better performance + return tl.rand(philox_seed, rng_offsets) -def compute_alibi_tensor(alibi_slopes, seqlen_q, seqlen_k): - q_idx = torch.arange(seqlen_q, dtype=torch.int32, - device="cuda").unsqueeze(-1) # (N_CTX_Q, 1) - k_idx = torch.arange(seqlen_k, dtype=torch.int32, - device="cuda").unsqueeze(0) # (1, N_CTX_K) - relative_pos = torch.abs(q_idx + seqlen_k - seqlen_q - - k_idx) # (N_CTX_Q, N_CTX_K) - return -1 * alibi_slopes.unsqueeze(-1).unsqueeze( - -1) * relative_pos # (Z, H, N_CTX_Q, N_CTX_K) +@triton.jit +def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride): + rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n, + stride) + rng_keep = rng_output > dropout_p + return rng_keep @triton.jit -def quant_fp8(x, scale): - x *= scale - x = tl.clamp(x, FP8_MIN, FP8_MAX) - return x +def load_fn(block_ptr, first, second, pad): + if first and second: + tensor = tl.load(block_ptr, boundary_check=(0, 1), padding_option=pad) + elif first: + tensor = tl.load(block_ptr, boundary_check=(0, ), padding_option=pad) + elif second: + tensor = tl.load(block_ptr, boundary_check=(1, ), padding_option=pad) + else: + tensor = tl.load(block_ptr) + return tensor @triton.jit @@ -330,68 +82,61 @@ def _attn_fwd_inner( l_i, m_i, q, - k_ptrs, - v_ptrs, - bias_ptrs, - stride_kn, - stride_vk, - stride_bn, + K_block_ptr, + V_block_ptr, start_m, actual_seqlen_k, - actual_seqlen_q, + dropout_p, philox_seed, batch_philox_offset, - encoded_sm_ptrs, + encoded_softmax_block_ptr, block_min, block_max, offs_n_causal, masked_blocks, n_extra_tokens, - alibi_slope, - q_descale, - k_descale, - v_descale, - p_scale, + bias_ptr, IS_CAUSAL: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, BLOCK_N: tl.constexpr, OFFS_M: tl.constexpr, OFFS_N: tl.constexpr, - SHOULD_PRE_LOAD_V: tl.constexpr, - SHOULD_MASK_STEPS: tl.constexpr, - SHOULD_RETURN_ENCODED_SOFTMAX: tl.constexpr, - USE_PADDED_HEAD: tl.constexpr, - IS_ACTUAL_BLOCK_DMODEL: tl.constexpr, - QK_SCALE: tl.constexpr, - IS_EIGHT_BIT_GEMM: tl.constexpr, - USE_P_SCALE: tl.constexpr, - IS_EIGHT_BIT_KV: tl.constexpr, - QUANT_DTYPE: tl.constexpr = default_eight_bit_dtype_triton, + PRE_LOAD_V: tl.constexpr, + MASK_STEPS: tl.constexpr, + ENABLE_DROPOUT: tl.constexpr, + RETURN_ENCODED_SOFTMAX: tl.constexpr, + PADDED_HEAD: tl.constexpr, + USE_FP8: tl.constexpr, + qk_scale, + p_descale, ): - # loop over k, v, and update accumulator for start_n in range(block_min, block_max, BLOCK_N): # For padded blocks, we will overrun the tensor size if # we load all BLOCK_N. For others, the blocks are all within range. - k_offs_n = start_n + tl.arange(0, - BLOCK_N) if SHOULD_MASK_STEPS else None - k_offs_k = None if not USE_PADDED_HEAD else tl.arange(0, BLOCK_DMODEL) - k = masked_load(k_ptrs, k_offs_k, k_offs_n, IS_ACTUAL_BLOCK_DMODEL, - actual_seqlen_k) - if SHOULD_PRE_LOAD_V: - # We can use the same offsets as k, just with dims transposed. - v = masked_load(v_ptrs, k_offs_n, k_offs_k, actual_seqlen_k, - IS_ACTUAL_BLOCK_DMODEL) + k = load_fn( + K_block_ptr, + PADDED_HEAD, + MASK_STEPS and (n_extra_tokens != 0), + "zero", + ) + if PRE_LOAD_V: + v = load_fn( + V_block_ptr, + MASK_STEPS and (n_extra_tokens != 0), + PADDED_HEAD, + "zero", + ) qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) # We start from end of seqlen_k so only the first iteration would need # to be checked for padding if it is not a multiple of block_n # TODO: This can be optimized to only be true for the padded block. - if SHOULD_MASK_STEPS: # noqa: SIM102 + if MASK_STEPS: # noqa: SIM102 # If this is the last block / iteration, we want to # mask if the sequence length is not a multiple of block size - # a solution is to always do BLOCK_M // BLOCK_N + 1 steps if not - # is_modulo_mn. last step might get wasted but that is okay. + # a solution is to always do BLOCK_M // BLOCK_N + 1 steps + # if not is_modulo_mn. last step might get wasted but that is okay. # check if this masking works for that case. if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0): boundary_m = tl.full([BLOCK_M], @@ -404,107 +149,112 @@ def _attn_fwd_inner( causal_boundary = start_n + offs_n_causal causal_mask = OFFS_M[:, None] >= causal_boundary[None, :] qk = tl.where(causal_mask, qk, float("-inf")) - # -- compute qk ---- - if IS_EIGHT_BIT_GEMM: - qk += ((((tl.dot(q, k).to(tl.float32) * q_descale)) * k_descale) * - QK_SCALE) - else: - if IS_EIGHT_BIT_KV: - k = (k * k_descale).to(q.type.element_ty) - qk += (tl.dot(q, k) * QK_SCALE) - - if bias_ptrs is not None: - bias_offs_n = start_n + tl.arange( - 0, BLOCK_N) if SHOULD_MASK_STEPS else None - bias = masked_load(bias_ptrs, OFFS_M, bias_offs_n, actual_seqlen_q, - actual_seqlen_k) - # While bias is added after multiplying qk with sm_scale, - # our optimization to use 2^x instead of e^x results in an - # additional scale factor of log2(e) which we must also multiply - # the bias with. - qk += (bias * 1.44269504089) - - if alibi_slope is not None: - # Compute the global position of each token within the sequence - global_m_positions = start_m * BLOCK_M + tl.arange(0, BLOCK_M) - global_n_positions = start_n + tl.arange(0, BLOCK_N) - alibi_block = compute_alibi_block(alibi_slope, actual_seqlen_q, - actual_seqlen_k, - global_m_positions, - global_n_positions) - qk += (alibi_block * 1.44269504089) # scale factor of log2(e) - - # softmax + qk += tl.dot(q, k) + if USE_FP8: + qk *= qk_scale + if bias_ptr is not None: + bias = load_fn(bias_ptr, False, MASK_STEPS + and (n_extra_tokens != 0), "zero") + # While bias is added after multiplying qk with sm_scale, our + # optimization to use 2^x instead of e^x results in an additional + # scale factor of log2(e) which we must also multiply the bias with. + qk += bias * 1.44269504089 m_ij = tl.maximum(m_i, tl.max(qk, 1)) qk = qk - m_ij[:, None] p = tl.math.exp2(qk) # CAVEAT: Must update l_ij before applying dropout l_ij = tl.sum(p, 1) - if SHOULD_RETURN_ENCODED_SOFTMAX: - tl.store(encoded_sm_ptrs, p.to(encoded_sm_ptrs.type.element_ty)) + if ENABLE_DROPOUT: + philox_offset = (batch_philox_offset + + start_m * BLOCK_M * actual_seqlen_k + start_n - + BLOCK_N) + keep = dropout_mask( + philox_seed, + philox_offset, + dropout_p, + BLOCK_M, + BLOCK_N, + actual_seqlen_k, + ) + if RETURN_ENCODED_SOFTMAX: + tl.store( + encoded_softmax_block_ptr, + tl.where(keep, p, + -p).to(encoded_softmax_block_ptr.type.element_ty), + ) + p = tl.where(keep, p, 0.0) + elif RETURN_ENCODED_SOFTMAX: + tl.store( + encoded_softmax_block_ptr, + p.to(encoded_softmax_block_ptr.type.element_ty), + ) # -- update output accumulator -- alpha = tl.math.exp2(m_i - m_ij) acc = acc * alpha[:, None] - if not SHOULD_PRE_LOAD_V: - v = masked_load(v_ptrs, k_offs_n, k_offs_k, actual_seqlen_k, - IS_ACTUAL_BLOCK_DMODEL) + if not PRE_LOAD_V: + v = load_fn( + V_block_ptr, + MASK_STEPS and (n_extra_tokens != 0), + PADDED_HEAD, + "zero", + ) # -- update m_i and l_i l_i = l_i * alpha + l_ij # update m_i and l_i m_i = m_ij - if IS_EIGHT_BIT_GEMM: - if USE_P_SCALE: - p = quant_fp8(p, p_scale).to(QUANT_DTYPE) - acc += tl.dot(p, v) - else: - # v is in eight_bit but p is not, we want the gemm in p's type - acc += tl.dot(p, v.to(p.type.element_ty)) - else: - if IS_EIGHT_BIT_KV: - v = (v * v_descale).to(p.type.element_ty) - acc += tl.dot(p.to(v.type.element_ty), v) - - k_ptrs += BLOCK_N * stride_kn - v_ptrs += BLOCK_N * stride_vk - if bias_ptrs is not None: - bias_ptrs += BLOCK_N * stride_bn - if SHOULD_RETURN_ENCODED_SOFTMAX: - encoded_sm_ptrs += BLOCK_N + if USE_FP8: + p *= p_descale + + acc += tl.dot(p.to(V_block_ptr.type.element_ty), v) + + V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0)) + K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N)) + if bias_ptr is not None: + bias_ptr = tl.advance(bias_ptr, (0, BLOCK_N)) + if RETURN_ENCODED_SOFTMAX: + encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr, + (0, BLOCK_N)) return acc, l_i, m_i def get_cdna_autotune_configs(): return [ + triton.Config( + { + 'BLOCK_M': 256, + 'BLOCK_N': 64, + 'waves_per_eu': 2, + 'PRE_LOAD_V': False + }, + num_stages=1, + num_warps=8), triton.Config( { 'BLOCK_M': 128, 'BLOCK_N': 128, 'waves_per_eu': 2, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 + 'PRE_LOAD_V': False }, num_stages=1, num_warps=4), triton.Config( { - 'BLOCK_M': 128, - 'BLOCK_N': 64, + 'BLOCK_M': 256, + 'BLOCK_N': 128, 'waves_per_eu': 2, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 + 'PRE_LOAD_V': False }, num_stages=1, - num_warps=4), + num_warps=8), triton.Config( { 'BLOCK_M': 128, 'BLOCK_N': 64, - 'waves_per_eu': 3, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 + 'waves_per_eu': 1, + 'PRE_LOAD_V': False }, num_stages=1, num_warps=4), @@ -512,168 +262,141 @@ def get_cdna_autotune_configs(): { 'BLOCK_M': 128, 'BLOCK_N': 64, - 'waves_per_eu': 1, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 + 'waves_per_eu': 3, + 'PRE_LOAD_V': True }, num_stages=1, num_warps=4), triton.Config( { 'BLOCK_M': 128, - 'BLOCK_N': 32, - 'waves_per_eu': 2, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 + 'BLOCK_N': 64, + 'waves_per_eu': 3, + 'PRE_LOAD_V': False }, num_stages=1, num_warps=4), - ], [ - 'IS_CAUSAL', 'MAX_SEQLENS_Q', 'MAX_SEQLENS_K', - 'IS_ACTUAL_BLOCK_DMODEL', 'VARLEN', 'HQ', 'HK' - ] - - -def get_rdna_autotune_configs(): - return [ triton.Config( { - 'BLOCK_M': 32, - 'BLOCK_N': 32, + 'BLOCK_M': 64, + 'BLOCK_N': 64, 'waves_per_eu': 4, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 + 'PRE_LOAD_V': False }, num_stages=1, - num_warps=2), + num_warps=8), triton.Config( { 'BLOCK_M': 32, 'BLOCK_N': 32, - 'waves_per_eu': 2, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 + 'waves_per_eu': 4, + 'PRE_LOAD_V': False }, num_stages=1, - num_warps=2), + num_warps=8), + # TODO: This config fails with head_size not pow2 with data mismatches. + # triton.Config({'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 1, + # 'PRE_LOAD_V': False}, num_stages=1, num_warps=4), + + # Fails in AccelerateAMDMatmul (Triton) assert when using FP8: + # triton.Config( + # { + # "BLOCK_M": 16, + # "BLOCK_N": 16, + # "waves_per_eu": 1, + # "PRE_LOAD_V": False, + # }, + # num_stages=1, + # num_warps=4, + # ), + ], ['IS_CAUSAL', 'dropout_p', 'BLOCK_DMODEL', 'USE_FP8'] + + +def get_rdna_autotune_configs(): + return [ triton.Config( { 'BLOCK_M': 32, - 'BLOCK_N': 16, + 'BLOCK_N': 32, 'waves_per_eu': 4, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 + 'PRE_LOAD_V': False }, num_stages=1, num_warps=2), triton.Config( { 'BLOCK_M': 32, - 'BLOCK_N': 16, + 'BLOCK_N': 32, 'waves_per_eu': 2, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 + 'PRE_LOAD_V': False }, num_stages=1, num_warps=2), triton.Config( { - 'BLOCK_M': 16, + 'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 4, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 + 'PRE_LOAD_V': False }, num_stages=1, num_warps=2), triton.Config( { - 'BLOCK_M': 16, + 'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 2, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 - }, - num_stages=1, - num_warps=2), - # Fall-back config. - triton.Config( - { - 'BLOCK_M': 16, - 'BLOCK_N': 16, - 'waves_per_eu': 1, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 + 'PRE_LOAD_V': False }, num_stages=1, num_warps=2), - ], [ - 'IS_CAUSAL', 'MAX_SEQLENS_Q', 'MAX_SEQLENS_K', - 'IS_ACTUAL_BLOCK_DMODEL', 'VARLEN', 'HQ', 'HK' - ] - - -def get_general_autotune_configs(): - return [ - triton.Config( - { - 'BLOCK_M': 128, - 'BLOCK_N': 128, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 - }, - num_stages=1, - num_warps=4), - triton.Config( - { - 'BLOCK_M': 128, - 'BLOCK_N': 64, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 - }, - num_stages=1, - num_warps=4), - triton.Config( - { - 'BLOCK_M': 128, - 'BLOCK_N': 32, - 'SHOULD_PRE_LOAD_V': False, - 'GRID_CU_MULTIP': 2 - }, - num_stages=1, - num_warps=4), - ], [ - 'IS_CAUSAL', 'MAX_SEQLENS_Q', 'MAX_SEQLENS_K', - 'IS_ACTUAL_BLOCK_DMODEL', 'VARLEN', 'HQ', 'HK' - ] - - -def has_cdna_target(): - ROCM_CDNA_TARGETS = ["gfx942", "gfx90a", "gfx908"] - return triton.runtime.driver.active.get_current_target( - ).arch in ROCM_CDNA_TARGETS - - -def is_rocm_cdna(): - return current_platform.is_rocm() and has_cdna_target() + # Fails in AccelerateAMDMatmul (Triton) assert when using FP8: + # triton.Config( + # { + # 'BLOCK_M': 16, + # 'BLOCK_N': 16, + # 'waves_per_eu': 4, + # 'PRE_LOAD_V': False + # }, + # num_stages=1, + # num_warps=2), + # triton.Config( + # { + # 'BLOCK_M': 16, + # 'BLOCK_N': 16, + # 'waves_per_eu': 2, + # 'PRE_LOAD_V': False + # }, + # num_stages=1, + # num_warps=2), + # # Fall-back config. + # triton.Config( + # { + # 'BLOCK_M': 16, + # 'BLOCK_N': 16, + # 'waves_per_eu': 1, + # 'PRE_LOAD_V': False + # }, + # num_stages=1, + # num_warps=2), + ], ['IS_CAUSAL', 'dropout_p', 'BLOCK_DMODEL', 'USE_FP8'] def get_autotune_configs(): - if is_rocm_cdna(): - return get_cdna_autotune_configs() - elif current_platform.is_rocm(): + if on_gfx1x(): return get_rdna_autotune_configs() else: - return get_general_autotune_configs() + return get_cdna_autotune_configs() autotune_configs, autotune_keys = get_autotune_configs() +float8_info = torch.finfo(current_platform.fp8_dtype()) + @triton.autotune( configs=autotune_configs, key=autotune_keys, - use_cuda_graph=True, ) @triton.jit def attn_fwd( @@ -681,7 +404,13 @@ def attn_fwd( K, V, bias, - SM_SCALE: tl.constexpr, + sm_scale, + q_scale, + k_scale, + v_scale, + p_scale, + p_descale, + o_descale, L, Out, stride_qz: tl.int64, @@ -704,70 +433,44 @@ def attn_fwd( stride_bh: tl.int64, stride_bm: tl.int64, stride_bn: tl.int64, - stride_az: tl.int64, - stride_ah: tl.int64, - q_descale_ptr, - k_descale_ptr, - p_scale_ptr, - p_descale_ptr, - o_descale_ptr, - v_descale_ptr, - q_descale_has_singleton: tl.constexpr, - k_descale_has_singleton: tl.constexpr, - p_descale_has_singleton: tl.constexpr, - v_descale_has_singleton: tl.constexpr, cu_seqlens_q, cu_seqlens_k, + dropout_p, philox_seed, - NUM_CU: tl.constexpr, - GRID_CU_MULTIP: tl.constexpr, - B: tl.constexpr, philox_offset_base, encoded_softmax, - alibi_slopes, HQ: tl.constexpr, HK: tl.constexpr, - IS_ACTUAL_BLOCK_DMODEL: tl.constexpr, + ACTUAL_BLOCK_DMODEL: tl.constexpr, MAX_SEQLENS_Q: tl.constexpr, MAX_SEQLENS_K: tl.constexpr, VARLEN: tl.constexpr, IS_CAUSAL: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, + USE_FP8: tl.constexpr, + USE_FP8_OUT: tl.constexpr, BLOCK_N: tl.constexpr, - SHOULD_PRE_LOAD_V: tl.constexpr, - USE_BIAS: tl.constexpr, - SHOULD_RETURN_ENCODED_SOFTMAX: tl.constexpr, - USE_ALIBI: tl.constexpr, - IS_EIGHT_BIT: tl.constexpr, - USE_P_SCALE: tl.constexpr, - IS_EIGHT_BIT_KV: tl.constexpr, - QUANT_DTYPE: tl.constexpr = default_eight_bit_dtype_triton, + PRE_LOAD_V: tl.constexpr, + BIAS_TYPE: tl.constexpr, + ENABLE_DROPOUT: tl.constexpr, + RETURN_ENCODED_SOFTMAX: tl.constexpr, + FP8_MIN: tl.constexpr = float8_info.min, + FP8_MAX: tl.constexpr = float8_info.max, ): - - if o_descale_ptr is not None: - o_descale = tl.load(o_descale_ptr) - - start_m: tl.int64 = tl.program_id(0) - off_h_q: tl.int64 = tl.program_id(1) - off_z: tl.int64 = tl.program_id(2) - - offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M).to(tl.int64) - offs_n = tl.arange(0, BLOCK_N).to(tl.int64) - offs_d = tl.arange(0, BLOCK_DMODEL).to(tl.int64) - - # as we can't have return statements inside while loop in Triton - continue_condition = True - + start_m = tl.program_id(0) + off_h_q = tl.program_id(1) + off_z = tl.program_id(2) + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_n = tl.arange(0, BLOCK_N) if VARLEN: cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z) cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1) seqlen_q = cu_seqlens_q_end - cu_seqlens_q_start - # We have a one-size-fits-all grid in id(0). Some seqlens might be - # too small for all start_m so for those we return early. + # We have a one-size-fits-all grid in id(0). Some seqlens might be too + # small for all start_m so for those we return early. if start_m * BLOCK_M > seqlen_q: - continue_condition = False - # return + return cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z) cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1) seqlen_k = cu_seqlens_k_end - cu_seqlens_k_start @@ -777,598 +480,499 @@ def attn_fwd( seqlen_q = MAX_SEQLENS_Q seqlen_k = MAX_SEQLENS_K - if continue_condition: - # Now we compute whether we need to exit early due to causal - # masking. This is because for seqlen_q > seqlen_k, M rows of the - # attn scores are completely masked, resulting in 0s written to the - # output, and inf written to LSE. We don't need to do any GEMMs in - # this case. This block of code determines what N is, and if this - # WG is operating on those M rows. - n_blocks = cdiv_fn(seqlen_k, BLOCK_N) - if (IS_CAUSAL): - # If seqlen_q == seqlen_k, the attn scores are a square matrix. - # If seqlen_q != seqlen_k, attn scores are rectangular which - # means the causal mask boundary is bottom right aligned, and - # ends at either the top edge (seqlen_q < seqlen_k) or left - # edge. This captures the decrease in n_blocks if we have a - # rectangular attn matrix - n_blocks_seqlen = cdiv_fn( - (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N) - # This is what adjusts the block_max for the current WG, only - # if IS_CAUSAL. Otherwise we want to always iterate through all - # n_blocks - n_blocks = min(n_blocks, n_blocks_seqlen) - # If we have no blocks after adjusting for seqlen deltas, this - # WG is part of the blocks that are all 0. We exit early. - if n_blocks <= 0: - o_offset = (Out + off_z * stride_oz + off_h_q * stride_oh + - cu_seqlens_q_start * stride_om) - o_ptrs = (o_offset + offs_m[:, None] * stride_om + - offs_d[None, :] * stride_on) - acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) - o_ptrs_mask = (offs_m[:, None] < seqlen_q).broadcast_to( - [BLOCK_M, BLOCK_DMODEL]) - # We still need to write 0s to the result - tl.store(o_ptrs, acc, mask=o_ptrs_mask) - # The tensor allocated for L is based on MAX_SEQLENS_Q as - # that is statically known. - l_ptrs = (L + off_z * HQ * MAX_SEQLENS_Q + - off_h_q * MAX_SEQLENS_Q + offs_m) - # We store inf to LSE, not -inf because in the bwd pass, - # we subtract this from qk which makes it -inf, such that - # exp(qk - inf) = 0 for these masked blocks. - l_value = tl.full([BLOCK_M], - value=float("inf"), - dtype=tl.float32) - l_ptrs_mask = offs_m < MAX_SEQLENS_Q - tl.store(l_ptrs, l_value, mask=l_ptrs_mask) - # TODO: Should dropout and return encoded softmax be - # handled here too? - continue_condition = False - # return - - if continue_condition: - # If MQA / GQA, set the K and V head offsets appropriately. - GROUP_SIZE: tl.constexpr = HQ // HK - off_h_k = off_h_q // GROUP_SIZE if GROUP_SIZE != 1 else off_h_q - n_extra_tokens = 0 - if seqlen_k < BLOCK_N: - n_extra_tokens = BLOCK_N - seqlen_k - elif seqlen_k % BLOCK_N: - n_extra_tokens = seqlen_k % BLOCK_N - USE_PADDED_HEAD: tl.constexpr = (IS_ACTUAL_BLOCK_DMODEL - != BLOCK_DMODEL) - - # Compute pointers for all the tensors used in this kernel. - q_offset = (Q + off_z * stride_qz + off_h_q * stride_qh + - cu_seqlens_q_start * stride_qm) - q_ptrs = (q_offset + offs_m[:, None] * stride_qm + - offs_d[None, :] * stride_qk) - k_offset = (K + off_z * stride_kz + off_h_k * stride_kh + - cu_seqlens_k_start * stride_kn) - k_ptrs = (k_offset + offs_d[:, None] * stride_kk + - offs_n[None, :] * stride_kn) - v_offset = (V + off_z * stride_vz + off_h_k * stride_vh + - cu_seqlens_k_start * stride_vk) - v_ptrs = (v_offset + offs_n[:, None] * stride_vk + - offs_d[None, :] * stride_vn) - # Compute pointers for all scale tensors used in this kernel. - - IS_EIGHT_BIT_GEMM: tl.constexpr = IS_EIGHT_BIT & ( - not IS_EIGHT_BIT_KV) - if IS_EIGHT_BIT: - if k_descale_has_singleton: - k_descale_ptrs = k_descale_ptr - else: - k_descale_ptrs = k_descale_ptr + off_h_k - - if v_descale_has_singleton: - v_descale_ptrs = v_descale_ptr - else: - v_descale_ptrs = v_descale_ptr + off_h_k - - if not IS_EIGHT_BIT_KV: - if q_descale_has_singleton: - q_descale_ptrs = q_descale_ptr - else: - q_descale_ptrs = q_descale_ptr + off_h_q - if USE_P_SCALE: - if p_descale_has_singleton: - p_scale_ptrs = p_scale_ptr - p_descale_ptrs = p_descale_ptr - else: - p_scale_ptrs = p_scale_ptr + off_h_q - p_descale_ptrs = p_descale_ptr + off_h_q - - if USE_BIAS: - bias_offset = off_h_q * stride_bh - bias_ptrs = (bias + bias_offset + offs_m[:, None] * stride_bm + - offs_n[None, :] * stride_bn) - else: - bias_ptrs = None - - if USE_ALIBI: - a_offset = off_z * stride_az + off_h_q * stride_ah - alibi_slope = tl.load(alibi_slopes + a_offset) - else: - alibi_slope = None - - batch_philox_offset = 0 - # We can ask to return the dropout mask without doing any - # dropout. In this case, we return an invalid pointer so - # indicate the mask is not valid. - if SHOULD_RETURN_ENCODED_SOFTMAX: - encoded_sm_base = (encoded_softmax + - off_h_q * seqlen_q * seqlen_k) - encoded_sm_ptrs = (encoded_sm_base + - offs_m[:, None] * seqlen_k + - offs_n[None, :]) - else: - encoded_sm_ptrs = None - # initialize pointer to m and l - m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) - l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32) - acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) - # scale sm_scale by log_2(e) and use 2^x in the loop as we do - # not have native e^x support in HW. - QK_SCALE: tl.constexpr = SM_SCALE * 1.44269504089 - # Q is loaded once at the beginning and shared by all N blocks. - q_ptrs_mask = offs_m[:, None] < seqlen_q - if USE_PADDED_HEAD: - q_ptrs_mask = q_ptrs_mask & (offs_d[None, :] - < IS_ACTUAL_BLOCK_DMODEL) - q = tl.load(q_ptrs, mask=q_ptrs_mask, other=0.0) - - if IS_EIGHT_BIT: - k_descale = tl.load(k_descale_ptrs) - v_descale = tl.load(v_descale_ptrs) - q_descale = None if IS_EIGHT_BIT_KV else tl.load( - q_descale_ptrs) - if USE_P_SCALE: - p_scale = tl.load(p_scale_ptrs) - p_descale = tl.load(p_descale_ptrs) - else: - p_scale = None - p_descale = None - else: - q_descale = None - k_descale = None - v_descale = None - p_scale = None - p_descale = None - # Here we compute how many full and masked blocks we have. - padded_block_k = n_extra_tokens != 0 - is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0) - if IS_CAUSAL: - # There are always at least BLOCK_M // BLOCK_N masked - # blocks. Additionally there might be one more due to - # dissimilar seqlens. - masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn) - else: - # Padding on Q does not need to be masked in the FA loop. - masked_blocks = padded_block_k - # if IS_CAUSAL, not is_modulo_mn does not always result in an - # additional block. In this case we might exceed n_blocks so - # pick the min. - masked_blocks = min(masked_blocks, n_blocks) - n_full_blocks = n_blocks - masked_blocks - block_min = 0 - block_max = n_blocks * BLOCK_N - # Compute for full blocks. Here we set causal to false - # regardless of its actual value because there is no masking. - # Similarly we do not need padding. - if n_full_blocks > 0: - block_max = (n_blocks - masked_blocks) * BLOCK_N - acc, l_i, m_i = _attn_fwd_inner( - acc, - l_i, - m_i, - q, - k_ptrs, - v_ptrs, - bias_ptrs, - stride_kn, - stride_vk, - stride_bn, - start_m, - seqlen_k, - seqlen_q, - philox_seed, - batch_philox_offset, - encoded_sm_ptrs, - # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _ - block_min, - block_max, - 0, - 0, - 0, - alibi_slope, - q_descale, - k_descale, - v_descale, - p_scale, - # IS_CAUSAL, .... - False, - BLOCK_M, - BLOCK_DMODEL, - BLOCK_N, - offs_m, - offs_n, - # _, SHOULD_MASK_STEPS, ... - SHOULD_PRE_LOAD_V, - False, - SHOULD_RETURN_ENCODED_SOFTMAX, - USE_PADDED_HEAD, - IS_ACTUAL_BLOCK_DMODEL, - QK_SCALE, - IS_EIGHT_BIT_GEMM, - USE_P_SCALE, - IS_EIGHT_BIT_KV, - QUANT_DTYPE) - block_min = block_max - block_max = n_blocks * BLOCK_N - - tl.debug_barrier() - # Remaining blocks, if any, are full / not masked. - if (masked_blocks > 0): - if IS_CAUSAL: - offs_n_causal = offs_n + (seqlen_q - seqlen_k) - else: - offs_n_causal = 0 - k_ptrs += n_full_blocks * BLOCK_N * stride_kn - v_ptrs += n_full_blocks * BLOCK_N * stride_vk - if USE_BIAS: - bias_ptrs += n_full_blocks * BLOCK_N * stride_bn - if SHOULD_RETURN_ENCODED_SOFTMAX: - encoded_sm_ptrs += n_full_blocks * BLOCK_N - acc, l_i, m_i = _attn_fwd_inner( - acc, - l_i, - m_i, - q, - k_ptrs, - v_ptrs, - bias_ptrs, - stride_kn, - stride_vk, - stride_bn, - start_m, - seqlen_k, - seqlen_q, - philox_seed, - batch_philox_offset, - encoded_sm_ptrs, - block_min, - block_max, - offs_n_causal, - masked_blocks, - n_extra_tokens, - alibi_slope, - q_descale, - k_descale, - v_descale, - p_scale, - IS_CAUSAL, - BLOCK_M, - BLOCK_DMODEL, - BLOCK_N, - offs_m, - offs_n, - # _, SHOULD_MASK_STEPS, ... - SHOULD_PRE_LOAD_V, - True, - SHOULD_RETURN_ENCODED_SOFTMAX, - USE_PADDED_HEAD, - IS_ACTUAL_BLOCK_DMODEL, - QK_SCALE, - IS_EIGHT_BIT_GEMM, - USE_P_SCALE, - IS_EIGHT_BIT_KV, - QUANT_DTYPE) - - if IS_EIGHT_BIT and not IS_EIGHT_BIT_KV: - if USE_P_SCALE: - acc *= p_descale - acc *= v_descale - - # epilogue - # This helps the compiler do Newton Raphson on l_i vs on acc - # which is much larger. - l_recip = 1 / l_i[:, None] - acc = acc * l_recip - - # If seqlen_q > seqlen_k but the delta is not a multiple of - # BLOCK_M, then we have one block with a row of all NaNs which - # come from computing softmax over a row of all - # -infs (-inf - inf = NaN). We check for that here and store 0s - # where there are NaNs as these rows should've been zeroed out. - end_m_idx = (start_m + 1) * BLOCK_M - start_m_idx = start_m * BLOCK_M - causal_start_idx = seqlen_q - seqlen_k - if IS_EIGHT_BIT and not IS_EIGHT_BIT_KV: # noqa: SIM102 - if o_descale_ptr is not None: - acc = quant_fp8(acc, o_descale) - - acc = acc.to(Out.type.element_ty) - if IS_CAUSAL: # noqa: SIM102 - if (causal_start_idx > start_m_idx - and causal_start_idx < end_m_idx): - out_mask_boundary = tl.full((BLOCK_DMODEL, ), - causal_start_idx, - dtype=tl.int32) - mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M) - out_ptrs_mask = (mask_m_offsets[:, None] - >= out_mask_boundary[None, :]) - z = tl.zeros((1, ), tl.float32) - acc = tl.where(out_ptrs_mask, acc, - z.to(acc.type.element_ty)) - # write back LSE - l_ptrs = (L + off_z * HQ * MAX_SEQLENS_Q + - off_h_q * MAX_SEQLENS_Q + offs_m) - # If seqlen_q not multiple of BLOCK_M, we need to mask out the - # last few rows. This is only true for the last M block. - # For others, overflow_size will be -ve - overflow_size = end_m_idx - seqlen_q - if overflow_size > 0: - boundary = tl.full((BLOCK_M, ), - BLOCK_M - overflow_size, - dtype=tl.int32) - l_ptrs_mask = tl.arange(0, BLOCK_M) < boundary - tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask) - else: - tl.store(l_ptrs, m_i + tl.math.log2(l_i)) - - # write back O - o_offset = (Out + off_z * stride_oz + off_h_q * stride_oh + - cu_seqlens_q_start * stride_om) - o_ptrs = (o_offset + offs_m[:, None] * stride_om + - offs_d[None, :] * stride_on) - o_ptrs_mask = tl.full([BLOCK_M, BLOCK_DMODEL], 1, dtype=tl.int1) - if overflow_size > 0: - o_ptrs_mask = o_ptrs_mask & (offs_m[:, None] < seqlen_q) - if USE_PADDED_HEAD: - o_ptrs_mask = o_ptrs_mask & (offs_d[None, :] - < IS_ACTUAL_BLOCK_DMODEL) - tl.store(o_ptrs, acc.to(Out.dtype.element_ty), mask=o_ptrs_mask) - - -def get_shape_from_layout(q, k, metadata): - assert metadata.layout in SUPPORTED_LAYOUTS, "Got unsupported layout." - - if metadata.layout == 'thd': - nheads_q, nheads_k = q.shape[1], k.shape[1] - head_size = q.shape[-1] - batch = metadata.num_contexts - elif metadata.layout == 'bhsd': - batch, nheads_q, _, head_size = q.shape - nheads_k = k.shape[1] - elif metadata.layout == 'bshd': - batch, _, nheads_q, head_size = q.shape - nheads_k = k.shape[2] - return batch, nheads_q, nheads_k, head_size - - -def get_strides_from_layout(q, k, v, o, metadata): - assert metadata.layout in SUPPORTED_LAYOUTS, "Got unsupported layout." - - STRIDE_PERMUTATIONS = { - 'thd': (None, 1, 0, 2), - 'bhsd': (0, 1, 2, 3), - 'bshd': (0, 2, 1, 3), - } - - perm = STRIDE_PERMUTATIONS[metadata.layout] - stride = lambda x, p: (0 if p is None else x.stride(p)) - strides = lambda x: (stride(x, p) for p in perm) - - return tuple(strides(x) for x in [q, k, v, o]) + # Now we compute whether we need to exit early due to causal masking. + # This is because for seqlen_q > seqlen_k, M rows of the attn scores + # are completely masked, resulting in 0s written to the output, and + # inf written to LSE. We don't need to do any GEMMs in this case. + # This block of code determines what N is, and if this WG is operating + # on those M rows. + n_blocks = cdiv_fn(seqlen_k, BLOCK_N) + if IS_CAUSAL: + # If seqlen_q == seqlen_k, the attn scores are a square matrix. + # If seqlen_q != seqlen_k, attn scores are rectangular which means + # the causal mask boundary is bottom right aligned, and ends at either + # the top edge (seqlen_q < seqlen_k) or left edge. + # This captures the decrease in n_blocks if we have a rectangular attn + # matrix + n_blocks_seqlen = cdiv_fn( + (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, BLOCK_N) + # This is what adjusts the block_max for the current WG, only + # if IS_CAUSAL. Otherwise we want to always iterate through all n_blocks + n_blocks = min(n_blocks, n_blocks_seqlen) + # If we have no blocks after adjusting for seqlen deltas, this WG is + # part of the blocks that are all 0. We exit early. + if n_blocks <= 0: + o_offset = (off_z * stride_oz + cu_seqlens_q_start * stride_om + + off_h_q * stride_oh) + O_block_ptr = tl.make_block_ptr( + base=Out + o_offset, + shape=(seqlen_q, BLOCK_DMODEL), + strides=(stride_om, stride_on), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_DMODEL), + order=(1, 0), + ) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=Out.type.element_ty) + # We still need to write 0s to the result + # tl.store(O_block_ptr, + # acc.to(Out.type.element_ty), boundary_check=(0,1)) + # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + # + offs_m + # We store inf to LSE, not -inf because in the bwd pass, + # we subtract this + # from qk which makes it -inf, such that exp(qk - inf) = 0 + # for these masked blocks. + # l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32) + # tl.store(l_ptrs, l) + # TODO: Should dropout and return encoded softmax be handled here? + return + + # If MQA / GQA, set the K and V head offsets appropriately. + GROUP_SIZE: tl.constexpr = HQ // HK + off_h_k = off_h_q // GROUP_SIZE if GROUP_SIZE != 1 else off_h_q + + n_extra_tokens = 0 + if seqlen_k < BLOCK_N: + n_extra_tokens = BLOCK_N - seqlen_k + elif seqlen_k % BLOCK_N: + n_extra_tokens = seqlen_k % BLOCK_N + padded_head = ACTUAL_BLOCK_DMODEL != BLOCK_DMODEL + + # Compute pointers for all the tensors used in this kernel. + q_offset = (off_z * stride_qz + off_h_q * stride_qh + + cu_seqlens_q_start * stride_qm) + Q_block_ptr = tl.make_block_ptr( + base=Q + q_offset, + shape=(seqlen_q, ACTUAL_BLOCK_DMODEL), + strides=(stride_qm, stride_qk), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_DMODEL), + order=(1, 0), + ) + k_offset = (off_z * stride_kz + off_h_k * stride_kh + + cu_seqlens_k_start * stride_kn) + K_block_ptr = tl.make_block_ptr( + base=K + k_offset, + shape=(ACTUAL_BLOCK_DMODEL, seqlen_k), + strides=(stride_kk, stride_kn), + offsets=(0, 0), + block_shape=(BLOCK_DMODEL, BLOCK_N), + order=(0, 1), + ) + v_offset = (off_z * stride_vz + off_h_k * stride_vh + + cu_seqlens_k_start * stride_vk) + V_block_ptr = tl.make_block_ptr( + base=V + v_offset, + shape=(seqlen_k, ACTUAL_BLOCK_DMODEL), + strides=(stride_vk, stride_vn), + offsets=(0, 0), + block_shape=(BLOCK_N, BLOCK_DMODEL), + order=(1, 0), + ) + if BIAS_TYPE != 0: + bias_ptr = tl.make_block_ptr( + base=bias + off_h_q * stride_bh, + shape=(seqlen_q, seqlen_k), + strides=(stride_bm, stride_bn), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_N), + order=(1, 0), + ) + else: + bias_ptr = None + if ENABLE_DROPOUT: + batch_philox_offset = philox_offset_base \ + + (off_z * HQ + off_h_q) \ + * seqlen_q * seqlen_k + else: + batch_philox_offset = 0 + # We can ask to return the dropout mask without actually doing any dropout. + # In this case, we return an invalid pointer so indicate the mask is not i + # valid. + # TODO: Fix encoded softmax. It currently uses just h_q in the base offset. + if RETURN_ENCODED_SOFTMAX: + encoded_softmax_block_ptr = tl.make_block_ptr( + base=encoded_softmax + off_h_q * seqlen_q * seqlen_k, + shape=(seqlen_q, seqlen_k), + strides=(seqlen_k, 1), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_N), + order=(1, 0), + ) + else: + encoded_softmax_block_ptr = 0 + # initialize pointer to m and l + m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) + # scale sm_scale by log_2(e) and use 2^x in the loop as we do not + # have native e^x support in HW. + qk_scale = sm_scale * 1.44269504089 + # Q is loaded once at the beginning and shared by all N blocks. + q = load_fn(Q_block_ptr, True, padded_head, "zero") + if not USE_FP8: + q = (q * qk_scale).to(Q_block_ptr.type.element_ty) + acc_scale = 1.0 + else: + qk_scale *= q_scale * k_scale + acc_scale = p_scale * v_scale + + # Here we compute how many full and masked blocks we have. + padded_block_k = n_extra_tokens != 0 + is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0) + if IS_CAUSAL: + # There are always at least BLOCK_M // BLOCK_N masked blocks. + # Additionally there might be one more due to dissimilar seqlens. + masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn) + else: + # Padding on Q does not need to be masked in the FA loop. + masked_blocks = padded_block_k + # if IS_CAUSAL, not is_modulo_mn does not always result in an additional + # block. In this case we might exceed n_blocks so pick the min. + masked_blocks = min(masked_blocks, n_blocks) + n_full_blocks = n_blocks - masked_blocks + block_min = 0 + block_max = n_blocks * BLOCK_N + # Compute for full blocks. Here we set causal to false regardless of its + # value because there is no masking. Similarly we do not need padding. + if n_full_blocks > 0: + block_max = (n_blocks - masked_blocks) * BLOCK_N + acc, l_i, m_i = _attn_fwd_inner( + acc, + l_i, + m_i, + q, + K_block_ptr, + V_block_ptr, + start_m, + seqlen_k, + dropout_p, + philox_seed, + batch_philox_offset, + encoded_softmax_block_ptr, + # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _ + block_min, + block_max, + 0, + 0, + 0, + bias_ptr, + # IS_CAUSAL, .... + False, + BLOCK_M, + BLOCK_DMODEL, + BLOCK_N, + offs_m, + offs_n, + # _, MASK_STEPS, ... + PRE_LOAD_V, + False, + ENABLE_DROPOUT, + RETURN_ENCODED_SOFTMAX, + padded_head, + USE_FP8, + qk_scale, + p_descale, + ) + block_min = block_max + block_max = n_blocks * BLOCK_N + + tl.debug_barrier() + # Remaining blocks, if any, are full / not masked. + if masked_blocks > 0: + offs_n_causal = offs_n + (seqlen_q - seqlen_k) if IS_CAUSAL else 0 + K_block_ptr = tl.advance(K_block_ptr, (0, n_full_blocks * BLOCK_N)) + V_block_ptr = tl.advance(V_block_ptr, (n_full_blocks * BLOCK_N, 0)) + if bias_ptr is not None: + bias_ptr = tl.advance(bias_ptr, (0, n_full_blocks * BLOCK_N)) + if RETURN_ENCODED_SOFTMAX: + encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr, + (0, n_full_blocks)) + acc, l_i, m_i = _attn_fwd_inner( + acc, + l_i, + m_i, + q, + K_block_ptr, + V_block_ptr, + start_m, + seqlen_k, + dropout_p, + philox_seed, + batch_philox_offset, + encoded_softmax_block_ptr, + block_min, + block_max, + offs_n_causal, + masked_blocks, + n_extra_tokens, + bias_ptr, + IS_CAUSAL, + BLOCK_M, + BLOCK_DMODEL, + BLOCK_N, + offs_m, + offs_n, + # _, MASK_STEPS, ... + PRE_LOAD_V, + True, + ENABLE_DROPOUT, + RETURN_ENCODED_SOFTMAX, + padded_head, + USE_FP8, + qk_scale, + p_descale, + ) + # epilogue + + if USE_FP8: + acc *= acc_scale + acc = acc / l_i[:, None] + if ENABLE_DROPOUT: + acc = acc / (1 - dropout_p) + # If seqlen_q > seqlen_k but the delta is not a multiple of BLOCK_M, + # then we have one block with a row of all NaNs which come from computing + # softmax over a row of all -infs (-inf - inf = NaN). We check for that here + # and store 0s where there are NaNs as these rows should've been zeroed out. + end_m_idx = (start_m + 1) * BLOCK_M + start_m_idx = start_m * BLOCK_M + causal_start_idx = seqlen_q - seqlen_k + if USE_FP8_OUT: + acc *= o_descale + acc = tl.clamp(acc, FP8_MIN, FP8_MAX) + acc = acc.to(Out.type.element_ty) + if IS_CAUSAL: # noqa: SIM102 + if causal_start_idx > start_m_idx and causal_start_idx < end_m_idx: + out_mask_boundary = tl.full((BLOCK_DMODEL, ), + causal_start_idx, + dtype=tl.int32) + mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M) + out_ptrs_mask = (mask_m_offsets[:, None] + >= out_mask_boundary[None, :]) + z = tl.zeros((1, ), tl.float32) + acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty)) + # write back LSE + # l_ptrs = L + off_z * HQ * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m + # If seqlen_q not multiple of BLOCK_M, we need to mask out the last + # few rows. This is only true for the last M block. For others, + # overflow_size will be -ve + # overflow_size = end_m_idx - seqlen_q + # if overflow_size > 0: + # boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32) + # # This is a > check because mask being 0 blocks the store. + # l_ptrs_mask = boundary > tl.arange(0, BLOCK_M) + # tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask) + # else: + # tl.store(l_ptrs, m_i + tl.math.log2(l_i)) + + # write back O + o_offset = (off_z * stride_oz + cu_seqlens_q_start * stride_om + + off_h_q * stride_oh) + O_block_ptr = tl.make_block_ptr( + base=Out + o_offset, + shape=(seqlen_q, ACTUAL_BLOCK_DMODEL), + strides=(stride_om, stride_on), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_DMODEL), + order=(1, 0), + ) + # Need boundary check on this to make sure the padding from the + # Q and KV tensors in both dims are not part of what we store back. + # TODO: Do the boundary check optionally. + tl.store(O_block_ptr, acc, boundary_check=(0, 1)) + + +def check_args( + q, + k, + v, + o, + varlen=True, + max_seqlens=None, + cu_seqlens_q=None, + cu_seqlens_k=None, +): + assert q.dim() == k.dim() and q.dim() == v.dim() + if varlen: + assert q.dim() == 3 + total_q, nheads_q, head_size = q.shape + total_k, nheads_k, _ = k.shape + assert cu_seqlens_q is not None + assert cu_seqlens_k is not None + assert len(cu_seqlens_q) == len(cu_seqlens_k) + else: + assert q.dim() == 4 + batch, nheads_q, seqlen_q, head_size = q.shape + _, nheads_k, seqlen_k, _ = k.shape + assert max_seqlens > 0 + assert k.shape == v.shape + assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1] + # TODO: Change assert if we support qkl f8 and v f16 + assert q.dtype == k.dtype and q.dtype == v.dtype + assert head_size <= 256 + assert o.shape == q.shape + assert (nheads_q % nheads_k) == 0 class _attention(torch.autograd.Function): @staticmethod - def forward(ctx, q, k, v, o, metadata: MetaData): - # NOTE: a large bias tensor leads to overflow during pointer arithmetic - if (metadata.bias is not None): - assert (metadata.bias.numel() < 2**31) + def forward( + ctx, + q, + k, + v, + o, + cu_seqlens_q, + cu_seqlens_k, + max_seqlens_q, + max_seqlens_k, + causal=False, + sm_scale=1.0, + bias=None, + fp8_scales=None, + fp8_out_scale=None, + ): + if fp8_scales is not None: + use_fp8 = True + (q_scale, k_scale, v_scale, p_scale) = fp8_scales + float8 = current_platform.fp8_dtype() + + def check_and_convert(t, scale): + if t.dtype != float8: + descale = 1.0 / scale + ts = (t * descale).clamp(min=float8_info.min, + max=float8_info.max) + return ts.to(float8) + else: + return t - if o is None: - if metadata.eight_bit: - o = torch.empty_like( - q, - dtype=metadata.output_dtype if metadata.output_dtype - is not None else metadata.eight_bit_dtype_torch) - else: - o = torch.empty_like(q, dtype=q.dtype) + q = check_and_convert(q, q_scale) + k = check_and_convert(k, k_scale) + v = check_and_convert(v, v_scale) + else: + use_fp8 = False + q_scale = k_scale = v_scale = p_scale = 1.0 - metadata.check_args(q, k, v, o) + if o is None: + o = torch.empty_like(q, dtype=v.dtype) - batch, nheads_q, nheads_k, head_size = get_shape_from_layout( - q, k, metadata) - q_strides, k_strides, v_strides, o_strides = get_strides_from_layout( - q, k, v, o, metadata) + check_args( + q, + k, + v, + o, + varlen=True, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + ) + if True: # varlen + total_q, nheads_q, head_size = q.shape + total_k, nheads_k, _ = k.shape + batch = len(cu_seqlens_q) - 1 + q_strides = (0, q.stride(1), q.stride(0), q.stride(2)) + k_strides = (0, k.stride(1), k.stride(0), k.stride(2)) + v_strides = (0, v.stride(1), v.stride(0), v.stride(2)) + o_strides = (0, o.stride(1), o.stride(0), o.stride(2)) + else: + batch, seqlen_q, nheads_q, head_size = q.shape + _, seqlen_k, nheads_k, _ = k.shape + q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3)) + k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3)) + v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3)) + o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3)) # Get closest power of 2 over or equal to 32. - padded_d_model = 1 << (head_size - 1).bit_length() - # Smallest head_dim supported is 16. If smaller, the tile in the - # kernel is padded - there is no padding in memory for any dims. - padded_d_model = max(padded_d_model, 16) - - # encoded_softmax is used to validate dropout behavior vs the - # PyTorch SDPA math backend reference. We zero this out to give a - # consistent starting point and then populate it with the output of - # softmax with the sign bit set according to the dropout mask. - # The resulting return allows this mask to be fed into the reference - # implementation for testing only. This return holds no useful output - # aside from debugging. - if metadata.return_encoded_softmax: - encoded_softmax = torch.zeros( - (q.shape[0], q.shape[1], q.shape[2], k.shape[2]), - device=q.device, - dtype=torch.float32) + unpadded_head_dims = {32, 64, 128, 256} + if head_size not in unpadded_head_dims: + padded_d_model = None + for i in unpadded_head_dims: + if i > head_size: + padded_d_model = i + break + assert padded_d_model is not None else: - encoded_softmax = None + padded_d_model = head_size + + grid = lambda META: ( + triton.cdiv(max_seqlens_q, META["BLOCK_M"]), + nheads_q, + batch, + ) - M = torch.empty((batch, nheads_q, metadata.max_seqlens_q), - device=q.device, - dtype=torch.float32) + encoded_softmax = None # Seed the RNG so we get reproducible results for testing. philox_seed = 0x1BF52 philox_offset = 0x1D4B42 - if metadata.bias is not None: - bias_strides = (metadata.bias.stride(0), metadata.bias.stride(1), - metadata.bias.stride(2), metadata.bias.stride(3)) + if bias is not None: + bias_strides = ( + bias.stride(0), + bias.stride(1), + bias.stride(2), + bias.stride(3), + ) else: bias_strides = (0, 0, 0, 0) - if metadata.alibi_slopes is not None: - alibi_strides = (metadata.alibi_slopes.stride(0), - metadata.alibi_slopes.stride(1)) - else: - alibi_strides = (0, 0) + p_descale = 1.0 / p_scale + o_descale = 1.0 / fp8_out_scale.item( + ) if fp8_out_scale is not None else 1.0 - if metadata.eight_bit: - q_descale, k_descale, p_scale, p_descale, v_descale, o_scale = ( - metadata.q_descale, metadata.k_descale, metadata.p_scale, - metadata.p_descale, metadata.v_descale, metadata.o_scale) - o_descale = 1.0 / o_scale if o_scale is not None else None - else: - q_descale = k_descale = p_scale = None - p_descale = v_descale = o_descale = None - - # number of compute units available - NUM_CU = torch.cuda.get_device_properties("cuda").multi_processor_count - - grid = lambda META: (triton.cdiv(metadata.max_seqlens_q, META[ - 'BLOCK_M']), nheads_q, batch) + arg_max_seqlens_q = 0 if on_gfx1x() else max_seqlens_q + arg_max_seqlens_k = 0 if on_gfx1x() else max_seqlens_k attn_fwd[grid]( q, k, v, - metadata.bias, - metadata.sm_scale, - M, + bias, + sm_scale, + q_scale, + k_scale, + v_scale, + p_scale, + p_descale, + o_descale, + None, o, *q_strides, *k_strides, *v_strides, *o_strides, *bias_strides, - *alibi_strides, - q_descale, - k_descale, - p_scale, - p_descale, - o_descale, - v_descale, - q_descale.numel() == 1 if q_descale is not None else False, - k_descale.numel() == 1 if k_descale is not None else False, - p_descale.numel() == 1 if p_descale is not None else False, - v_descale.numel() == 1 if v_descale is not None else False, - metadata.cu_seqlens_q, - metadata.cu_seqlens_k, + cu_seqlens_q, + cu_seqlens_k, + dropout_p=0.0, philox_seed=philox_seed, philox_offset_base=philox_offset, encoded_softmax=encoded_softmax, - alibi_slopes=metadata.alibi_slopes, HQ=nheads_q, HK=nheads_k, - IS_ACTUAL_BLOCK_DMODEL=head_size, - MAX_SEQLENS_Q=metadata.max_seqlens_q, - MAX_SEQLENS_K=metadata.max_seqlens_k, - IS_CAUSAL=metadata.causal, - VARLEN=metadata.varlen, + ACTUAL_BLOCK_DMODEL=head_size, + MAX_SEQLENS_Q=arg_max_seqlens_q, + MAX_SEQLENS_K=arg_max_seqlens_k, + IS_CAUSAL=causal, + VARLEN=True, BLOCK_DMODEL=padded_d_model, - USE_BIAS=metadata.bias is not None, - USE_ALIBI=metadata.alibi_slopes is not None, - SHOULD_RETURN_ENCODED_SOFTMAX=metadata.return_encoded_softmax, - IS_EIGHT_BIT=metadata.eight_bit, - USE_P_SCALE=metadata.eight_bit and metadata.use_p_scale, - IS_EIGHT_BIT_KV=metadata.eight_bit and metadata.eight_bit_kv, - NUM_CU=NUM_CU, - B=batch, - QUANT_DTYPE=metadata.eight_bit_dtype_triton) + BIAS_TYPE=0 if bias is None else 1, + ENABLE_DROPOUT=False, + RETURN_ENCODED_SOFTMAX=False, + USE_FP8=use_fp8, + USE_FP8_OUT=fp8_out_scale is not None, + ) ctx.grid = grid - ctx.sm_scale = metadata.sm_scale + ctx.sm_scale = sm_scale ctx.BLOCK_DMODEL = head_size - ctx.causal = metadata.causal - ctx.alibi_slopes = metadata.alibi_slopes + ctx.causal = causal + ctx.dropout_p = 0.0 ctx.philox_seed = philox_seed ctx.philox_offset = philox_offset ctx.encoded_softmax = encoded_softmax - ctx.return_encoded_softmax = metadata.return_encoded_softmax + ctx.return_encoded_softmax = False return o, encoded_softmax -triton_attention_rocm = _attention.apply - - -def scale_fp8(t, scale=None): - t_scaled, scale_out = ops.scaled_fp8_quant(t.reshape(-1, t.shape[-1]), - scale) - return t_scaled.reshape(t.shape), scale_out - - -def maybe_quantize_fp8(t, scale): - eight_bit_dtype = current_platform.fp8_dtype() - if t.dtype != eight_bit_dtype: - t, _ = scale_fp8(t, scale) - return t - - -def check_and_maybe_quantize_qkv(q, k, v, fp8_scales): - (q_scale, k_scale, v_scale, p_scale) = fp8_scales - - q = maybe_quantize_fp8(q, q_scale) - k = maybe_quantize_fp8(k, k_scale) - v = maybe_quantize_fp8(v, v_scale) - - return q, k, v - - -# query - [num_tokens, num_heads, head_size] -# key - [num_tokens, num_kv_heads, head_size] -# value - [num_tokens, num_kv_heads, head_size -# output - [num_tokens, num_heads, head_size] -def triton_attention( - q: torch.Tensor, - k: torch.Tensor, - v: torch.Tensor, - o: torch.Tensor, - cu_seqlens_q: torch.Tensor, - cu_seqlens_k: torch.Tensor, - max_seqlens_q: int, - max_seqlens_k: int, - causal: bool = False, - sm_scale: float = 1.0, - bias: Optional[torch.Tensor] = None, - fp8_scales: Optional[tuple[float, ...]] = None, - input_scale: Optional[torch.Tensor] = None, -) -> torch.Tensor: - if fp8_scales is not None: - q_descale, k_descale, v_descale, p_scale = fp8_scales - else: - q_descale = k_descale = v_descale = p_scale = None - - attn_metadata = MetaData(sm_scale=sm_scale, - max_seqlens_q=max_seqlens_q, - max_seqlens_k=max_seqlens_k, - causal=causal, - bias=bias, - output_dtype=q.dtype, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - q_descale=q_descale, - k_descale=k_descale, - v_descale=v_descale, - p_scale=p_scale, - o_scale=input_scale) - - if fp8_scales is not None: - q, k, v = check_and_maybe_quantize_qkv(q, k, v, fp8_scales) - - return triton_attention_rocm(q, k, v, o, attn_metadata) +triton_attention = _attention.apply diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index d544b4ab4..06ee8614d 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -98,6 +98,12 @@ def with_amdsmi_context(fn): return wrapper +@cache +def on_gfx1x() -> bool: + GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName + return any(arch in GPU_ARCH for arch in ["gfx11", "gfx12"]) + + @cache def on_mi250_mi300() -> bool: GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName -- GitLab From c29034037d73e4f774796c661d3fc48038fc1067 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 30 May 2025 00:36:58 +0800 Subject: [PATCH 052/274] [Deprecation] Disallow pos-args other than `model` when initializing `LLM` (#18802) Signed-off-by: DarkLight1337 --- tests/entrypoints/llm/test_init.py | 24 ------------------------ vllm/entrypoints/llm.py | 20 +++----------------- 2 files changed, 3 insertions(+), 41 deletions(-) delete mode 100644 tests/entrypoints/llm/test_init.py diff --git a/tests/entrypoints/llm/test_init.py b/tests/entrypoints/llm/test_init.py deleted file mode 100644 index 925bf56a9..000000000 --- a/tests/entrypoints/llm/test_init.py +++ /dev/null @@ -1,24 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import pytest - -from vllm import LLM - -from ...utils import error_on_warning - -MODEL_NAME = "facebook/opt-125m" - - -def test_pos_args_deprecated(): - with error_on_warning(DeprecationWarning): - LLM(model=MODEL_NAME, tokenizer=MODEL_NAME) - - with error_on_warning(DeprecationWarning): - LLM(MODEL_NAME, tokenizer=MODEL_NAME) - - with pytest.warns(DeprecationWarning, match="'tokenizer'"): - LLM(MODEL_NAME, MODEL_NAME) - - with pytest.warns(DeprecationWarning, - match="'tokenizer', 'tokenizer_mode'"): - LLM(MODEL_NAME, MODEL_NAME, "auto") diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index f8eeae61f..e05189ef4 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -45,8 +45,7 @@ from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams, from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer, get_cached_tokenizer) from vllm.usage.usage_lib import UsageContext -from vllm.utils import (Counter, Device, deprecate_args, deprecate_kwargs, - is_list_of) +from vllm.utils import Counter, Device, deprecate_kwargs, is_list_of if TYPE_CHECKING: from vllm.v1.metrics.reader import Metric @@ -143,12 +142,6 @@ class LLM: DEPRECATE_LEGACY: ClassVar[bool] = True """A flag to toggle whether to deprecate the legacy generate/encode API.""" - DEPRECATE_INIT_POSARGS: ClassVar[bool] = True - """ - A flag to toggle whether to deprecate positional arguments in - [LLM.__init__][]. - """ - @classmethod @contextmanager def deprecate_legacy_api(cls): @@ -158,16 +151,11 @@ class LLM: cls.DEPRECATE_LEGACY = False - @deprecate_args( - start_index=2, # Ignore self and model - is_deprecated=lambda: LLM.DEPRECATE_INIT_POSARGS, - additional_message=( - "All positional arguments other than `model` will be " - "replaced with keyword arguments in an upcoming version."), - ) def __init__( self, model: str, + *, + task: TaskOption = "auto", tokenizer: Optional[str] = None, tokenizer_mode: TokenizerMode = "auto", skip_tokenizer_init: bool = False, @@ -189,8 +177,6 @@ class LLM: hf_token: Optional[Union[bool, str]] = None, hf_overrides: Optional[HfOverrides] = None, mm_processor_kwargs: Optional[dict[str, Any]] = None, - # After positional args are removed, move this right below `model` - task: TaskOption = "auto", override_pooler_config: Optional[PoolerConfig] = None, compilation_config: Optional[Union[int, dict[str, Any]]] = None, **kwargs, -- GitLab From d58f9c7f7a3bd29a6ab3f6a5a740299a4555cbc2 Mon Sep 17 00:00:00 2001 From: CYJiang <86391540+googs1025@users.noreply.github.com> Date: Fri, 30 May 2025 01:26:07 +0800 Subject: [PATCH 053/274] [Misc] Remove duplicate init for self.vllm_config (#18896) Signed-off-by: googs1025 --- vllm/v1/engine/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 740ba60fe..e6de31ab7 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -123,7 +123,6 @@ class EngineCore: logger.info("Batch queue is enabled with size %d", self.batch_queue_size) self.batch_queue = queue.Queue(self.batch_queue_size) - self.vllm_config = vllm_config def _initialize_kv_caches( self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]: -- GitLab From 32ce3cf7c97bc309b5dbffaea5703a89969cfc1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Thu, 29 May 2025 19:54:16 +0200 Subject: [PATCH 054/274] [V1] Allocate kv_cache with stride order for V1 (#18775) Signed-off-by: nicklucche --- tests/v1/worker/test_gpu_model_runner.py | 71 +++++++++++++++++++----- vllm/v1/worker/gpu_model_runner.py | 26 ++++++++- 2 files changed, 81 insertions(+), 16 deletions(-) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index b8c3d8861..c38eb4866 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -1,7 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 +import random + import pytest +from vllm.attention import Attention from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, SchedulerConfig, VllmConfig) from vllm.sampling_params import SamplingParams @@ -13,27 +16,30 @@ from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.worker.gpu_input_batch import InputBatch from vllm.v1.worker.gpu_model_runner import GPUModelRunner +BLOCK_SIZE = 16 +NUM_BLOCKS = 10 + def initialize_kv_cache(runner: GPUModelRunner): """ Only perform necessary steps in GPUModelRunner.initialize_kv_cache() """ + attn_spec = FullAttentionSpec( + block_size=BLOCK_SIZE, + num_kv_heads=runner.model_config.get_num_kv_heads( + runner.parallel_config), + head_size=runner.model_config.get_head_size(), + dtype=runner.kv_cache_dtype, + use_mla=False, + ) + tensor_size = attn_spec.page_size_bytes * NUM_BLOCKS kv_cache_config = KVCacheConfig( - num_blocks=10, + num_blocks=NUM_BLOCKS, tensors={ - "layer.0": KVCacheTensor(size=1024), + "layer.0": KVCacheTensor(size=tensor_size), }, kv_cache_groups=[ - KVCacheGroupSpec( - layer_names=["layer.0"], - kv_cache_spec=FullAttentionSpec( - block_size=16, - num_kv_heads=runner.model_config.get_num_kv_heads( - runner.parallel_config), - head_size=runner.model_config.get_head_size(), - dtype=runner.kv_cache_dtype, - use_mla=False, - )) + KVCacheGroupSpec(layer_names=["layer.0"], kv_cache_spec=attn_spec) ]) runner.kv_cache_config = kv_cache_config runner.input_batch = InputBatch( @@ -65,7 +71,7 @@ def model_runner(): seed=42, ) cache_config = CacheConfig( - block_size=16, + block_size=BLOCK_SIZE, gpu_memory_utilization=0.9, swap_space=0, cache_dtype="auto", @@ -77,6 +83,10 @@ def model_runner(): scheduler_config=scheduler_config, parallel_config=parallel_config, ) + num_heads = model_config.get_num_kv_heads(parallel_config) + head_size = model_config.get_head_size() + vllm_config.compilation_config.static_forward_context[ + "layer.0"] = Attention(num_heads, head_size, 0.1) device = "cuda" runner = GPUModelRunner(vllm_config, device) @@ -321,3 +331,38 @@ def test_update_states_request_unscheduled(model_runner): assert _is_req_added(model_runner, req_ids[1]) assert not _is_req_scheduled(model_runner, req_ids[1]) + + +def test_kv_cache_stride_order(monkeypatch, model_runner): + # This test checks if GPUModelRunner initializes correctly when an attention + # backend enforces a non-default KV cache stride order. + n_heads = model_runner.model_config.get_num_kv_heads( + model_runner.parallel_config) + expected_kv_cache_shape = [ + 2, NUM_BLOCKS, BLOCK_SIZE, n_heads, + model_runner.model_config.get_head_size() + ] + # TODO mla test + default_stride = list(range(5)) + # Permutation that gets you back to expected kv shape + rnd_stride = tuple(random.sample(default_stride, len(default_stride))) + + def rnd_stride_order(): + return rnd_stride + + # Patch the attention backend class and re-trigger the KV cache creation. + for attn_backend in model_runner.attn_backends: + monkeypatch.setattr(attn_backend, "get_kv_cache_stride_order", + rnd_stride_order) + + model_runner.attn_backends = [] + model_runner.attn_metadata_builders = [] + model_runner.initialize_kv_cache(model_runner.kv_cache_config) + + # Shape is unchanged, but layout may differ + kv_cache_shape = model_runner.kv_caches[0].shape + assert list(kv_cache_shape) == expected_kv_cache_shape + if default_stride == rnd_stride: + assert all(kv.is_contiguous() for kv in model_runner.kv_caches) + else: + assert all(not kv.is_contiguous() for kv in model_runner.kv_caches) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d1195bcfb..60425a4e1 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2033,9 +2033,29 @@ class GPUModelRunner(LoRAModelRunnerMixin): num_blocks, kv_cache_spec.block_size, kv_cache_spec.num_kv_heads, kv_cache_spec.head_size) dtype = kv_cache_spec.dtype - kv_caches[layer_name] = torch.zeros(kv_cache_shape, - dtype=dtype, - device=self.device) + try: + kv_cache_stride_order = self.attn_backends[ + i].get_kv_cache_stride_order() + assert len(kv_cache_stride_order) == len( + kv_cache_shape) + except (AttributeError, NotImplementedError): + kv_cache_stride_order = tuple( + range(len(kv_cache_shape))) + # The allocation respects the backend-defined stride order + # to ensure the semantic remains consistent for each + # backend. We first obtain the generic kv cache shape and + # then permute it according to the stride order which could + # result in a non-contiguous tensor. + kv_cache_shape = tuple(kv_cache_shape[i] + for i in kv_cache_stride_order) + # Maintain original KV shape view. + inv_order = [ + kv_cache_stride_order.index(i) + for i in range(len(kv_cache_stride_order)) + ] + kv_caches[layer_name] = torch.zeros( + kv_cache_shape, dtype=dtype, + device=self.device).permute(*inv_order) else: # TODO: add new branches when introducing more types of # KV cache specs. -- GitLab From d1d61f33511d46fda02944d8000a5ab22465c142 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Thu, 29 May 2025 11:04:18 -0700 Subject: [PATCH 055/274] [BugFix] Make DP work with connector-delayed new requests (#18559) Signed-off-by: Nick Hill Co-authored-by: Will Eaton --- tests/v1/engine/test_engine_core.py | 18 +++++------ vllm/forward_context.py | 7 ++-- vllm/v1/engine/core.py | 50 +++++++++++++---------------- vllm/v1/engine/core_client.py | 3 +- 4 files changed, 37 insertions(+), 41 deletions(-) diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index dcf494825..ae1d8a762 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -88,7 +88,7 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch): assert len(engine_core.scheduler.running) == 4 # Loop through until they are all done. - while len(engine_core.step().outputs) > 0: + while len(engine_core.step()[0].outputs) > 0: pass assert len(engine_core.scheduler.waiting) == 0 @@ -163,11 +163,11 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch): req0.request_id = req1.request_id = "test" engine_core.add_request(req0) - while len(engine_core.step().outputs) > 0: + while len(engine_core.step()[0].outputs) > 0: pass engine_core.add_request(req1) - while len(engine_core.step().outputs) > 0: + while len(engine_core.step()[0].outputs) > 0: pass assert len(engine_core.scheduler.waiting) == 0 @@ -207,7 +207,7 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch): assert len(engine_core.scheduler.waiting) == 1 assert len(engine_core.scheduler.running) == 0 # Loop through until they are all done. - while len(engine_core.step().outputs) > 0: + while len(engine_core.step()[0].outputs) > 0: pass assert len(engine_core.scheduler.waiting) == 0 assert len(engine_core.scheduler.running) == 0 @@ -296,7 +296,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): engine_core.add_request(req1) # Schedule Batch 1: (10, req0) - assert engine_core.step_with_batch_queue() is None + assert engine_core.step_with_batch_queue()[0] is None assert engine_core.batch_queue.qsize() == 1 scheduler_output = engine_core.batch_queue.queue[-1][1] assert scheduler_output.num_scheduled_tokens[0] == 10 @@ -305,7 +305,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): req0.request_id].num_computed_tokens == 10 # Schedule Batch 2: (2, req0), (8, req1) - assert engine_core.step_with_batch_queue() is None + assert engine_core.step_with_batch_queue()[0] is None assert engine_core.batch_queue.qsize() == 2 scheduler_output = engine_core.batch_queue.queue[-1][1] assert scheduler_output.num_scheduled_tokens[0] == 2 @@ -327,7 +327,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): assert scheduler_output.num_scheduled_tokens[1] == 4 # Batch queue is full. Finish Batch 2. Get first token of req0. - output = engine_core.step_with_batch_queue() + output = engine_core.step_with_batch_queue()[0] assert output is not None assert len(output.outputs) == 1 assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13 @@ -339,7 +339,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): assert scheduler_output.num_scheduled_tokens[0] == 1 # Batch queue is full. Finish Batch 3. Get first token of req1. - output = engine_core.step_with_batch_queue() + output = engine_core.step_with_batch_queue()[0] assert output is not None assert len(output.outputs) == 1 assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13 @@ -358,7 +358,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): engine_core.scheduler.requests[1].num_tokens + 1, ] while engine_core.scheduler.get_num_unfinished_requests() == 2: - output = engine_core.step_with_batch_queue() + output = engine_core.step_with_batch_queue()[0] if step % 2 == 0: # Even steps consumes an output. assert output is not None diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 592ca650a..0af16bbc0 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -101,7 +101,7 @@ def get_forward_context() -> ForwardContext: def set_forward_context(attn_metadata: Any, vllm_config: VllmConfig, virtual_engine: int = 0, - num_tokens: int = 0): + num_tokens: Optional[int] = None): """A context manager that stores the current forward context, can be attention metadata, etc. Here we can inject common logic for every model forward pass. @@ -111,9 +111,10 @@ def set_forward_context(attn_metadata: Any, if need_to_track_batchsize: forward_start_time = time.perf_counter() dp_metadata: Optional[DPMetadata] = None - if vllm_config.parallel_config.data_parallel_size > 1: + if vllm_config.parallel_config.data_parallel_size > 1 and ( + attn_metadata is not None or num_tokens is not None): dp_metadata = DPMetadata.make(vllm_config.parallel_config, - attn_metadata, num_tokens) + attn_metadata, num_tokens or 0) global _forward_context prev_context = _forward_context diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index e6de31ab7..2a7680140 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -211,8 +211,12 @@ class EngineCore: # Re-raise exception raise err - def step(self) -> EngineCoreOutputs: - """Schedule, execute, and make output.""" + def step(self) -> tuple[EngineCoreOutputs, bool]: + """Schedule, execute, and make output. + + Returns tuple of outputs and a flag indicating whether the model + was executed. + """ # Check for any requests remaining in the scheduler - unfinished, # or finished and not yet removed from the batch. @@ -220,15 +224,17 @@ class EngineCore: return EngineCoreOutputs( outputs=[], scheduler_stats=self.scheduler.make_stats(), - ) + ), False scheduler_output = self.scheduler.schedule() model_output = self.execute_model(scheduler_output) engine_core_outputs = self.scheduler.update_from_output( scheduler_output, model_output) # type: ignore - return engine_core_outputs + return (engine_core_outputs, + scheduler_output.total_num_scheduled_tokens > 0) - def step_with_batch_queue(self) -> Optional[EngineCoreOutputs]: + def step_with_batch_queue( + self) -> tuple[Optional[EngineCoreOutputs], bool]: """Schedule and execute batches with the batch queue. Note that if nothing to output in this step, None is returned. @@ -273,7 +279,7 @@ class EngineCore: engine_core_outputs = self.scheduler.update_from_output( scheduler_output, model_output) - return engine_core_outputs + return engine_core_outputs, scheduled_batch def shutdown(self): self.structured_output_manager.clear_backend() @@ -537,15 +543,17 @@ class EngineCoreProc(EngineCore): req = self.input_queue.get_nowait() self._handle_client_request(*req) - def _process_engine_step(self): + def _process_engine_step(self) -> bool: """Called only when there are unfinished local requests.""" # Step the engine core. - outputs = self.step_fn() + outputs, model_executed = self.step_fn() # Put EngineCoreOutputs into the output queue. if outputs is not None: self.output_queue.put_nowait(outputs) + return model_executed + def _handle_client_request(self, request_type: EngineCoreRequestType, request: Any) -> None: """Dispatch request from client.""" @@ -749,30 +757,16 @@ class DPEngineCoreProc(EngineCoreProc): # 1) Poll the input queue until there is work to do. self._process_input_queue() + # 2) Step the engine core. + executed = self._process_engine_step() local_unfinished_reqs = self.scheduler.has_unfinished_requests() - - if local_unfinished_reqs: - # 2) Step the engine core. - self._process_engine_step() - - # Check if we have now finished all requests. - local_unfinished_reqs = ( - self.scheduler.has_unfinished_requests()) - else: - if self.scheduler.has_finished_requests(): - # There are no unfinished requests, but there are some - # finished requests remaining to be removed from the - # batch state. This engine step won't perform a forward - # pass but will flush the finished requests to ensure - # up-to-date state is returned in the engine outputs. - self._process_engine_step() - - if not self.engines_running: + if not executed: + if not local_unfinished_reqs and not self.engines_running: # All engines are idle. continue - # There must be unfinished requests in DP peers, run a - # dummy forward pass. + # We are in a running state and so must execute a dummy pass + # if the model didn't execute any ready requests. self.execute_dummy_batch() # 3) All-reduce operation to determine global unfinished reqs. diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 0d52bc9a6..9f8a9b692 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -206,7 +206,8 @@ class InprocClient(EngineCoreClient): self.engine_core = EngineCore(*args, **kwargs) def get_output(self) -> EngineCoreOutputs: - return self.engine_core.step() + outputs, _ = self.engine_core.step() + return outputs def add_request(self, request: EngineCoreRequest) -> None: self.engine_core.add_request(request) -- GitLab From 64eaf5fe0589a9e9082ce1de6a2e9d22ccb84b6a Mon Sep 17 00:00:00 2001 From: Will Eaton Date: Thu, 29 May 2025 14:08:40 -0400 Subject: [PATCH 056/274] [P/D] NixlConnector DP fixes (#18903) Signed-off-by: Will Eaton --- vllm/distributed/kv_transfer/kv_connector/factory.py | 3 ++- .../kv_transfer/kv_connector/v1/nixl_connector.py | 12 +++++++----- vllm/v1/engine/core.py | 9 +++++++++ 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 06b3983ed..dce0b545c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -70,7 +70,8 @@ class KVConnectorFactory: connector_module = importlib.import_module(connector_module_path) connector_cls = getattr(connector_module, connector_name) assert issubclass(connector_cls, KVConnectorBase_V1) - logger.info("Creating v1 connector with name: %s", connector_name) + logger.info("Creating v1 connector with name: %s and engine_id: %s", + connector_name, kv_transfer_config.engine_id) # NOTE(Kuntai): v1 connector is explicitly separated into two roles. # Scheduler connector: # - Co-locate with scheduler process diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 6303d77ad..f02434aeb 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -19,7 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, - get_tp_group) + get_tp_group, get_world_group) from vllm.logger import init_logger from vllm.utils import make_zmq_path, make_zmq_socket, round_down from vllm.v1.core.sched.output import SchedulerOutput @@ -334,6 +334,7 @@ class NixlConnectorWorker: self.engine_id = engine_id self.rank = get_tensor_model_parallel_rank() self.world_size = get_tensor_model_parallel_world_size() + self.world_rank = get_world_group().rank_in_group self.tp_group = get_tp_group() # KV Caches and nixl tracking data. @@ -382,7 +383,8 @@ class NixlConnectorWorker: @staticmethod def _nixl_handshake_listener(metadata: NixlAgentMetadata, - ready_event: threading.Event, rank: int): + ready_event: threading.Event, + world_rank: int): """Background thread for getting new NIXL handshakes.""" # NOTE(rob): this is a simple implementation. We will move # to a better approach like an ETCD server in the future. @@ -403,7 +405,7 @@ class NixlConnectorWorker: # NOTE(rob): we need each rank to have a unique port. This # hack to keeps us moving. We will switch when moving to etcd # or where we have a single ZMQ socket in the scheduler. - port = envs.VLLM_NIXL_SIDE_CHANNEL_PORT + rank + port = envs.VLLM_NIXL_SIDE_CHANNEL_PORT + world_rank path = make_zmq_path("tcp", host, port) logger.debug("Starting listening on path: %s", path) with zmq_ctx(zmq.ROUTER, path) as sock: @@ -422,7 +424,7 @@ class NixlConnectorWorker: # NOTE(rob): we need each rank to have a unique port. This is # a hack to keep us moving. We will switch when moving to etcd # or where we have a single ZMQ socket in the scheduler. - path = make_zmq_path("tcp", host, port + self.rank) + path = make_zmq_path("tcp", host, port + self.world_rank) logger.debug("Querying metadata on path: %s", path) with zmq_ctx(zmq.REQ, path) as sock: # Send query for the request. @@ -529,7 +531,7 @@ class NixlConnectorWorker: ready_event = threading.Event() self._nixl_handshake_listener_t = threading.Thread( target=self._nixl_handshake_listener, - args=(metadata, ready_event, self.rank), + args=(metadata, ready_event, self.world_rank), daemon=True, name="nixl_handshake_listener") self._nixl_handshake_listener_t.start() diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 2a7680140..ed71d9b67 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -707,6 +707,15 @@ class DPEngineCoreProc(EngineCoreProc): assert dp_size > 1 assert 0 <= local_dp_rank <= dp_rank < dp_size + if vllm_config.kv_transfer_config is not None: + # modify the engine_id and append the local_dp_rank to it to ensure + # that the kv_transfer_config is unique for each DP rank. + vllm_config.kv_transfer_config.engine_id = ( + f"{vllm_config.kv_transfer_config.engine_id}_dp{local_dp_rank}" + ) + logger.debug("Setting kv_transfer_config.engine_id to %s", + vllm_config.kv_transfer_config.engine_id) + from vllm.platforms import current_platform device_control_env_var = current_platform.device_control_env_var world_size = vllm_config.parallel_config.world_size -- GitLab From a521ef06e5eb18a34d665282fa38c4768a855bb8 Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Thu, 29 May 2025 18:41:58 -0400 Subject: [PATCH 057/274] Use standalone_compile by default in torch >= 2.8.0 (#18846) Signed-off-by: rzou --- vllm/compilation/backends.py | 5 +++-- vllm/compilation/compiler_interface.py | 2 +- vllm/envs.py | 14 ++++++++------ 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 0358c9d0d..b724479a9 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -16,7 +16,7 @@ import vllm.envs as envs from vllm.config import CompilationConfig, VllmConfig from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import resolve_obj_by_qualname +from vllm.utils import is_torch_equal_or_newer, resolve_obj_by_qualname from .compiler_interface import (CompilerInterface, EagerAdaptor, InductorAdaptor, InductorStandaloneAdaptor) @@ -29,7 +29,8 @@ logger = init_logger(__name__) def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface: if compilation_config.use_inductor: - if envs.VLLM_TEST_STANDALONE_COMPILE: + if envs.VLLM_USE_STANDALONE_COMPILE and is_torch_equal_or_newer( + "2.8.0"): logger.info("Using InductorStandaloneAdaptor") return InductorStandaloneAdaptor() else: diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 7e9186f86..8fa8ce279 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -155,7 +155,7 @@ class InductorStandaloneAdaptor(CompilerInterface): This is not on by default yet, but we plan to turn it on by default for PyTorch 2.8. - Use VLLM_TEST_STANDALONE_COMPILE to toggle this on or off. + Use VLLM_USE_STANDALONE_COMPILE to toggle this on or off. """ name = "inductor_standalone" diff --git a/vllm/envs.py b/vllm/envs.py index bd9104afa..785fe7309 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -143,10 +143,10 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: def get_vllm_port() -> Optional[int]: """Get the port from VLLM_PORT environment variable. - + Returns: The port number as an integer if VLLM_PORT is set, None otherwise. - + Raises: ValueError: If VLLM_PORT is a URI, suggest k8s service discovery issue. """ @@ -308,9 +308,11 @@ environment_variables: dict[str, Callable[[], Any]] = { lambda: bool( os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"), - # Internal flag to enable/disable Inductor standalone compile - "VLLM_TEST_STANDALONE_COMPILE": - lambda: os.environ.get("VLLM_TEST_STANDALONE_COMPILE", "0") != "0", + # Feature flag to enable/disable Inductor standalone compile. + # In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is + # enabled by default. + "VLLM_USE_STANDALONE_COMPILE": + lambda: os.environ.get("VLLM_USE_STANDALONE_COMPILE", "1") == "1", # local rank of the process in the distributed setting, used to determine # the GPU device id @@ -892,7 +894,7 @@ def compute_hash() -> str: "VLLM_USE_TRITON_AWQ", "VLLM_DP_RANK", "VLLM_DP_SIZE", - "VLLM_TEST_STANDALONE_COMPILE", + "VLLM_USE_STANDALONE_COMPILE", ] for key in environment_variables_to_hash: if key in environment_variables: -- GitLab From a1cc9f33a32eef4550daccdc76aefc1baf7bc35d Mon Sep 17 00:00:00 2001 From: Chengji Yao Date: Thu, 29 May 2025 16:00:11 -0700 Subject: [PATCH 058/274] [TPU] remove transpose ops in moe kernel (#18923) Signed-off-by: Chengji Yao --- requirements/tpu.txt | 10 +++++----- tests/tpu/test_moe_pallas.py | 2 +- vllm/model_executor/layers/fused_moe/moe_pallas.py | 9 ++------- 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/requirements/tpu.txt b/requirements/tpu.txt index 3b204a8f9..edc8b2a45 100644 --- a/requirements/tpu.txt +++ b/requirements/tpu.txt @@ -18,9 +18,9 @@ setuptools==78.1.0 --find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html -torch==2.8.0.dev20250518 -torchvision==0.22.0.dev20250518 -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" -torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" +torch==2.8.0.dev20250529 +torchvision==0.22.0.dev20250529 +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250529-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250529-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250529-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" diff --git a/tests/tpu/test_moe_pallas.py b/tests/tpu/test_moe_pallas.py index 13fc8bc8f..19df22f78 100644 --- a/tests/tpu/test_moe_pallas.py +++ b/tests/tpu/test_moe_pallas.py @@ -26,7 +26,7 @@ TOP_KS = [2, 6] # The Pallas GMM kernel requires num_tokens * topk to be a multiple of 16 @pytest.mark.parametrize("m", [8, 16, 64, 2048]) @pytest.mark.parametrize("n", [128, 1024, 2048]) -@pytest.mark.parametrize("k", [128, 511, 1024]) +@pytest.mark.parametrize("k", [128, 512, 1024]) @pytest.mark.parametrize("e", NUM_EXPERTS) @pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("ep_size", EP_SIZE) diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py index 539459992..9d8bd62c6 100644 --- a/vllm/model_executor/layers/fused_moe/moe_pallas.py +++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py @@ -67,15 +67,10 @@ def fused_moe( token_indices = token_indices[topk_argsort_indices] group_sizes = _histogram(topk_indices.to(torch.int32), 0, num_experts - 1) - # NOTE(woosuk): The GMM Pallas kernel requires a different weight layout - # from HF Transformers. - w1 = w1.transpose(1, 2) - w2 = w2.transpose(1, 2) - x = hidden_states[token_indices] - x = torch.ops.xla.gmm(x, w1, group_sizes) + x = torch.ops.xla.gmm(x, w1, group_sizes, transpose_rhs=True) x = F.silu(x[..., :intermediate_size]) * x[..., intermediate_size:] - x = torch.ops.xla.gmm(x, w2, group_sizes) + x = torch.ops.xla.gmm(x, w2, group_sizes, transpose_rhs=True) x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size) x = x * topk_weights.unsqueeze(dim=-1) -- GitLab From d54af615d5cf63e111e564b3a2c25405fc16b4fd Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 29 May 2025 22:13:17 -0400 Subject: [PATCH 059/274] [Bugfix] Fix PP default fallback behavior for V1 (#18915) Signed-off-by: mgoin --- vllm/engine/arg_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 2a1a34211..13d8a280e 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1380,7 +1380,8 @@ class EngineArgs: if (self.pipeline_parallel_size > 1 and self.distributed_executor_backend - not in ("ray", "mp", "external_launcher")): + not in (ParallelConfig.distributed_executor_backend, "ray", + "mp", "external_launcher")): name = "Pipeline Parallelism without Ray distributed executor " \ "or multiprocessing executor or external launcher" _raise_or_fallback(feature_name=name, recommend_to_remove=False) -- GitLab From 1aa2f81b43b286ec9fa66439c151d18f20a662ec Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 30 May 2025 10:17:01 +0800 Subject: [PATCH 060/274] [Misc] Update type annotation for rotary embedding `base` (#18914) Signed-off-by: DarkLight1337 --- benchmarks/kernels/benchmark_rope.py | 2 +- tests/kernels/core/test_pos_encoding.py | 6 ++-- .../model_executor/layers/rotary_embedding.py | 34 +++++++++---------- vllm/model_executor/models/minimax_text_01.py | 7 ++-- 4 files changed, 23 insertions(+), 26 deletions(-) diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 110d36db1..944024ca3 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -22,7 +22,7 @@ def benchmark_rope_kernels_multi_lora( seed: int, device: str, max_position: int = 8192, - base: int = 10000, + base: float = 10000, ) -> None: current_platform.seed_everything(seed) torch.set_default_device(device) diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index f327deb0e..8cb56314c 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -70,7 +70,7 @@ def test_rotary_embedding( device: str, use_key: bool, max_position: int = 8192, - base: int = 10000, + base: float = 10000, ) -> None: if rotary_dim is None: rotary_dim = head_size @@ -135,7 +135,7 @@ def test_batched_rotary_embedding( device: str, use_key: bool, max_position: int = 8192, - base: int = 10000, + base: float = 10000, ) -> None: current_platform.seed_everything(seed) torch.set_default_device(device) @@ -203,7 +203,7 @@ def test_batched_rotary_embedding_multi_lora( device: str, use_key: bool, max_position: int = 8192, - base: int = 10000, + base: float = 10000, ) -> None: current_platform.seed_everything(seed) torch.set_default_device(device) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 70463ecd9..afc059719 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -96,7 +96,7 @@ class RotaryEmbedding(CustomOp): head_size: int, rotary_dim: int, max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, dtype: torch.dtype, ) -> None: @@ -113,7 +113,7 @@ class RotaryEmbedding(CustomOp): self.cos_sin_cache: torch.Tensor self.register_buffer("cos_sin_cache", cache, persistent=False) - def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: + def _compute_inv_freq(self, base: float) -> torch.Tensor: """Compute the inverse frequency.""" # NOTE(woosuk): To exactly match the HF implementation, we need to # use CPU to compute the cache and then move it to GPU. However, we @@ -404,7 +404,7 @@ class LinearScalingRotaryEmbedding(RotaryEmbedding): head_size: int, rotary_dim: int, max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, scaling_factors: Union[list[float], float], dtype: torch.dtype, @@ -464,7 +464,7 @@ class NTKScalingRotaryEmbedding(RotaryEmbedding): head_size: int, rotary_dim: int, max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, scaling_factor: float, dtype: torch.dtype, @@ -474,7 +474,7 @@ class NTKScalingRotaryEmbedding(RotaryEmbedding): super().__init__(head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype) - def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: + def _compute_inv_freq(self, base: float) -> torch.Tensor: base = self.base * (self.scaling_factor if self.mixed_b is None else 1) inv_freq = super()._compute_inv_freq(base) @@ -501,7 +501,7 @@ class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding): head_size: int, rotary_dim: int, max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, scaling_factor: float, dtype: torch.dtype, @@ -582,7 +582,7 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding): head_size: int, rotary_dim: int, max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, scaling_factor: float, dtype: torch.dtype, @@ -644,7 +644,7 @@ class Phi3LongRoPEScaledRotaryEmbedding(nn.Module): rotary_dim: int, max_position_embeddings: int, original_max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, dtype: torch.dtype, short_factor: list[float], @@ -769,7 +769,7 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding): head_size: int, rotary_dim: int, max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, scaling_factor: float, dtype: torch.dtype, @@ -877,7 +877,7 @@ class Llama3RotaryEmbedding(RotaryEmbedding): head_size: int, rotary_dim: int, max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, dtype: torch.dtype, scaling_factor: float, @@ -892,7 +892,7 @@ class Llama3RotaryEmbedding(RotaryEmbedding): super().__init__(head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype) - def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: + def _compute_inv_freq(self, base: float) -> torch.Tensor: inv_freqs = super()._compute_inv_freq(base) low_freq_wavelen = self.orig_max_position / self.low_freq_factor high_freq_wavelen = self.orig_max_position / self.high_freq_factor @@ -923,14 +923,14 @@ class Llama4VisionRotaryEmbedding(RotaryEmbedding): head_size: int, rotary_dim: int, max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, dtype: torch.dtype, ): super().__init__(head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype) - def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: + def _compute_inv_freq(self, base: float) -> torch.Tensor: inv_freqs = super()._compute_inv_freq(base) inv_freqs = inv_freqs[:(self.rotary_dim // 2)] return inv_freqs @@ -989,7 +989,7 @@ class MRotaryEmbedding(RotaryEmbedding): head_size: int, rotary_dim: int, max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, dtype: torch.dtype, mrope_section: Optional[list[int]] = None, @@ -1529,7 +1529,7 @@ class DualChunkRotaryEmbedding(CustomOp): head_size: int, rotary_dim: int, max_position_embeddings: int, - base: int, + base: float, is_neox_style: bool, dtype: torch.dtype, chunk_size: int, @@ -1558,7 +1558,7 @@ class DualChunkRotaryEmbedding(CustomOp): q_inter_cache, persistent=False) - def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: + def _compute_inv_freq(self, base: float) -> torch.Tensor: """Compute the inverse frequency.""" # NOTE(woosuk): The HF implementation uses `torch.arange(...).float()`. # However, we use `torch.arange(..., dtype=torch.float)` instead to @@ -1705,7 +1705,7 @@ def get_rope( head_size: int, rotary_dim: int, max_position: int, - base: int, + base: float, is_neox_style: bool = True, rope_scaling: Optional[dict[str, Any]] = None, dtype: Optional[torch.dtype] = None, diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 36bab9ee1..ac0fe7b10 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -141,7 +141,7 @@ class MiniMaxText01RotaryEmbedding(CustomOp): head_size: int, rotary_dim: int, max_position: int, - base: int, + base: float, is_neox_style: bool, cache_dtype: torch.dtype, ) -> None: @@ -155,10 +155,7 @@ class MiniMaxText01RotaryEmbedding(CustomOp): cache = self._compute_cos_sin_cache().to(cache_dtype) self.register_buffer("cos_sin_cache", cache, persistent=False) - def _compute_inv_freq( - self, - base: Union[int, float], - ) -> torch.Tensor: + def _compute_inv_freq(self, base: float) -> torch.Tensor: """Compute the inverse frequency.""" inv_freq = 1.0 / (base**(torch.arange( 0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim)) -- GitLab From 3132290a14a66dc73c9f15ec9cd9f8909c978e11 Mon Sep 17 00:00:00 2001 From: Carol Zheng Date: Fri, 30 May 2025 02:24:19 +0000 Subject: [PATCH 061/274] [TPU][CI/CD] Clean up docker for TPU tests. (#18926) Signed-off-by: Carol Zheng --- .../scripts/hardware_ci/run-tpu-v1-test.sh | 39 +++++++++++++++++-- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index 5dd53420d..610243145 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -2,15 +2,46 @@ set -xu -# Build the docker image. -docker build -f docker/Dockerfile.tpu -t vllm-tpu . -# Set up cleanup. -remove_docker_container() { docker rm -f tpu-test || true; } +remove_docker_container() { + docker rm -f tpu-test || true; + docker rm -f vllm-tpu || true; +} + trap remove_docker_container EXIT + # Remove the container that might not be cleaned up in the previous run. remove_docker_container +# Build the docker image. +docker build -f docker/Dockerfile.tpu -t vllm-tpu . + +# Set up cleanup. +cleanup_docker() { + # Get Docker's root directory + docker_root=$(docker info -f '{{.DockerRootDir}}') + if [ -z "$docker_root" ]; then + echo "Failed to determine Docker root directory." + exit 1 + fi + echo "Docker root directory: $docker_root" + # Check disk usage of the filesystem where Docker's root directory is located + disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') + # Define the threshold + threshold=70 + if [ "$disk_usage" -gt "$threshold" ]; then + echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." + # Remove dangling images (those that are not tagged and not used by any container) + docker image prune -f + # Remove unused volumes / force the system prune for old images as well. + docker volume prune -f && docker system prune --force --filter "until=72h" --all + echo "Docker images and volumes cleanup completed." + else + echo "Disk usage is below $threshold%. No cleanup needed." + fi +} +cleanup_docker + # For HF_TOKEN. source /etc/environment -- GitLab From 3de3eadf5b1c271ccd7140526ffb3f850d6b0189 Mon Sep 17 00:00:00 2001 From: Wenhua Cheng Date: Fri, 30 May 2025 10:24:47 +0800 Subject: [PATCH 062/274] improve the robustness of parsing vlms config in AutoRound (#18894) Signed-off-by: wenhuach21 --- vllm/model_executor/layers/quantization/auto_round.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py index 2d9f5e52b..eb8ffa378 100644 --- a/vllm/model_executor/layers/quantization/auto_round.py +++ b/vllm/model_executor/layers/quantization/auto_round.py @@ -116,8 +116,9 @@ class AutoRoundConfig(QuantizationConfig): quantized = True if self.block_name_to_quantize: - quantized = any(name in layer_name - for name in self.block_name_to_quantize) + quantized = any( + layer_name.startswith(name) + for name in self.block_name_to_quantize) elif isinstance(layer, ParallelLMHead): quantized = False -- GitLab From 77164dad5e7973163d1563f0854644c2864700be Mon Sep 17 00:00:00 2001 From: Chauncey Date: Fri, 30 May 2025 12:44:43 +0800 Subject: [PATCH 063/274] [Bugfix] Consistent ascii handling in tool parsers (#18883) Signed-off-by: chaunceyjiang --- vllm/entrypoints/openai/serving_chat.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 6a0e3b14d..ea8e187dc 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -988,7 +988,8 @@ class OpenAIServingChat(OpenAIServing): tool_calls=[ tool_call_class(function=FunctionCall( name=tool_call.name, - arguments=json.dumps(tool_call.parameters))) + arguments=json.dumps(tool_call.parameters, + ensure_ascii=False))) for tool_call in tool_calls ]) -- GitLab From 3987e2ae963963b9edb132935deabd16dd5a7468 Mon Sep 17 00:00:00 2001 From: iLeGend Date: Fri, 30 May 2025 12:50:10 +0800 Subject: [PATCH 064/274] [Model] Use AutoWeightsLoader for mamba2 (#18918) Signed-off-by: iLeGend <824040212@qq.com> --- vllm/model_executor/models/mamba2.py | 43 ++++++++++++++++------------ 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index 858a1633b..65c6467bc 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -32,7 +32,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.utils import LayerBlockType -from .utils import (is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -167,6 +167,27 @@ class Mamba2Model(nn.Module): return hidden_states + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "A_log" in name: + name = name.replace("A_log", "A") + + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree, SupportsV0Only): @@ -282,21 +303,5 @@ class Mamba2ForCausalLM(nn.Module, HasInnerState, IsAttentionFree, def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "A_log" in name: - name = name.replace("A_log", "A") - - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) -- GitLab From 5acf828d9932fb109f18b9790d0bca011a9ed868 Mon Sep 17 00:00:00 2001 From: H Date: Thu, 29 May 2025 22:20:48 -0700 Subject: [PATCH 065/274] [docs] fix: fix markdown syntax (#18927) --- docs/design/arch_overview.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/design/arch_overview.md b/docs/design/arch_overview.md index 75d3e1b7c..14720a392 100644 --- a/docs/design/arch_overview.md +++ b/docs/design/arch_overview.md @@ -48,8 +48,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -More API details can be found in the [Offline Inference] -(#offline-inference-api) section of the API docs. +More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs. The code for the `LLM` class can be found in . -- GitLab From 77b6e74fe2b6259b2ad4dbca55eca50017a68926 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Fri, 30 May 2025 13:33:17 +0800 Subject: [PATCH 066/274] [ROCm] Remove unnecessary assertion of max_model_len in ROCM_AITER_MLA attention backend. (#18938) Signed-off-by: vllmellm --- vllm/attention/backends/rocm_aiter_mla.py | 2 -- vllm/v1/attention/backends/mla/rocm_aiter_mla.py | 3 --- 2 files changed, 5 deletions(-) diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py index b04822002..c974f2a15 100644 --- a/vllm/attention/backends/rocm_aiter_mla.py +++ b/vllm/attention/backends/rocm_aiter_mla.py @@ -132,8 +132,6 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): def __init__(self, input_builder: "ModelInputForGPUBuilder"): super().__init__(input_builder) - assert self.runner.model_config.max_model_len == 32768,\ - "AITER MLA requires max model len to be set to 32768" assert self.block_size == 1, "AITER MLA requires only block size 1." def prepare(self): diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 31980e94a..d1e823bbe 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -66,9 +66,6 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): def __init__(self, runner, kv_cache_spec: AttentionSpec, block_table: BlockTable): super().__init__(runner, kv_cache_spec, block_table) - max_model_len = self.runner.model_config.max_model_len - assert max_model_len == 32768,\ - "AITER MLA requires max_model_len=32768" assert self.kv_cache_spec.block_size == 1, "AITER MLA" \ "only supports block size 1." -- GitLab From 4d0a1541bed0a89f71c55b0714e18422688bec53 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 30 May 2025 01:37:36 -0400 Subject: [PATCH 067/274] [Bugfix] Remove NVFP4 scales assertions to fix load_format=dummy (#18861) Signed-off-by: mgoin --- vllm/model_executor/layers/quantization/modelopt.py | 8 +++++--- .../layers/quantization/utils/marlin_utils_fp4.py | 7 ++++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 1c5680f95..2abe16a08 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -585,9 +585,11 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): def process_weights_after_loading(self, layer: torch.nn.Module) -> None: # GEMM 1 - assert torch.allclose( - layer.w13_weight_scale_2[:, 0], layer.w13_weight_scale_2[:, 1]), ( - "w1_weight_scale_2 must match w3_weight_scale_2") + if not torch.allclose(layer.w13_weight_scale_2[:, 0], + layer.w13_weight_scale_2[:, 1]): + logger.warning_once( + "w1_weight_scale_2 must match w3_weight_scale_2. " + "Accuracy may be affected.") w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0] layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2, diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py index 15177af58..13dcdc00a 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py @@ -22,7 +22,12 @@ def is_fp4_marlin_supported(): def fp4_marlin_process_scales(marlin_scales): - assert (marlin_scales >= 0).all() + if not (marlin_scales >= 0).all(): + logger.warning_once( + "NVFP4 Marlin assumes the scales to be >=0, but has encountered " + "negative scales. Accuracy will likely be degraded. This is " + "because it changes the scales from FP8-S1E4M3 to a special " + "FP8-S0E5M3 format to speedup the dequantization.") # convert to half first, we would convert to fp8 later marlin_scales = marlin_scales.to(torch.half) -- GitLab From 4f4a6b844a6347a539b9dd87c39f557d91981988 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 30 May 2025 14:53:37 +0800 Subject: [PATCH 068/274] [Deprecation] Remove mean pooling default for `Qwen2EmbeddingModel` (#18913) Signed-off-by: DarkLight1337 --- docs/models/supported_models.md | 2 +- vllm/model_executor/models/qwen2.py | 77 +------------------------- vllm/model_executor/models/registry.py | 2 +- 3 files changed, 5 insertions(+), 76 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 0202ba5a6..6b0ceaf21 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -401,7 +401,7 @@ Specified using `--task embed`. !!! note `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. - You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`. + You need to manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`. !!! note For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded. diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 0d0d98c59..a664864ff 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -34,32 +34,27 @@ from vllm.attention import Attention, AttentionType from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, PoolerOutput +from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP -from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, - extract_layer_index, is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index, + is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -logger = init_logger(__name__) - class Qwen2MLP(nn.Module): @@ -499,69 +494,3 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): if self.config.tie_word_embeddings else None), ) return loader.load_weights(weights) - - -class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } - - hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config - pooler_config = vllm_config.model_config.pooler_config - - self.config = config - self.lora_config = lora_config - - self.quant_config = quant_config - self.model = Qwen2Model(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - - # TODO: Replace this model class with as_embedding_model( - # Qwen2ForCausalLM) after changing the default pooling method - if pooler_config.pooling_type is None: - logger.warning( - "This embedding model will default to last-token pooling in " - "an upcoming version. To avoid breaking changes, you should " - "pass `--override-pooler-config '{\"pooling_type\": \"MEAN\"}'`" - " explicitly.") - - self._pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.MEAN, - normalize=True, - softmax=False) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - ) -> torch.Tensor: - return self.model(input_ids, positions, intermediate_tensors) - - def pooler( - self, - hidden_states: torch.Tensor, - pooling_metadata: PoolingMetadata, - ) -> Optional[PoolerOutput]: - return self._pooler(hidden_states, pooling_metadata) - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): - weights = self.hf_to_vllm_mapper.apply(weights) - weights = ((name, data) for name, data in weights - if not name.startswith("lm_head.")) - self.model.load_weights(weights) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 97ea12de6..5a6a12fcc 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -142,7 +142,7 @@ _EMBEDDING_MODELS = { "ModernBertModel": ("modernbert", "ModernBertModel"), "NomicBertModel": ("bert_with_rope", "NomicBertModel"), "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"), - "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"), + "Qwen2Model": ("qwen2", "Qwen2ForCausalLM"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), "Qwen2ForProcessRewardModel": ("qwen2_rm", "Qwen2ForProcessRewardModel"), -- GitLab From 6acb7a62855a037a9cb50344e692ca23ed8782ea Mon Sep 17 00:00:00 2001 From: Rabi Mishra Date: Fri, 30 May 2025 13:28:04 +0530 Subject: [PATCH 069/274] [Misc]Fix benchmarks/README.md for speculative decoding (#18897) Signed-off-by: rabi --- benchmarks/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index ecab570bb..cbf2f281b 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -146,9 +146,9 @@ python3 vllm/benchmarks/benchmark_serving.py \ ``` bash VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \ - --ngram_prompt_lookup_min 2 \ - --ngram-prompt-lookup-max 5 \ - --speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5} + --speculative-config $'{"method": "ngram", + "num_speculative_tokens": 5, "prompt_lookup_max": 5, + "prompt_lookup_min": 2}' ``` ``` bash @@ -273,9 +273,9 @@ python3 vllm/benchmarks/benchmark_throughput.py \ --output-len=100 \ --num-prompts=2048 \ --async-engine \ - --ngram_prompt_lookup_min=2 \ - --ngram-prompt-lookup-max=5 \ - --speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5} + --speculative-config $'{"method": "ngram", + "num_speculative_tokens": 5, "prompt_lookup_max": 5, + "prompt_lookup_min": 2}' ``` ``` -- GitLab From 8f8900cee9121572d27463b7d8223fa6ef76e0ef Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Fri, 30 May 2025 15:58:44 +0800 Subject: [PATCH 070/274] [doc] add mkdocs doc (#18930) Signed-off-by: reidliu41 Co-authored-by: reidliu41 --- docs/contributing/README.md | 53 ++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/docs/contributing/README.md b/docs/contributing/README.md index 2517436af..72c4909b1 100644 --- a/docs/contributing/README.md +++ b/docs/contributing/README.md @@ -29,20 +29,67 @@ See . Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source][build-from-source] documentation for details. -### Building the docs +### Building the docs with MkDocs -Install the dependencies: +#### Introduction to MkDocs + +[MkDocs](https://github.com/mkdocs/mkdocs) is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file. + +#### Install MkDocs and Plugins + +Install MkDocs along with the [plugins](https://github.com/vllm-project/vllm/blob/main/mkdocs.yaml) used in the vLLM documentation, as well as required dependencies: ```bash pip install -r requirements/docs.txt ``` -Start the autoreloading MkDocs server: +> **Note:** Ensure that your Python version is compatible with the plugins (e.g., mkdocs-awesome-nav requires Python 3.10+) + +#### Verify Installation + +Confirm that MkDocs is correctly installed:: + +```bash +mkdocs --version +``` + +Example output: + +```console +mkdocs, version 1.6.1 from /opt/miniconda3/envs/mkdoc/lib/python3.9/site-packages/mkdocs (Python 3.9) +``` + +#### Clone the `vLLM` repository + +```bash +git clone https://github.com/vllm-project/vllm.git +cd vllm +``` + +#### Start the Development Server + +MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it. Make sure you're in the same directory as the `mkdocs.yml` configuration file, and then start the server by running the `mkdocs serve` command: ```bash mkdocs serve ``` +Example output: + +```console +INFO - Documentation built in 106.83 seconds +INFO - [22:02:02] Watching paths for changes: 'docs', 'mkdocs.yaml' +INFO - [22:02:02] Serving on http://127.0.0.1:8000/ +``` + +#### View in Your Browser + +Open up [http://127.0.0.1:8000/](http://127.0.0.1:8000/) in your browser to see a live preview:. + +#### Learn More + +For additional features and advanced configurations, refer to the official [MkDocs Documentation](https://www.mkdocs.org/). + ## Testing ```bash -- GitLab From c3bb9f23315f8cdc5007349717fd58d16a833ff6 Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Fri, 30 May 2025 10:12:59 +0100 Subject: [PATCH 071/274] [Model] Use in-place adds in SigLIP (#18922) Signed-off-by: Lukas Geiger --- vllm/model_executor/models/siglip.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 3b5334afa..4803da295 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -130,11 +130,10 @@ class SiglipVisionEmbeddings(nn.Module): embeddings = patch_embeds.flatten(2).transpose(1, 2) if interpolate_pos_encoding: - embeddings = embeddings + self.interpolate_pos_encoding( + embeddings += self.interpolate_pos_encoding( embeddings, height, width) else: - embeddings = embeddings + self.position_embedding( - self.position_ids) + embeddings += self.position_embedding(self.position_ids) return embeddings @@ -271,12 +270,12 @@ class SiglipEncoderLayer(nn.Module): hidden_states = self.layer_norm1(hidden_states) hidden_states, _ = self.self_attn(hidden_states=hidden_states) - hidden_states = residual + hidden_states + hidden_states += residual residual = hidden_states hidden_states = self.layer_norm2(hidden_states) hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states + hidden_states += residual return hidden_states, None @@ -354,7 +353,8 @@ class SiglipMultiheadAttentionPoolingHead(nn.Module): residual = hidden_state hidden_state = self.layernorm(hidden_state) - hidden_state = residual + self.mlp(hidden_state) + hidden_state = self.mlp(hidden_state) + hidden_state += residual return hidden_state[:, 0] -- GitLab From 5f1d0c8118b099ca0a08435f5f0312aa50f21142 Mon Sep 17 00:00:00 2001 From: Rabi Mishra Date: Fri, 30 May 2025 14:43:47 +0530 Subject: [PATCH 072/274] [Bugfix][Failing Test] Fix test_vllm_port.py (#18618) Signed-off-by: rabi --- .buildkite/test-pipeline.yaml | 3 ++- vllm/envs.py | 18 +++++++----------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 4e7bea25e..46785a8b3 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -199,8 +199,9 @@ steps: - tests/test_sequence - tests/test_config - tests/test_logger + - tests/test_vllm_port commands: - - pytest -v -s engine test_sequence.py test_config.py test_logger.py + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py # OOM in the CI unless we run this separately - pytest -v -s tokenization diff --git a/vllm/envs.py b/vllm/envs.py index 785fe7309..dc52bbd8e 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -159,17 +159,13 @@ def get_vllm_port() -> Optional[int]: return int(port) except ValueError as err: from urllib.parse import urlparse - try: - parsed = urlparse(port) - if parsed.scheme: - raise ValueError( - f"VLLM_PORT '{port}' appears to be a URI. " - "This may be caused by a Kubernetes service discovery issue" - "check the warning in: https://docs.vllm.ai/en/stable/usage/env_vars.html" - ) - except Exception: - pass - + parsed = urlparse(port) + if parsed.scheme: + raise ValueError( + f"VLLM_PORT '{port}' appears to be a URI. " + "This may be caused by a Kubernetes service discovery issue," + "check the warning in: https://docs.vllm.ai/en/stable/serving/env_vars.html" + ) from None raise ValueError( f"VLLM_PORT '{port}' must be a valid integer") from err -- GitLab From 4577fc9abb064d74b2082ffc5005cbb82ca91766 Mon Sep 17 00:00:00 2001 From: Always-Naive <97138029+Always-Naive@users.noreply.github.com> Date: Fri, 30 May 2025 17:21:35 +0800 Subject: [PATCH 073/274] [Misc]Fix typo (#18947) --- vllm/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/utils.py b/vllm/utils.py index c1213d463..25e34446a 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -107,7 +107,7 @@ STR_NOT_IMPL_ENC_DEC_LOGIT_SOFTCAP = ( "currently not supported for encoder/decoder " "models.") -STR_NOT_IMPL_ENC_DEC_LORA = ("LoRA is currently not currently " +STR_NOT_IMPL_ENC_DEC_LORA = ("LoRA is not currently " "supported with encoder/decoder " "models.") -- GitLab From fba02e3bd1775f28cb18390a9485460a9aabdeec Mon Sep 17 00:00:00 2001 From: Carol Zheng Date: Fri, 30 May 2025 10:04:03 +0000 Subject: [PATCH 074/274] [Bugfix][TPU] Fix tpu model runner testcase failure (#18810) Signed-off-by: Carol Zheng --- tests/v1/tpu/worker/test_tpu_model_runner.py | 31 ++++++++++++++--- vllm/v1/worker/tpu_model_runner.py | 35 ++++++++++++++------ 2 files changed, 50 insertions(+), 16 deletions(-) diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index 319b38b4c..348f12887 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -81,7 +81,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: mm_hashes=[], mm_positions=[], sampling_params=SamplingParams(), - block_ids=[0], + block_ids=[[0]], # block_ids should be list[list[int]] num_computed_tokens=0, lora_request=None, )) @@ -112,14 +112,35 @@ def _is_req_added(model_runner, req_id: str) -> bool: def _is_req_state_block_table_match(model_runner, req_id: str) -> bool: + """Check if the request state block IDs match the block table. + + This function handles both legacy BlockTable and new MultiGroupBlockTable + structures for backward compatibility. + """ + req_index = model_runner.input_batch.req_id_to_index[req_id] - block_table = model_runner.input_batch.block_table + multi_group_block_table = model_runner.input_batch.block_table req_state = model_runner.requests[req_id] - if block_table.num_blocks_per_row[req_index] != len(req_state.block_ids): + + # Access the first block table from MultiGroupBlockTable + # This is safe since we currently only use single KV cache groups + block_table = multi_group_block_table[0] + + # req_state.block_ids is now list[list[int]] for MultiGroupBlockTable + # Extract the first group's block IDs + if isinstance(req_state.block_ids[0], list): + # New format: list[list[int]] - extract first group + req_block_ids = req_state.block_ids[0] + else: + # Legacy format: list[int] - use directly + req_block_ids = req_state.block_ids + + if block_table.num_blocks_per_row[req_index] != len(req_block_ids): return False + num_blocks = block_table.num_blocks_per_row[req_index] - return (block_table.block_table_np[req_index, :num_blocks] == - req_state.block_ids).all() + block_table_values = block_table.block_table_np[req_index, :num_blocks] + return (block_table_values == req_block_ids).all() def test_update_states_new_request(model_runner): diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 669908cb5..c57ac3138 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -175,11 +175,21 @@ class TPUModelRunner(LoRAModelRunnerMixin): self.kv_caches: list[torch.Tensor] = [] # req_id -> (input_id -> encoder_output) self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {} - # self.input_batch: InputBatch # Persistent batch. # Request states. self.requests: dict[str, CachedRequestState] = {} + # Initialize input batch early to avoid AttributeError in _update_states + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.max_model_len, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=self.pin_memory, + vocab_size=self.model_config.get_vocab_size(), + block_size=self.block_size, + ) + # Cached torch/numpy tensor # The pytorch tensor and numpy array share the same buffer. # Sometimes the numpy op is faster so we create both. @@ -1286,16 +1296,19 @@ class TPUModelRunner(LoRAModelRunnerMixin): "Hybrid models with more than one KV cache type are not " "supported yet.") - self.input_batch = InputBatch( - max_num_reqs=self.max_num_reqs, - max_model_len=self.max_model_len, - max_num_batched_tokens=self.max_num_tokens, - device=self.device, - pin_memory=self.pin_memory, - vocab_size=self.model_config.get_vocab_size(), - block_size=kv_cache_config.kv_cache_groups[0].kv_cache_spec. - block_size, - ) + if kv_cache_config.kv_cache_groups[ + 0].kv_cache_spec.block_size != self.block_size: + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.max_model_len, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=self.pin_memory, + vocab_size=self.model_config.get_vocab_size(), + block_size=kv_cache_config.kv_cache_groups[0].kv_cache_spec. + block_size, + ) + # Verify dtype compatibility between block_table_cpu and input_batch assert self.block_table_cpu.dtype == self.input_batch.block_table[ 0].get_cpu_tensor().dtype -- GitLab From 43ff405b9059d2991604fb35e23bf212f8f390f3 Mon Sep 17 00:00:00 2001 From: Daniele <36171005+dtrifiro@users.noreply.github.com> Date: Fri, 30 May 2025 13:02:50 +0200 Subject: [PATCH 075/274] [CI/Build] remove regex from build dependencies (#18945) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Daniele Trifirò Co-authored-by: Cyrus Leung --- pyproject.toml | 1 - setup.py | 2 +- tools/enforce_regex_import.py | 3 +++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5286724b5..10f5dbeae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,6 @@ requires = [ "setuptools-scm>=8.0", "torch == 2.7.0", "wheel", - "regex", "jinja2", ] build-backend = "setuptools.build_meta" diff --git a/setup.py b/setup.py index b822a4ec3..c190864dd 100644 --- a/setup.py +++ b/setup.py @@ -5,12 +5,12 @@ import importlib.util import json import logging import os +import re import subprocess import sys from pathlib import Path from shutil import which -import regex as re import torch from packaging.version import Version, parse from setuptools import Extension, setup diff --git a/tools/enforce_regex_import.py b/tools/enforce_regex_import.py index b55c4a94e..6c201dd25 100644 --- a/tools/enforce_regex_import.py +++ b/tools/enforce_regex_import.py @@ -58,6 +58,9 @@ def main() -> int: if not Path(filepath).exists(): continue + if filepath == "setup.py": + continue + violations = check_file(filepath) if violations: print(f"\n❌ {filepath}:") -- GitLab From e1fadf11976ecc87ed60c9fc1bf8f81271c68c18 Mon Sep 17 00:00:00 2001 From: Shawn Huang <57223022+huangyuxiang03@users.noreply.github.com> Date: Fri, 30 May 2025 21:45:56 +0800 Subject: [PATCH 076/274] [Feature] minicpm eagle support (#18943) Signed-off-by: huangyuxiang03 Co-authored-by: huangyuxiang03 --- tests/models/registry.py | 5 + vllm/model_executor/models/minicpm.py | 5 +- vllm/model_executor/models/minicpm_eagle.py | 390 ++++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + 4 files changed, 399 insertions(+), 2 deletions(-) create mode 100644 vllm/model_executor/models/minicpm_eagle.py diff --git a/tests/models/registry.py b/tests/models/registry.py index 18342b671..fe49d2427 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -434,6 +434,11 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = { trust_remote_code=True, speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", tokenizer="meta-llama/Llama-3.1-8B-Instruct"), + "EagleMiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-1B-sft-bf16", + trust_remote_code=True, + is_available_online=False, + speculative_model="openbmb/MiniCPM-2B-sft-bf16", + tokenizer="openbmb/MiniCPM-2B-sft-bf16"), "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True, speculative_model="XiaomiMiMo/MiMo-7B-RL") diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 0397b552c..f471a86ff 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -242,6 +242,7 @@ class MiniCPMAttention(nn.Module): base=rope_theta, rope_scaling=rope_scaling, ) + self.attn = Attention(self.num_heads, self.head_dim, self.scaling, @@ -444,6 +445,7 @@ class MiniCPMModel(nn.Module): for weight_name in ["w1", "w2", "w3"] ] params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: @@ -567,7 +569,7 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, intermediate_tensors, - inputs_embeds) + inputs_embeds) / self.scale_width return hidden_states def compute_logits( @@ -575,7 +577,6 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - hidden_states = hidden_states / self.scale_width logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata) return logits diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py new file mode 100644 index 000000000..039c3d22d --- /dev/null +++ b/vllm/model_executor/models/minicpm_eagle.py @@ -0,0 +1,390 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only EagleMiniCPM model compatible with HuggingFace weights.""" +import math +from collections.abc import Iterable +from typing import Optional, Union + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from .interfaces import SupportsLoRA, SupportsPP +from .minicpm import MiniCPMAttention as EagleMiniCPMAttention +from .minicpm import MiniCPMMLP as EagleMiniCPMMLP +from .minicpm import MiniCPMMoE as EagleMiniCPMMoE +from .utils import (AutoWeightsLoader, is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, maybe_prefix) + + +class EagleMiniCPMDecoderLayer(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.config = config + self.cache_config = cache_config + self.quant_config = quant_config + self.hidden_size = config.hidden_size + self.rope_theta = getattr(config, "rope_theta", 10000) + self.rope_scaling = getattr(config, "rope_scaling", None) + self.max_position_embeddings = getattr(config, + "max_position_embeddings", 8192) + self.prefix = prefix + self._init_attn_block() + self._init_ffn_block() + + def _init_attn_block(self): + self.input_layernorm = RMSNorm(self.config.hidden_size, + eps=self.config.rms_norm_eps) + self.self_attn = EagleMiniCPMAttention( + hidden_size=self.hidden_size, + num_heads=self.config.num_attention_heads, + num_kv_heads=self.config.num_key_value_heads, + rope_theta=self.rope_theta, + rope_scaling=self.rope_scaling, + max_position_embeddings=self.max_position_embeddings, + cache_config=self.cache_config, + quant_config=self.quant_config, + prefix=f"{self.prefix}.self_attn", + ) + + def _init_ffn_block(self): + self.post_attention_layernorm = RMSNorm(self.config.hidden_size, + eps=self.config.rms_norm_eps) + self.num_experts = getattr(self.config, "num_experts", 0) + if self.num_experts == 0: + self.mlp = EagleMiniCPMMLP( + hidden_size=self.hidden_size, + intermediate_size=self.config.intermediate_size, + hidden_act=self.config.hidden_act, + hidden_act_param=getattr(self.config, "hidden_act_param", 0.), + quant_config=self.quant_config, + ) + else: + self.mlp = EagleMiniCPMMoE( + num_experts=self.config.num_experts, + top_k=self.config.num_experts_per_tok, + hidden_size=self.config.hidden_size, + intermediate_size=self.config.intermediate_size) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + ) -> tuple[torch.Tensor, torch.Tensor]: + # Self Attention + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + ) + hidden_states = residual + hidden_states * \ + (self.config.scale_depth / math.sqrt(self.config.mup_denominator)) + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states * \ + (self.config.scale_depth / math.sqrt(self.config.mup_denominator)) + + return hidden_states, None + + +@support_torch_compile +class EagleMiniCPMModel(nn.Module): + + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = "", + start_layer: int = 0): + super().__init__() + + config = vllm_config.speculative_config.draft_model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + + self.config = config + self.cache_config = cache_config + self.quant_config = quant_config + lora_vocab = (lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0 + self.vocab_size = config.vocab_size + lora_vocab + self.org_vocab_size = config.vocab_size + self.fc = torch.nn.Linear(self.config.hidden_size * 2, + self.config.hidden_size, + bias=False) + self.input_norm1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.input_norm2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + ) + self.num_experts = getattr(self.config, "num_experts", 0) + self._init_layers(prefix, config, cache_config, quant_config, + start_layer) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], self.config.hidden_size)) + + def _init_layers( + self, + prefix: str, + config: PretrainedConfig, + cache_config: Optional[CacheConfig], + quant_config: Optional[QuantizationConfig], + start_layer: int, + ): + self.eagle_layers = nn.ModuleList([ + EagleMiniCPMDecoderLayer( + config, + cache_config, + quant_config, + f"{prefix}.eagle_layers.{i + start_layer}", + ) for i in range(self.config.num_hidden_layers) + ]) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + embedding = self.embed_tokens(input_ids) + return embedding * self.config.scale_emb + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> Union[torch.Tensor, IntermediateTensors]: + input_embeds = self.get_input_embeddings(input_ids) + input_embeds = self.input_norm1(input_embeds) + hidden_states = self.input_norm2(hidden_states) + + hidden_states = self.fc( + torch.cat((input_embeds, hidden_states), dim=-1)) + residual = None + for layer in self.eagle_layers: + hidden_states, residual = layer( + positions, + hidden_states, + residual, + ) + + return hidden_states, hidden_states + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + expert_params_mapping = [ + # (param_name, weight_name, expert_id) + ("ws" if weight_name in ["w1", "w3"] else "w2s", + f"experts.{expert_id}.{weight_name}.weight", expert_id) + for expert_id in range(self.num_experts) + for weight_name in ["w1", "w2", "w3"] + ] + params_dict = dict(self.named_parameters()) + + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if ("rotary_emb.cos_cached" in name + or "rotary_emb.sin_cached" in name): + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for param_name, weight_name, expert_id in expert_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, + loaded_weight, + weight_name, + expert_id=expert_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + # LoRA specific attributes + embedding_modules = { + "embed_tokens": "input_embeddings", + "lm_head": "output_embeddings", + } + embedding_padding_modules = ["lm_head"] + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.speculative_config.draft_model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + + self.prefix = prefix + self.vllm_config = vllm_config + self.config = config + self.lora_config = lora_config + self.cache_config = cache_config + self.quant_config = quant_config + + target_layer_num = vllm_config.model_config.get_num_layers( + vllm_config.parallel_config) + + self.model = self._init_model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model"), + start_layer=target_layer_num) + + unpadded_vocab_size = config.vocab_size + if lora_config: + unpadded_vocab_size += lora_config.lora_extra_vocab_size + self.lm_head = ParallelLMHead( + unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE + # We need bigger padding if using lora for kernel + # compatibility + if not lora_config else lora_config.lora_vocab_padding_size, + quant_config=quant_config, + ) + if config.tie_word_embeddings: + self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens) + self.scale_width = self.config.hidden_size / self.config.dim_model_base + + self.logits_processor = LogitsProcessor(unpadded_vocab_size, + config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + def _init_model(self, + *, + vllm_config: VllmConfig, + prefix: str = "", + start_layer: int = 0): + return EagleMiniCPMModel(vllm_config=vllm_config, + prefix=prefix, + start_layer=start_layer) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + hidden_states, hidden_states2 = self.model(input_ids, positions, + hidden_states) + hidden_states = hidden_states / self.scale_width + hidden_states2 = hidden_states2 / self.scale_width + return hidden_states, hidden_states2 + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] + if self.config.tie_word_embeddings else None), + ) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 5a6a12fcc..8efd4825b 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -223,6 +223,7 @@ _SPECULATIVE_DECODING_MODELS = { "MiMoMTPModel": ("mimo_mtp", "MiMoMTP"), "EAGLEModel": ("eagle", "EAGLE"), "EagleLlamaForCausalLM": ("llama_eagle", "EagleLlamaForCausalLM"), + "EagleMiniCPMForCausalLM": ("minicpm_eagle", "EagleMiniCPMForCausalLM"), "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"), "MedusaModel": ("medusa", "Medusa"), -- GitLab From ec6833c5e9f12edf9f08d4d2c226ac43e8089e7a Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Fri, 30 May 2025 21:45:59 +0800 Subject: [PATCH 077/274] [doc] show the count for fork and watch (#18950) Signed-off-by: reidliu41 Co-authored-by: reidliu41 --- docs/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/README.md b/docs/README.md index 57b1d03de..0c6aff5fa 100644 --- a/docs/README.md +++ b/docs/README.md @@ -12,8 +12,8 @@

Star -Watch -Fork +Watch +Fork

vLLM is a fast and easy-to-use library for LLM inference and serving. -- GitLab From b29ca5c4d51ca0129485ed9546d4f94a6a8763a9 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 30 May 2025 10:37:27 -0400 Subject: [PATCH 078/274] [Docs] Update SECURITY.md with link to our security guide (#18961) Signed-off-by: Russell Bryant --- SECURITY.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/SECURITY.md b/SECURITY.md index 47196a1f1..6053cfb41 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -8,4 +8,6 @@ Please report security issues privately using [the vulnerability submission form --- +Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations. + Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models. -- GitLab From 84ec470fca4f3c1873863086a420a40206f1bf01 Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Fri, 30 May 2025 11:00:54 -0400 Subject: [PATCH 079/274] Improve "failed to get the hash of the compiled graph" error (#18956) Signed-off-by: rzou --- vllm/compilation/compiler_interface.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 8fa8ce279..9293610cc 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -415,8 +415,14 @@ class InductorAdaptor(CompilerInterface): # compilation cache. So turn off the checks if we disable the # compilation cache. if not envs.VLLM_DISABLE_COMPILE_CACHE: - assert hash_str is not None, ( - "failed to get the hash of the compiled graph") + if hash_str is None: + raise RuntimeError( + "vLLM failed to compile the model. The most " + "likely reason for this is that a previous compilation " + "failed, leading to a corrupted compilation artifact. " + "We recommend trying to " + "remove ~/.cache/vllm/torch_compile_cache and try again " + "to see the real issue. ") assert file_path is not None, ( "failed to get the file path of the compiled graph") return compiled_graph, (hash_str, file_path) -- GitLab From 2dbe8c07744cd5b7531c191a734a613f8b797e65 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 30 May 2025 08:17:00 -0700 Subject: [PATCH 080/274] [Perf] API-server scaleout with many-to-many server-engine comms (#17546) --- .buildkite/test-pipeline.yaml | 2 + .../test_api_server_process_manager.py | 268 ++++++++++ tests/utils.py | 5 +- tests/v1/core/test_kv_cache_utils.py | 1 - tests/v1/core/test_prefix_caching.py | 1 - tests/v1/core/test_scheduler.py | 9 +- tests/v1/engine/test_engine_core.py | 14 +- .../openai/test_multi_api_servers.py | 171 +++++++ .../unit/test_remote_decode_lifecycle.py | 4 +- .../unit/test_remote_prefill_lifecycle.py | 6 +- tests/v1/kv_connector/unit/utils.py | 1 - vllm/entrypoints/cli/serve.py | 179 ++++++- vllm/entrypoints/openai/api_server.py | 102 ++-- vllm/lora/worker_manager.py | 5 + vllm/utils.py | 6 +- vllm/v1/core/sched/interface.py | 10 +- vllm/v1/core/sched/scheduler.py | 53 +- vllm/v1/engine/__init__.py | 8 +- vllm/v1/engine/async_llm.py | 14 +- vllm/v1/engine/coordinator.py | 252 ++++++++++ vllm/v1/engine/core.py | 253 ++++++---- vllm/v1/engine/core_client.py | 456 ++++++++++-------- vllm/v1/metrics/loggers.py | 79 +-- vllm/v1/metrics/prometheus.py | 77 +++ vllm/v1/request.py | 5 +- vllm/v1/utils.py | 301 +++++++++++- 26 files changed, 1837 insertions(+), 445 deletions(-) create mode 100644 tests/entrypoints/test_api_server_process_manager.py create mode 100644 tests/v1/entrypoints/openai/test_multi_api_servers.py create mode 100644 vllm/v1/engine/coordinator.py create mode 100644 vllm/v1/metrics/prometheus.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 46785a8b3..bff2f69c1 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -618,9 +618,11 @@ steps: - vllm/worker/model_runner.py - entrypoints/llm/test_collective_rpc.py - tests/v1/test_async_llm_dp.py + - tests/v1/entrypoints/openai/test_multi_api_servers.py - vllm/v1/engine/ commands: - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py + - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/test_api_server_process_manager.py new file mode 100644 index 000000000..0dd1fdd99 --- /dev/null +++ b/tests/entrypoints/test_api_server_process_manager.py @@ -0,0 +1,268 @@ +# SPDX-License-Identifier: Apache-2.0 + +import multiprocessing +import socket +import threading +import time +from typing import Optional +from unittest.mock import patch + +import pytest + +from vllm.v1.utils import (APIServerProcessManager, + wait_for_completion_or_failure) + +# Global variables to control worker behavior +WORKER_RUNTIME_SECONDS = 0.5 + + +# Mock implementation of run_api_server_worker +def mock_run_api_server_worker(listen_address, sock, args, client_config=None): + """Mock run_api_server_worker that runs for a specific time.""" + print(f"Mock worker started with client_config: {client_config}") + time.sleep(WORKER_RUNTIME_SECONDS) + print("Mock worker completed successfully") + + +@pytest.fixture +def api_server_args(): + """Fixture to provide arguments for APIServerProcessManager.""" + sock = socket.socket() + return { + "target_server_fn": + mock_run_api_server_worker, + "listen_address": + "localhost:8000", + "sock": + sock, + "args": + "test_args", # Simple string to avoid pickling issues + "num_servers": + 3, + "input_addresses": [ + "tcp://127.0.0.1:5001", "tcp://127.0.0.1:5002", + "tcp://127.0.0.1:5003" + ], + "output_addresses": [ + "tcp://127.0.0.1:6001", "tcp://127.0.0.1:6002", + "tcp://127.0.0.1:6003" + ], + "stats_update_address": + "tcp://127.0.0.1:7000", + } + + +@pytest.mark.parametrize("with_stats_update", [True, False]) +def test_api_server_process_manager_init(api_server_args, with_stats_update): + """Test initializing the APIServerProcessManager.""" + # Set the worker runtime to ensure tests complete in reasonable time + global WORKER_RUNTIME_SECONDS + WORKER_RUNTIME_SECONDS = 0.5 + + # Copy the args to avoid mutating the + args = api_server_args.copy() + + if not with_stats_update: + args.pop("stats_update_address") + manager = APIServerProcessManager(**args) + + try: + # Verify the manager was initialized correctly + assert len(manager.processes) == 3 + + # Verify all processes are running + for proc in manager.processes: + assert proc.is_alive() + + print("Waiting for processes to run...") + time.sleep(WORKER_RUNTIME_SECONDS / 2) + + # They should still be alive at this point + for proc in manager.processes: + assert proc.is_alive() + + finally: + # Always clean up the processes + print("Cleaning up processes...") + manager.close() + + # Give processes time to terminate + time.sleep(0.2) + + # Verify all processes were terminated + for proc in manager.processes: + assert not proc.is_alive() + + +@patch("vllm.entrypoints.cli.serve.run_api_server_worker", + mock_run_api_server_worker) +def test_wait_for_completion_or_failure(api_server_args): + """Test that wait_for_completion_or_failure works with failures.""" + global WORKER_RUNTIME_SECONDS + WORKER_RUNTIME_SECONDS = 1.0 + + # Create the manager + manager = APIServerProcessManager(**api_server_args) + + try: + assert len(manager.processes) == 3 + + # Create a result capture for the thread + result: dict[str, Optional[Exception]] = {"exception": None} + + def run_with_exception_capture(): + try: + wait_for_completion_or_failure(api_server_manager=manager) + except Exception as e: + result["exception"] = e + + # Start a thread to run wait_for_completion_or_failure + wait_thread = threading.Thread(target=run_with_exception_capture, + daemon=True) + wait_thread.start() + + # Let all processes run for a short time + time.sleep(0.2) + + # All processes should still be running + assert all(proc.is_alive() for proc in manager.processes) + + # Now simulate a process failure + print("Simulating process failure...") + manager.processes[0].terminate() + + # Wait for the wait_for_completion_or_failure + # to detect and handle the failure + # This should trigger it to terminate all other processes + wait_thread.join(timeout=1.0) + + # The wait thread should have exited + assert not wait_thread.is_alive() + + # Verify that an exception was raised with appropriate error message + assert result["exception"] is not None + assert "died with exit code" in str(result["exception"]) + + # All processes should now be terminated + for i, proc in enumerate(manager.processes): + assert not proc.is_alive(), f"Process {i} should not be alive" + + finally: + manager.close() + time.sleep(0.2) + + +@pytest.mark.timeout(30) +def test_normal_completion(api_server_args): + """Test that wait_for_completion_or_failure works in normal completion.""" + global WORKER_RUNTIME_SECONDS + WORKER_RUNTIME_SECONDS = 0.1 + + # Create the manager + manager = APIServerProcessManager(**api_server_args) + + try: + # Give processes time to terminate + # wait for processes to complete + remaining_processes = manager.processes.copy() + while remaining_processes: + for proc in remaining_processes: + if not proc.is_alive(): + remaining_processes.remove(proc) + time.sleep(0.1) + + # Verify all processes have terminated + for i, proc in enumerate(manager.processes): + assert not proc.is_alive( + ), f"Process {i} still alive after terminate()" + + # Now call wait_for_completion_or_failure + # since all processes have already + # terminated, it should return immediately + # with no error + wait_for_completion_or_failure(api_server_manager=manager) + + finally: + # Clean up just in case + manager.close() + time.sleep(0.2) + + +@pytest.mark.timeout(30) +def test_external_process_monitoring(api_server_args): + """Test that wait_for_completion_or_failure handles additional processes.""" + global WORKER_RUNTIME_SECONDS + WORKER_RUNTIME_SECONDS = 100 + + # Create and start the external process + # (simulates local_engine_manager or coordinator) + spawn_context = multiprocessing.get_context("spawn") + external_proc = spawn_context.Process(target=mock_run_api_server_worker, + name="MockExternalProcess") + external_proc.start() + + # Create the class to simulate a coordinator + class MockCoordinator: + + def __init__(self, proc): + self.proc = proc + + def close(self): + if self.proc.is_alive(): + self.proc.terminate() + self.proc.join(timeout=0.5) + + # Create a mock coordinator with the external process + mock_coordinator = MockCoordinator(external_proc) + + # Create the API server manager + manager = APIServerProcessManager(**api_server_args) + + try: + # Verify manager initialization + assert len(manager.processes) == 3 + + # Create a result capture for the thread + result: dict[str, Optional[Exception]] = {"exception": None} + + def run_with_exception_capture(): + try: + wait_for_completion_or_failure(api_server_manager=manager, + coordinator=mock_coordinator) + except Exception as e: + result["exception"] = e + + # Start a thread to run wait_for_completion_or_failure + wait_thread = threading.Thread(target=run_with_exception_capture, + daemon=True) + wait_thread.start() + + # Terminate the external process to trigger a failure + time.sleep(0.2) + external_proc.terminate() + + # Wait for the thread to detect the failure + wait_thread.join(timeout=1.0) + + # The wait thread should have completed + assert not wait_thread.is_alive( + ), "wait_for_completion_or_failure thread still running" + + # Verify that an exception was raised with appropriate error message + assert result["exception"] is not None, "No exception was raised" + error_message = str(result["exception"]) + assert "died with exit code" in error_message, \ + f"Unexpected error message: {error_message}" + assert "MockExternalProcess" in error_message, \ + f"Error doesn't mention external process: {error_message}" + + # Verify that all API server processes were terminated as a result + for i, proc in enumerate(manager.processes): + assert not proc.is_alive( + ), f"API server process {i} was not terminated" + + finally: + # Clean up + manager.close() + mock_coordinator.close() + time.sleep(0.2) diff --git a/tests/utils.py b/tests/utils.py index bf38d7843..d21b18470 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -28,7 +28,7 @@ from tests.models.utils import TextTextLogprobs from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.entrypoints.openai.cli_args import make_arg_parser +from vllm.entrypoints.cli.serve import ServeSubcommand from vllm.model_executor.model_loader import get_model_loader from vllm.platforms import current_platform from vllm.transformers_utils.tokenizer import get_tokenizer @@ -99,7 +99,8 @@ class RemoteOpenAIServer: parser = FlexibleArgumentParser( description="vLLM's remote OpenAI server.") - parser = make_arg_parser(parser) + subparsers = parser.add_subparsers(required=False, dest="subparser") + parser = ServeSubcommand().subparser_init(subparsers) args = parser.parse_args(["--model", model, *vllm_serve_args]) self.host = str(args.host or 'localhost') self.port = int(args.port) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 43a27da2d..d3d62cf09 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -45,7 +45,6 @@ def make_request(request_id, multi_modal_placeholders=mm_positions, sampling_params=SamplingParams(max_tokens=17), eos_token_id=100, - arrival_time=0, lora_request=None, cache_salt=cache_salt, ) diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 3da27786b..ba3c0b3cf 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -38,7 +38,6 @@ def make_request(request_id, sampling_params=SamplingParams(max_tokens=17, prompt_logprobs=prompt_logprobs), eos_token_id=100, - arrival_time=0, lora_request=None, cache_salt=cache_salt, ) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index f40d477a0..f38454b1b 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -138,7 +138,6 @@ def create_requests(num_requests: int, multi_modal_placeholders=mm_position, multi_modal_hashes=None, eos_token_id=EOS_TOKEN_ID, - arrival_time=0, ) requests.append(request) return requests @@ -744,7 +743,8 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i]) # No draft or accepted tokens counted yet - assert engine_core_outputs.scheduler_stats.spec_decoding_stats is None + assert not engine_core_outputs or ( + engine_core_outputs[0].scheduler_stats.spec_decoding_stats is None) # Schedule the speculated tokens for validation output = scheduler.schedule() @@ -772,7 +772,8 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): engine_core_outputs = scheduler.update_from_output(output, model_runner_output) - scheduler_stats = engine_core_outputs.scheduler_stats + scheduler_stats = engine_core_outputs[0].scheduler_stats \ + if engine_core_outputs else None if expected[0] == 0: assert scheduler_stats.spec_decoding_stats is None else: @@ -843,7 +844,7 @@ def _step_until_done( # We should be in the decode phase now. assert num_scheduled_tokens == 1 assert len(output.kv_connector_metadata.requests) == 0 - ecos = scheduler.update_from_output(output, model_runner_output) + ecos = scheduler.update_from_output(output, model_runner_output)[0] all_done = True for eco in ecos.outputs: if eco.finish_reason is None: diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index ae1d8a762..e78c7480a 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -88,7 +88,7 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch): assert len(engine_core.scheduler.running) == 4 # Loop through until they are all done. - while len(engine_core.step()[0].outputs) > 0: + while (outs := engine_core.step()[0].get(0)) and outs.outputs: pass assert len(engine_core.scheduler.waiting) == 0 @@ -163,11 +163,11 @@ def test_engine_core(monkeypatch: pytest.MonkeyPatch): req0.request_id = req1.request_id = "test" engine_core.add_request(req0) - while len(engine_core.step()[0].outputs) > 0: + while (outs := engine_core.step()[0].get(0)) and outs.outputs: pass engine_core.add_request(req1) - while len(engine_core.step()[0].outputs) > 0: + while (outs := engine_core.step()[0].get(0)) and outs.outputs: pass assert len(engine_core.scheduler.waiting) == 0 @@ -207,7 +207,7 @@ def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch): assert len(engine_core.scheduler.waiting) == 1 assert len(engine_core.scheduler.running) == 0 # Loop through until they are all done. - while len(engine_core.step()[0].outputs) > 0: + while (outs := engine_core.step()[0].get(0)) and outs.outputs: pass assert len(engine_core.scheduler.waiting) == 0 assert len(engine_core.scheduler.running) == 0 @@ -327,7 +327,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): assert scheduler_output.num_scheduled_tokens[1] == 4 # Batch queue is full. Finish Batch 2. Get first token of req0. - output = engine_core.step_with_batch_queue()[0] + output = engine_core.step_with_batch_queue()[0].get(0) assert output is not None assert len(output.outputs) == 1 assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13 @@ -339,7 +339,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): assert scheduler_output.num_scheduled_tokens[0] == 1 # Batch queue is full. Finish Batch 3. Get first token of req1. - output = engine_core.step_with_batch_queue()[0] + output = engine_core.step_with_batch_queue()[0].get(0) assert output is not None assert len(output.outputs) == 1 assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13 @@ -362,7 +362,7 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch): if step % 2 == 0: # Even steps consumes an output. assert output is not None - assert len(output.outputs) == 1 + assert len(output[0].outputs) == 1 if req_id in engine_core.scheduler.requests: assert engine_core.scheduler.requests[ req_id].num_tokens == expected_num_tokens[req_id] diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/v1/entrypoints/openai/test_multi_api_servers.py new file mode 100644 index 000000000..7b4583bc3 --- /dev/null +++ b/tests/v1/entrypoints/openai/test_multi_api_servers.py @@ -0,0 +1,171 @@ +# SPDX-License-Identifier: Apache-2.0 +import asyncio +import os + +import openai # use the official client for correctness check +import pytest +import pytest_asyncio + +from tests.utils import RemoteOpenAIServer + +MODEL_NAME = "ibm-research/PowerMoE-3b" + +DP_SIZE = os.getenv("DP_SIZE", "1") + + +@pytest.fixture(scope="module") +def default_server_args(): + return [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "2048", + "--max-num-seqs", + "128", + "--enforce-eager", + "--api-server-count", + "4", + "--data_parallel_size", + DP_SIZE, + ] + + +@pytest.fixture(scope="module") +def server(default_server_args): + with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME], +) +async def test_single_completion(client: openai.AsyncOpenAI, + model_name: str) -> None: + + async def make_request(): + completion = await client.completions.create( + model=model_name, + prompt="Hello, my name is", + max_tokens=10, + temperature=1.0) + + assert completion.id is not None + assert completion.choices is not None and len(completion.choices) == 1 + + choice = completion.choices[0] + # The exact number of tokens can vary slightly with temperature=1.0, + # so we check for a reasonable minimum length. + assert len(choice.text) >= 1 + # Finish reason might not always be 'length' if the model finishes early + # or due to other reasons, especially with high temperature. + # So, we'll accept 'length' or 'stop'. + assert choice.finish_reason in ("length", "stop") + + # Token counts can also vary, so we check they are positive. + assert completion.usage.completion_tokens > 0 + assert completion.usage.prompt_tokens > 0 + assert completion.usage.total_tokens > 0 + return completion + + # Test single request + result = await make_request() + assert result is not None + + await asyncio.sleep(0.5) + + # Send two bursts of requests + num_requests = 100 + tasks = [make_request() for _ in range(num_requests)] + results = await asyncio.gather(*tasks) + assert len(results) == num_requests + assert all(completion is not None for completion in results) + + await asyncio.sleep(0.5) + + tasks = [make_request() for _ in range(num_requests)] + results = await asyncio.gather(*tasks) + assert len(results) == num_requests + assert all(completion is not None for completion in results) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME], +) +async def test_completion_streaming(client: openai.AsyncOpenAI, + model_name: str) -> None: + prompt = "What is an LLM?" + + async def make_streaming_request(): + # Perform a non-streaming request to get the expected full output + single_completion = await client.completions.create( + model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + ) + single_output = single_completion.choices[0].text + + # Perform the streaming request + stream = await client.completions.create(model=model_name, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True) + chunks: list[str] = [] + finish_reason_count = 0 + last_chunk = None + async for chunk in stream: + chunks.append(chunk.choices[0].text) + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + last_chunk = chunk # Keep track of the last chunk + + # finish reason should only return in the last block for OpenAI API + assert finish_reason_count == 1, ( + "Finish reason should appear exactly once.") + assert last_chunk is not None, ( + "Stream should have yielded at least one chunk.") + assert last_chunk.choices[ + 0].finish_reason == "length", "Finish reason should be 'length'." + # Check that the combined text matches the non-streamed version. + assert "".join( + chunks + ) == single_output, "Streamed output should match non-streamed output." + return True # Indicate success for this request + + # Test single request + result = await make_streaming_request() + assert result is not None + + await asyncio.sleep(0.5) + + # Send two bursts of requests + num_requests = 100 + tasks = [make_streaming_request() for _ in range(num_requests)] + results = await asyncio.gather(*tasks) + + assert len( + results + ) == num_requests, f"Expected {num_requests} results, got {len(results)}" + assert all(results), "Not all streaming requests completed successfully." + + await asyncio.sleep(0.5) + + tasks = [make_streaming_request() for _ in range(num_requests)] + results = await asyncio.gather(*tasks) + + assert len( + results + ) == num_requests, f"Expected {num_requests} results, got {len(results)}" + assert all(results), "Not all streaming requests completed successfully." diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py index 770981403..dc963251c 100644 --- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py +++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py @@ -43,7 +43,7 @@ def test_basic_lifecycle(): # Ensure the request is finished after 1 tokens. assert request.is_finished() assert request.status == RequestStatus.FINISHED_LENGTH_CAPPED - output = engine_core_outputs.outputs[0] + output = engine_core_outputs[0].outputs[0] assert output.finish_reason == FinishReason.LENGTH assert output.kv_transfer_params is not None @@ -165,7 +165,7 @@ def test_prefix_cache_lifecycle(): scheduler_output = scheduler.schedule() model_runner_output = create_model_runner_output(reqs=[request_remote]) eco = scheduler.update_from_output(scheduler_output, model_runner_output) - kv_transfer_params = eco.outputs[0].kv_transfer_params + kv_transfer_params = eco[0].outputs[0].kv_transfer_params # Ensure we send all block ids, even if there is a cache hit. assert (len( diff --git a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py index 6fcff0d62..86eacb693 100644 --- a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py +++ b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py @@ -61,7 +61,7 @@ def test_basic_lifecycle(): # (1c): update_from_output() engine_core_outputs = scheduler.update_from_output(scheduler_output, model_runner_output) - assert len(engine_core_outputs.outputs) == 0 + assert not engine_core_outputs or not engine_core_outputs[0].outputs # STEP (2): # (2a): schedule(): nothing happens! @@ -112,7 +112,7 @@ def test_basic_lifecycle(): model_runner_output) scheduler.schedule() - outputs = engine_core_outputs.outputs + outputs = engine_core_outputs[0].outputs assert len(outputs) == 1 output = outputs[0] assert output.finish_reason == FinishReason.STOP @@ -335,7 +335,7 @@ def test_full_block_prompt(): model_runner_output) scheduler.schedule() - outputs = engine_core_outputs.outputs + outputs = engine_core_outputs[0].outputs assert len(outputs) == 1 output = outputs[0] assert output.finish_reason == FinishReason.STOP diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 53e2d6fda..3c3190b32 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -153,7 +153,6 @@ def create_request( multi_modal_placeholders=None, multi_modal_hashes=None, eos_token_id=EOS_TOKEN_ID, - arrival_time=0, ) req.kv_transfer_params = kv_transfer_params return req diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 957fec290..e65c97073 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -1,24 +1,35 @@ # SPDX-License-Identifier: Apache-2.0 import argparse +import os import signal +import sys import uvloop +import zmq import vllm.envs as envs from vllm import AsyncEngineArgs from vllm.entrypoints.cli.types import CLISubcommand -from vllm.entrypoints.openai.api_server import run_server +from vllm.entrypoints.openai.api_server import (run_server, run_server_worker, + setup_server) from vllm.entrypoints.openai.cli_args import (make_arg_parser, validate_parsed_serve_args) from vllm.entrypoints.utils import (VLLM_SERVE_PARSER_EPILOG, show_filtered_argument_or_group_from_help) +from vllm.executor.multiproc_worker_utils import _add_prefix from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext -from vllm.utils import FlexibleArgumentParser, get_tcp_uri +from vllm.utils import FlexibleArgumentParser, get_tcp_uri, zmq_socket_ctx +from vllm.v1.engine.coordinator import DPCoordinator from vllm.v1.engine.core import EngineCoreProc from vllm.v1.engine.core_client import CoreEngineProcManager from vllm.v1.executor.abstract import Executor +from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus +from vllm.v1.utils import (APIServerProcessManager, CoreEngine, + EngineZmqAddresses, get_engine_client_zmq_addr, + wait_for_completion_or_failure, + wait_for_engine_startup) logger = init_logger(__name__) @@ -36,9 +47,12 @@ class ServeSubcommand(CLISubcommand): if hasattr(args, 'model_tag') and args.model_tag is not None: args.model = args.model_tag - if args.headless: + if args.headless or args.api_server_count < 1: run_headless(args) + elif args.api_server_count > 1: + run_multi_api_server(args) else: + # Single API server (this process). uvloop.run(run_server(args)) def validate(self, args: argparse.Namespace) -> None: @@ -69,6 +83,11 @@ class ServeSubcommand(CLISubcommand): type=int, default=0, help='Starting data parallel rank for secondary nodes.') + serve_parser.add_argument('--api-server-count', + '-asc', + type=int, + default=1, + help='How many API server processes to run.') serve_parser.add_argument( "--config", type=str, @@ -91,23 +110,26 @@ def cmd_init() -> list[CLISubcommand]: def run_headless(args: argparse.Namespace): + if args.api_server_count > 1: + raise ValueError("api_server_count can't be set in headless mode") + # Create the EngineConfig. engine_args = AsyncEngineArgs.from_cli_args(args) usage_context = UsageContext.OPENAI_API_SERVER vllm_config = engine_args.create_engine_config(usage_context=usage_context) if not envs.VLLM_USE_V1: - raise RuntimeError("Headless mode is only supported for V1") + raise ValueError("Headless mode is only supported for V1") parallel_config = vllm_config.parallel_config local_engine_count = parallel_config.data_parallel_size_local host = parallel_config.data_parallel_master_ip port = engine_args.data_parallel_rpc_port # add to config too - input_address = get_tcp_uri(host, port) + handshake_address = get_tcp_uri(host, port) if local_engine_count <= 0: - raise RuntimeError("data_parallel_size_local must be > 0 in " - "headless mode") + raise ValueError("data_parallel_size_local must be > 0 in " + "headless mode") # Catch SIGTERM and SIGINT to allow graceful shutdown. def signal_handler(signum, frame): @@ -119,7 +141,7 @@ def run_headless(args: argparse.Namespace): logger.info( "Launching %d data parallel engine(s) in headless mode, " - "with head node address %s.", local_engine_count, input_address) + "with head node address %s.", local_engine_count, handshake_address) # Create the engines. engine_manager = CoreEngineProcManager( @@ -129,7 +151,7 @@ def run_headless(args: argparse.Namespace): local_start_index=0, vllm_config=vllm_config, on_head_node=False, - input_address=input_address, + handshake_address=handshake_address, executor_class=Executor.get_class(vllm_config), log_stats=not engine_args.disable_log_stats, ) @@ -139,3 +161,142 @@ def run_headless(args: argparse.Namespace): finally: logger.info("Shutting down.") engine_manager.close() + + +def run_multi_api_server(args: argparse.Namespace): + + assert not args.headless + num_api_servers = args.api_server_count + assert num_api_servers > 0 + + if num_api_servers > 1: + setup_multiprocess_prometheus() + + listen_address, sock = setup_server(args) + + engine_args = AsyncEngineArgs.from_cli_args(args) + usage_context = UsageContext.OPENAI_API_SERVER + vllm_config = engine_args.create_engine_config(usage_context=usage_context) + model_config = vllm_config.model_config + + if num_api_servers > 1: + if not envs.VLLM_USE_V1: + raise ValueError("api_server_count > 1 is only supported for V1") + + if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: + raise ValueError("VLLM_ALLOW_RUNTIME_LORA_UPDATING cannot be used " + "with api_server_count > 1") + + if model_config.is_multimodal_model and not ( + model_config.disable_mm_preprocessor_cache): + logger.warning( + "Multi-model preprocessor cache will be disabled for" + " api_server_count > 1") + model_config.disable_mm_preprocessor_cache = True + + parallel_config = vllm_config.parallel_config + + assert parallel_config.data_parallel_rank == 0 + + dp_size = parallel_config.data_parallel_size + local_engine_count = parallel_config.data_parallel_size_local + host = parallel_config.data_parallel_master_ip + local_only = local_engine_count == dp_size + + # Set up input and output addresses. + input_addresses = [ + get_engine_client_zmq_addr(local_only, host) + for _ in range(num_api_servers) + ] + output_addresses = [ + get_engine_client_zmq_addr(local_only, host) + for _ in range(num_api_servers) + ] + + addresses = EngineZmqAddresses( + inputs=input_addresses, + outputs=output_addresses, + ) + + # Set up coordinator for dp > 1. + coordinator = None + stats_update_address = None + if dp_size > 1: + coordinator = DPCoordinator(parallel_config) + addresses.coordinator_input, addresses.coordinator_output = ( + coordinator.get_engine_socket_addresses()) + stats_update_address = coordinator.get_stats_publish_address() + logger.info("Started DP Coordinator process (PID: %d)", + coordinator.proc.pid) + + handshake_address = get_engine_client_zmq_addr( + local_only, host, parallel_config.data_parallel_rpc_port) + + with zmq_socket_ctx(handshake_address, zmq.ROUTER, + bind=True) as handshake_socket: + + # Start local engines. + if not local_engine_count: + local_engine_manager = None + else: + local_engine_manager = CoreEngineProcManager( + EngineCoreProc.run_engine_core, + vllm_config=vllm_config, + executor_class=Executor.get_class(vllm_config), + log_stats=not engine_args.disable_log_stats, + handshake_address=handshake_address, + on_head_node=True, + local_engine_count=local_engine_count, + start_index=0, + local_start_index=0) + + # Start API servers using the manager + api_server_manager = APIServerProcessManager( + target_server_fn=run_api_server_worker_proc, + listen_address=listen_address, + sock=sock, + args=args, + num_servers=num_api_servers, + input_addresses=input_addresses, + output_addresses=output_addresses, + stats_update_address=stats_update_address) + + # Wait for engine handshakes to complete. + core_engines = [ + CoreEngine(index=i, local=(i < local_engine_count)) + for i in range(dp_size) + ] + wait_for_engine_startup( + handshake_socket, + addresses, + core_engines, + parallel_config, + vllm_config.cache_config, + local_engine_manager, + coordinator.proc if coordinator else None, + ) + + # Wait for API servers + wait_for_completion_or_failure( + api_server_manager=api_server_manager, + local_engine_manager=local_engine_manager, + coordinator=coordinator) + + +def run_api_server_worker_proc(listen_address, + sock, + args, + client_config=None, + **uvicorn_kwargs) -> None: + """Entrypoint for individual API server worker processes.""" + + # Add process-specific prefix to stdout and stderr. + from multiprocessing import current_process + process_name = current_process().name + pid = os.getpid() + _add_prefix(sys.stdout, process_name, pid) + _add_prefix(sys.stderr, process_name, pid) + + uvloop.run( + run_server_worker(listen_address, sock, args, client_config, + **uvicorn_kwargs)) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index b991cb3a4..1e7f88a6a 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -17,7 +17,7 @@ from contextlib import asynccontextmanager from functools import partial from http import HTTPStatus from json import JSONDecodeError -from typing import Annotated, Optional +from typing import Annotated, Any, Optional import prometheus_client import regex as re @@ -26,6 +26,8 @@ from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse +from prometheus_client import make_asgi_app +from prometheus_fastapi_instrumentator import Instrumentator from starlette.concurrency import iterate_in_threadpool from starlette.datastructures import State from starlette.routing import Mount @@ -97,6 +99,7 @@ from vllm.transformers_utils.tokenizer import MistralTokenizer from vllm.usage.usage_lib import UsageContext from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path, is_valid_ipv6_address, set_ulimit) +from vllm.v1.metrics.prometheus import get_prometheus_registry from vllm.version import __version__ as VLLM_VERSION TIMEOUT_KEEP_ALIVE = 5 # seconds @@ -142,14 +145,17 @@ async def lifespan(app: FastAPI): @asynccontextmanager async def build_async_engine_client( - args: Namespace) -> AsyncIterator[EngineClient]: + args: Namespace, + client_config: Optional[dict[str, Any]] = None, +) -> AsyncIterator[EngineClient]: # Context manager to handle engine_client lifecycle # Ensures everything is shutdown and cleaned up on error/exit engine_args = AsyncEngineArgs.from_cli_args(args) async with build_async_engine_client_from_engine_args( - engine_args, args.disable_frontend_multiprocessing) as engine: + engine_args, args.disable_frontend_multiprocessing, + client_config) as engine: yield engine @@ -157,6 +163,7 @@ async def build_async_engine_client( async def build_async_engine_client_from_engine_args( engine_args: AsyncEngineArgs, disable_frontend_multiprocessing: bool = False, + client_config: Optional[dict[str, Any]] = None, ) -> AsyncIterator[EngineClient]: """ Create EngineClient, either: @@ -179,12 +186,16 @@ async def build_async_engine_client_from_engine_args( from vllm.v1.engine.async_llm import AsyncLLM async_llm: Optional[AsyncLLM] = None + client_index = client_config.pop( + "client_index") if client_config else 0 try: async_llm = AsyncLLM.from_vllm_config( vllm_config=vllm_config, usage_context=usage_context, disable_log_requests=engine_args.disable_log_requests, - disable_log_stats=engine_args.disable_log_stats) + disable_log_stats=engine_args.disable_log_stats, + client_addresses=client_config, + client_index=client_index) # Don't keep the dummy data in memory await async_llm.reset_mm_cache() @@ -318,22 +329,9 @@ class PrometheusResponse(Response): def mount_metrics(app: FastAPI): - # Lazy import for prometheus multiprocessing. - # We need to set PROMETHEUS_MULTIPROC_DIR environment variable - # before prometheus_client is imported. - # See https://prometheus.github.io/client_python/multiprocess/ - from prometheus_client import (REGISTRY, CollectorRegistry, make_asgi_app, - multiprocess) - from prometheus_fastapi_instrumentator import Instrumentator - - registry = REGISTRY - - prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None) - if prometheus_multiproc_dir_path is not None: - logger.debug("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR", - prometheus_multiproc_dir_path) - registry = CollectorRegistry() - multiprocess.MultiProcessCollector(registry) + """Mount prometheus metrics to a FastAPI app.""" + + registry = get_prometheus_registry() # `response_class=PrometheusResponse` is needed to return an HTTP response # with header "Content-Type: text/plain; version=0.0.4; charset=utf-8" @@ -1256,16 +1254,10 @@ def create_server_socket(addr: tuple[str, int]) -> socket.socket: return sock -async def run_server(args, **uvicorn_kwargs) -> None: - logger.info("vLLM API server version %s", VLLM_VERSION) - log_non_default_args(args) - - if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: - ToolParserManager.import_tool_parser(args.tool_parser_plugin) - +def validate_api_server_args(args): valid_tool_parses = ToolParserManager.tool_parsers.keys() if args.enable_auto_tool_choice \ - and args.tool_call_parser not in valid_tool_parses: + and args.tool_call_parser not in valid_tool_parses: raise KeyError(f"invalid tool call parser: {args.tool_call_parser} " f"(chose from {{ {','.join(valid_tool_parses)} }})") @@ -1276,6 +1268,19 @@ async def run_server(args, **uvicorn_kwargs) -> None: f"invalid reasoning parser: {args.reasoning_parser} " f"(chose from {{ {','.join(valid_reasoning_parses)} }})") + +def setup_server(args): + """Validate API server args, set up signal handler, create socket + ready to serve.""" + + logger.info("vLLM API server version %s", VLLM_VERSION) + log_non_default_args(args) + + if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: + ToolParserManager.import_tool_parser(args.tool_parser_plugin) + + validate_api_server_args(args) + # workaround to make sure that we bind the port before the engine is set up. # This avoids race conditions with ray. # see https://github.com/vllm-project/vllm/issues/8204 @@ -1292,22 +1297,41 @@ async def run_server(args, **uvicorn_kwargs) -> None: signal.signal(signal.SIGTERM, signal_handler) - async with build_async_engine_client(args) as engine_client: + addr, port = sock_addr + is_ssl = args.ssl_keyfile and args.ssl_certfile + host_part = f"[{addr}]" if is_valid_ipv6_address( + addr) else addr or "0.0.0.0" + listen_address = f"http{'s' if is_ssl else ''}://{host_part}:{port}" + + return listen_address, sock + + +async def run_server(args, **uvicorn_kwargs) -> None: + """Run a single-worker API server.""" + listen_address, sock = setup_server(args) + await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) + + +async def run_server_worker(listen_address, + sock, + args, + client_config=None, + **uvicorn_kwargs) -> None: + """Run a single API server worker.""" + + if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: + ToolParserManager.import_tool_parser(args.tool_parser_plugin) + + server_index = client_config.get("client_index", 0) if client_config else 0 + + async with build_async_engine_client(args, client_config) as engine_client: app = build_app(args) vllm_config = await engine_client.get_vllm_config() await init_app_state(engine_client, vllm_config, app.state, args) - def _listen_addr(a: str) -> str: - if is_valid_ipv6_address(a): - return '[' + a + ']' - return a or "0.0.0.0" - - is_ssl = args.ssl_keyfile and args.ssl_certfile - logger.info("Starting vLLM API server on http%s://%s:%d", - "s" if is_ssl else "", _listen_addr(sock_addr[0]), - sock_addr[1]) - + logger.info("Starting vLLM API server %d on %s", server_index, + listen_address) shutdown_task = await serve_http( app, sock=sock, diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index afc8a8dc3..f1ae03097 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -229,6 +229,11 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager): self.add_adapter(lora) def add_adapter(self, lora_request: LoRARequest) -> bool: + # Note that this method is not thread-safe. It may be invoked multiple + # times for the same adapter when using multiple API servers. + # This is ok because it's currently only called from + # the single-threaded core engine loop. + if lora_request.lora_int_id not in self.list_adapters(): # Load the new adapter first to ensure it is actually valid, before # evicting any existing adapters. diff --git a/vllm/utils.py b/vllm/utils.py index 25e34446a..65d3579d5 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -2420,6 +2420,7 @@ def make_zmq_socket( socket_type: Any, bind: Optional[bool] = None, identity: Optional[bytes] = None, + linger: Optional[int] = None, ) -> Union[zmq.Socket, zmq.asyncio.Socket]: # type: ignore[name-defined] """Make a ZMQ socket with the proper bind/connect semantics.""" @@ -2439,7 +2440,7 @@ def make_zmq_socket( buf_size = -1 # Use system default buffer size if bind is None: - bind = socket_type != zmq.PUSH + bind = socket_type not in (zmq.PUSH, zmq.SUB, zmq.XSUB) if socket_type in (zmq.PULL, zmq.DEALER, zmq.ROUTER): socket.setsockopt(zmq.RCVHWM, 0) @@ -2452,6 +2453,9 @@ def make_zmq_socket( if identity is not None: socket.setsockopt(zmq.IDENTITY, identity) + if linger is not None: + socket.setsockopt(zmq.LINGER, linger) + # Determine if the path is a TCP socket with an IPv6 address. # Enable IPv6 on the zmq socket if so. scheme, host, _ = split_zmq_path(path) diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py index c17f80b6a..055ce4460 100644 --- a/vllm/v1/core/sched/interface.py +++ b/vllm/v1/core/sched/interface.py @@ -45,7 +45,7 @@ class SchedulerInterface(ABC): self, scheduler_output: "SchedulerOutput", model_runner_output: "ModelRunnerOutput", - ) -> "EngineCoreOutputs": + ) -> dict[int, "EngineCoreOutputs"]: """Update the scheduler state based on the model runner output. This method is called after the model runner has processed the scheduled @@ -55,7 +55,8 @@ class SchedulerInterface(ABC): for each request. Returns: - A EngineCoreOutputs object containing the outputs for each request. + A dict of client index to EngineCoreOutputs object containing the + outputs for each request originating from that client. """ raise NotImplementedError @@ -126,6 +127,11 @@ class SchedulerInterface(ABC): """ raise NotImplementedError + @abstractmethod + def get_request_counts(self) -> tuple[int, int]: + """Returns (num_running_reqs, num_waiting_reqs).""" + raise NotImplementedError + @abstractmethod def make_stats(self) -> Optional["SchedulerStats"]: """Make a SchedulerStats object for logging. diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 4c6b3eea0..ce16a1ed5 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -58,7 +58,8 @@ class Scheduler(SchedulerInterface): # request ids should be included in the EngineCoreOutputs returned # by update_from_outputs(). This is currently used in the multi-engine # case to track request lifetimes efficiently. - self.include_finished_set = include_finished_set + self.finished_req_ids_dict: Optional[dict[int, set[str]]] = ( + defaultdict(set) if include_finished_set else None) # Scheduling constraints. self.max_num_running_reqs = self.scheduler_config.max_num_seqs @@ -693,7 +694,7 @@ class Scheduler(SchedulerInterface): self, scheduler_output: SchedulerOutput, model_runner_output: ModelRunnerOutput, - ) -> EngineCoreOutputs: + ) -> dict[int, EngineCoreOutputs]: sampled_token_ids = model_runner_output.sampled_token_ids spec_token_ids = model_runner_output.spec_token_ids logprobs = model_runner_output.logprobs @@ -701,7 +702,7 @@ class Scheduler(SchedulerInterface): num_scheduled_tokens = scheduler_output.num_scheduled_tokens new_running: list[Request] = [] - outputs: list[EngineCoreOutput] = [] + outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list) spec_decoding_stats: Optional[SpecDecodingStats] = None # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below @@ -797,7 +798,7 @@ class Scheduler(SchedulerInterface): if new_token_ids or kv_transfer_params: # Add EngineCoreOutput for this Request. - outputs.append( + outputs[request.client_index].append( EngineCoreOutput( request_id=req_id, new_token_ids=new_token_ids, @@ -828,17 +829,38 @@ class Scheduler(SchedulerInterface): self._cached_reqs_data[req_data.req_id].append(req_data) self.running = new_running - engine_core_outputs = EngineCoreOutputs( - outputs=outputs, - scheduler_stats=self.make_stats(spec_decoding_stats), - ) - if self.include_finished_set: - #TODO currently sending duplicates here, improve this - engine_core_outputs.finished_requests = ( - scheduler_output.finished_req_ids | self.finished_req_ids) + + # Create EngineCoreOutputs for all clients that have requests with + # outputs in this step. + engine_core_outputs = { + client_index: EngineCoreOutputs(outputs=outs) + for client_index, outs in outputs.items() + } + + finished_req_ids = self.finished_req_ids_dict + if finished_req_ids is not None: + # Include ids of requests that finished since last outputs + # were sent. + for client_index, finished_set in finished_req_ids.items(): + # Set finished request set in EngineCoreOutputs for this client. + if (eco := engine_core_outputs.get(client_index)) is not None: + eco.finished_requests = finished_set + else: + engine_core_outputs[client_index] = EngineCoreOutputs( + finished_requests=finished_set) + finished_req_ids.clear() + + if engine_core_outputs: + # Return stats to only one of the front-ends. + next(iter(engine_core_outputs.values())).scheduler_stats = ( + self.make_stats(spec_decoding_stats)) return engine_core_outputs + def get_request_counts(self) -> tuple[int, int]: + """Returns (num_running_reqs, num_waiting_reqs).""" + return len(self.running), len(self.waiting) + def add_request(self, request: Request) -> None: self.waiting.append(request) self.requests[request.request_id] = request @@ -880,8 +902,11 @@ class Scheduler(SchedulerInterface): delay_free_blocks, kv_xfer_params = self._connector_finished(request) self.encoder_cache_manager.free(request) - self._cached_reqs_data.pop(request.request_id, None) - self.finished_req_ids.add(request.request_id) + request_id = request.request_id + self._cached_reqs_data.pop(request_id, None) + self.finished_req_ids.add(request_id) + if self.finished_req_ids_dict is not None: + self.finished_req_ids_dict[request.client_index].add(request_id) if not delay_free_blocks: self._free_blocks(request) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 41db99bea..0c9f61a76 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -44,10 +44,6 @@ class EngineCoreRequest( omit_defaults=True, # type: ignore[call-arg] gc=False): # type: ignore[call-arg] - # NOTE: prompt and prompt_token_ids should be DecoderOnlyInput, - # but this object is currently not playing well with msgspec - # due to circular imports and typing we have in data.py - request_id: str prompt_token_ids: list[int] mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]] @@ -59,6 +55,10 @@ class EngineCoreRequest( lora_request: Optional[LoRARequest] cache_salt: Optional[str] + # Index of the client, used to ensure outputs are sent back to the same + # client for this request when scaling out the front-end. + client_index: int = 0 + # Used in DP case to indicate which wave of requests this is expected to # belong to, to cover a race condition where the request is sent before # a wave finished notification is received. diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 74c2251c7..86781e752 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -36,6 +36,7 @@ from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor from vllm.v1.metrics.loggers import (StatLoggerBase, StatLoggerFactory, setup_default_loggers) +from vllm.v1.metrics.prometheus import shutdown_prometheus from vllm.v1.metrics.stats import IterationStats, SchedulerStats logger = init_logger(__name__) @@ -54,6 +55,8 @@ class AsyncLLM(EngineClient): log_requests: bool = True, start_engine_loop: bool = True, stat_loggers: Optional[list[StatLoggerFactory]] = None, + client_addresses: Optional[dict[str, str]] = None, + client_index: int = 0, ) -> None: """ Create an AsyncLLM. @@ -124,6 +127,8 @@ class AsyncLLM(EngineClient): vllm_config=vllm_config, executor_class=executor_class, log_stats=self.log_stats, + client_addresses=client_addresses, + client_index=client_index, ) if self.stat_loggers: for stat_logger in self.stat_loggers[0]: @@ -145,6 +150,8 @@ class AsyncLLM(EngineClient): stat_loggers: Optional[list[StatLoggerFactory]] = None, disable_log_requests: bool = False, disable_log_stats: bool = False, + client_addresses: Optional[dict[str, str]] = None, + client_index: int = 0, ) -> "AsyncLLM": if not envs.VLLM_USE_V1: raise ValueError( @@ -162,6 +169,8 @@ class AsyncLLM(EngineClient): log_requests=not disable_log_requests, log_stats=not disable_log_stats, usage_context=usage_context, + client_addresses=client_addresses, + client_index=client_index, ) @classmethod @@ -195,6 +204,8 @@ class AsyncLLM(EngineClient): def shutdown(self): """Shutdown, cleaning up the background proc and IPC.""" + shutdown_prometheus() + if engine_core := getattr(self, "engine_core", None): engine_core.shutdown() @@ -398,7 +409,6 @@ class AsyncLLM(EngineClient): # TODO(rob): make into a coroutine and launch it in # background thread once Prometheus overhead is non-trivial. if stat_loggers: - assert outputs.scheduler_stats is not None AsyncLLM._record_stats( stat_loggers[outputs.engine_index], scheduler_stats=outputs.scheduler_stats, @@ -422,7 +432,7 @@ class AsyncLLM(EngineClient): @staticmethod def _record_stats( stat_loggers: list[StatLoggerBase], - scheduler_stats: SchedulerStats, + scheduler_stats: Optional[SchedulerStats], iteration_stats: Optional[IterationStats], ): """static so that it can be used from the output_handler task diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py new file mode 100644 index 000000000..b84d4b144 --- /dev/null +++ b/vllm/v1/engine/coordinator.py @@ -0,0 +1,252 @@ +# SPDX-License-Identifier: Apache-2.0 +import multiprocessing +import time +import weakref +from typing import Optional + +import msgspec.msgpack +import zmq + +from vllm.config import ParallelConfig +from vllm.logger import init_logger +from vllm.utils import get_mp_context, get_open_zmq_ipc_path, make_zmq_socket +from vllm.v1.engine import EngineCoreOutputs, EngineCoreRequestType +from vllm.v1.serial_utils import MsgpackDecoder +from vllm.v1.utils import get_engine_client_zmq_addr, shutdown + +logger = init_logger(__name__) + + +class DPCoordinator: + """Coordinator process used for data-parallel deployments (DP>1). + + Intermediates between multiple DP engine rank processes and one or more + front-end API server processes. + + * Collects stats from each DP engine (currently just waiting and running + queue lengths), and publishes these to all front-ends for use in + load-balancing decisions. + + * Keeps track of the current DP "request wave" number and running state + of the engines. This is received from the DP rank 0 engine and published + to the front-end processes along with the current load stats. + + The engines alternate between a global running/paused state. The global + "request wave" number is a count of the number of times that the workers + collectively move from a running state to a paused state. This transition + is synchronized via the all-reduce operation performed in the + DPEngineCoreProc._has_global_unfinished_reqs method. + + * Broadcasts the START_DP_WAVE message to engines to move them from paused + to running state when one engine receives a new request. This can happen + in two cases: + 1) A front-end sending a new request while the engines are paused will + concurrently notify the coordinator. + 2) An engine receiving a request for a stale request wave while in paused + state will notify the coordinator. + + Engines will move into running state when receiving a new request or + START_DP_WAVE message. + """ + + def __init__(self, parallel_config: ParallelConfig): + + # Assume coordinator is colocated with front-end procs. + front_publish_address = get_open_zmq_ipc_path() + + dp_size = parallel_config.data_parallel_size + assert dp_size > 1, "Coordinator only used for data parallel" + + local_only = dp_size == parallel_config.data_parallel_size_local + host = parallel_config.data_parallel_master_ip + back_publish_address = get_engine_client_zmq_addr(local_only, host) + back_output_address = get_engine_client_zmq_addr(local_only, host) + + context = get_mp_context() + self.proc: multiprocessing.Process = context.Process( + target=CoordinatorProc.run_coordinator, + name="VLLM_DP_Coordinator", + kwargs={ + "engine_count": parallel_config.data_parallel_size, + "front_publish_address": front_publish_address, + "back_output_address": back_output_address, + "back_publish_address": back_publish_address, + }, + daemon=True) + self.proc.start() + + self.stats_publish_address = front_publish_address + self.coord_in_address = back_publish_address + self.coord_out_address = back_output_address + self._finalizer = weakref.finalize(self, shutdown, [self.proc]) + + def get_stats_publish_address(self) -> str: + return self.stats_publish_address + + def get_engine_socket_addresses(self) -> tuple[str, str]: + """Returns tuple of ZMQ input address, output address.""" + return self.coord_in_address, self.coord_out_address + + def close(self): + self._finalizer() + + +class EngineState: + + def __init__(self): + self.request_counts = [0, 0] # [waiting, running] + + +class CoordinatorProc: + + def __init__(self, engine_count: int): + + self.ctx = zmq.Context() + + self.engines = [EngineState() for _ in range(engine_count)] + + self.current_wave = 0 + self.engines_running = False + self.stats_changed = False + + @staticmethod + def run_coordinator( + engine_count: int, + front_publish_address: str, + back_output_address: str, + back_publish_address: str, + ): + coordinator = CoordinatorProc(engine_count=engine_count) + try: + coordinator.process_input_socket( + front_publish_address, + back_output_address, + back_publish_address, + ) + except KeyboardInterrupt: + logger.info("DP Coordinator process exiting") + + def process_input_socket(self, front_publish_address: str, + back_output_address: str, + back_publish_address: str): + + decoder = MsgpackDecoder(EngineCoreOutputs) + + with make_zmq_socket( + path=front_publish_address, # IPC + ctx=self.ctx, + socket_type=zmq.XPUB, + bind=True, + ) as publish_front, make_zmq_socket( + path=back_output_address, # IPC or TCP + ctx=self.ctx, + socket_type=zmq.PULL, + bind=True, + ) as output_back, make_zmq_socket( + path=back_publish_address, # IPC or TCP + ctx=self.ctx, + socket_type=zmq.XPUB, + bind=True, + ) as publish_back: + + poller = zmq.Poller() + poller.register(publish_front, zmq.POLLIN) + poller.register(output_back, zmq.POLLIN) + last_publish_time = 0 + while True: + elapsed = int(time.time() * 1000) - last_publish_time + # Send at 100 ms interval if the stats have changed, + # or otherwise every 3 seconds. + wait_for = 100 if self.stats_changed else 3000 + events = poller.poll(timeout=max(0, wait_for - elapsed)) + if not events: + # Poller timeout - publish current stats to front-ends. + engine_req_counts_list = self._get_engine_counts() + to_publish = (engine_req_counts_list, self.current_wave, + self.engines_running) + publish_front.send(msgspec.msgpack.encode(to_publish)) + last_publish_time = int(time.time() * 1000) + self.stats_changed = False + continue + + events = dict(events) + + if publish_front in events: + buffer = publish_front.recv() + if buffer == b'\x01': + # Ignore subscription messages. + continue + + # We received a message on the front-end XPUB socket, + # from an API server sending a new request while the + # engines are paused, so that we can wake the other + # engines. + engine_to_exclude, wave = msgspec.msgpack.decode(buffer) + if wave < self.current_wave: + # If the wave number is stale, ensure the message is + # handled by all the engines. + engine_to_exclude = None + if not self.engines_running: + self.engines_running = True + self.stats_changed = True + self._send_start_wave(publish_back, self.current_wave, + engine_to_exclude) + + if output_back in events: + # We received a message from one of the engines. + + buffer = output_back.recv() + outputs: EngineCoreOutputs = decoder.decode(buffer) + + assert not outputs.outputs + assert outputs.utility_output is None + + eng_index = outputs.engine_index + if outputs.scheduler_stats: + # 1. Updated request load stats - update our local + # state with these. + stats = self.engines[eng_index].request_counts + stats[0] = outputs.scheduler_stats.num_waiting_reqs + stats[1] = outputs.scheduler_stats.num_running_reqs + self.stats_changed = True + + if (wave := outputs.wave_complete) is not None: + # 2. Notification from rank 0 engine that we've + # moved into the global paused state + # (engines_running==False) + if self.current_wave <= wave: + logger.debug("Moving DP wave from %d to %d.", + self.current_wave, wave) + self.current_wave = wave + 1 + self.engines_running = False + self.stats_changed = True + elif (wave := outputs.start_wave) is not None and ( + wave > self.current_wave or + (wave == self.current_wave + and not self.engines_running)): + # 3. The engine received request for a non-current wave + # so we must ensure that other engines progress to the + # next wave (race condition handling). + logger.debug( + "Starting wave %d after notification of " + "stale wave request from engine.", wave) + self.current_wave = wave + self.engines_running = True + self.stats_changed = True + self._send_start_wave(publish_back, wave, eng_index) + + @staticmethod + def _send_start_wave(socket: zmq.Socket, wave: int, + exclude_engine_index: Optional[int]): + """Broadcast the START_DP_WAVE message to all the engines. + It includes the current wave number and index of engine which + has already received a request with this wave number and so doesn't + require additional notification. + """ + wave_encoded = msgspec.msgpack.encode((wave, exclude_engine_index)) + socket.send_multipart( + (EngineCoreRequestType.START_DP_WAVE.value, wave_encoded)) + + def _get_engine_counts(self) -> list[list[int]]: + """Return list of [waiting, running] count lists for each engine.""" + return [e.request_counts for e in self.engines] diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index ed71d9b67..a02abb62b 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -7,6 +7,7 @@ import threading import time from collections import deque from concurrent.futures import Future +from contextlib import ExitStack from inspect import isclass, signature from logging import DEBUG from typing import Any, Callable, Optional, TypeVar, Union @@ -22,7 +23,7 @@ from vllm.logging_utils.dump_input import dump_engine_exception from vllm.lora.request import LoRARequest from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) -from vllm.utils import make_zmq_socket, resolve_obj_by_qualname, zmq_socket_ctx +from vllm.utils import make_zmq_socket, resolve_obj_by_qualname from vllm.v1.core.kv_cache_utils import (get_kv_cache_config, unify_kv_cache_configs) from vllm.v1.core.sched.interface import SchedulerInterface @@ -33,10 +34,12 @@ from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest, from vllm.v1.engine.mm_input_cache import MirroredProcessingCache from vllm.v1.executor.abstract import Executor from vllm.v1.kv_cache_interface import KVCacheConfig +from vllm.v1.metrics.stats import SchedulerStats from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder from vllm.v1.structured_output import StructuredOutputManager +from vllm.v1.utils import EngineHandshakeMetadata, EngineZmqAddresses from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) @@ -211,7 +214,7 @@ class EngineCore: # Re-raise exception raise err - def step(self) -> tuple[EngineCoreOutputs, bool]: + def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]: """Schedule, execute, and make output. Returns tuple of outputs and a flag indicating whether the model @@ -221,10 +224,7 @@ class EngineCore: # Check for any requests remaining in the scheduler - unfinished, # or finished and not yet removed from the batch. if not self.scheduler.has_requests(): - return EngineCoreOutputs( - outputs=[], - scheduler_stats=self.scheduler.make_stats(), - ), False + return {}, False scheduler_output = self.scheduler.schedule() model_output = self.execute_model(scheduler_output) engine_core_outputs = self.scheduler.update_from_output( @@ -234,7 +234,7 @@ class EngineCore: scheduler_output.total_num_scheduled_tokens > 0) def step_with_batch_queue( - self) -> tuple[Optional[EngineCoreOutputs], bool]: + self) -> tuple[Optional[dict[int, EngineCoreOutputs]], bool]: """Schedule and execute batches with the batch queue. Note that if nothing to output in this step, None is returned. @@ -276,8 +276,8 @@ class EngineCore: # Blocking until the first result is available. model_output = future.result() self.batch_queue.task_done() - engine_core_outputs = self.scheduler.update_from_output( - scheduler_output, model_output) + engine_core_outputs = (self.scheduler.update_from_output( + scheduler_output, model_output)) return engine_core_outputs, scheduled_batch @@ -362,7 +362,7 @@ class EngineCoreProc(EngineCore): self, vllm_config: VllmConfig, on_head_node: bool, - input_address: str, + handshake_address: str, executor_class: type[Executor], log_stats: bool, engine_index: int = 0, @@ -375,65 +375,70 @@ class EngineCoreProc(EngineCore): # Create input socket. input_ctx = zmq.Context() identity = engine_index.to_bytes(length=2, byteorder="little") - input_socket = make_zmq_socket(input_ctx, - input_address, - zmq.DEALER, - identity=identity, - bind=False) - try: + with make_zmq_socket(input_ctx, + handshake_address, + zmq.DEALER, + identity=identity, + linger=5000, + bind=False) as handshake_socket: + # Register engine with front-end. - output_address = self.startup_handshake( - input_socket, on_head_node, vllm_config.parallel_config) + addresses = self.startup_handshake(handshake_socket, on_head_node, + vllm_config.parallel_config) + self.client_count = len(addresses.outputs) # Update config which may have changed from the handshake. vllm_config.__post_init__() # Set up data parallel environment. + self.has_coordinator = addresses.coordinator_output is not None self._init_data_parallel(vllm_config) # Initialize engine core and model. super().__init__(vllm_config, executor_class, log_stats, executor_fail_callback) + self.engine_index = engine_index self.step_fn = (self.step if self.batch_queue is None else self.step_with_batch_queue) self.engines_running = False + self.last_counts = (0, 0) # Send ready message. num_gpu_blocks = vllm_config.cache_config.num_gpu_blocks - input_socket.send( + handshake_socket.send( msgspec.msgpack.encode({ "status": "READY", "local": on_head_node, "num_gpu_blocks": num_gpu_blocks, })) - # Background Threads and Queues for IO. These enable us to - # overlap ZMQ socket IO with GPU since they release the GIL, - # and to overlap some serialization/deserialization with the - # model forward pass. - # Threads handle Socket <-> Queues and core_busy_loop uses Queue. - self.input_queue = input_queue - self.output_queue = queue.Queue[Union[EngineCoreOutputs, bytes]]() - threading.Thread(target=self.process_input_socket, - args=(input_socket, ), - daemon=True).start() - input_socket = None - self.output_thread = threading.Thread( - target=self.process_output_socket, - args=(output_address, engine_index), - daemon=True) - self.output_thread.start() - finally: - if input_socket is not None: - input_socket.close(linger=0) + # Background Threads and Queues for IO. These enable us to + # overlap ZMQ socket IO with GPU since they release the GIL, + # and to overlap some serialization/deserialization with the + # model forward pass. + # Threads handle Socket <-> Queues and core_busy_loop uses Queue. + self.input_queue = input_queue + self.output_queue = queue.Queue[Union[tuple[int, EngineCoreOutputs], + bytes]]() + threading.Thread(target=self.process_input_sockets, + args=(addresses.inputs, addresses.coordinator_input, + identity), + daemon=True).start() + self.output_thread = threading.Thread( + target=self.process_output_sockets, + args=(addresses.outputs, addresses.coordinator_output, + engine_index), + daemon=True) + self.output_thread.start() @staticmethod - def startup_handshake(input_socket: zmq.Socket, on_head_node: bool, - parallel_config: ParallelConfig) -> str: + def startup_handshake( + handshake_socket: zmq.Socket, on_head_node: bool, + parallel_config: ParallelConfig) -> EngineZmqAddresses: # Send registration message. - input_socket.send( + handshake_socket.send( msgspec.msgpack.encode({ "status": "HELLO", "local": on_head_node, @@ -441,22 +446,20 @@ class EngineCoreProc(EngineCore): # Receive initialization message. logger.info("Waiting for init message from front-end.") - if not input_socket.poll(timeout=HANDSHAKE_TIMEOUT_MINS * 60 * 1000): + if not handshake_socket.poll(timeout=HANDSHAKE_TIMEOUT_MINS * 60_000): raise RuntimeError("Did not receive response from front-end " f"process within {HANDSHAKE_TIMEOUT_MINS} " f"minutes") - init_bytes = input_socket.recv() - init_message = msgspec.msgpack.decode(init_bytes) + init_bytes = handshake_socket.recv() + init_message: EngineHandshakeMetadata = msgspec.msgpack.decode( + init_bytes, type=EngineHandshakeMetadata) logger.debug("Received init message: %s", init_message) - output_socket_address = init_message["output_socket_address"] - #TBD(nick) maybe replace IP with configured head node address - - received_parallel_config = init_message["parallel_config"] + received_parallel_config = init_message.parallel_config for key, value in received_parallel_config.items(): setattr(parallel_config, key, value) - return output_socket_address + return init_message.addresses @staticmethod def run_engine_core(*args, @@ -528,7 +531,7 @@ class EngineCoreProc(EngineCore): """Exits when an engine step needs to be performed.""" waited = False - while not self.engines_running and not (self.scheduler.has_requests()): + while not self.engines_running and not self.scheduler.has_requests(): if logger.isEnabledFor(DEBUG) and self.input_queue.empty(): logger.debug("EngineCore waiting for work.") waited = True @@ -549,8 +552,8 @@ class EngineCoreProc(EngineCore): # Step the engine core. outputs, model_executed = self.step_fn() # Put EngineCoreOutputs into the output queue. - if outputs is not None: - self.output_queue.put_nowait(outputs) + for output in (outputs.items() if outputs else ()): + self.output_queue.put_nowait(output) return model_executed @@ -563,7 +566,7 @@ class EngineCoreProc(EngineCore): elif request_type == EngineCoreRequestType.ABORT: self.abort_requests(request) elif request_type == EngineCoreRequestType.UTILITY: - call_id, method_name, args = request + client_idx, call_id, method_name, args = request output = UtilityOutput(call_id) try: method = getattr(self, method_name) @@ -574,7 +577,7 @@ class EngineCoreProc(EngineCore): output.failure_message = (f"Call to {method_name} method" f" failed: {str(e)}") self.output_queue.put_nowait( - EngineCoreOutputs(utility_output=output)) + (client_idx, EngineCoreOutputs(utility_output=output))) elif request_type == EngineCoreRequestType.EXECUTOR_FAILED: raise RuntimeError("Executor failed.") else: @@ -607,27 +610,68 @@ class EngineCoreProc(EngineCore): logger.fatal("vLLM shutdown signal from EngineCore failed " "to send. Please report this issue.") - def process_input_socket(self, input_socket: zmq.Socket): + def process_input_sockets(self, input_addresses: list[str], + coord_input_address: Optional[str], + identity: bytes): """Input socket IO thread.""" # Msgpack serialization decoding. add_request_decoder = MsgpackDecoder(EngineCoreRequest) generic_decoder = MsgpackDecoder() - while True: - # (RequestType, RequestData) - type_frame, *data_frames = input_socket.recv_multipart(copy=False) - request_type = EngineCoreRequestType(bytes(type_frame.buffer)) - - # Deserialize the request data. - decoder = add_request_decoder if ( - request_type == EngineCoreRequestType.ADD) else generic_decoder - request = decoder.decode(data_frames) - - # Push to input queue for core busy loop. - self.input_queue.put_nowait((request_type, request)) + with ExitStack() as stack, zmq.Context() as ctx: + input_sockets = [ + stack.enter_context( + make_zmq_socket(ctx, + input_address, + zmq.DEALER, + identity=identity, + bind=False)) + for input_address in input_addresses + ] + if coord_input_address is None: + coord_socket = None + else: + coord_socket = stack.enter_context( + make_zmq_socket(ctx, + coord_input_address, + zmq.XSUB, + identity=identity, + bind=False)) + # Send subscription message to coordinator. + coord_socket.send(b'\x01') + + # Register sockets with poller. + poller = zmq.Poller() + for input_socket in input_sockets: + # Send initial message to each input socket - this is required + # before the front-end ROUTER socket can send input messages + # back to us. + input_socket.send(b'') + poller.register(input_socket, zmq.POLLIN) + if coord_socket is not None: + poller.register(coord_socket, zmq.POLLIN) - def process_output_socket(self, output_path: str, engine_index: int): + while True: + for input_socket, _ in poller.poll(): + # (RequestType, RequestData) + type_frame, *data_frames = input_socket.recv_multipart( + copy=False) + request_type = EngineCoreRequestType( + bytes(type_frame.buffer)) + + # Deserialize the request data. + decoder = add_request_decoder if ( + request_type + == EngineCoreRequestType.ADD) else generic_decoder + request = decoder.decode(data_frames) + + # Push to input queue for core busy loop. + self.input_queue.put_nowait((request_type, request)) + + def process_output_sockets(self, output_paths: list[str], + coord_output_path: Optional[str], + engine_index: int): """Output socket IO thread.""" # Msgpack serialization encoding. @@ -641,30 +685,49 @@ class EngineCoreProc(EngineCore): # We must set linger to ensure the ENGINE_CORE_DEAD # message is sent prior to closing the socket. - with zmq_socket_ctx(output_path, zmq.constants.PUSH, - linger=4000) as socket: + with ExitStack() as stack, zmq.Context() as ctx: + sockets = [ + stack.enter_context( + make_zmq_socket(ctx, output_path, zmq.PUSH, linger=4000)) + for output_path in output_paths + ] + coord_socket = stack.enter_context( + make_zmq_socket( + ctx, coord_output_path, zmq.PUSH, bind=False, + linger=4000)) if coord_output_path is not None else None + max_reuse_bufs = len(sockets) + 1 + while True: - outputs = self.output_queue.get() - if outputs == EngineCoreProc.ENGINE_CORE_DEAD: - socket.send(outputs, copy=False) + output = self.output_queue.get() + if output == EngineCoreProc.ENGINE_CORE_DEAD: + for socket in sockets: + socket.send(output) break - assert not isinstance(outputs, bytes) + assert not isinstance(output, bytes) + client_index, outputs = output outputs.engine_index = engine_index + if client_index == -1: + # Don't reuse buffer for coordinator message + # which will be very small. + assert coord_socket is not None + coord_socket.send_multipart(encoder.encode(outputs)) + continue + # Reclaim buffers that zmq is finished with. while pending and pending[-1][0].done: reuse_buffers.append(pending.pop()[2]) buffer = reuse_buffers.pop() if reuse_buffers else bytearray() buffers = encoder.encode_into(outputs, buffer) - tracker = socket.send_multipart(buffers, - copy=False, - track=True) + tracker = sockets[client_index].send_multipart(buffers, + copy=False, + track=True) if not tracker.done: ref = outputs if len(buffers) > 1 else None pending.appendleft((tracker, ref, buffer)) - elif len(reuse_buffers) < 2: - # Keep at most 2 buffers to reuse. + elif len(reuse_buffers) < max_reuse_bufs: + # Limit the number of buffers to reuse. reuse_buffers.append(buffer) @@ -676,7 +739,7 @@ class DPEngineCoreProc(EngineCoreProc): self, vllm_config: VllmConfig, on_head_node: bool, - input_address: str, + handshake_address: str, executor_class: type[Executor], log_stats: bool, ): @@ -691,10 +754,11 @@ class DPEngineCoreProc(EngineCoreProc): # Counts forward-passes of the model so that we can synchronize # finished with DP peers every N steps. self.counter = 0 + self.current_wave = 0 # Initialize the engine. dp_rank = vllm_config.parallel_config.data_parallel_rank - super().__init__(vllm_config, on_head_node, input_address, + super().__init__(vllm_config, on_head_node, handshake_address, executor_class, log_stats, dp_rank) def _init_data_parallel(self, vllm_config: VllmConfig): @@ -726,7 +790,6 @@ class DPEngineCoreProc(EngineCoreProc): self.dp_rank = dp_rank self.dp_group = vllm_config.parallel_config.stateless_init_dp_group() - self.current_wave = 0 def shutdown(self): super().shutdown() @@ -734,22 +797,23 @@ class DPEngineCoreProc(EngineCoreProc): stateless_destroy_torch_distributed_process_group(dp_group) def add_request(self, request: EngineCoreRequest): - if request.current_wave != self.current_wave: + if self.has_coordinator and request.current_wave != self.current_wave: if request.current_wave > self.current_wave: self.current_wave = request.current_wave elif not self.engines_running: # Request received for an already-completed wave, notify # front-end that we need to start the next one. self.output_queue.put_nowait( - EngineCoreOutputs(start_wave=self.current_wave)) + (-1, EngineCoreOutputs(start_wave=self.current_wave))) super().add_request(request) def _handle_client_request(self, request_type: EngineCoreRequestType, request: Any) -> None: if request_type == EngineCoreRequestType.START_DP_WAVE: - new_wave: int = request - if new_wave >= self.current_wave: + new_wave, exclude_eng_index = request + if exclude_eng_index != self.engine_index and ( + new_wave >= self.current_wave): self.current_wave = new_wave if not self.engines_running: logger.debug("EngineCore starting idle loop for wave %d.", @@ -758,6 +822,18 @@ class DPEngineCoreProc(EngineCoreProc): else: super()._handle_client_request(request_type, request) + def _maybe_publish_request_counts(self): + if not self.has_coordinator: + return + + # Publish our request counts (if they've changed). + counts = self.scheduler.get_request_counts() + if counts != self.last_counts: + self.last_counts = counts + stats = SchedulerStats(*counts) + self.output_queue.put_nowait( + (-1, EngineCoreOutputs(scheduler_stats=stats))) + def run_busy_loop(self): """Core busy loop of the EngineCore for data parallel case.""" @@ -768,6 +844,8 @@ class DPEngineCoreProc(EngineCoreProc): # 2) Step the engine core. executed = self._process_engine_step() + self._maybe_publish_request_counts() + local_unfinished_reqs = self.scheduler.has_unfinished_requests() if not executed: if not local_unfinished_reqs and not self.engines_running: @@ -788,7 +866,8 @@ class DPEngineCoreProc(EngineCoreProc): logger.debug("Wave %d finished, pausing engine loop.", self.current_wave) self.output_queue.put_nowait( - EngineCoreOutputs(wave_complete=self.current_wave)) + (-1, + EngineCoreOutputs(wave_complete=self.current_wave))) self.current_wave += 1 def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool: diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 9f8a9b692..e9e2d2d8d 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -2,6 +2,7 @@ import asyncio import contextlib import queue +import sys import uuid import weakref from abc import ABC, abstractmethod @@ -9,26 +10,28 @@ from collections import deque from collections.abc import Awaitable, Sequence from concurrent.futures import Future from dataclasses import dataclass -from enum import Enum, auto from threading import Thread from typing import Any, Callable, Optional, TypeVar, Union -import msgspec +import msgspec.msgpack import zmq import zmq.asyncio -from vllm.config import ParallelConfig, VllmConfig +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.utils import (get_open_port, get_open_zmq_inproc_path, - get_open_zmq_ipc_path, get_tcp_uri, make_zmq_socket) +from vllm.utils import (get_open_zmq_inproc_path, make_zmq_socket, + zmq_socket_ctx) from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest, EngineCoreRequestType, UtilityOutput) +from vllm.v1.engine.coordinator import DPCoordinator from vllm.v1.engine.core import EngineCore, EngineCoreProc from vllm.v1.engine.exceptions import EngineDeadError from vllm.v1.executor.abstract import Executor from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr -from vllm.v1.utils import CoreEngineProcManager +from vllm.v1.utils import (CoreEngine, CoreEngineProcManager, + EngineZmqAddresses, get_engine_client_zmq_addr, + wait_for_engine_startup) logger = init_logger(__name__) @@ -36,8 +39,6 @@ AnyFuture = Union[asyncio.Future[Any], Future[Any]] _R = TypeVar('_R') # Return type for collective_rpc -STARTUP_POLL_PERIOD_MS = 10000 - class EngineCoreClient(ABC): """ @@ -207,7 +208,7 @@ class InprocClient(EngineCoreClient): def get_output(self) -> EngineCoreOutputs: outputs, _ = self.engine_core.step() - return outputs + return outputs.get(0) or EngineCoreOutputs() def add_request(self, request: EngineCoreRequest) -> None: self.engine_core.add_request(request) @@ -266,24 +267,6 @@ class InprocClient(EngineCoreClient): return self.engine_core.collective_rpc(method, timeout, args, kwargs) -class CoreEngineState(Enum): - NEW = auto() - CONNECTED = auto() - READY = auto() - - -class CoreEngine: - """One per data parallel rank.""" - - def __init__(self, index: int = 0, local: bool = True): - self.local = local - self.index = index - self.identity = index.to_bytes(length=2, byteorder="little") - - self.state = CoreEngineState.NEW - self.num_reqs_in_flight = 0 - - @dataclass class BackgroundResources: """Used as a finalizer for clean shutdown, avoiding @@ -291,9 +274,12 @@ class BackgroundResources: ctx: Union[zmq.Context] local_engine_manager: Optional[CoreEngineProcManager] = None + coordinator: Optional[DPCoordinator] = None output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None + first_req_send_socket: Optional[zmq.asyncio.Socket] = None output_queue_task: Optional[asyncio.Task] = None + stats_update_task: Optional[asyncio.Task] = None shutdown_path: Optional[str] = None # Set if any of the engines are dead. Here so that the output @@ -306,16 +292,21 @@ class BackgroundResources: self.engine_dead = True if self.local_engine_manager is not None: self.local_engine_manager.close() + if self.coordinator is not None: + self.coordinator.close() if self.output_queue_task is not None: self.output_queue_task.cancel() + if self.stats_update_task is not None: + self.stats_update_task.cancel() # ZMQ context termination can hang if the sockets # aren't explicitly closed first. - if self.output_socket is not None: - self.output_socket.close(linger=0) - if self.input_socket is not None: - self.input_socket.close(linger=0) + for socket in (self.output_socket, self.input_socket, + self.first_req_send_socket): + if socket is not None: + socket.close(linger=0) + if self.shutdown_path is not None: # We must ensure that the sync output socket is # closed cleanly in its own thread. @@ -350,6 +341,7 @@ class MPClient(EngineCoreClient): vllm_config: VllmConfig, executor_class: type[Executor], log_stats: bool, + client_addresses: Optional[dict[str, str]] = None, ): self.vllm_config = vllm_config # Serialization setup. @@ -369,8 +361,8 @@ class MPClient(EngineCoreClient): try: parallel_config = vllm_config.parallel_config local_engine_count = parallel_config.data_parallel_size_local - start_index = parallel_config.data_parallel_rank local_start_index = parallel_config.data_parallel_rank_local + dp_size = parallel_config.data_parallel_size # SPMD mode is where there is an LLM instance per DP rank and # one core engine per LLM, see @@ -382,42 +374,53 @@ class MPClient(EngineCoreClient): CoreEngine(index=local_start_index, local=True) ] else: - assert start_index == 0 + assert parallel_config.data_parallel_rank == 0 local_start_index = 0 self.core_engines = [ CoreEngine(index=i, local=(i < local_engine_count)) - for i in range(parallel_config.data_parallel_size) + for i in range(dp_size) ] - input_address, output_address = self._get_zmq_addresses( - parallel_config, spmd_mode) + local_only = spmd_mode or local_engine_count == dp_size + + self.stats_update_address: Optional[str] = None + if client_addresses is not None: + input_address = client_addresses["input_address"] + output_address = client_addresses["output_address"] + self.stats_update_address = client_addresses.get( + "stats_update_address") + else: + host = parallel_config.data_parallel_master_ip + input_address = get_engine_client_zmq_addr(local_only, host) + output_address = get_engine_client_zmq_addr(local_only, host) # Create input and output sockets. self.input_socket = self.resources.input_socket = make_zmq_socket( self.ctx, input_address, zmq.ROUTER, bind=True) - self.resources.output_socket = make_zmq_socket( - self.ctx, output_address, zmq.constants.PULL) - # Start local engines. - if local_engine_count: - # In server mode, start_index and local_start_index will - # both be 0. - self.resources.local_engine_manager = CoreEngineProcManager( - EngineCoreProc.run_engine_core, - vllm_config=vllm_config, - executor_class=executor_class, - log_stats=log_stats, - input_address=input_address, - on_head_node=True, - local_engine_count=local_engine_count, - start_index=start_index, - local_start_index=local_start_index) + self.ctx, output_address, zmq.PULL) + + if client_addresses is None: + self._init_engines_direct(vllm_config, local_only, + local_start_index, input_address, + output_address, executor_class, + log_stats) + coordinator = self.resources.coordinator + if coordinator: + self.stats_update_address = ( + coordinator.get_stats_publish_address()) + + # Wait for ready messages from each engine on the input socket. + identities = set(e.identity for e in self.core_engines) + sync_input_socket = zmq.Socket.shadow(self.input_socket) + while identities: + if not sync_input_socket.poll(timeout=600_000): + raise TimeoutError("Timed out waiting for engines to send" + "initial message on input socket.") + identity, _ = sync_input_socket.recv_multipart() + identities.remove(identity) self.core_engine = self.core_engines[0] - - # Wait for engine core process(es) to start. - self._wait_for_engine_startup(output_address, parallel_config) - self.utility_results: dict[int, AnyFuture] = {} # Request objects which may contain pytorch-allocated tensors @@ -430,116 +433,67 @@ class MPClient(EngineCoreClient): if not success: self._finalizer() - @staticmethod - def _get_zmq_addresses(parallel_config: ParallelConfig, - spmd_mode: bool) -> tuple[str, str]: - """Returns (input_address, output_address).""" - dp_size = parallel_config.data_parallel_size + def _init_engines_direct(self, vllm_config: VllmConfig, local_only: bool, + local_start_index: int, input_address: str, + output_address: str, + executor_class: type[Executor], log_stats: bool): + """Self-contained client mode, launch engine and coordinator process + as needed.""" + + parallel_config = vllm_config.parallel_config local_engine_count = parallel_config.data_parallel_size_local + start_index = parallel_config.data_parallel_rank + host = parallel_config.data_parallel_master_ip - if local_engine_count == dp_size or spmd_mode: - input_address = get_open_zmq_ipc_path() - output_address = get_open_zmq_ipc_path() - else: - host = parallel_config.data_parallel_master_ip - input_port = parallel_config.data_parallel_rpc_port - output_port = get_open_port() - input_address = get_tcp_uri(host, input_port) - output_address = get_tcp_uri(host, output_port) - - return input_address, output_address - - def _wait_for_engine_startup(self, output_address: str, - parallel_config: ParallelConfig): - # Get a sync handle to the socket which can be sync or async. - sync_input_socket = zmq.Socket.shadow(self.input_socket) - - # Wait for engine core process(es) to send ready messages. - local_count = parallel_config.data_parallel_size_local - remote_count = len(self.core_engines) - local_count - # [local, remote] counts - conn_pending, start_pending = [local_count, remote_count], [0, 0] - - poller = zmq.Poller() - poller.register(sync_input_socket, zmq.POLLIN) - proc_manager = self.resources.local_engine_manager - if proc_manager is not None: - for sentinel in proc_manager.sentinels(): - poller.register(sentinel, zmq.POLLIN) - while any(conn_pending) or any(start_pending): - events = poller.poll(STARTUP_POLL_PERIOD_MS) - if not events: - if any(conn_pending): - logger.debug( - "Waiting for %d local, %d remote core engine proc(s) " - "to connect.", *conn_pending) - if any(start_pending): - logger.debug( - "Waiting for %d local, %d remote core engine proc(s) " - "to start.", *start_pending) - continue - if len(events) > 1 or events[0][0] != sync_input_socket: - # One of the local core processes exited. - finished = proc_manager.finished_procs( - ) if proc_manager else {} - raise RuntimeError("Engine core initialization failed. " - "See root cause above. " - f"Failed core proc(s): {finished}") - - # Receive HELLO and READY messages from the input socket. - eng_identity, ready_msg_bytes = sync_input_socket.recv_multipart() - eng_index = int.from_bytes(eng_identity, byteorder="little") - engine = next( - (e for e in self.core_engines if e.identity == eng_identity), - None) - if engine is None: - raise RuntimeError(f"Message from engine with unexpected data " - f"parallel rank: {eng_index}") - msg = msgspec.msgpack.decode(ready_msg_bytes) - status, local = msg["status"], msg["local"] - if local != engine.local: - raise RuntimeError(f"{status} message from " - f"{'local' if local else 'remote'} " - f"engine {eng_index}, expected it to be " - f"{'local' if engine.local else 'remote'}") - - if status == "HELLO" and engine.state == CoreEngineState.NEW: - - # Send init message with DP config info. - init_message = self.encoder.encode({ - "output_socket_address": output_address, - "parallel_config": { - "data_parallel_master_ip": - parallel_config.data_parallel_master_ip, - "data_parallel_master_port": - parallel_config.data_parallel_master_port, - "data_parallel_size": - parallel_config.data_parallel_size, - }, - }) - sync_input_socket.send_multipart((eng_identity, *init_message), - copy=False) - conn_pending[0 if local else 1] -= 1 - start_pending[0 if local else 1] += 1 - engine.state = CoreEngineState.CONNECTED - elif status == "READY" and (engine.state - == CoreEngineState.CONNECTED): - # Setup KV cache config with initialization state from - # engine core process. Sum values from all engines in DP case. - cache_config = self.vllm_config.cache_config - num_gpu_blocks = cache_config.num_gpu_blocks or 0 - num_gpu_blocks += msg['num_gpu_blocks'] - cache_config.num_gpu_blocks = num_gpu_blocks - - start_pending[0 if local else 1] -= 1 - engine.state = CoreEngineState.READY - else: - raise RuntimeError(f"Unexpected {status} message for " - f"{'local' if local else 'remote'} engine " - f"{eng_index} in {engine.state} state.") + if len(self.core_engines) > 1: + self.resources.coordinator = DPCoordinator(parallel_config) + + handshake_address = get_engine_client_zmq_addr( + local_only, host, parallel_config.data_parallel_rpc_port) - logger.debug("%s from %s core engine process %s.", status, - "local" if local else "remote", eng_index) + with zmq_socket_ctx(handshake_address, zmq.ROUTER, + bind=True) as handshake_socket: + + # Start local engines. + if local_engine_count: + # In server mode, start_index and local_start_index will + # both be 0. + self.resources.local_engine_manager = CoreEngineProcManager( + EngineCoreProc.run_engine_core, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=log_stats, + handshake_address=handshake_address, + on_head_node=True, + local_engine_count=local_engine_count, + start_index=start_index, + local_start_index=local_start_index) + + # Wait for engine core process(es) to start. + self._wait_for_engine_startup(handshake_socket, input_address, + output_address) + + def _wait_for_engine_startup(self, handshake_socket: zmq.Socket, + input_address: str, output_address: str): + addresses = EngineZmqAddresses( + inputs=[input_address], + outputs=[output_address], + ) + + coordinator = self.resources.coordinator + if coordinator is not None: + addresses.coordinator_input, addresses.coordinator_output = ( + coordinator.get_engine_socket_addresses()) + + wait_for_engine_startup( + handshake_socket, + addresses, + self.core_engines, + self.vllm_config.parallel_config, + self.vllm_config.cache_config, + self.resources.local_engine_manager, + coordinator.proc if coordinator else None, + ) def shutdown(self): # Terminate background resources. @@ -605,8 +559,8 @@ class SyncMPClient(MPClient): try: shutdown_socket.bind(shutdown_path) poller = zmq.Poller() - poller.register(shutdown_socket) - poller.register(out_socket) + poller.register(shutdown_socket, zmq.POLLIN) + poller.register(out_socket, zmq.POLLIN) while True: socks = poller.poll() if not socks: @@ -668,7 +622,7 @@ class SyncMPClient(MPClient): future: Future[Any] = Future() self.utility_results[call_id] = future self._send_input(EngineCoreRequestType.UTILITY, - (call_id, method, args)) + (0, call_id, method, args)) return future.result() @@ -730,15 +684,21 @@ class SyncMPClient(MPClient): class AsyncMPClient(MPClient): """Asyncio-compatible client for multi-proc EngineCore.""" - def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], - log_stats: bool): + def __init__(self, + vllm_config: VllmConfig, + executor_class: type[Executor], + log_stats: bool, + client_addresses: Optional[dict[str, str]] = None, + client_index: int = 0): super().__init__( asyncio_mode=True, vllm_config=vllm_config, executor_class=executor_class, log_stats=log_stats, + client_addresses=client_addresses, ) + self.client_index = client_index self.outputs_queue = asyncio.Queue[Union[EngineCoreOutputs, Exception]]() try: @@ -854,12 +814,13 @@ class AsyncMPClient(MPClient): future = asyncio.get_running_loop().create_future() self.utility_results[call_id] = future message = (EngineCoreRequestType.UTILITY.value, *self.encoder.encode( - (call_id, method, args))) + (self.client_index, call_id, method, args))) await self._send_input_message(message, engine, args) self._ensure_output_queue_task() return await future async def add_request_async(self, request: EngineCoreRequest) -> None: + request.client_index = self.client_index await self._send_input(EngineCoreRequestType.ADD, request) self._ensure_output_queue_task() @@ -921,17 +882,120 @@ class DPAsyncMPClient(AsyncMPClient): """Asyncio-compatible client for multi-proc, multi-engine (data parallel) EngineCore.""" - def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor], - log_stats: bool): + def __init__(self, + vllm_config: VllmConfig, + executor_class: type[Executor], + log_stats: bool, + client_addresses: Optional[dict[str, str]] = None, + client_index: int = 0): self.current_wave = 0 self.engines_running = False + # To route aborts to the correct engine. self.reqs_in_flight: dict[str, CoreEngine] = {} - super().__init__(vllm_config, executor_class, log_stats) + super().__init__(vllm_config, executor_class, log_stats, + client_addresses, client_index) assert len(self.core_engines) > 1 + # List of [waiting, running] pair per engine. + self.lb_engines: list[list[int]] = [] + + self.first_req_sock_addr = get_open_zmq_inproc_path() + self.first_req_send_socket = self.resources.first_req_send_socket = ( + make_zmq_socket(self.ctx, + self.first_req_sock_addr, + zmq.PAIR, + bind=True)) + try: + # If we are running in an asyncio event loop, start the stats task. + # Otherwise, it will be started lazily. + asyncio.get_running_loop() + self._ensure_stats_update_task() + except RuntimeError: + pass + + def _ensure_stats_update_task(self): + resources = self.resources + if resources.stats_update_task is not None: + return + + assert self.stats_update_address is not None + + async def run_engine_stats_update_task(): + with make_zmq_socket(self.ctx, self.stats_update_address, + zmq.XSUB) as socket, make_zmq_socket( + self.ctx, + self.first_req_sock_addr, + zmq.PAIR, + bind=False) as first_req_rcv_socket: + # Send subscription message. + await socket.send(b'\x01') + + poller = zmq.asyncio.Poller() + poller.register(socket, zmq.POLLIN) + poller.register(first_req_rcv_socket, zmq.POLLIN) + + while True: + events = await poller.poll() + if not self.engines_running and len(events) == 2 or ( + events[0][0] == first_req_rcv_socket): + # Send a message to notify the coordinator that + # we're sending a request while the engines are + # paused, so that it can wake the others up + # (to run dummy EP loop). + self.engines_running = True + buf = first_req_rcv_socket.recv( + flags=zmq.NOBLOCK).result() + target_eng_index = int.from_bytes(buf, "little") + msg = msgspec.msgpack.encode( + (target_eng_index, self.current_wave)) + await socket.send(msg) + + buf = None + while True: + # Drain all stats events (we only care about latest). + future: asyncio.Future[bytes] = socket.recv( + flags=zmq.NOBLOCK) + if isinstance(future.exception(), zmq.Again): + break + buf = future.result() + if buf is None: + continue + + # Update local load-balancing state. + counts, wave, running = msgspec.msgpack.decode(buf) + self.current_wave = wave + self.engines_running = running + self.lb_engines = counts + + resources.stats_update_task = asyncio.create_task( + run_engine_stats_update_task()) + + def get_core_engine_for_request(self) -> CoreEngine: + if not self.lb_engines: + return self.core_engines[0] + # TODO use P2C alg for larger DP sizes + num_engines = len(self.lb_engines) + min_counts = [sys.maxsize, sys.maxsize] + eng_index = 0 + for i in range(num_engines): + # Start from client_index to help with balancing when engines + # are empty. + idx = (self.client_index + i) % num_engines + counts = self.lb_engines[idx] + if counts < min_counts: + min_counts = counts + eng_index = idx + # Adjust local counts for better balancing between stats updates + # from the coordinator (which happen every 100ms). + if min_counts[0]: + min_counts[0] += 1 + else: + min_counts[1] += 1 + return self.core_engines[eng_index] + async def call_utility_async(self, method: str, *args) -> Any: # Only the result from the first engine is returned. return (await asyncio.gather(*[ @@ -940,62 +1004,30 @@ class DPAsyncMPClient(AsyncMPClient): ]))[0] async def add_request_async(self, request: EngineCoreRequest) -> None: + self._ensure_stats_update_task() + request.current_wave = self.current_wave + request.client_index = self.client_index chosen_engine = self.get_core_engine_for_request() self.reqs_in_flight[request.request_id] = chosen_engine - chosen_engine.num_reqs_in_flight += 1 to_await = self._send_input(EngineCoreRequestType.ADD, request, chosen_engine) if not self.engines_running: - # Send request to chosen engine and dp start loop - # control message to all other engines. - self.engines_running = True - to_await = asyncio.gather( - to_await, # type: ignore[assignment] - *self._start_wave_coros(exclude_index=chosen_engine.index)) + # Notify coordinator that we're sending a request + await self.first_req_send_socket.send(chosen_engine.identity) await to_await self._ensure_output_queue_task() - def get_core_engine_for_request(self) -> CoreEngine: - return min(self.core_engines, key=lambda e: e.num_reqs_in_flight) - @staticmethod async def process_engine_outputs(self: "DPAsyncMPClient", outputs: EngineCoreOutputs): - if self.reqs_in_flight: - for req_id in outputs.finished_requests or (): - if engine := self.reqs_in_flight.pop(req_id, None): - engine.num_reqs_in_flight -= 1 - - if outputs.wave_complete is not None: - # Current wave is complete, move to next wave number - # and mark engines as paused. - if self.current_wave <= outputs.wave_complete: - self.current_wave = outputs.wave_complete + 1 - self.engines_running = False - - elif outputs.start_wave is not None and ( - outputs.start_wave > self.current_wave or - (outputs.start_wave == self.current_wave - and not self.engines_running)): - # Engine received request for a non-current wave so we must ensure - # that other engines progress to the next wave. - self.current_wave = outputs.start_wave - self.engines_running = True - await asyncio.gather(*self._start_wave_coros( - exclude_index=outputs.engine_index)) - - def _start_wave_coros(self, exclude_index: int) -> list[Awaitable[None]]: - logger.debug("Sending start DP wave %d.", self.current_wave) - return [ - self._send_input(EngineCoreRequestType.START_DP_WAVE, - self.current_wave, engine) - for engine in self.core_engines if engine.index != exclude_index - ] + if outputs.finished_requests and self.reqs_in_flight: + for req_id in outputs.finished_requests: + self.reqs_in_flight.pop(req_id, None) async def abort_requests_async(self, request_ids: list[str]) -> None: if not request_ids: diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 3dc2f7744..665e5873d 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -12,13 +12,12 @@ from vllm.config import SupportsMetricsInfo, VllmConfig from vllm.logger import init_logger from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics from vllm.v1.engine import FinishReason +from vllm.v1.metrics.prometheus import unregister_vllm_metrics from vllm.v1.metrics.stats import IterationStats, SchedulerStats from vllm.v1.spec_decode.metrics import SpecDecodingLogging, SpecDecodingProm logger = init_logger(__name__) -_LOCAL_LOGGING_INTERVAL_SEC = 5.0 - StatLoggerFactory = Callable[[VllmConfig, int], "StatLoggerBase"] @@ -35,7 +34,7 @@ class StatLoggerBase(ABC): ... @abstractmethod - def record(self, scheduler_stats: SchedulerStats, + def record(self, scheduler_stats: Optional[SchedulerStats], iteration_stats: Optional[IterationStats]): ... @@ -78,20 +77,22 @@ class LoggingStatLogger(StatLoggerBase): # Compute summary metrics for tracked stats return float(np.sum(tracked_stats) / (now - self.last_log_time)) - def record(self, scheduler_stats: SchedulerStats, + def record(self, scheduler_stats: Optional[SchedulerStats], iteration_stats: Optional[IterationStats]): """Log Stats to standard output.""" if iteration_stats: self._track_iteration_stats(iteration_stats) - self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats) + if scheduler_stats is not None: + self.prefix_caching_metrics.observe( + scheduler_stats.prefix_cache_stats) - if scheduler_stats.spec_decoding_stats is not None: - self.spec_decoding_logging.observe( - scheduler_stats.spec_decoding_stats) + if scheduler_stats.spec_decoding_stats is not None: + self.spec_decoding_logging.observe( + scheduler_stats.spec_decoding_stats) - self.last_scheduler_stats = scheduler_stats + self.last_scheduler_stats = scheduler_stats def log(self): now = time.monotonic() @@ -131,10 +132,11 @@ class LoggingStatLogger(StatLoggerBase): self.spec_decoding_logging.log(log_fn=log_fn) def log_engine_initialized(self): - logger.info( - "vllm cache_config_info with initialization " \ - "after num_gpu_blocks is: %d", - self.vllm_config.cache_config.num_gpu_blocks) + if self.vllm_config.cache_config.num_gpu_blocks: + logger.info( + "Engine %03d: vllm cache_config_info with initialization " + "after num_gpu_blocks is: %d", self.engine_index, + self.vllm_config.cache_config.num_gpu_blocks) class PrometheusStatLogger(StatLoggerBase): @@ -144,7 +146,8 @@ class PrometheusStatLogger(StatLoggerBase): _spec_decoding_cls = SpecDecodingProm def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): - self._unregister_vllm_metrics() + + unregister_vllm_metrics() self.vllm_config = vllm_config self.engine_index = engine_index # Use this flag to hide metrics that were deprecated in @@ -169,11 +172,13 @@ class PrometheusStatLogger(StatLoggerBase): self.gauge_scheduler_running = self._gauge_cls( name="vllm:num_requests_running", documentation="Number of requests in model execution batches.", + multiprocess_mode="mostrecent", labelnames=labelnames).labels(*labelvalues) self.gauge_scheduler_waiting = self._gauge_cls( name="vllm:num_requests_waiting", documentation="Number of requests waiting to be processed.", + multiprocess_mode="mostrecent", labelnames=labelnames).labels(*labelvalues) # @@ -182,6 +187,7 @@ class PrometheusStatLogger(StatLoggerBase): self.gauge_gpu_cache_usage = self._gauge_cls( name="vllm:gpu_cache_usage_perc", documentation="GPU KV-cache usage. 1 means 100 percent usage.", + multiprocess_mode="mostrecent", labelnames=labelnames).labels(*labelvalues) self.counter_gpu_prefix_cache_queries = self._counter_cls( @@ -242,6 +248,9 @@ class PrometheusStatLogger(StatLoggerBase): buckets=build_1_2_5_buckets(max_model_len), labelnames=labelnames).labels(*labelvalues) + # TODO: This metric might be incorrect in case of using multiple + # api_server counts which uses prometheus mp. + # See: https://github.com/vllm-project/vllm/pull/18053 self.histogram_iteration_tokens = \ self._histogram_cls( name="vllm:iteration_tokens_total", @@ -340,6 +349,9 @@ class PrometheusStatLogger(StatLoggerBase): # # LoRA metrics # + + # TODO: This metric might be incorrect in case of using multiple + # api_server counts which uses prometheus mp. self.gauge_lora_info: Optional[prometheus_client.Gauge] = None if vllm_config.lora_config is not None: self.labelname_max_lora = "max_lora" @@ -350,13 +362,16 @@ class PrometheusStatLogger(StatLoggerBase): self._gauge_cls( name="vllm:lora_requests_info", documentation="Running stats on lora requests.", + multiprocess_mode="sum", labelnames=[ self.labelname_max_lora, self.labelname_waiting_lora_adapters, self.labelname_running_lora_adapters, - ]) + ], + ) def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo): + metrics_info = config_obj.metrics_info() metrics_info["engine"] = self.engine_index @@ -372,25 +387,28 @@ class PrometheusStatLogger(StatLoggerBase): info_gauge = self._gauge_cls( name=name, documentation=documentation, - labelnames=metrics_info.keys()).labels(**metrics_info) + multiprocess_mode="mostrecent", + labelnames=metrics_info.keys(), + ).labels(**metrics_info) info_gauge.set(1) - def record(self, scheduler_stats: SchedulerStats, + def record(self, scheduler_stats: Optional[SchedulerStats], iteration_stats: Optional[IterationStats]): """Log to prometheus.""" - self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) - self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) + if scheduler_stats is not None: + self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) + self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) - self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage) + self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage) - self.counter_gpu_prefix_cache_queries.inc( - scheduler_stats.prefix_cache_stats.queries) - self.counter_gpu_prefix_cache_hits.inc( - scheduler_stats.prefix_cache_stats.hits) + self.counter_gpu_prefix_cache_queries.inc( + scheduler_stats.prefix_cache_stats.queries) + self.counter_gpu_prefix_cache_hits.inc( + scheduler_stats.prefix_cache_stats.hits) - if scheduler_stats.spec_decoding_stats is not None: - self.spec_decoding_prom.observe( - scheduler_stats.spec_decoding_stats) + if scheduler_stats.spec_decoding_stats is not None: + self.spec_decoding_prom.observe( + scheduler_stats.spec_decoding_stats) if iteration_stats is None: return @@ -445,13 +463,6 @@ class PrometheusStatLogger(StatLoggerBase): self.gauge_lora_info.labels(**lora_info_labels)\ .set_to_current_time() - @staticmethod - def _unregister_vllm_metrics(): - # Unregister any existing vLLM collectors (for CI/CD - for collector in list(prometheus_client.REGISTRY._collector_to_names): - if hasattr(collector, "_name") and "vllm" in collector._name: - prometheus_client.REGISTRY.unregister(collector) - def log_engine_initialized(self): self.log_metrics_info("cache_config", self.vllm_config.cache_config) diff --git a/vllm/v1/metrics/prometheus.py b/vllm/v1/metrics/prometheus.py new file mode 100644 index 000000000..f12568535 --- /dev/null +++ b/vllm/v1/metrics/prometheus.py @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: Apache-2.0 + +import os +import tempfile +from typing import Optional + +from prometheus_client import REGISTRY, CollectorRegistry, multiprocess + +from vllm.logger import init_logger + +logger = init_logger(__name__) + +# Global temporary directory for prometheus multiprocessing +_prometheus_multiproc_dir: Optional[tempfile.TemporaryDirectory] = None + + +def setup_multiprocess_prometheus(): + """Set up prometheus multiprocessing directory if not already configured. + + """ + global _prometheus_multiproc_dir + + if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: + # Make TemporaryDirectory for prometheus multiprocessing + # Note: global TemporaryDirectory will be automatically + # cleaned up upon exit. + _prometheus_multiproc_dir = tempfile.TemporaryDirectory() + os.environ["PROMETHEUS_MULTIPROC_DIR"] = _prometheus_multiproc_dir.name + logger.debug("Created PROMETHEUS_MULTIPROC_DIR at %s", + _prometheus_multiproc_dir.name) + else: + logger.warning("Found PROMETHEUS_MULTIPROC_DIR was set by user. " + "This directory must be wiped between vLLM runs or " + "you will find inaccurate metrics. Unset the variable " + "and vLLM will properly handle cleanup.") + + +def get_prometheus_registry(): + """Get the appropriate prometheus registry based on multiprocessing + configuration. + + Returns: + Registry: A prometheus registry + """ + if os.getenv("PROMETHEUS_MULTIPROC_DIR") is not None: + logger.debug("Using multiprocess registry for prometheus metrics") + registry = CollectorRegistry() + multiprocess.MultiProcessCollector(registry) + return registry + + return REGISTRY + + +def unregister_vllm_metrics(): + """Unregister any existing vLLM collectors from the prometheus registry. + + This is useful for testing and CI/CD where metrics may be registered + multiple times across test runs. + + Also, in case of multiprocess, we need to unregister the metrics from the + global registry. + """ + registry = REGISTRY + # Unregister any existing vLLM collectors + for collector in list(registry._collector_to_names): + if hasattr(collector, "_name") and "vllm" in collector._name: + registry.unregister(collector) + + +def shutdown_prometheus(): + """Shutdown prometheus metrics.""" + try: + pid = os.getpid() + multiprocess.mark_process_dead(pid) + logger.debug("Marked Prometheus metrics for process %d as dead", pid) + except Exception as e: + logger.error("Error during metrics cleanup: %s", str(e)) diff --git a/vllm/v1/request.py b/vllm/v1/request.py index b4c845075..42c75ef96 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -26,12 +26,13 @@ class Request: multi_modal_placeholders: Optional[list[PlaceholderRange]], sampling_params: SamplingParams, eos_token_id: Optional[int], - arrival_time: float, + client_index: int = 0, lora_request: Optional["LoRARequest"] = None, structured_output_request: Optional["StructuredOutputRequest"] = None, cache_salt: Optional[str] = None, ) -> None: self.request_id = request_id + self.client_index = client_index self.sampling_params = sampling_params # Because of LoRA, the eos token id can be different for each request. self.eos_token_id = eos_token_id @@ -90,13 +91,13 @@ class Request: return cls( request_id=request.request_id, + client_index=request.client_index, prompt_token_ids=request.prompt_token_ids, multi_modal_inputs=request.mm_inputs, multi_modal_hashes=request.mm_hashes, multi_modal_placeholders=request.mm_placeholders, sampling_params=request.sampling_params, eos_token_id=request.eos_token_id, - arrival_time=request.arrival_time, lora_request=request.lora_request, structured_output_request=StructuredOutputRequest( sampling_params=request.sampling_params), diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 0758747a8..a26794561 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -1,31 +1,41 @@ # SPDX-License-Identifier: Apache-2.0 -import os +import argparse +import multiprocessing import time import weakref from collections import defaultdict from collections.abc import Sequence +from dataclasses import dataclass +from enum import Enum, auto from multiprocessing import Process, connection -from typing import (TYPE_CHECKING, Callable, Generic, Optional, TypeVar, Union, - overload) +from multiprocessing.process import BaseProcess +from typing import (TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar, + Union, overload) +import msgspec import torch +import zmq -from vllm.config import VllmConfig +from vllm.config import CacheConfig, ParallelConfig, VllmConfig from vllm.logger import init_logger from vllm.model_executor.models.utils import extract_layer_index from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, usage_message) -from vllm.utils import get_mp_context, kill_process_tree +from vllm.utils import (get_mp_context, get_open_port, get_open_zmq_ipc_path, + get_tcp_uri, kill_process_tree) from vllm.v1.executor.abstract import Executor if TYPE_CHECKING: from vllm.attention.layer import Attention + from vllm.v1.engine.coordinator import DPCoordinator logger = init_logger(__name__) T = TypeVar("T") +STARTUP_POLL_PERIOD_MS = 10000 + class ConstantList(Generic[T], Sequence): @@ -95,6 +105,78 @@ class ConstantList(Generic[T], Sequence): return f"ConstantList({self._x})" +def get_engine_client_zmq_addr(local_only: bool, + host: str, + port: int = 0) -> str: + return get_open_zmq_ipc_path() if local_only else (get_tcp_uri( + host, port or get_open_port())) + + +class APIServerProcessManager: + """Manages a group of API server processes. + + Handles creation, monitoring, and termination of API server worker + processes. Also monitors extra processes to check if they are healthy. + """ + + def __init__( + self, + target_server_fn: Callable, + listen_address: str, + sock: Any, + args: argparse.Namespace, + num_servers: int, + input_addresses: list[str], + output_addresses: list[str], + stats_update_address: Optional[str] = None, + ): + """Initialize and start API server worker processes. + + Args: + target_server_fn: Function to call for each API server process + listen_address: Address to listen for client connections + sock: Socket for client connections + args: Command line arguments + num_servers: Number of API server processes to start + input_addresses: Input addresses for each API server + output_addresses: Output addresses for each API server + stats_update_address: Optional stats update address + """ + self.listen_address = listen_address + self.sock = sock + self.args = args + + # Start API servers + spawn_context = multiprocessing.get_context("spawn") + self.processes: list[BaseProcess] = [] + + for i, in_addr, out_addr in zip(range(num_servers), input_addresses, + output_addresses): + client_config = { + "input_address": in_addr, + "output_address": out_addr, + "client_index": i + } + if stats_update_address is not None: + client_config["stats_update_address"] = stats_update_address + + proc = spawn_context.Process(target=target_server_fn, + name=f"ApiServer_{i}", + args=(listen_address, sock, args, + client_config)) + self.processes.append(proc) + proc.start() + + logger.info("Started %d API server processes", len(self.processes)) + + # Shutdown only the API server processes on garbage collection + # The extra processes are managed by their owners + self._finalizer = weakref.finalize(self, shutdown, self.processes) + + def close(self) -> None: + self._finalizer() + + class CoreEngineProcManager: """ Utility class to handle creation, readiness, and shutdown @@ -109,7 +191,7 @@ class CoreEngineProcManager: local_start_index: int, vllm_config: VllmConfig, on_head_node: bool, - input_address: str, + handshake_address: str, executor_class: type[Executor], log_stats: bool, ): @@ -117,12 +199,12 @@ class CoreEngineProcManager: common_kwargs = { "vllm_config": vllm_config, "on_head_node": on_head_node, - "input_address": input_address, + "handshake_address": handshake_address, "executor_class": executor_class, "log_stats": log_stats, } - self.processes: list[Process] = [] + self.processes: list[BaseProcess] = [] for index in range(local_engine_count): local_index = local_start_index + index global_index = start_index + index @@ -135,8 +217,7 @@ class CoreEngineProcManager: "local_dp_rank": local_index, })) - self._finalizer = weakref.finalize(self, shutdown, self.processes, - input_address) + self._finalizer = weakref.finalize(self, shutdown, self.processes) try: for proc in self.processes: proc.start() @@ -164,9 +245,199 @@ class CoreEngineProcManager: } +class CoreEngineState(Enum): + NEW = auto() + CONNECTED = auto() + READY = auto() + + +class CoreEngine: + """One per data parallel rank.""" + + def __init__(self, index: int = 0, local: bool = True): + self.local = local + self.index = index + self.identity = index.to_bytes(2, "little") + + self.state = CoreEngineState.NEW + + +@dataclass +class EngineZmqAddresses: + # ZMQ input socket addresses for each front-end client (requests) + inputs: list[str] + # ZMQ output socket addresses for each front-end client (responses) + outputs: list[str] + # ZMQ input socket address of DP coordinator if applicable + coordinator_input: Optional[str] = None + # ZMQ output socket address of DP coordinator if applicable + coordinator_output: Optional[str] = None + + +@dataclass +class EngineHandshakeMetadata: + """Metadata sent to each engine process during startup handshake, + including addresses of the front-end ZMQ queues that they should + connect to. + """ + addresses: EngineZmqAddresses + parallel_config: dict[str, Union[int, str]] + + +def wait_for_engine_startup( + handshake_socket: zmq.Socket, + addresses: EngineZmqAddresses, + core_engines: list[CoreEngine], + parallel_config: ParallelConfig, + cache_config: CacheConfig, + proc_manager: Optional[CoreEngineProcManager], + coord_process: Optional[Process], +): + + # Wait for engine core process(es) to send ready messages. + local_count = parallel_config.data_parallel_size_local + remote_count = len(core_engines) - local_count + # [local, remote] counts + conn_pending, start_pending = [local_count, remote_count], [0, 0] + poller = zmq.Poller() + poller.register(handshake_socket, zmq.POLLIN) + + if proc_manager is not None: + for sentinel in proc_manager.sentinels(): + poller.register(sentinel, zmq.POLLIN) + if coord_process is not None: + poller.register(coord_process.sentinel, zmq.POLLIN) + while any(conn_pending) or any(start_pending): + events = poller.poll(STARTUP_POLL_PERIOD_MS) + if not events: + if any(conn_pending): + logger.debug( + "Waiting for %d local, %d remote core engine proc(s) " + "to connect.", *conn_pending) + if any(start_pending): + logger.debug( + "Waiting for %d local, %d remote core engine proc(s) " + "to start.", *start_pending) + continue + if len(events) > 1 or events[0][0] != handshake_socket: + # One of the local core processes exited. + finished = proc_manager.finished_procs() if proc_manager else {} + if coord_process is not None and coord_process.exitcode is not None: + finished[coord_process.name] = coord_process.exitcode + raise RuntimeError("Engine core initialization failed. " + "See root cause above. " + f"Failed core proc(s): {finished}") + + # Receive HELLO and READY messages from the input socket. + eng_identity, ready_msg_bytes = handshake_socket.recv_multipart() + eng_index = int.from_bytes(eng_identity, "little") + engine = next((e for e in core_engines if e.identity == eng_identity), + None) + if engine is None: + raise RuntimeError(f"Message from engine with unexpected data " + f"parallel rank: {eng_index}") + msg = msgspec.msgpack.decode(ready_msg_bytes) + status, local = msg["status"], msg["local"] + if local != engine.local: + raise RuntimeError(f"{status} message from " + f"{'local' if local else 'remote'} " + f"engine {eng_index}, expected it to be " + f"{'local' if engine.local else 'remote'}") + + if status == "HELLO" and engine.state == CoreEngineState.NEW: + + # Send init message with DP config info. + init_message = msgspec.msgpack.encode( + EngineHandshakeMetadata( + addresses=addresses, + parallel_config={ + "data_parallel_master_ip": + parallel_config.data_parallel_master_ip, + "data_parallel_master_port": + parallel_config.data_parallel_master_port, + "data_parallel_size": + parallel_config.data_parallel_size, + })) + handshake_socket.send_multipart((eng_identity, init_message), + copy=False) + conn_pending[0 if local else 1] -= 1 + start_pending[0 if local else 1] += 1 + engine.state = CoreEngineState.CONNECTED + elif status == "READY" and (engine.state == CoreEngineState.CONNECTED): + # Setup KV cache config with initialization state from + # engine core process. Sum values from all engines in DP case. + num_gpu_blocks = cache_config.num_gpu_blocks or 0 + num_gpu_blocks += msg["num_gpu_blocks"] + cache_config.num_gpu_blocks = num_gpu_blocks + + start_pending[0 if local else 1] -= 1 + engine.state = CoreEngineState.READY + else: + raise RuntimeError(f"Unexpected {status} message for " + f"{'local' if local else 'remote'} engine " + f"{eng_index} in {engine.state} state.") + + logger.debug("%s from %s core engine process %s.", status, + "local" if local else "remote", eng_index) + + +def wait_for_completion_or_failure( + api_server_manager: APIServerProcessManager, + local_engine_manager: Optional[CoreEngineProcManager] = None, + coordinator: Optional["DPCoordinator"] = None) -> None: + """Wait for all processes to complete or detect if any fail. + + Raises an exception if any process exits with a non-zero status. + """ + + try: + logger.info("Waiting for API servers to complete ...") + # Create a mapping of sentinels to their corresponding processes + # for efficient lookup + sentinel_to_proc: dict[Any, BaseProcess] = { + proc.sentinel: proc + for proc in api_server_manager.processes + } + + if coordinator: + sentinel_to_proc[coordinator.proc.sentinel] = coordinator.proc + + if local_engine_manager: + for proc in local_engine_manager.processes: + sentinel_to_proc[proc.sentinel] = proc + + # Check if any process terminates + while sentinel_to_proc: + # Wait for any process to terminate + ready_sentinels: list[Any] = connection.wait(sentinel_to_proc) + + # Process any terminated processes + for sentinel in ready_sentinels: + proc = sentinel_to_proc.pop(sentinel) + + # Check if process exited with error + if proc.exitcode != 0: + raise RuntimeError( + f"Process {proc.name} (PID: {proc.pid}) " + f"died with exit code {proc.exitcode}") + except KeyboardInterrupt: + logger.info("Received KeyboardInterrupt, shutting down API servers...") + except Exception as e: + logger.exception("Exception occurred while running API servers: %s", + str(e)) + raise + finally: + logger.info("Terminating remaining processes ...") + api_server_manager.close() + if coordinator: + coordinator.close() + if local_engine_manager: + local_engine_manager.close() + + # Note(rob): shutdown function cannot be a bound method, -# else the gc cannot collect the objedecoupct. -def shutdown(procs: list[Process], input_address: str): +# else the gc cannot collect the object. +def shutdown(procs: list[BaseProcess]): # Shutdown the process. for proc in procs: if proc.is_alive(): @@ -185,12 +456,6 @@ def shutdown(procs: list[Process], input_address: str): if proc.is_alive() and (pid := proc.pid) is not None: kill_process_tree(pid) - # Remove zmq ipc socket files. - if input_address.startswith("ipc://"): - socket_file = input_address[len("ipc://"):] - if os and os.path.exists(socket_file): - os.remove(socket_file) - def bind_kv_cache( kv_caches: dict[str, torch.Tensor], -- GitLab From f49239cb454b0ab1169fadd6a700812f33a0eafa Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 30 May 2025 12:56:11 -0400 Subject: [PATCH 081/274] Benchmark script for fp8 vs bf16 gemm (#17126) Signed-off-by: mgoin --- benchmarks/kernels/bench_fp8_gemm.py | 222 +++++++++++++++++++++++++++ benchmarks/kernels/weight_shapes.py | 46 ++++++ 2 files changed, 268 insertions(+) create mode 100644 benchmarks/kernels/bench_fp8_gemm.py diff --git a/benchmarks/kernels/bench_fp8_gemm.py b/benchmarks/kernels/bench_fp8_gemm.py new file mode 100644 index 000000000..36d03e40e --- /dev/null +++ b/benchmarks/kernels/bench_fp8_gemm.py @@ -0,0 +1,222 @@ +# SPDX-License-Identifier: Apache-2.0 +import argparse +import copy +import itertools + +import torch +import triton +from weight_shapes import WEIGHT_SHAPES + +from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm +from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant + + +@triton.testing.perf_report( + triton.testing.Benchmark( + x_names=["batch_size"], + x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], + x_log=False, + line_arg="provider", + line_vals=[ + "torch-bf16", + # "fp8-tensor-w-token-a", + "fp8-tensor-w-tensor-a", + "fp8-channel-w-token-a", + # "fp8-channel-w-tensor-a", + # "fp8-tensor-w-token-a-noquant", + "fp8-tensor-w-tensor-a-noquant", + "fp8-channel-w-token-a-noquant", + # "fp8-channel-w-tensor-a-noquant", + ], + line_names=[ + "torch-bf16", + # "fp8-tensor-w-token-a", + "fp8-tensor-w-tensor-a", + "fp8-channel-w-token-a", + # "fp8-channel-w-tensor-a", + # "fp8-tensor-w-token-a-noquant", + "fp8-tensor-w-tensor-a-noquant", + "fp8-channel-w-token-a-noquant", + # "fp8-channel-w-tensor-a-noquant", + ], + ylabel="TFLOP/s (larger is better)", + plot_name="BF16 vs FP8 GEMMs", + args={}, + ) +) +def benchmark(batch_size, provider, N, K): + M = batch_size + device = "cuda" + dtype = torch.bfloat16 + + # Create input tensors + a = torch.randn((M, K), device=device, dtype=dtype) + b = torch.randn((N, K), device=device, dtype=dtype) + + quantiles = [0.5, 0.2, 0.8] + + if "torch-bf16" in provider: + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: torch.nn.functional.linear(a, b), quantiles=quantiles + ) + + elif "fp8" in provider: + # Weights are always quantized ahead of time + if "noquant" in provider: + # For no quantization, we just measure the GEMM + if "tensor-w-token-a" in provider: + # Dynamic per-token quant for A, per-tensor quant for B + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b) + assert scale_b_fp8.numel() == 1 + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( + a, use_per_token_if_dynamic=True + ) + + def run_quant(): + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "tensor-w-tensor-a" in provider: + # Static per-tensor quantization with fixed scales + # for both A and B + scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) + scale_b = torch.tensor([1.0], device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + assert scale_b_fp8.numel() == 1 + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) + + def run_quant(): + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "channel-w-token-a" in provider: + # Static per-channel quantization for weights, per-token + # quant for A + scale_b = torch.tensor((N,), device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + scale_b_fp8 = scale_b_fp8.expand(N).contiguous() + assert scale_b_fp8.numel() == N + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( + a, use_per_token_if_dynamic=True + ) + + def run_quant(): + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "channel-w-tensor-a" in provider: + # Static per-channel quantization for weights, per-tensor + # quant for A + scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) + scale_b = torch.tensor((N,), device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + scale_b_fp8 = scale_b_fp8.expand(N).contiguous() + assert scale_b_fp8.numel() == N + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) + + def run_quant(): + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + else: + # In these cases, we quantize the activations during the GEMM call + if "tensor-w-token-a" in provider: + # Dynamic per-token quant for A, per-tensor quant for B + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b) + assert scale_b_fp8.numel() == 1 + + def run_quant(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( + a, use_per_token_if_dynamic=True + ) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "tensor-w-tensor-a" in provider: + # Static per-tensor quantization with fixed scales + # for both A and B + scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) + scale_b = torch.tensor([1.0], device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + assert scale_b_fp8.numel() == 1 + + def run_quant(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "channel-w-token-a" in provider: + # Static per-channel quantization for weights, per-token + # quant for A + scale_b = torch.tensor((N,), device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + scale_b_fp8 = scale_b_fp8.expand(N).contiguous() + assert scale_b_fp8.numel() == N + + def run_quant(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant( + a, use_per_token_if_dynamic=True + ) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + elif "channel-w-tensor-a" in provider: + # Static per-channel quantization for weights, per-tensor + # quant for A + scale_a = torch.tensor([1.0], device=device, dtype=torch.float32) + scale_b = torch.tensor((N,), device=device, dtype=torch.float32) + b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b) + scale_b_fp8 = scale_b_fp8.expand(N).contiguous() + assert scale_b_fp8.numel() == N + + def run_quant(): + a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a) + return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype) + + b_fp8 = b_fp8.t() + + ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( + lambda: run_quant(), quantiles=quantiles + ) + + # Calculate TFLOP/s, two flops per multiply-add + tflops = lambda ms: (2 * M * N * K) * 1e-12 / (ms * 1e-3) + return tflops(ms), tflops(max_ms), tflops(min_ms) + + +def prepare_shapes(args): + KN_model_names = [] + models_tps = list(itertools.product(args.models, args.tp_sizes)) + for model, tp_size in models_tps: + assert model in WEIGHT_SHAPES + for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model]): + KN[tp_split_dim] = KN[tp_split_dim] // tp_size + KN.append(model) + KN_model_names.append(KN) + return KN_model_names + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--models", + nargs="+", + type=str, + default=["meta-llama/Llama-3.1-8B-Instruct"], + choices=[*WEIGHT_SHAPES.keys()], + help="List of models to benchmark", + ) + parser.add_argument( + "--tp-sizes", + nargs="+", + type=int, + default=[1], + help="List of tensor parallel sizes", + ) + args = parser.parse_args() + + KN_model_names = prepare_shapes(args) + for K, N, model_name in KN_model_names: + print(f"{model_name}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:") + benchmark.run( + print_data=True, + show_plots=True, + save_path=f"bench_fp8_res_n{N}_k{K}", + N=N, + K=K, + ) + + print("Benchmark finished!") diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py index 89b05d588..afe159ddd 100644 --- a/benchmarks/kernels/weight_shapes.py +++ b/benchmarks/kernels/weight_shapes.py @@ -48,4 +48,50 @@ WEIGHT_SHAPES = { ([16384, 106496], 1), ([53248, 16384], 0), ], + "meta-llama/Llama-3.1-8B-Instruct": [ + ([4096, 6144], 1), + ([4096, 4096], 0), + ([4096, 28672], 1), + ([14336, 4096], 0), + ], + "meta-llama/Llama-3.3-70B-Instruct": [ + ([8192, 10240], 1), + ([8192, 8192], 0), + ([8192, 57344], 1), + ([28672, 8192], 0), + ], + "mistralai/Mistral-Large-Instruct-2407": [ + ([12288, 14336], 1), + ([12288, 12288], 0), + ([12288, 57344], 1), + ([28672, 12288], 0), + ], + "Qwen/Qwen2.5-7B-Instruct": [ + ([3584, 4608], 1), + ([3584, 3584], 0), + ([3584, 37888], 1), + ([18944, 3584], 0), + ], + "Qwen/Qwen2.5-32B-Instruct": [ + ([5120, 7168], 1), + ([5120, 5120], 0), + ([5120, 55296], 1), + ([27648, 5120], 0), + ], + "Qwen/Qwen2.5-72B-Instruct": [ + ([8192, 10240], 1), + ([8192, 8192], 0), + ([8192, 59136], 1), + ([29568, 8192], 0), + ], + "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [ + ([2048, 3072], 1), + ([2048, 4096], 1), + ([2048, 2048], 0), + ([2048, 576], 0), + ([2048, 21888], 1), + ([10944, 2048], 0), + ([2048, 2816], 1), + ([1408, 2048], 0), + ], } -- GitLab From 5a8641638a5ec6cf089490d6b7f474b135ff590e Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 31 May 2025 01:11:44 +0800 Subject: [PATCH 082/274] [VLM] Add PP support and fix GPTQ inference for Ovis models (#18958) Signed-off-by: isotr0py <2037008807@qq.com> Signed-off-by: Isotr0py <2037008807@qq.com> --- docs/models/supported_models.md | 2 +- tests/distributed/test_pipeline_parallel.py | 1 + vllm/model_executor/models/aimv2.py | 194 ++++++++++++-------- vllm/model_executor/models/clip.py | 5 - vllm/model_executor/models/ovis.py | 34 ++-- 5 files changed, 145 insertions(+), 91 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 6b0ceaf21..b60fefdda 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -538,7 +538,7 @@ Specified using `--task generate`. | `MllamaForConditionalGeneration` | Llama 3.2 | T + I+ | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | | | `MolmoForCausalLM` | Molmo | T + I+ | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ | ✅︎ | | `NVLM_D_Model` | NVLM-D 1.0 | T + I+ | `nvidia/NVLM-D-72B`, etc. | | ✅︎ | ✅︎ | -| `Ovis` | Ovis2, Ovis1.6 | T + I+ | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | | ✅︎ | +| `Ovis` | Ovis2, Ovis1.6 | T + I+ | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ | ✅︎ | | `PaliGemmaForConditionalGeneration` | PaliGemma, PaliGemma 2 | T + IE | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. | | ✅︎ | ⚠️ | | `Phi3VForCausalLM` | Phi-3-Vision, Phi-3.5-Vision | T + IE+ | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. | | ✅︎ | ✅︎ | | `Phi4MMForCausalLM` | Phi-4-multimodal | T + I+ / T + A+ / I+ + A+ | `microsoft/Phi-4-multimodal-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 5346d67b1..e6410ab06 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -227,6 +227,7 @@ MULTIMODAL_MODELS = { "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(), "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(), "allenai/Molmo-7B-D-0924": PPTestSettings.fast(), + "AIDC-AI/Ovis2-1B": PPTestSettings.fast(), "microsoft/Phi-3.5-vision-instruct": PPTestSettings.fast(), "mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"), "Qwen/Qwen-VL-Chat": PPTestSettings.fast(), diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py index aefd6c973..2e2a18abd 100644 --- a/vllm/model_executor/models/aimv2.py +++ b/vllm/model_executor/models/aimv2.py @@ -2,16 +2,23 @@ # A modified implementation of the AIMv2 Transformer # inserted here also the image tokenizer used by Ovis2 +from collections.abc import Iterable from typing import Optional import torch import torch.nn as nn -from torch.nn import functional as F +from vllm.attention.layer import MultiHeadAttention +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.distributed.utils import divide +from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import ReplicatedLinear +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.transformers_utils.configs.ovis import AIMv2Config @@ -24,29 +31,27 @@ class AIMv2SwiGLUFFN(nn.Module): in_features = config.hidden_size bias = config.use_bias - # TODO(Isotr0py): investigate if we can add TP to visual tokenizer - self.fc1 = ReplicatedLinear(in_features, - hidden_features, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.fc1") - self.fc2 = ReplicatedLinear(hidden_features, - in_features, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.fc2") - self.fc3 = ReplicatedLinear(in_features, - hidden_features, - bias=bias, - quant_config=quant_config, - prefix=f"{prefix}.fc3") + self.fc13 = MergedColumnParallelLinear( + in_features, + [hidden_features] * 2, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.fc13", + ) + self.fc2 = RowParallelLinear( + input_size=hidden_features, + output_size=in_features, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.fc2", + ) + self.act_fn = SiluAndMul() def forward(self, x: torch.Tensor) -> torch.Tensor: - x_parallel, _ = self.fc1(x) - gate, _ = self.fc3(x) - x_parallel = F.silu(x_parallel) * gate - out, _ = self.fc2(x_parallel) - return out + x, _ = self.fc13(x) + x = self.act_fn(x) + x, _ = self.fc2(x) + return x class AIMv2PatchEmbed(nn.Module): @@ -90,39 +95,45 @@ class AIMv2Attention(nn.Module): def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, prefix: str): super().__init__() - dim = config.hidden_size - - # TODO(Isotr0py): investigate if we can add TP to visual tokenizer + self.config = config + self.embed_dim = config.hidden_size self.num_heads = config.num_attention_heads - self.qkv = ReplicatedLinear(dim, dim * 3, bias=config.qkv_bias) - # self.qkv = QKVParallelLinear( - # hidden_size=dim, - # head_size=dim // config.num_attention_heads, - # total_num_heads=config.num_attention_heads, - # bias=config.qkv_bias, - # quant_config=quant_config, - # prefix=f"{prefix}.qkv") - self.proj = ReplicatedLinear(dim, dim, bias=config.use_bias) - # self.proj = RowParallelLinear(input_size=dim, - # output_size=dim, - # bias = config.use_bias, - # quant_config=quant_config, - # prefix=f"{prefix}.proj") - - def forward( # todo might implement multiple attn implementations - self, - x: torch.Tensor, - mask: Optional[torch.Tensor] = None) -> torch.Tensor: - B, N, C = x.shape - qkv, _ = self.qkv(x) + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + "embed_dim must be divisible by num_heads " + f"(got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads}).") + self.scale = self.head_dim**-0.5 + + self.qkv = QKVParallelLinear( + hidden_size=self.embed_dim, + head_size=self.head_dim, + total_num_heads=self.num_heads, + bias=config.qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv", + ) + + self.proj = RowParallelLinear( + input_size=self.embed_dim, + output_size=self.embed_dim, + bias=config.use_bias, + quant_config=quant_config, + prefix=f"{prefix}.proj", + ) + + self.tp_size = get_tensor_model_parallel_world_size() + self.num_heads_per_partition = divide(self.num_heads, self.tp_size) - qkv = qkv.reshape(B, N, 3, self.num_heads, - C // self.num_heads).permute(2, 0, 3, 1, 4) + self.attn = MultiHeadAttention(self.num_heads_per_partition, + self.head_dim, self.scale) - q, k, v = qkv.unbind(0) + def forward(self, x: torch.Tensor) -> torch.Tensor: + qkv, _ = self.qkv(x) + q, k, v = qkv.chunk(3, dim=-1) - x = F.scaled_dot_product_attention(q, k, v, attn_mask=mask) - x = x.transpose(1, 2).contiguous().reshape(B, N, C) + x = self.attn(q, k, v) x, _ = self.proj(x) return x @@ -141,37 +152,40 @@ class AIMv2Block(nn.Module): prefix=f"{prefix}.mlp") self.norm_2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - def forward(self, - x: torch.Tensor, - mask: Optional[torch.Tensor] = None) -> torch.Tensor: - x = x + self.attn(self.norm_1.forward_native(x), mask) + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x + self.attn(self.norm_1.forward_native(x)) x = x + self.mlp(self.norm_2.forward_native(x)) return x class AIMv2Transformer(nn.Module): - def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, - prefix: str): + def __init__( + self, + config: AIMv2Config, + quant_config: QuantizationConfig, + *, + require_post_norm: Optional[bool] = None, + prefix: str = "", + ): super().__init__() self.blocks = nn.ModuleList([ AIMv2Block(config, quant_config, prefix=f"{prefix}.blocks.{i}") for i in range(config.num_hidden_layers) ]) - self.post_trunk_norm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) + if require_post_norm: + self.post_trunk_norm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + else: + self.post_trunk_norm = None - def forward( - self, - tokens: torch.Tensor, - mask: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + def forward(self, tokens: torch.Tensor) -> torch.Tensor: # they take the -1 as the ref embeddings, like a clip skip for block in self.blocks: - tokens = block(tokens, mask) - # NO NORM IN THE OG IMPLEMENTATION - # tokens = self.post_trunk_norm(tokens) + tokens = block(tokens) + if self.post_trunk_norm is not None: + tokens = self.post_trunk_norm(tokens) return tokens @@ -180,20 +194,52 @@ class AIMv2Model(torch.nn.Module): def __init__(self, config: AIMv2Config, quant_config: QuantizationConfig, + *, + require_post_norm: Optional[bool] = None, prefix: str = ""): super().__init__() self.preprocessor = AIMv2ViTPreprocessor(config) self.trunk = AIMv2Transformer(config, quant_config=quant_config, + require_post_norm=require_post_norm, prefix=f"{prefix}.trunk") - def forward( - self, - pixel_values: torch.Tensor, - mask: Optional[torch.Tensor] = None, - ) -> torch.Tensor: + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: x = self.preprocessor(pixel_values) - x = self.trunk(x, mask) + x = self.trunk(x) return x + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".fc13", ".fc1", 0), + (".fc13", ".fc3", 1), + ] + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + + for name, loaded_weight in weights: + # post_layernorm is optional in SiglipVisionModel + if (name.startswith("trunk.post_trunk_norm") + and self.trunk.post_trunk_norm is None): + continue + + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index e8f3ae215..9fd528fd7 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -106,7 +106,6 @@ class CLIPAttention(nn.Module): f"(got `embed_dim`: {self.embed_dim} and `num_heads`:" f" {self.num_heads}).") self.scale = self.head_dim**-0.5 - self.dropout = config.attention_dropout self.qkv_proj = QKVParallelLinear( hidden_size=self.embed_dim, @@ -129,10 +128,6 @@ class CLIPAttention(nn.Module): self.attn = MultiHeadAttention(self.num_heads_per_partition, self.head_dim, self.scale) - def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, - self.head_dim).transpose(1, 2).contiguous() - def forward( self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index e03705d48..232a63c50 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -30,6 +30,9 @@ from vllm.config import VllmConfig from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.quantization.gptq import GPTQConfig +from vllm.model_executor.layers.quantization.gptq_marlin import ( + GPTQMarlinConfig) from vllm.model_executor.models.aimv2 import AIMv2Model from vllm.model_executor.models.siglip import SiglipVisionModel from vllm.model_executor.models.utils import (AutoWeightsLoader, flatten_bn, @@ -48,7 +51,7 @@ from vllm.transformers_utils.configs.ovis import (BaseVisualTokenizerConfig, OvisConfig) from vllm.transformers_utils.processors.ovis import OvisProcessor -from .interfaces import MultiModalEmbeddings, SupportsMultiModal +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .utils import merge_multimodal_embeddings # Cannot find the following number from hf config. @@ -106,12 +109,14 @@ class VisualTokenizer(torch.nn.Module): config: BaseVisualTokenizerConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", - ): + ) -> nn.Module: model_type = config.backbone_config.model_type if model_type == "aimv2": + # No post rms_norm in Ovis2's AIMv2 ViT. return AIMv2Model( config=config.backbone_config, quant_config=quant_config, + require_post_norm=False, prefix=prefix, ) elif model_type == "siglip_vision_model": @@ -124,14 +129,14 @@ class VisualTokenizer(torch.nn.Module): f"Unsupported visual tokenizer model_type: {model_type}") @property - def dtype(self): + def dtype(self) -> torch.dtype: return next(self.head.parameters()).dtype @property - def device(self): + def device(self) -> torch.device: return next(self.head.parameters()).device - def tokenize(self, logits): + def tokenize(self, logits: torch.Tensor) -> torch.Tensor: if self.config.tokenize_function == 'softmax': tokens = softmax(logits, dim=-1) elif self.config.tokenize_function == 'gumbel_argmax': @@ -144,7 +149,7 @@ class VisualTokenizer(torch.nn.Module): f'or st_argmax, but got {self.config.tokenize_function}') return tokens - def encode(self, pixel_values): + def encode(self, pixel_values: torch.Tensor) -> torch.Tensor: features = self.backbone(pixel_values) if self.config.drop_cls_token: features = features[:, 1:, :] @@ -395,7 +400,7 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]): @MULTIMODAL_REGISTRY.register_processor(OvisMultiModalProcessor, info=OvisProcessingInfo, dummy_inputs=OvisDummyInputsBuilder) -class Ovis(nn.Module, SupportsMultiModal): +class Ovis(nn.Module, SupportsMultiModal, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -410,7 +415,7 @@ class Ovis(nn.Module, SupportsMultiModal): self.visual_tokenizer = VisualTokenizer( config=config.visual_tokenizer_config, - quant_config=quant_config, + quant_config=self._maybe_ignore_quant_config(quant_config), prefix=f"{prefix}.visual_tokenizer", ) @@ -421,9 +426,16 @@ class Ovis(nn.Module, SupportsMultiModal): text_model_type = self.config.get_text_config().model_type self.image_pad_token_id = IMAGE_PAD_TOKEN_ID_MAP[text_model_type] - # TODO(Isotr0py): PP support - # self.make_empty_intermediate_tensors = ( - # self.language_model.make_empty_intermediate_tensors) + self.make_empty_intermediate_tensors = ( + self.get_language_model().make_empty_intermediate_tensors) + + def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): + # GPTQ configs do not have a list of ignored modules, however AutoGPTQ + # seems to avoid vision encoder sections for some models. + # See: https://huggingface.co/AIDC-AI/Ovis2-2B-GPTQ-Int4 + if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)): + return None + return quant_config def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[OvisImagePatchInputs]: -- GitLab From 7f21e8052b5f3948c8a59514a8dc1e9c5eef70d6 Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Sat, 31 May 2025 01:34:22 +0800 Subject: [PATCH 083/274] [Misc] add group_size is -1 in awq quantization (#18910) Signed-off-by: rongfu.leng --- vllm/model_executor/layers/quantization/awq.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 4660c28c8..87afdb623 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -101,7 +101,13 @@ class AWQLinearMethod(LinearMethodBase): output_partition_sizes: list[int], input_size: int, output_size: int, params_dtype: torch.dtype, **extra_weight_attrs): - if input_size_per_partition % self.quant_config.group_size != 0: + # Normalize group_size + if self.quant_config.group_size != -1: + group_size = self.quant_config.group_size + else: + group_size = input_size + + if input_size_per_partition % group_size != 0: raise ValueError( "The input size is not aligned with the quantized " "weight shape. This can be caused by too large " @@ -127,9 +133,11 @@ class AWQLinearMethod(LinearMethodBase): packed_factor=self.quant_config.pack_factor, weight_loader=weight_loader) + num_groups = input_size_per_partition // group_size + qzeros = PackedvLLMParameter( data=torch.empty( - input_size_per_partition // self.quant_config.group_size, + num_groups, output_size_per_partition // self.quant_config.pack_factor, dtype=torch.int32, ), @@ -140,7 +148,7 @@ class AWQLinearMethod(LinearMethodBase): weight_loader=weight_loader) scales = GroupQuantScaleParameter(data=torch.empty( - input_size_per_partition // self.quant_config.group_size, + num_groups, output_size_per_partition, dtype=params_dtype, ), -- GitLab From 1dab4d5718243687acd9edc0ccaa0c94ddc43934 Mon Sep 17 00:00:00 2001 From: Will Eaton Date: Fri, 30 May 2025 17:02:54 -0400 Subject: [PATCH 084/274] Tool parser regex timeout handling (#18960) Signed-off-by: Will Eaton --- .../test_llama4_pythonic_tool_parser.py | 26 ++++++++++++++++++- .../tool_parsers/test_pythonic_tool_parser.py | 26 ++++++++++++++++++- .../llama4_pythonic_tool_parser.py | 15 ++++++++++- .../tool_parsers/pythonic_tool_parser.py | 15 +++++++++-- vllm/envs.py | 5 ++++ 5 files changed, 82 insertions(+), 5 deletions(-) diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py index 92ba1376e..f5f327ea0 100644 --- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pytest @@ -191,3 +191,27 @@ def test_streaming_tool_call_with_large_steps(): assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL + + +@pytest.mark.parametrize("streaming", [False]) +def test_regex_timeout_handling(streaming: bool): + """test regex timeout is handled gracefully""" + mock_tokenizer = MagicMock() + tool_parser: ToolParser = ToolParserManager.get_tool_parser( + "llama4_pythonic")(mock_tokenizer) + + fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2 + + # create a mock regex that raises TimeoutError + mock_regex = MagicMock() + mock_regex.match.side_effect = TimeoutError("Regex timeout") + + with patch.object(tool_parser, 'TOOL_CALL_REGEX', mock_regex): + content, tool_calls = run_tool_extraction(tool_parser, + fake_problematic_input, + streaming=streaming) + + # should treat as regular text when regex times out + assert content == fake_problematic_input + assert len(tool_calls) == 0 + mock_regex.match.assert_called_once() diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py index fbbbc1fb2..71f41ea7d 100644 --- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pytest @@ -159,3 +159,27 @@ def test_streaming_tool_call_with_large_steps(): assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL + + +@pytest.mark.parametrize("streaming", [False]) +def test_regex_timeout_handling(streaming: bool): + """test regex timeout is handled gracefully""" + mock_tokenizer = MagicMock() + tool_parser: ToolParser = ToolParserManager.get_tool_parser( + "llama4_pythonic")(mock_tokenizer) + + fake_problematic_input = "hello world[A(A=" + "\t)A(A=,\t" * 2 + + # create a mock regex that raises TimeoutError + mock_regex = MagicMock() + mock_regex.match.side_effect = TimeoutError("Regex timeout") + + with patch.object(tool_parser, 'TOOL_CALL_REGEX', mock_regex): + content, tool_calls = run_tool_extraction(tool_parser, + fake_problematic_input, + streaming=streaming) + + # should treat as regular text when regex times out + assert content == fake_problematic_input + assert len(tool_calls) == 0 + mock_regex.match.assert_called_once() diff --git a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py index 858c8db99..323fb1441 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py @@ -7,6 +7,7 @@ from typing import Any, Union import regex as re from transformers import PreTrainedTokenizerBase +import vllm.envs as envs from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -64,7 +65,19 @@ class Llama4PythonicToolParser(ToolParser): if model_output.startswith("<|python_start|>"): model_output = model_output[len("<|python_start|>"):] model_output = model_output.replace("<|python_end|>", "") - if not (self.TOOL_CALL_REGEX.match(model_output)): + + is_tool_call_pattern = False + try: + is_tool_call_pattern = self.TOOL_CALL_REGEX.match( + model_output, + timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None + except TimeoutError: + logger.warning( + "Regex timeout occurred when matching tool call pattern.") + logger.debug("Regex timeout occurred when matching user input: %s", + model_output) + + if not is_tool_call_pattern: return ExtractedToolCallInformation(tools_called=False, tool_calls=[], content=model_output) diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py index 548ff39d1..bc5d15dcb 100644 --- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py @@ -8,6 +8,7 @@ from typing import Any, Union import regex as re from transformers import PreTrainedTokenizerBase +import vllm.envs as envs from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -61,8 +62,18 @@ class PythonicToolParser(ToolParser): """ Extract the tool calls from a complete model response. """ - - if not (self.TOOL_CALL_REGEX.match(model_output)): + is_tool_call_pattern = False + try: + is_tool_call_pattern = self.TOOL_CALL_REGEX.match( + model_output, + timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS) is not None + except TimeoutError: + logger.warning( + "Regex timeout occurred when matching tool call pattern.") + logger.debug("Regex timeout occurred when matching user input: %s", + model_output) + + if not is_tool_call_pattern: return ExtractedToolCallInformation(tools_called=False, tool_calls=[], content=model_output) diff --git a/vllm/envs.py b/vllm/envs.py index dc52bbd8e..44baf5a18 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -119,6 +119,7 @@ if TYPE_CHECKING: VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557 VLLM_ALL2ALL_BACKEND: str = "naive" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840 + VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1 def get_default_cache_root(): @@ -828,6 +829,10 @@ environment_variables: dict[str, Callable[[], Any]] = { # This is used to prevent the kernel from running out of memory. "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE": lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")), + + # Regex timeout for use by the vLLM tool parsing plugins. + "VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS": + lambda: int(os.getenv("VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS", "1")), } # --8<-- [end:env-vars-definition] -- GitLab From 0f71e24034263363e48cdc6ae036e7ca057a4f44 Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Sat, 31 May 2025 02:30:15 +0100 Subject: [PATCH 085/274] [Docs] Correct multiprocessing design doc (#18964) Signed-off-by: Lukas Geiger --- docs/design/multiprocessing.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/design/multiprocessing.md b/docs/design/multiprocessing.md index 412c42fd5..4d58fae20 100644 --- a/docs/design/multiprocessing.md +++ b/docs/design/multiprocessing.md @@ -22,13 +22,13 @@ This document describes how vLLM deals with these challenges. [Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include: -- `spawn` - spawn a new Python process. This will be the default as of Python - 3.14. In macOS, this is already the default. +- `spawn` - spawn a new Python process. The default on Windows and macOS. -- `fork` - Use `os.fork()` to fork the Python interpreter. This is the default - in Python versions prior to 3.14. +- `fork` - Use `os.fork()` to fork the Python interpreter. The default on + Linux for Python versions prior to 3.14. - `forkserver` - Spawn a server process that will fork a new process on request. + The default on Linux for Python version 3.14 and newer. ### Tradeoffs -- GitLab From 7782464a1714f6081ca06f47b75e824b14316c72 Mon Sep 17 00:00:00 2001 From: Yu Guo <82124926+yuguo68@users.noreply.github.com> Date: Fri, 30 May 2025 22:50:38 -0700 Subject: [PATCH 086/274] create util function for batched arange (#18937) --- vllm/v1/worker/gpu_model_runner.py | 64 ++++++++++++++++-------------- 1 file changed, 35 insertions(+), 29 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 60425a4e1..b6fa68ab0 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -500,6 +500,26 @@ class GPUModelRunner(LoRAModelRunnerMixin): if batch_changed or batch_reordered: self.input_batch.refresh_sampling_metadata() + def _get_cumsum_and_arange( + self, + num_tokens: np.ndarray, + cumsum_dtype: Optional[np.dtype] = None, + ) -> tuple[np.ndarray, np.ndarray]: + """Get the cumulative sum and batched arange of the given array. + # E.g., [2, 5, 3] -> ([2, 7, 10], [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]) + # Equivalent to but faster than: + # np.concatenate([np.arange(n) for n in num_tokens]) + """ + # Step 1. [2, 5, 3] -> [2, 7, 10] + cu_num_tokens = np.cumsum(num_tokens, dtype=cumsum_dtype) + total_num_tokens = cu_num_tokens[-1] + # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7] + cumsums_offsets = np.repeat(cu_num_tokens - num_tokens, num_tokens) + # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + arange = self.arange_np[:total_num_tokens] - cumsums_offsets + + return cu_num_tokens, arange + def _prepare_inputs( self, scheduler_output: "SchedulerOutput", @@ -525,17 +545,10 @@ class GPUModelRunner(LoRAModelRunnerMixin): req_indices = np.repeat(self.arange_np[:num_reqs], num_scheduled_tokens) - # Get batched arange. - # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] - # Equivalent to but faster than: - # np.concatenate([np.arange(n) for n in num_scheduled_tokens]) - # Step 1. [2, 5, 3] -> [2, 7, 10] - cu_num_tokens = np.cumsum(num_scheduled_tokens) - # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7] - cumsums_offsets = np.repeat(cu_num_tokens - num_scheduled_tokens, - num_scheduled_tokens) - # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] - arange = self.arange_np[:total_num_scheduled_tokens] - cumsums_offsets + # cu_num_tokens: [2, 5, 3] -> [2, 7, 10] + # arange: [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + cu_num_tokens, arange = self._get_cumsum_and_arange( + num_scheduled_tokens) # Get positions. positions_np = self.positions_np[:total_num_scheduled_tokens] @@ -841,32 +854,25 @@ class GPUModelRunner(LoRAModelRunnerMixin): # Compute the logits indices. # [4, 1, 3, 1, 2] num_sampled_tokens = num_draft_tokens + 1 - # Step 1. [4, 5, 8, 9, 11] - cu_num_sampled_tokens = np.cumsum(num_sampled_tokens, dtype=np.int32) - total_num_sampled_tokens = cu_num_sampled_tokens[-1] - # Step 2. [0, 0, 0, 0, 4, 5, 5, 5, 8, 9, 9] - cumsums_offsets = np.repeat(cu_num_sampled_tokens - num_sampled_tokens, - num_sampled_tokens) - # Step 3. [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1] - arange = self.arange_np[:total_num_sampled_tokens] - cumsums_offsets - # Step 4. [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207] + + # Step 1. cu_num_sampled_tokens: [4, 5, 8, 9, 11] + # arange: [0, 1, 2, 3, 0, 0, 1, 2, 0, 0, 1] + cu_num_sampled_tokens, arange = self._get_cumsum_and_arange( + num_sampled_tokens, cumsum_dtype=np.int32) + # Step 2. [0, 0, 0, 0, 103, 104, 104, 104, 206, 207, 207] logits_indices = np.repeat( cu_num_scheduled_tokens - num_sampled_tokens, num_sampled_tokens) - # Step 5. [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208] + # Step 3. [0, 1, 2, 3, 103, 104, 105, 106, 206, 207, 208] logits_indices += arange # Compute the bonus logits indices. bonus_logits_indices = cu_num_sampled_tokens - 1 # Compute the draft logits indices. - # [3, 3, 5, 5, 6] - cu_num_draft_tokens = np.cumsum(num_draft_tokens, dtype=np.int32) - total_num_draft_tokens = cu_num_draft_tokens[-1] - # [0, 0, 0, 3, 3, 5] - cumsums_offsets = np.repeat(cu_num_draft_tokens - num_draft_tokens, - num_draft_tokens) - # [0, 1, 2, 0, 1, 0] - arange = self.arange_np[:total_num_draft_tokens] - cumsums_offsets + # cu_num_draft_tokens: [3, 3, 5, 5, 6] + # arange: [0, 1, 2, 0, 1, 0] + cu_num_draft_tokens, arange = self._get_cumsum_and_arange( + num_draft_tokens, cumsum_dtype=np.int32) # [0, 0, 0, 5, 5, 9] target_logits_indices = np.repeat( cu_num_sampled_tokens - num_sampled_tokens, num_draft_tokens) -- GitLab From dff80b0e42d5d2deebc0a6b0e3d2f6f06bd01c78 Mon Sep 17 00:00:00 2001 From: Pooya Davoodi Date: Sat, 31 May 2025 00:40:01 -0700 Subject: [PATCH 087/274] [Frontend] Add rerank support to run_batch endpoint (#16278) Signed-off-by: Pooya Davoodi --- tests/entrypoints/openai/test_run_batch.py | 14 +++++++-- vllm/entrypoints/openai/protocol.py | 17 +++++++---- vllm/entrypoints/openai/run_batch.py | 33 +++++++++++++++++----- 3 files changed, 48 insertions(+), 16 deletions(-) diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index 27802945a..99639ce51 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -4,6 +4,8 @@ import json import subprocess import tempfile +import pytest + from vllm.entrypoints.openai.protocol import BatchRequestOutput # ruff: noqa: E501 @@ -23,9 +25,13 @@ INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": " {"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "Hello world!"}} {"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}""" -INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} +INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} {"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}""" +INPUT_RERANK_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} +{"custom_id": "request-2", "method": "POST", "url": "/v2/rerank", "body": {"model": "BAAI/bge-reranker-v2-m3", "query": "What is the capital of France?", "documents": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}""" + def test_empty_file(): with tempfile.NamedTemporaryFile( @@ -105,11 +111,13 @@ def test_embeddings(): BatchRequestOutput.model_validate_json(line) -def test_score(): +@pytest.mark.parametrize("input_batch", + [INPUT_SCORE_BATCH, INPUT_RERANK_BATCH]) +def test_score(input_batch): with tempfile.NamedTemporaryFile( "w") as input_file, tempfile.NamedTemporaryFile( "r") as output_file: - input_file.write(INPUT_SCORE_BATCH) + input_file.write(input_batch) input_file.flush() proc = subprocess.Popen([ "vllm", diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index a7f85e9ee..2f641079e 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1481,6 +1481,10 @@ class TranscriptionStreamResponse(OpenAIBaseModel): usage: Optional[UsageInfo] = Field(default=None) +BatchRequestInputBody = Union[ChatCompletionRequest, EmbeddingRequest, + ScoreRequest, RerankRequest] + + class BatchRequestInput(OpenAIBaseModel): """ The per-line object of the batch input file. @@ -1501,21 +1505,22 @@ class BatchRequestInput(OpenAIBaseModel): url: str # The parameters of the request. - body: Union[ChatCompletionRequest, EmbeddingRequest, ScoreRequest] + body: BatchRequestInputBody @field_validator('body', mode='plain') @classmethod def check_type_for_url(cls, value: Any, info: ValidationInfo): # Use url to disambiguate models - url = info.data['url'] + url: str = info.data["url"] if url == "/v1/chat/completions": return ChatCompletionRequest.model_validate(value) if url == "/v1/embeddings": return TypeAdapter(EmbeddingRequest).validate_python(value) - if url == "/v1/score": + if url.endswith("/score"): return ScoreRequest.model_validate(value) - return TypeAdapter(Union[ChatCompletionRequest, EmbeddingRequest, - ScoreRequest]).validate_python(value) + if url.endswith("/rerank"): + return RerankRequest.model_validate(value) + return TypeAdapter(BatchRequestInputBody).validate_python(value) class BatchResponseData(OpenAIBaseModel): @@ -1527,7 +1532,7 @@ class BatchResponseData(OpenAIBaseModel): # The body of the response. body: Optional[Union[ChatCompletionResponse, EmbeddingResponse, - ScoreResponse]] = None + ScoreResponse, RerankResponse]] = None class BatchRequestOutput(OpenAIBaseModel): diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index f38465b22..ac250b3cb 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -21,7 +21,7 @@ from vllm.entrypoints.openai.protocol import (BatchRequestInput, BatchResponseData, ChatCompletionResponse, EmbeddingResponse, ErrorResponse, - ScoreResponse) + RerankResponse, ScoreResponse) # yapf: enable from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding @@ -274,8 +274,11 @@ async def run_request(serving_engine_func: Callable, tracker: BatchProgressTracker) -> BatchRequestOutput: response = await serving_engine_func(request.body) - if isinstance(response, - (ChatCompletionResponse, EmbeddingResponse, ScoreResponse)): + if isinstance( + response, + (ChatCompletionResponse, EmbeddingResponse, ScoreResponse, + RerankResponse), + ): batch_output = BatchRequestOutput( id=f"vllm-{random_uuid()}", custom_id=request.custom_id, @@ -397,7 +400,7 @@ async def main(args): response_futures.append( run_request(embed_handler_fn, request, tracker)) tracker.submitted() - elif request.url == "/v1/score": + elif request.url.endswith("/score"): score_handler_fn = openai_serving_scores.create_score if \ openai_serving_scores is not None else None if score_handler_fn is None: @@ -411,13 +414,29 @@ async def main(args): response_futures.append( run_request(score_handler_fn, request, tracker)) tracker.submitted() + elif request.url.endswith("/rerank"): + rerank_handler_fn = openai_serving_scores.do_rerank if \ + openai_serving_scores is not None else None + if rerank_handler_fn is None: + response_futures.append( + make_async_error_request_output( + request, + error_msg="The model does not support Rerank API", + )) + continue + + response_futures.append( + run_request(rerank_handler_fn, request, tracker)) + tracker.submitted() else: response_futures.append( make_async_error_request_output( request, - error_msg= - "Only /v1/chat/completions, /v1/embeddings, and /v1/score " - "are supported in the batch endpoint.", + error_msg=f"URL {request.url} was used. " + "Supported endpoints: /v1/chat/completions, /v1/embeddings," + " /score, /rerank ." + "See vllm/entrypoints/openai/api_server.py for supported " + "score/rerank versions.", )) with tracker.pbar(): -- GitLab From 1e123529d7df1ff8f868b19aeced6a64e67bd618 Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Sat, 31 May 2025 01:43:44 -0700 Subject: [PATCH 088/274] [Misc] Fix estimated max model len msg (#18966) Signed-off-by: Yong Hoon Shin --- vllm/v1/core/kv_cache_utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 403b5401b..a41fe4881 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -544,16 +544,17 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig, available_memory) estimated_msg = "" if estimated_max_len > 0: - estimated_msg = " Based on the available memory," - f" the estimated maximum model length is {estimated_max_len}." + estimated_msg = ( + "Based on the available memory, " + f"the estimated maximum model length is {estimated_max_len}.") raise ValueError( f"To serve at least one request with the models's max seq len " f"({max_model_len}), ({needed_memory/GiB_bytes:.2f} GiB KV " f"cache is needed, which is larger than the available KV cache " - f"memory ({available_memory/GiB_bytes:.2f} GiB)." + f"memory ({available_memory/GiB_bytes:.2f} GiB). " f"{estimated_msg} " - f" Try increasing `gpu_memory_utilization` or decreasing " + f"Try increasing `gpu_memory_utilization` or decreasing " f"`max_model_len` when initializing the engine.") -- GitLab From ba5111f2372678fb03e96ee69decd0febd03f13e Mon Sep 17 00:00:00 2001 From: Chauncey Date: Sat, 31 May 2025 17:20:54 +0800 Subject: [PATCH 089/274] [Bugfix]: Fix the incompatibility issue with Structured Outputs when Thinking is disabled (#18879) Signed-off-by: chaunceyjiang --- vllm/v1/structured_output/__init__.py | 30 ++++++++++++++++----------- vllm/v1/structured_output/request.py | 2 +- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index c701ab1d3..07b422814 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -149,31 +149,37 @@ class StructuredOutputManager: # NOTE: This outer loop can likely be parallelized to improve # performance of bitmask generation for large batches. for req_id, _ in ordered_seq: - request = requests[req_id].structured_output_request - if TYPE_CHECKING: - assert request is not None - assert request.grammar is not None + request = requests[req_id] + structured_output_request = request.structured_output_request - apply_bitmask = ( - request.reasoning_ended if self.reasoner is not None else True - ) # noqa: E501 + if TYPE_CHECKING: + assert structured_output_request is not None + assert structured_output_request.grammar is not None + apply_bitmask: bool = True + if self.reasoner is not None: + if structured_output_request.reasoning_ended is None: + structured_output_request.reasoning_ended = \ + self.reasoner.is_reasoning_end(request.prompt_token_ids) + apply_bitmask = structured_output_request.reasoning_ended state_advancements = 0 req_tokens = scheduled_spec_decode_tokens.get(req_id, []) + [None] for i, token in enumerate(req_tokens): - if apply_bitmask and not request.grammar.is_terminated(): - request.grammar.fill_bitmask(bitmask_tensor, - cumulative_index) + if apply_bitmask and not \ + structured_output_request.grammar.is_terminated(): + structured_output_request.grammar.fill_bitmask( + bitmask_tensor, cumulative_index) if token is not None: # In order to generate the correct bitmask for each # position in the speculative sequence, we advance # the FSM state for each speculative token and rollback # to restore the previous state when we are finished. - assert request.grammar.accept_tokens(req_id, [token]) + assert structured_output_request.grammar.accept_tokens( + req_id, [token]) state_advancements += 1 cumulative_index += 1 if state_advancements > 0: - request.grammar.rollback(state_advancements) + structured_output_request.grammar.rollback(state_advancements) if cumulative_index < bitmask_tensor.shape[0]: bitmask_tensor = bitmask_tensor[:cumulative_index] diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py index c16320b9e..9a7e30d41 100644 --- a/vllm/v1/structured_output/request.py +++ b/vllm/v1/structured_output/request.py @@ -20,7 +20,7 @@ class StructuredOutputRequest: sampling_params: SamplingParams _grammar: Optional[Union[Future[StructuredOutputGrammar], StructuredOutputGrammar]] = None - reasoning_ended: bool = False + reasoning_ended: Optional[bool] = None def _check_grammar_completion(self) -> bool: # NOTE: We have to lazy import to gate circular imports -- GitLab From b8b904795d3033c21bcb8f5c36e135e75dc1baf2 Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Sat, 31 May 2025 03:38:56 -0700 Subject: [PATCH 090/274] fix security issue of logging llm output (#18980) Signed-off-by: Lu Fang Co-authored-by: Lucia (Lu) Fang --- vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py index b403a1467..00690ad79 100644 --- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py @@ -68,8 +68,8 @@ class Phi4MiniJsonToolParser(ToolParser): len(function_call_arr)) except json.JSONDecodeError as e: logger.error( - "Failed to parse function calls from model output: %s. " - "Error: %s", model_output, str(e)) + "Failed to parse function calls from model output. " + "Error: %s", str(e)) tool_calls: list[ToolCall] = [ ToolCall( -- GitLab From 2a50ef57605ffe332a73e50597276b71e9d52676 Mon Sep 17 00:00:00 2001 From: Satyajith Chilappagari Date: Sat, 31 May 2025 03:39:11 -0700 Subject: [PATCH 091/274] [Neuron] Add Multi-Modal model support for Neuron (#18921) Signed-off-by: Satyajith Chilappagari Co-authored-by: Ashraf Mahgoub Co-authored-by: Rohith Nallamaddi Co-authored-by: FeliciaLuo Co-authored-by: Elaine Zhao --- .../offline_inference/neuron_multimodal.py | 105 ++++++++++++++++++ vllm/config.py | 10 ++ .../model_loader/neuronx_distributed.py | 65 ++++++++++- vllm/worker/neuron_model_runner.py | 13 +++ .../neuronx_distributed_model_runner.py | 88 ++++++++------- 5 files changed, 235 insertions(+), 46 deletions(-) create mode 100644 examples/offline_inference/neuron_multimodal.py diff --git a/examples/offline_inference/neuron_multimodal.py b/examples/offline_inference/neuron_multimodal.py new file mode 100644 index 000000000..a9478650b --- /dev/null +++ b/examples/offline_inference/neuron_multimodal.py @@ -0,0 +1,105 @@ +# SPDX-License-Identifier: Apache-2.0 +import requests +import torch +from neuronx_distributed_inference.models.mllama.utils import add_instruct +from PIL import Image + +from vllm import LLM, SamplingParams, TextPrompt + + +def get_image(image_url): + image = Image.open(requests.get(image_url, stream=True).raw) + return image + + +# Model Inputs +PROMPTS = [ + "What is in this image? Tell me a story", + "What is the recipe of mayonnaise in two sentences?", + "Describe this image", + "What is the capital of Italy famous for?", +] +IMAGES = [ + get_image( + "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500" + ), + None, + get_image( + "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500" + ), + None, +] +SAMPLING_PARAMS = [ + dict(top_k=1, temperature=1.0, top_p=1.0, max_tokens=16) + for _ in range(len(PROMPTS)) +] + + +def get_VLLM_mllama_model_inputs(prompt, single_image, sampling_params): + # Prepare all inputs for mllama generation, including: + # 1. put text prompt into instruct chat template + # 2. compose single text and single image prompt into Vllm's prompt class + # 3. prepare sampling parameters + input_image = single_image + has_image = torch.tensor([1]) + if isinstance(single_image, torch.Tensor) and single_image.numel() == 0: + has_image = torch.tensor([0]) + + instruct_prompt = add_instruct(prompt, has_image) + inputs = TextPrompt(prompt=instruct_prompt) + + if input_image is not None: + inputs["multi_modal_data"] = {"image": input_image} + + sampling_params = SamplingParams(**sampling_params) + return inputs, sampling_params + + +def print_outputs(outputs): + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + +if __name__ == "__main__": + assert ( + len(PROMPTS) == len(IMAGES) == len(SAMPLING_PARAMS) + ), f"""Text, image prompts and sampling parameters should have the + same batch size; but got {len(PROMPTS)}, {len(IMAGES)}, + and {len(SAMPLING_PARAMS)}""" + + # Create an LLM. + llm = LLM( + model="meta-llama/Llama-3.2-11B-Vision-Instruct", + max_num_seqs=1, + max_model_len=4096, + block_size=4096, + device="neuron", + tensor_parallel_size=32, + override_neuron_config={ + "sequence_parallel_enabled": False, + "skip_warmup": True, + "save_sharded_checkpoint": True, + "on_device_sampling_config": { + "global_topk": 1, + "dynamic": False, + "deterministic": False, + }, + }, + ) + + batched_inputs = [] + batched_sample_params = [] + for pmpt, img, params in zip(PROMPTS, IMAGES, SAMPLING_PARAMS): + inputs, sampling_params = get_VLLM_mllama_model_inputs(pmpt, img, params) + # test batch-size = 1 + outputs = llm.generate(inputs, sampling_params) + print_outputs(outputs) + batched_inputs.append(inputs) + batched_sample_params.append(sampling_params) + + # test batch-size = 4 + outputs = llm.generate(batched_inputs, batched_sample_params) + print_outputs(outputs) diff --git a/vllm/config.py b/vllm/config.py index 6cec97a5f..dfa44b044 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1360,6 +1360,16 @@ class ModelConfig: @property def is_encoder_decoder(self) -> bool: """Extract the HF encoder/decoder model flag.""" + """ + For Mllama, VLLM overrides HF's is_encoder_decoder flag and sets it to + True to enable cross-attention + Neuron needs all multimodal data to be in the decoder and does not + need to explicitly enable cross-attention + """ + if (current_platform.is_neuron() + and self.hf_config.model_type == "mllama"): + return False + return is_encoder_decoder(self.hf_config) @property diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py index 624bd476c..72ad4da29 100644 --- a/vllm/model_executor/model_loader/neuronx_distributed.py +++ b/vllm/model_executor/model_loader/neuronx_distributed.py @@ -204,6 +204,11 @@ class NeuronMllamaForCausalLM(nn.Module): config: PretrainedConfig, on_device_sampling_disabled: bool = False) -> None: super().__init__() + # has_image is the only multimodal input that is used in + # token-generation + # This is a cache (on CPU) that saves has_image data per sequence id + # The number of entries in this cache is <= Batch-Size + self.has_image_cache: dict[int, torch.Tensor] = {} self.config = config self.logits_processor = LogitsProcessor( config.get_text_config().vocab_size, logits_as_input=True) @@ -215,11 +220,57 @@ class NeuronMllamaForCausalLM(nn.Module): # Lazy initialized self.model: nn.Module + self.is_reorder_needed: bool = True + + def read_from_has_image_cache(self, seq_ids: torch.Tensor): + has_image_list = [] + for index in range(len(seq_ids)): + seq_id = seq_ids[index].item() + if seq_id in self.has_image_cache: + has_image_list.append(self.has_image_cache[seq_id]) + else: + has_image_list.append(torch.tensor([0])) + return torch.tensor(has_image_list) + + def write_to_has_image_cache(self, seq_ids: torch.Tensor, + has_image: torch.Tensor): + for index in range(len(seq_ids)): + seq_id = seq_ids[index].item() + if index < len(has_image): + self.has_image_cache[seq_id] = has_image[index] + else: + self.has_image_cache[seq_id] = torch.zeros(1) def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, seq_ids: torch.Tensor, pixel_values: torch.Tensor, aspect_ratios: torch.Tensor, num_chunks: torch.Tensor, has_image: torch.Tensor, sampling_params) -> torch.Tensor: + + # We update the has_image cache during prefill + # and read the has_image cache during decode + if input_ids.shape[-1] > 1: # prefill + self.write_to_has_image_cache(seq_ids, has_image) + else: + has_image = self.read_from_has_image_cache(seq_ids) + bs = input_ids.shape[0] + num_chunks = torch.zeros((bs, 1)) + aspect_ratios = torch.zeros((bs, 1, 2)) + + input_block_ids = seq_ids + origin_input_block_ids = seq_ids + if self.is_reorder_needed: + # sort block ids sequentially for perf/neuron support reasons + input_block_ids, sorted_indices = torch.sort(input_block_ids) + input_ids = torch.index_select(input_ids, 0, sorted_indices) + positions = torch.index_select(positions, 0, sorted_indices) + sampling_params = torch.index_select(sampling_params, 0, + sorted_indices) + pixel_values = torch.index_select(pixel_values, 0, sorted_indices) + aspect_ratios = torch.index_select(aspect_ratios, 0, + sorted_indices) + num_chunks = torch.index_select(num_chunks, 0, sorted_indices) + has_image = torch.index_select(has_image, 0, sorted_indices) + self.vision_mask = create_vision_mask(input_ids, self.vision_token_id) output = self.model( input_ids.to(torch.int32), @@ -235,8 +286,14 @@ class NeuronMllamaForCausalLM(nn.Module): has_image=has_image.to(torch.int32), ) if self.config.neuron_config.on_device_sampling_config: - return output.hidden_states - return output.logits[:, -1, :] + output = output.hidden_states + else: + output = output.logits[:, -1, :] + + if self.is_reorder_needed and origin_input_block_ids.shape[0] != 1: + restored_indices = torch.argsort(sorted_indices) + output = torch.index_select(output, 0, restored_indices) + return output def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata) -> torch.Tensor: @@ -299,7 +356,7 @@ class NeuronMllamaForCausalLM(nn.Module): self.model = neuronx_model_cls(compiled_model_path) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) self.vision_token_id = tokenizer( - "<|image|>", add_special_tokens=False).input_ids + "<|image|>", add_special_tokens=False).input_ids[0] self.model.load(compiled_model_path) return except (FileNotFoundError, ValueError): @@ -326,7 +383,7 @@ class NeuronMllamaForCausalLM(nn.Module): # Read "<|image|>" token_id from the tokenizer self.vision_token_id = tokenizer("<|image|>", - add_special_tokens=False).input_ids + add_special_tokens=False).input_ids[0] logger.info("\nLoading model from compiled checkpoint...") self.model.load(compiled_model_path) diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index 292fe57f3..3aff3e01a 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -169,6 +169,7 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): mm_kwargs = seq_group_metadata.multi_modal_data if mm_kwargs: + mm_kwargs = self.process_multi_modal_data_neuron(mm_kwargs) multi_modal_kwargs_list.append(mm_kwargs) max_seq_len = max(seq_lens) @@ -274,6 +275,14 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): sampling_params.top_p = top_p sampling_params.temperature = temperature + # we need multi_modal_data for later tokens as well + multi_modal_kwargs_list: List[MultiModalKwargs] = [] + for seq_group_metadata in seq_group_metadata_list: + mm_data = seq_group_metadata.multi_modal_data + if mm_data: + multi_modal_kwargs_list.append(mm_data) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) + sampling_metadata = SamplingMetadata.prepare( seq_group_metadata_list, seq_lens, @@ -422,6 +431,10 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): def vocab_size(self) -> int: return self.model_config.get_vocab_size() + def process_multi_modal_data_neuron(self, mm_data): + # this is a no-op for NeuronModelRunner + return mm_data + def remove_all_loras(self): raise NotImplementedError( "LoRAs are not supported for Transformers NeuronX framework") diff --git a/vllm/worker/neuronx_distributed_model_runner.py b/vllm/worker/neuronx_distributed_model_runner.py index aa94706c8..9cd4f88d3 100644 --- a/vllm/worker/neuronx_distributed_model_runner.py +++ b/vllm/worker/neuronx_distributed_model_runner.py @@ -3,6 +3,8 @@ from typing import List, Optional, Set import torch +from neuronx_distributed_inference.models.mllama.aspect_ratio_utils import ( + get_all_supported_aspect_ratios) from neuronx_distributed_inference.modules.generation.sampling import ( prepare_sampling_params) from neuronx_distributed_inference.modules.lora_serving import ( @@ -17,7 +19,7 @@ from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.neuronx_distributed import ( _get_model_architecture, get_neuron_model) -from vllm.platforms import current_platform +from vllm.multimodal import MultiModalKwargs from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.worker.neuron_model_runner import (ModelInputForNeuron, NeuronModelRunner) @@ -121,42 +123,28 @@ class NeuronxDistributedModelRunner(NeuronModelRunner): sampling_params = self.get_nxd_sampling_params( model_input.sampling_metadata) - if model_input.multi_modal_kwargs.get('image') is not None: - pixel_values = [] - aspect_ratios = [] - num_chunks = [] - has_image = [] - for multi_modal_input in model_input.multi_modal_kwargs.get( - 'image'): - image_tensors = self.get_multi_modal_data_neuron( - multi_modal_input.squeeze(0)) - pixel_values.append(image_tensors[0]) - aspect_ratios.append(image_tensors[1]) - num_chunks.append(image_tensors[2]) - has_image.append(image_tensors[3]) - - pixel_values = torch.cat(pixel_values, dim=0) - aspect_ratios = torch.cat(aspect_ratios, dim=0) - num_chunks = torch.cat(num_chunks, dim=0) - has_image = torch.cat(has_image, dim=0) - + if model_input.multi_modal_kwargs.get('pixel_values') is not None: hidden_states = self.model( input_ids=model_input.input_tokens, positions=model_input.input_positions, seq_ids=model_input.input_block_ids, - pixel_values=pixel_values, - aspect_ratios=aspect_ratios, + pixel_values=model_input.multi_modal_kwargs.get( + 'pixel_values'), + aspect_ratios=model_input.multi_modal_kwargs.get( + 'aspect_ratios'), sampling_params=sampling_params, - num_chunks=num_chunks, - has_image=has_image, + num_chunks=model_input.multi_modal_kwargs.get('num_chunks'), + has_image=model_input.multi_modal_kwargs.get( + 'has_image').squeeze(1), ) else: - empty_pixel_values = torch.zeros([1, 1, 4, 3, 560, 560], + bs = model_input.input_tokens.shape[0] if (model_input.input_tokens + is not None) else 1 + empty_pixel_values = torch.zeros([bs, 1, 4, 3, 560, 560], dtype=torch.bfloat16) - empty_aspect_ratios = torch.ones([1, 1, 2], dtype=torch.int64) - num_chunks = torch.tensor([[1] - ]) # dummy num_chunks, will not be used - has_image = torch.tensor([0]) + empty_aspect_ratios = torch.ones([bs, 1, 2], dtype=torch.int64) + num_chunks = torch.zeros((bs, 1), dtype=torch.int32) + has_image = torch.zeros([bs], dtype=torch.int32) hidden_states = self.model( input_ids=model_input.input_tokens, positions=model_input.input_positions, @@ -175,6 +163,27 @@ class NeuronxDistributedModelRunner(NeuronModelRunner): return [output] + def process_multi_modal_data_neuron(self, mm_data): + # Neuron uses aspect_ratios instead of aspect_ratio_ids + all_supported_aspect_ratios = get_all_supported_aspect_ratios( + self.model.config.vision_config.max_num_tiles) + aspect_ratio_ids = mm_data.get("aspect_ratio_ids") + mm_data["aspect_ratios"] = torch.tensor( + all_supported_aspect_ratios[aspect_ratio_ids]).unsqueeze(0) + + # Neuron's num_chunks is HF's num_tiles + mm_data["num_chunks"] = mm_data.get("num_tiles") + + # Input has an image if it has pixel_values + bs = mm_data["num_chunks"].shape[0] + pixel_values = mm_data.get("pixel_values") + if pixel_values is not None and not torch.all(pixel_values == 0): + mm_data["has_image"] = torch.ones(bs) + + else: + mm_data["has_image"] = torch.zeros(bs) + return mm_data + def _get_lora_adapter_ids(self, seq_group_metadata_list): # set LoRA adapter IDs for multi-lora serving batch_size = len(seq_group_metadata_list) @@ -200,7 +209,6 @@ class NeuronxDistributedModelRunner(NeuronModelRunner): virtual_engine: int = 0, finished_requests_ids: Optional[List[str]] = None ) -> ModelInputForNeuron: - multi_modal_kwargs = None # NOTE: We assume that all sequences in the group are all prompts or # all decodes. is_prompt = seq_group_metadata_list[0].is_prompt @@ -223,6 +231,14 @@ class NeuronxDistributedModelRunner(NeuronModelRunner): sampling_params.top_p = top_p sampling_params.temperature = temperature + # we need multi_modal_data for later tokens as well + multi_modal_kwargs_list: List[MultiModalKwargs] = [] + for seq_group_metadata in seq_group_metadata_list: + mm_data = seq_group_metadata.multi_modal_data + if mm_data: + multi_modal_kwargs_list.append(mm_data) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) + lora_adapter_ids = self._get_lora_adapter_ids(seq_group_metadata_list) sampling_metadata = SamplingMetadata.prepare( @@ -236,18 +252,6 @@ class NeuronxDistributedModelRunner(NeuronModelRunner): self.pin_memory, generators=self.get_generators(finished_requests_ids)) - if current_platform.use_transformers_neuronx( - ) and not self._on_device_sampling_disabled: - # Once the request IDs are changed in current iteration, we will - # update the on-device sampling parameters. - current_batch_request_ids = [ - seq_group_meta_data.request_id - for seq_group_meta_data in seq_group_metadata_list - ] - if current_batch_request_ids != self._previous_batch_request_ids: - self._update_neuron_sampling_params(seq_group_metadata_list) - self._previous_batch_request_ids = current_batch_request_ids - return ModelInputForNeuron(input_tokens=input_tokens, input_positions=input_positions, input_block_ids=input_block_ids, -- GitLab From 749f5bdd3879e8e7ebce1c56f9960481a6cf63bb Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Sat, 31 May 2025 18:39:21 +0800 Subject: [PATCH 092/274] [doc] fix the list rendering issue - security.md (#18982) Signed-off-by: reidliu41 Co-authored-by: reidliu41 --- docs/usage/security.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/usage/security.md b/docs/usage/security.md index f1661828d..1209cc8dd 100644 --- a/docs/usage/security.md +++ b/docs/usage/security.md @@ -12,14 +12,14 @@ All communications between nodes in a multi-node vLLM deployment are **insecure The following options control inter-node communications in vLLM: -1. **Environment Variables:** +#### 1. **Environment Variables:** - `VLLM_HOST_IP`: Sets the IP address for vLLM processes to communicate on -2. **KV Cache Transfer Configuration:** +#### 2. **KV Cache Transfer Configuration:** - `--kv-ip`: The IP address for KV cache transfer communications (default: 127.0.0.1) - `--kv-port`: The port for KV cache transfer communications (default: 14579) -3. **Data Parallel Configuration:** +#### 3. **Data Parallel Configuration:** - `data_parallel_master_ip`: IP of the data parallel master (default: 127.0.0.1) - `data_parallel_master_port`: Port of the data parallel master (default: 29500) @@ -39,16 +39,16 @@ Key points from the PyTorch security guide: ### Security Recommendations -1. **Network Isolation:** +#### 1. **Network Isolation:** - Deploy vLLM nodes on a dedicated, isolated network - Use network segmentation to prevent unauthorized access - Implement appropriate firewall rules -2. **Configuration Best Practices:** +#### 2. **Configuration Best Practices:** - Always set `VLLM_HOST_IP` to a specific IP address rather than using defaults - Configure firewalls to only allow necessary ports between nodes -3. **Access Control:** +#### 3. **Access Control:** - Restrict physical and network access to the deployment environment - Implement proper authentication and authorization for management interfaces - Follow the principle of least privilege for all system components -- GitLab From c55d8046723325e09521a24ac076a8a7e64eaa52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luka=20Govedi=C4=8D?= Date: Sat, 31 May 2025 06:39:28 -0400 Subject: [PATCH 093/274] [BugFix] Pydantic part 2 (#18911) Signed-off-by: luka --- requirements/test.in | 1 + requirements/test.txt | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/requirements/test.in b/requirements/test.in index 87af61769..e906752ff 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -51,3 +51,4 @@ numpy runai-model-streamer==0.11.0 runai-model-streamer-s3==0.11.0 fastsafetensors>=0.1.10 +pydantic>=2.10 # 2.9 leads to error on python 3.10 \ No newline at end of file diff --git a/requirements/test.txt b/requirements/test.txt index 89d477017..60dcaca81 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -480,12 +480,13 @@ pycparser==2.22 # via cffi pycryptodomex==3.22.0 # via blobfile -pydantic==2.9.2 +pydantic==2.11.5 # via + # -r requirements/test.in # datamodel-code-generator # mistral-common # mteb -pydantic-core==2.23.4 +pydantic-core==2.33.2 # via pydantic pygments==2.18.0 # via rich @@ -784,6 +785,9 @@ typing-extensions==4.12.2 # pydantic-core # torch # typer + # typing-inspection +typing-inspection==0.4.1 + # via pydantic tzdata==2024.2 # via pandas uri-template==1.3.0 -- GitLab From 0f5e0d567e9949ffee4f1677bc9dd767804eadd0 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Sat, 31 May 2025 18:39:31 +0800 Subject: [PATCH 094/274] [FEAT][ROCm] Add AITER grouped topk for DeepSeekV2 (#18825) Signed-off-by: vllmellm --- tests/kernels/moe/test_rocm_aiter_topk.py | 93 +++++++++++++++++++ vllm/model_executor/layers/fused_moe/layer.py | 2 +- .../layers/fused_moe/rocm_aiter_fused_moe.py | 78 +++++++++++++--- 3 files changed, 157 insertions(+), 16 deletions(-) diff --git a/tests/kernels/moe/test_rocm_aiter_topk.py b/tests/kernels/moe/test_rocm_aiter_topk.py index b0d34ddfd..922fd66db 100644 --- a/tests/kernels/moe/test_rocm_aiter_topk.py +++ b/tests/kernels/moe/test_rocm_aiter_topk.py @@ -35,6 +35,15 @@ def test_rocm_aiter_biased_grouped_topk_custom_op_registration(): assert callable(torch.ops.vllm.rocm_aiter_biased_grouped_topk) +def test_rocm_aiter_grouped_topk_custom_op_registration(): + """Test that the custom op is correctly registered.""" + # Check if the op exists in torch.ops.vllm + assert hasattr(torch.ops.vllm, 'rocm_aiter_grouped_topk') + + # Check if the op is callable + assert callable(torch.ops.vllm.rocm_aiter_grouped_topk) + + def test_rocm_aiter_biased_grouped_topk_torch_compile_compatibility(): """Test that the op can be used with torch.compile.""" # Create test tensors @@ -120,3 +129,87 @@ def test_rocm_aiter_biased_grouped_topk_torch_compile_compatibility(): rtol=1e-2, atol=1e-2) assert torch.allclose(topk_ids_original, topk_ids_compiled) + + +def test_rocm_aiter_grouped_topk_torch_compile_compatibility(): + """Test that the op can be used with torch.compile.""" + # Create test tensors + token = 64 + expert = 256 + num_expert_group = 8 + topk = 8 + topk_group = 4 + renormalize = True + scoring_func = "softmax" + scale_factor = 1.0 + + gating_output = torch.randn((token, expert), + dtype=torch.bfloat16, + device="cuda") + + device = gating_output.device + topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device) + topk_weights = torch.empty((token, topk), + dtype=torch.float32, + device=device) + + # Define a function that uses the op + def grouped_topk_fn(gating_output, topk_weights, topk_ids, scoring_func): + return torch.ops.vllm.rocm_aiter_grouped_topk( + gating_output, topk_weights, topk_ids, num_expert_group, + topk_group, renormalize, scoring_func, scale_factor) + + # Verify the op's fake implementation + torch.library.opcheck(torch.ops.vllm.rocm_aiter_grouped_topk, + (gating_output, topk_weights, topk_ids), + kwargs={ + "num_expert_group": num_expert_group, + "topk_group": topk_group, + "need_renorm": renormalize, + "scoring_func": scoring_func, + "routed_scaling_factor": scale_factor + }, + test_utils=("test_faketensor")) + + # Compile the function with appropriate settings + compiled_fn = torch.compile(grouped_topk_fn, + fullgraph=True, + backend="inductor", + mode="reduce-overhead", + dynamic=False) + + topk_weights_original = torch.empty((token, topk), + dtype=torch.float32, + device=device) + topk_ids_original = torch.empty((token, topk), + dtype=torch.int32, + device=device) + + topk_weights_compiled = torch.empty((token, topk), + dtype=torch.float32, + device=device) + topk_ids_compiled = torch.empty((token, topk), + dtype=torch.int32, + device=device) + + # Run both compiled (V1 graph mode) and uncompiled versions (V1 eager mode) + grouped_topk_fn(gating_output, topk_weights_original, topk_ids_original, + scoring_func) + compiled_fn(gating_output, topk_weights_compiled, topk_ids_compiled, + scoring_func) + + # Sort the results for comparison since the order might not be deterministic + topk_ids_original, indices_original = torch.sort(topk_ids_original) + topk_weights_original = torch.gather(topk_weights_original, 1, + indices_original) + + topk_ids_compiled, indices_compiled = torch.sort(topk_ids_compiled) + topk_weights_compiled = torch.gather(topk_weights_compiled, 1, + indices_compiled) + + # Verify results match + assert torch.allclose(topk_weights_original, + topk_weights_compiled, + rtol=1e-2, + atol=1e-2) + assert torch.allclose(topk_ids_original, topk_ids_compiled) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 838a7c24b..af7b98e14 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -45,7 +45,7 @@ else: FusedMoEPrepareAndFinalize = None # type: ignore if is_rocm_aiter_moe_enabled(): from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501 - rocm_aiter_biased_group_topk as grouped_topk) + rocm_aiter_grouped_topk as grouped_topk) else: from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk if current_platform.is_tpu(): diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index 10b61fcda..824062491 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -140,6 +140,36 @@ def rocm_aiter_biased_grouped_topk_fake( pass +def rocm_aiter_grouped_topk_impl( + gating_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_expert_group: int, + topk_group: int, + need_renorm: bool, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0 # mul to topk_weights +) -> None: + + from aiter import grouped_topk + + grouped_topk(gating_output, topk_weights, topk_ids, num_expert_group, + topk_group, need_renorm, scoring_func, routed_scaling_factor) + + +def rocm_aiter_grouped_topk_fake( + gating_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_expert_group: int, + topk_group: int, + need_renorm: bool, + scoring_func: str = "softmax", + routed_scaling_factor: float = 1.0 # mul to topk_weights +) -> None: + pass + + def rocm_aiter_fused_moe_impl( hidden_states: torch.Tensor, w1: torch.Tensor, @@ -218,36 +248,54 @@ if current_platform.is_rocm(): dispatch_key=current_platform.dispatch_key, ) + direct_register_custom_op( + op_name="rocm_aiter_grouped_topk", + op_func=rocm_aiter_grouped_topk_impl, + mutates_args=["topk_weights", "topk_ids"], + fake_impl=rocm_aiter_grouped_topk_fake, + dispatch_key=current_platform.dispatch_key, + ) + -def rocm_aiter_biased_group_topk( +def rocm_aiter_grouped_topk( hidden_states: torch.Tensor, gating_output: torch.Tensor, topk: int, renormalize: bool, num_expert_group: int = 0, topk_group: int = 0, - scoring_func: str = "sigmoid", + scoring_func: str = "softmax", e_score_correction_bias: Optional[torch.Tensor] = None ) -> tuple[torch.Tensor, torch.Tensor]: - assert scoring_func == "sigmoid", ( - "rocm_aiter_biased_group_topk only supports 'sigmoid' scoring_func.") - assert e_score_correction_bias is not None, ( - "'e_score_correction_bias' must not be None.") token = hidden_states.shape[0] device = hidden_states.device topk_ids = torch.empty((token, topk), dtype=torch.int32, device=device) topk_weights = torch.empty((token, topk), dtype=torch.float32, device=device) - torch.ops.vllm.rocm_aiter_biased_grouped_topk( - gating_output, - e_score_correction_bias, - topk_weights, - topk_ids, - num_expert_group, - topk_group, - renormalize, - ) + + if e_score_correction_bias is not None: + torch.ops.vllm.rocm_aiter_biased_grouped_topk( + gating_output, + e_score_correction_bias, + topk_weights, + topk_ids, + num_expert_group, + topk_group, + renormalize, + ) + else: + assert (scoring_func == "softmax" or scoring_func == "sigmoid") + torch.ops.vllm.rocm_aiter_grouped_topk( + gating_output, + topk_weights, + topk_ids, + num_expert_group, + topk_group, + renormalize, + scoring_func, + ) + return topk_weights, topk_ids -- GitLab From f2c3f66d59f9e38aa94985b54f370219222e7bd1 Mon Sep 17 00:00:00 2001 From: Fred Reiss Date: Sat, 31 May 2025 04:58:17 -0700 Subject: [PATCH 095/274] [Bugfix] Fix for issue 17396 (#18773) Signed-off-by: Fred Reiss --- vllm/lora/ops/torch_ops/lora_ops.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/lora/ops/torch_ops/lora_ops.py b/vllm/lora/ops/torch_ops/lora_ops.py index af79f9841..ab65faceb 100644 --- a/vllm/lora/ops/torch_ops/lora_ops.py +++ b/vllm/lora/ops/torch_ops/lora_ops.py @@ -36,10 +36,13 @@ def bgmv_expand(inputs: torch.Tensor, if outputs.shape[0] == 1 and output_tensor.shape[0] != 1: limit = 1 + # LoRA adapter and model may add different amounts of padding to output + common_len = min(outputs.shape[1], output_tensor.shape[1]) + if add_inputs: - output_tensor[:, :outputs.shape[1]] += outputs[:limit, :] + output_tensor[:, :common_len] += outputs[:limit, :common_len] else: - output_tensor[:, :outputs.shape[1]] = outputs[:limit, :] + output_tensor[:, :common_len] = outputs[:limit, :common_len] def sgmv_shrink( -- GitLab From 306d60401dbd066f64298e02ca73d4f2075d7bf6 Mon Sep 17 00:00:00 2001 From: Charlie Fu Date: Sat, 31 May 2025 09:40:05 -0500 Subject: [PATCH 096/274] [ROCm][Kernel] Add gfx950 support for skinny gemms (#18010) Signed-off-by: charlifu --- csrc/rocm/skinny_gemms.cu | 113 +++++++++++------- tests/kernels/quant_utils.py | 14 ++- .../layers/quantization/utils/w8a8_utils.py | 4 +- vllm/model_executor/layers/utils.py | 4 +- vllm/platforms/rocm.py | 10 +- 5 files changed, 91 insertions(+), 54 deletions(-) diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu index b3717892d..e31aa0162 100644 --- a/csrc/rocm/skinny_gemms.cu +++ b/csrc/rocm/skinny_gemms.cu @@ -13,14 +13,34 @@ #include "dispatch_utils.h" #include "quantization/fp8/common.cuh" -#if defined(__HIPCC__) && (defined(__gfx90a__) || defined(__gfx942__)) - #define __HIP__MI300_MI250__ +#if defined(__HIPCC__) && \ + (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) + #define __HIP__GFX9__ #endif -#if defined(__HIPCC__) && defined(__gfx942__) - #define __HIP__MI300__ +#if defined(__HIPCC__) && (defined(__gfx942__) || defined(__gfx950__)) + #define __HIP__MI3XX__ #endif +#if defined(__gfx950__) + #define LDS_SIZE 160 * 1024 +#else + #define LDS_SIZE 64 * 1024 +#endif + +int get_lds_size() { + static bool is_cached = false; + static int result; + if (is_cached == false) { + auto dprops = at::cuda::getCurrentDeviceProperties(); + std::string device_arch = dprops->gcnArchName; + size_t substring = device_arch.find("gfx95"); + result = (substring == std::string::npos ? 64 * 1024 : 160 * 1024); + is_cached = true; + } + return result; +} + #if defined(NDEBUG) #undef NDEBUG #include @@ -267,7 +287,7 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b, V0 += (s.x + s.y); \ } -#if defined(__HIP__MI300_MI250__) // TODO: Add NAVI support +#if defined(__HIP__GFX9__) // TODO: Add NAVI support // This version targets cases where A[] fits LDS capacity template @@ -275,7 +295,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B, const scalar_t* __restrict__ A, scalar_t* C, const int _WvPrGrp, const int CuCount) { - #if defined(__HIP__MI300__) + constexpr int max_lds_len = LDS_SIZE / 2; + #if defined(__HIP__MI3XX__) constexpr bool use_mfma = (std::is_same_v); #else constexpr bool use_mfma = false; @@ -295,13 +316,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) }; //---------------------------------------------------- - // Reserving 64 KB of LDS to have 1 WG / CU + // Reserving 64/160 KB of LDS to have 1 WG / CU // Goal is to bring the activation matrix A to the LDS // and use it across the lifetime of the work group // TODO: When activation matrix is larger than 64 KB // then this is not goint to work! //---------------------------------------------------- - __shared__ scalar_t s[1024 * 32]; + __shared__ scalar_t s[max_lds_len]; //---------------------------------------------------- // Fetch the activation matrix to LDS @@ -312,11 +333,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) // - Then the WG will move to another 8 K elements // TODO: Logic below will only work when K is multiple of 8 //---------------------------------------------------- - for (uint32_t k = 0; k < min(K * N, 32 * 1024); + for (uint32_t k = 0; k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) { uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK); - if (k_in >= min(K * N, 32 * 1024)) break; + if (k_in >= min(K * N, max_lds_len)) break; *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in])); } @@ -517,7 +538,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) m += CuCount * _WvPrGrp * YTILE; } } -#else // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support +#else // !defined(__HIP__GFX9__) TODO: Add NAVI support template __global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B, @@ -525,9 +546,9 @@ __global__ void wvSplitK_hf_sml_(const int K, const int M, const scalar_t* B, const int _WvPrGrp, const int CuCount) { UNREACHABLE_CODE } -#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support +#endif // defined(__HIP__GFX9__) TODO: Add NAVI support -#if defined(__HIP__MI300_MI250__) // TODO: Add NAVI support +#if defined(__HIP__GFX9__) // TODO: Add NAVI support // This version targets cases where A[] marginally exceeds LDS capacity template @@ -535,7 +556,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) wvSplitK_hf_(const int K, const int M, const scalar_t* B, const scalar_t* __restrict__ A, scalar_t* C, const int _WvPrGrp, const int CuCount) { - #if defined(__HIP__MI300__) + constexpr int max_lds_len = LDS_SIZE / 2; + #if defined(__HIP__MI3XX__) constexpr bool use_mfma = (std::is_same_v); #else constexpr bool use_mfma = false; @@ -561,7 +583,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) // TODO: When activation matrix is larger than 64 KB // then this is not goint to work! //---------------------------------------------------- - __shared__ scalar_t s[1024 * 32]; + __shared__ scalar_t s[max_lds_len]; //---------------------------------------------------- // Computation of columns that need to be committed to memory! @@ -598,11 +620,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) // - Then the WG will move to another 8 K elements // TODO: Logic below will only work when K is multiple of 8 //---------------------------------------------------- - for (uint32_t k = 0; k < min(K * N, 32 * 1024); + for (uint32_t k = 0; k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) { uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK); - if (k_in >= min(K * N, 32 * 1024)) break; + if (k_in >= min(K * N, max_lds_len)) break; *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in])); } @@ -686,7 +708,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) // Fetch A activation matrix in interleaved fashion from LDS or memory for (int n = 0; n < N; n++) { - if (k_ + K * n < 32 * 1024) + if (k_ + K * n < max_lds_len) bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n]))); else bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n]))); @@ -817,7 +839,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) } } -#else // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support +#else // !defined(__HIP__GFX9__) TODO: Add NAVI support template __global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B, @@ -825,9 +847,9 @@ __global__ void wvSplitK_hf_(const int K, const int M, const scalar_t* B, const int _WvPrGrp, const int CuCount) { UNREACHABLE_CODE } -#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support +#endif // defined(__HIP__GFX9__) TODO: Add NAVI support -#if defined(__HIP__MI300_MI250__) // TODO: Add NAVI support +#if defined(__HIP__GFX9__) // TODO: Add NAVI support // This version targets big A[] cases, where it is much larger than LDS capacity template @@ -835,7 +857,8 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) wvSplitK_hf_big_(const int K, const int M, const scalar_t* B, const scalar_t* __restrict__ A, scalar_t* C, const int _WvPrGrp, const int CuCount) { - #if defined(__HIP__MI300__) + constexpr int max_lds_len = LDS_SIZE / 2; + #if defined(__HIP__MI3XX__) constexpr bool use_mfma = (std::is_same_v); #else constexpr bool use_mfma = false; @@ -855,13 +878,13 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) }; //---------------------------------------------------- - // Reserving 64 KB of LDS to have 1 WG / CU + // Reserving 64/160 KB of LDS to have 1 WG / CU // Goal is to bring the activation matrix A to the LDS // and use it across the lifetime of the work group // TODO: When activation matrix is larger than 64 KB // then this is not goint to work! //---------------------------------------------------- - __shared__ scalar_t s[1024 * 32]; + __shared__ scalar_t s[max_lds_len]; //---------------------------------------------------- // Computation of columns that need to be committed to memory! @@ -902,11 +925,11 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) //---------------------------------------------------- #define PCML #ifndef PCML - for (uint32_t k = 0; k < min(K * N, 32 * 1024); + for (uint32_t k = 0; k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) { uint32_t k_in = k + ((threadIdx.y * THRDS + threadIdx.x) * A_CHUNK); - if (k_in >= min(K * N, 32 * 1024)) break; + if (k_in >= min(K * N, max_lds_len)) break; *((bigType*)(&s[k_in])) = *((bigType*)(&A[k_in])); } @@ -916,7 +939,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) #define TUC (THRDS * UNRL * A_CHUNK) uint32_t kBase = 0; // find biggest k size that fits in LDS - uint32_t kFit = (32 * 1024) / N; + uint32_t kFit = (max_lds_len) / N; // kFit = (kFit%TWC==0) ? kFit : (kFit-kFit%TWC+TWC); //round up to multiple // of TUC kFit = (kFit % TUC == 0) @@ -1164,7 +1187,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) } } } -#else // !defined(__HIP__MI300_MI250__) TODO: Add NAVI support +#else // !defined(__HIP__GFX9__) TODO: Add NAVI support template __global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B, @@ -1172,7 +1195,7 @@ __global__ void wvSplitK_hf_big_(const int K, const int M, const scalar_t* B, const int _WvPrGrp, const int CuCount) { UNREACHABLE_CODE } -#endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support +#endif // defined(__HIP__GFX9__) TODO: Add NAVI support int mindiv(int N, int div1, int div2) { int nPrRnd = div1 * div2; @@ -1222,17 +1245,18 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b, const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + const int max_lds_len = get_lds_size() / 2; #define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \ _N) \ { \ dim3 block(64, _WvPrGrp); \ - if ((K_in * N_in <= 32 * 1024) && (M_in % _YTILEs == 0)) { \ + if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \ int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \ wvSplitK_hf_sml_ \ <<>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \ CuCount); \ - } else if (K_in * N_in <= 32 * 1024 * 1.2) { \ + } else if (K_in * N_in <= max_lds_len * 1.2) { \ int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp); \ wvSplitK_hf_ \ <<>>(K_in, M_in, af4, bf4, c, __wvPrGrp, \ @@ -1272,7 +1296,7 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b, return out_c; } -#if defined(__HIP__MI300__) // TODO: Add NAVI support +#if defined(__HIP__MI3XX__) // TODO: Add NAVI support template __global__ void __launch_bounds__(WvPrGrp* THRDS) @@ -1281,6 +1305,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) const float* __restrict__ s_A, const float* __restrict__ s_B, const int _WvPrGrp, const int CuCount) { + constexpr int max_lds_len = LDS_SIZE; using scalar8 = __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float; using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int; @@ -1296,10 +1321,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) scalar8 h8; }; - __shared__ fp8_t s[1024 * 64]; + __shared__ fp8_t s[max_lds_len]; for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK; - k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) { + k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) { *((bigType*)(&s[k])) = *((bigType*)(&A[k])); } __syncthreads(); @@ -1436,7 +1461,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) m += CuCount * _WvPrGrp * YTILE; } } -#else // !defined(__HIP__MI300__) TODO: Add NAVI support +#else // !defined(__HIP__MI3XX__) TODO: Add NAVI support template __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, @@ -1446,9 +1471,9 @@ __global__ void wvSplitKQ_hf_sml_(const int K, const int Kp, const int M, const int _WvPrGrp, const int CuCount) { UNREACHABLE_CODE } -#endif // defined(__HIP__MI300__) TODO: Add NAVI support +#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support -#if defined(__HIP__MI300__) // TODO: Add NAVI support +#if defined(__HIP__MI3XX__) // TODO: Add NAVI support template __global__ void __launch_bounds__(WvPrGrp* THRDS) @@ -1456,6 +1481,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) const fp8_t* __restrict__ A, scalar_t* C, const float* __restrict__ s_A, const float* __restrict__ s_B, const int _WvPrGrp, const int CuCount) { + constexpr int max_lds_len = LDS_SIZE; using scalar8 = __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float; using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int; @@ -1471,10 +1497,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) scalar8 h8; }; - __shared__ fp8_t s[1024 * 64]; + __shared__ fp8_t s[max_lds_len]; for (uint32_t k = (threadIdx.y * THRDS + threadIdx.x) * A_CHUNK; - k < min(K * N, 64 * 1024); k += THRDS * WvPrGrp * A_CHUNK) { + k < min(K * N, max_lds_len); k += THRDS * WvPrGrp * A_CHUNK) { *((bigType*)(&s[k])) = *((bigType*)(&A[k])); } __syncthreads(); @@ -1517,7 +1543,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) uint32_t k_ = k + threadIdx.x * A_CHUNK; if (k_ >= K) break; for (int n = 0; n < N; n++) { - if (k_ + K * n < 64 * 1024) + if (k_ + K * n < max_lds_len) bigA[n][k2] = *((const bigType*)(&(s[k_ + K * n]))); else bigA[n][k2] = *((const bigType*)(&(A[k_ + K * n]))); @@ -1608,7 +1634,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) m += CuCount * _WvPrGrp * YTILE; } } -#else // !defined(__HIP__MI300__) TODO: Add NAVI support +#else // !defined(__HIP__MI3XX__) TODO: Add NAVI support template __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M, @@ -1618,7 +1644,7 @@ __global__ void wvSplitKQ_hf_(const int K, const int Kp, const int M, const int CuCount) { UNREACHABLE_CODE } -#endif // defined(__HIP__MI300__) TODO: Add NAVI support +#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c, at::Tensor& scale_a, at::Tensor& scale_b, @@ -1638,12 +1664,13 @@ void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c, dim3 grid(CuCount); const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + const int max_lds_len = get_lds_size(); #define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \ _N) \ { \ dim3 block(64, _WvPrGrp); \ - if ((K_in * N_in <= 64 * 1024) && (M_in % _YTILEs == 0)) { \ + if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \ int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \ wvSplitKQ_hf_sml_ \ <<>>(K_in, Kp_in, M_in, a_ptr, b_ptr, c_ptr, \ diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index 764924f26..892309a01 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -8,7 +8,7 @@ from vllm.platforms import current_platform # Using the default value (240.0) from pytorch will cause accuracy # issue on dynamic quantization models. Here use 224.0 for rocm. -ROCM_FP8_MAX = 224.0 +ROCM_FP8FNUZ_MAX = 224.0 FP8_DTYPE = current_platform.fp8_dtype() @@ -26,9 +26,11 @@ def ref_dynamic_per_token_quant(x: torch.tensor, qtype_traits = torch.iinfo(quant_dtype) if quant_dtype == torch.int8 \ else torch.finfo(quant_dtype) - qtype_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \ + qtype_traits_max = ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \ + and current_platform.is_fp8_fnuz() \ else qtype_traits.max - qtype_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \ + qtype_traits_min = -ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \ + and current_platform.is_fp8_fnuz() \ else qtype_traits.min qtype_max = as_float32_tensor(qtype_traits_max) s_1 = as_float32_tensor(1.0) @@ -70,9 +72,11 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \ -> tuple[torch.tensor, torch.tensor]: fp8_traits = torch.finfo(FP8_DTYPE) - fp8_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \ + fp8_traits_max = ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \ + and current_platform.is_fp8_fnuz() \ else fp8_traits.max - fp8_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \ + fp8_traits_min = -ROCM_FP8FNUZ_MAX if current_platform.is_rocm() \ + and current_platform.is_fp8_fnuz() \ else fp8_traits.min fp8_max = as_float32_tensor(fp8_traits_max) one = as_float32_tensor(1.0) diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 4b041cff2..eed8998fe 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -155,8 +155,8 @@ def rocm_per_tensor_w8a8_scaled_mm(*, qinput: torch.Tensor, scale_b: torch.Tensor, bias: torch.Tensor, input_2d: torch.Tensor, output_shape: list) -> torch.Tensor: - from vllm.platforms.rocm import on_mi250_mi300 - if envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi250_mi300( + from vllm.platforms.rocm import on_mi3xx + if envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi3xx( ) and qinput.shape[0] == 1 and qinput.shape[1] % 16 == 0: output = ops.wvSplitKQ(weight.t(), qinput, out_dtype, scale_a, scale_b, current_platform.get_cu_count()) diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index 18783d0d7..001e6aaf0 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -70,9 +70,9 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor, def rocm_unquantized_gemm(x: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None): - from vllm.platforms.rocm import on_mi250_mi300 + from vllm.platforms.rocm import on_gfx9 k = weight.shape[1] - use_skinny = (envs.VLLM_ROCM_USE_SKINNY_GEMM and on_mi250_mi300() and \ + use_skinny = (envs.VLLM_ROCM_USE_SKINNY_GEMM and on_gfx9() and \ x.dtype in [torch.float16, torch.bfloat16] \ and k % 8 == 0 and bias is None) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 06ee8614d..ef1c632a5 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -105,9 +105,15 @@ def on_gfx1x() -> bool: @cache -def on_mi250_mi300() -> bool: +def on_mi3xx() -> bool: GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName - return any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942"]) + return any(arch in GPU_ARCH for arch in ["gfx942", "gfx950"]) + + +@cache +def on_gfx9() -> bool: + GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName + return any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"]) @cache -- GitLab From 8bf507d766894a8104946e026a2eeb84cbf34242 Mon Sep 17 00:00:00 2001 From: ptarasiewiczNV <104908264+ptarasiewiczNV@users.noreply.github.com> Date: Sat, 31 May 2025 17:19:18 +0200 Subject: [PATCH 097/274] [P/D] NixlConnector use cache device index for memory registration (#18969) Signed-off-by: Piotr Tarasiewicz --- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index f02434aeb..6a3472157 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -488,7 +488,8 @@ class NixlConnectorWorker: for cache in cache_list: base_addr = cache.data_ptr() region_len = self.num_blocks * self.block_len - caches_data.append((base_addr, region_len, self.rank, "")) + caches_data.append( + (base_addr, region_len, cache.device.index, "")) kv_caches_base_addr.append(base_addr) self.kv_caches_base_addr[self.engine_id] = kv_caches_base_addr self.num_regions = len(caches_data) -- GitLab From 9a1b9b99d7d21a65280fc68c82f5bb6152fdf9dd Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sat, 31 May 2025 08:34:52 -0700 Subject: [PATCH 098/274] [BugFix] Fix multi-node offline data-parallel (#18981) Signed-off-by: Nick Hill Co-authored-by: Yizhou Liu --- examples/offline_inference/data_parallel.py | 12 ++++++++---- vllm/v1/engine/core_client.py | 7 +++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index bf60d883c..15906e1a2 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -97,10 +97,14 @@ def main( # with DP, each rank should process different prompts. # usually all the DP ranks process a full dataset, # and each rank processes a different part of the dataset. - promts_per_rank = len(prompts) // dp_size - start = global_dp_rank * promts_per_rank - end = start + promts_per_rank - prompts = prompts[start:end] + floor = len(prompts) // dp_size + remainder = len(prompts) % dp_size + + # Distribute prompts into even groups. + def start(rank): + return rank * floor + min(rank, remainder) + + prompts = prompts[start(global_dp_rank) : start(global_dp_rank + 1)] if len(prompts) == 0: # if any rank has no prompts to process, # we need to set a placeholder prompt diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index e9e2d2d8d..232d6742b 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -363,6 +363,7 @@ class MPClient(EngineCoreClient): local_engine_count = parallel_config.data_parallel_size_local local_start_index = parallel_config.data_parallel_rank_local dp_size = parallel_config.data_parallel_size + dp_rank = parallel_config.data_parallel_rank # SPMD mode is where there is an LLM instance per DP rank and # one core engine per LLM, see @@ -370,11 +371,9 @@ class MPClient(EngineCoreClient): spmd_mode = local_start_index is not None if spmd_mode: assert local_engine_count == 1 - self.core_engines = [ - CoreEngine(index=local_start_index, local=True) - ] + self.core_engines = [CoreEngine(index=dp_rank, local=True)] else: - assert parallel_config.data_parallel_rank == 0 + assert dp_rank == 0 local_start_index = 0 self.core_engines = [ CoreEngine(index=i, local=(i < local_engine_count)) -- GitLab From 20079c6e365752713c05de0eb719ac0307979c3b Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Sun, 1 Jun 2025 02:00:11 +0800 Subject: [PATCH 099/274] [Misc] add return token strs for tokenize (#18941) Signed-off-by: reidliu41 Co-authored-by: reidliu41 --- tests/entrypoints/openai/test_tokenization.py | 65 ++++++++++++++----- vllm/entrypoints/openai/protocol.py | 11 ++++ .../openai/serving_tokenization.py | 5 ++ 3 files changed, 66 insertions(+), 15 deletions(-) diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py index 9773f3e45..7d823542e 100644 --- a/tests/entrypoints/openai/test_tokenization.py +++ b/tests/entrypoints/openai/test_tokenization.py @@ -76,11 +76,11 @@ async def test_tokenize_completions( }) response.raise_for_status() - assert response.json() == { - "tokens": tokens, - "count": len(tokens), - "max_model_len": 8192 - } + result = response.json() + assert result["tokens"] == tokens + assert result["count"] == len(tokens) + assert result["max_model_len"] == 8192 + assert result["token_strs"] is None @pytest.mark.asyncio @@ -138,11 +138,11 @@ async def test_tokenize_chat( }) response.raise_for_status() - assert response.json() == { - "tokens": tokens, - "count": len(tokens), - "max_model_len": 8192 - } + result = response.json() + assert result["tokens"] == tokens + assert result["count"] == len(tokens) + assert result["max_model_len"] == 8192 + assert result["token_strs"] is None @pytest.mark.asyncio @@ -215,11 +215,46 @@ async def test_tokenize_chat_with_tools( ) response.raise_for_status() - assert response.json() == { - "tokens": tokens, - "count": len(tokens), - "max_model_len": 8192, - } + result = response.json() + assert result["tokens"] == tokens + assert result["count"] == len(tokens) + assert result["max_model_len"] == 8192 + assert result["token_strs"] is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name, tokenizer_name", + [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], + indirect=["tokenizer_name"], +) +async def test_tokenize_with_return_token_strs( + server: RemoteOpenAIServer, + model_name: str, + tokenizer_name: str, +): + tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, + tokenizer_mode="fast") + + prompt = "This is a token_strs test prompt! vllm1" + response = requests.post( + server.url_for("tokenize"), + json={ + "prompt": prompt, + "model": model_name, + "return_token_strs": True + }, + ) + response.raise_for_status() + + tokens = tokenizer.encode(prompt, add_special_tokens=True) + tokens_str = tokenizer.convert_ids_to_tokens(tokens) + + result = response.json() + assert result["tokens"] == tokens + assert result["count"] == len(tokens) + assert result["max_model_len"] == 8192 + assert result["token_strs"] == tokens_str @pytest.mark.asyncio diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 2f641079e..e72c23993 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1563,6 +1563,11 @@ class TokenizeCompletionRequest(OpenAIBaseModel): "If true (the default), special tokens (e.g. BOS) will be added to " "the prompt."), ) + return_token_strs: Optional[bool] = Field( + default=False, + description=("If true, also return the token strings " + "corresponding to the token ids."), + ) class TokenizeChatRequest(OpenAIBaseModel): @@ -1576,6 +1581,11 @@ class TokenizeChatRequest(OpenAIBaseModel): "This is a parameter used by chat template in tokenizer config of the " "model."), ) + return_token_strs: Optional[bool] = Field( + default=False, + description=("If true, also return the token strings " + "corresponding to the token ids."), + ) continue_final_message: bool = Field( default=False, description= @@ -1633,6 +1643,7 @@ class TokenizeResponse(OpenAIBaseModel): count: int max_model_len: int tokens: list[int] + token_strs: Optional[list[str]] = None class DetokenizeRequest(OpenAIBaseModel): diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 5ef1a486d..0d739bbf9 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -110,7 +110,12 @@ class OpenAIServingTokenization(OpenAIServing): dict) and "prompt_token_ids" in engine_prompt: input_ids.extend(engine_prompt["prompt_token_ids"]) + token_strs = None + if request.return_token_strs: + token_strs = tokenizer.convert_ids_to_tokens(input_ids) + return TokenizeResponse(tokens=input_ids, + token_strs=token_strs, count=len(input_ids), max_model_len=self.max_model_len) -- GitLab From bbfa0c61d195968dd0e9a6169243019109098a2c Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Sat, 31 May 2025 15:07:38 -0400 Subject: [PATCH 100/274] [Misc][Benchmark] Add support for CustomDataset (#18511) --- benchmarks/README.md | 48 ++++++++++++++++ benchmarks/benchmark_dataset.py | 94 +++++++++++++++++++++++++++++++- benchmarks/benchmark_serving.py | 30 +++++++++- vllm/benchmarks/datasets.py | 97 ++++++++++++++++++++++++++++++++- vllm/benchmarks/serve.py | 3 + 5 files changed, 264 insertions(+), 8 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index cbf2f281b..6f9fbb91c 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -64,6 +64,12 @@ become available. ✅ lmms-lab/LLaVA-OneVision-Data, Aeala/ShareGPT_Vicuna_unfiltered + + Custom + ✅ + ✅ + Local file: data.jsonl + @@ -124,6 +130,38 @@ P99 ITL (ms): 8.39 ================================================== ``` +### Custom Dataset +If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl + +``` +{"prompt": "What is the capital of India?"} +{"prompt": "What is the capital of Iran?"} +{"prompt": "What is the capital of China?"} +``` + +```bash +# start server +VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests +``` + +```bash +# run benchmarking script +python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \ + --backend vllm \ + --model meta-llama/Llama-3.1-8B-Instruct \ + --endpoint /v1/completions \ + --dataset-name custom \ + --dataset-path \ + --custom-skip-chat-template \ + --num-prompts 80 \ + --max-concurrency 1 \ + --temperature=0.3 \ + --top-p=0.75 \ + --result-dir "./log/" +``` + +You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`. + ### VisionArena Benchmark for Vision Language Models ```bash @@ -203,6 +241,16 @@ python3 vllm/benchmarks/benchmark_serving.py \ --seed 42 ``` +**`philschmid/mt-bench`** + +``` bash +python3 vllm/benchmarks/benchmark_serving.py \ + --model Qwen/QwQ-32B \ + --dataset-name hf \ + --dataset-path philschmid/mt-bench \ + --num-prompts 80 +``` + ### Running With Sampling Parameters When using OpenAI-compatible backends such as `vllm`, optional sampling diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 5513a5f78..d86bf045e 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -9,9 +9,6 @@ generation. Supported dataset types include: - BurstGPT - HuggingFace - VisionArena - -TODO: Implement CustomDataset to parse a JSON file and convert its contents into -SampleRequest instances, similar to the approach used in ShareGPT. """ import base64 @@ -442,6 +439,97 @@ class ShareGPTDataset(BenchmarkDataset): return samples +# ----------------------------------------------------------------------------- +# Custom Dataset Implementation +# ----------------------------------------------------------------------------- + + +class CustomDataset(BenchmarkDataset): + """ + Implements the Custom dataset. Loads data from a JSONL file and generates + sample requests based on conversation turns. E.g., + ``` + {"prompt": "What is the capital of India?"} + {"prompt": "What is the capital of Iran?"} + {"prompt": "What is the capital of China?"} + ``` + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + # self.data will be a list of dictionaries + # e.g., [{"prompt": "What is the capital of India?"}, ...] + # This will be the standardized format which load_data() + # has to convert into depending on the filetype of dataset_path. + # sample() will assume this standardized format of self.data + self.data = [] + + # Load the JSONL file + if self.dataset_path.endswith(".jsonl"): + jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True) + + # check if the JSONL file has a 'prompt' column + if "prompt" not in jsonl_data.columns: + raise ValueError("JSONL file must contain a 'prompt' column.") + + # Convert each row to a dictionary and append to self.data + # This will convert the DataFrame to a list of dictionaries + # where each dictionary corresponds to a row in the DataFrame. + # This is the standardized format we want for self.data + for _, row in jsonl_data.iterrows(): + self.data.append(row.to_dict()) + else: + raise NotImplementedError( + "Only JSONL format is supported for CustomDataset." + ) + + random.seed(self.random_seed) + random.shuffle(self.data) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + lora_path: Optional[str] = None, + max_loras: Optional[int] = None, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + skip_chat_template: bool = False, + **kwargs, + ) -> list: + sampled_requests = [] + for item in self.data: + if len(sampled_requests) >= num_requests: + break + prompt = item["prompt"] + + # apply template + if not skip_chat_template: + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + ) + ) + self.maybe_oversample_requests(sampled_requests, num_requests) + + return sampled_requests + + # ----------------------------------------------------------------------------- # Sonnet Dataset Implementation # ----------------------------------------------------------------------------- diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 79024a9d6..6bd9f1b49 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -60,6 +60,7 @@ from benchmark_dataset import ( ASRDataset, BurstGPTDataset, ConversationDataset, + CustomDataset, HuggingFaceDataset, InstructCoderDataset, MTBenchDataset, @@ -627,7 +628,16 @@ def main(args: argparse.Namespace): "'--dataset-path' if required." ) - if args.dataset_name == "sonnet": + if args.dataset_name == "custom": + dataset = CustomDataset(dataset_path=args.dataset_path) + input_requests = dataset.sample( + num_requests=args.num_prompts, + tokenizer=tokenizer, + output_len=args.custom_output_len, + skip_chat_template=args.custom_skip_chat_template, + ) + + elif args.dataset_name == "sonnet": dataset = SonnetDataset(dataset_path=args.dataset_path) # For the "sonnet" dataset, formatting depends on the backend. if args.backend == "openai-chat": @@ -838,6 +848,8 @@ def main(args: argparse.Namespace): ]: if field in result_json: del result_json[field] + if field in benchmark_result: + del benchmark_result[field] # Save to file base_model_id = model_id.split("/")[-1] @@ -850,6 +862,7 @@ def main(args: argparse.Namespace): if args.result_filename: file_name = args.result_filename if args.result_dir: + os.makedirs(args.result_dir, exist_ok=True) file_name = os.path.join(args.result_dir, file_name) with open( file_name, mode="a+" if args.append_result else "w", encoding="utf-8" @@ -890,7 +903,7 @@ if __name__ == "__main__": "--dataset-name", type=str, default="sharegpt", - choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"], + choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"], help="Name of the dataset to benchmark on.", ) parser.add_argument( @@ -1060,6 +1073,19 @@ if __name__ == "__main__": ) # group for dataset specific arguments + custom_group = parser.add_argument_group("custom dataset options") + custom_group.add_argument( + "--custom-output-len", + type=int, + default=256, + help="Number of output tokens per request, used only for custom dataset.", + ) + custom_group.add_argument( + "--custom-skip-chat-template", + action="store_true", + help="Skip applying chat template to prompt, used only for custom dataset.", + ) + sonnet_group = parser.add_argument_group("sonnet dataset options") sonnet_group.add_argument( "--sonnet-input-len", diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 712e83528..35cc303f6 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -9,9 +9,6 @@ generation. Supported dataset types include: - BurstGPT - HuggingFace - VisionArena - -TODO: Implement CustomDataset to parse a JSON file and convert its contents into -SampleRequest instances, similar to the approach used in ShareGPT. """ import base64 import io @@ -26,6 +23,7 @@ from io import BytesIO from typing import Any, Callable, Optional, Union import numpy as np +import pandas as pd from PIL import Image from transformers import PreTrainedTokenizerBase @@ -443,6 +441,99 @@ class ShareGPTDataset(BenchmarkDataset): return samples +# ----------------------------------------------------------------------------- +# Custom Dataset Implementation +# ----------------------------------------------------------------------------- + + +class CustomDataset(BenchmarkDataset): + """ + Implements the Custom dataset. Loads data from a JSONL file and generates + sample requests based on conversation turns. E.g., + ``` + {"prompt": "What is the capital of India?"} + {"prompt": "What is the capital of Iran?"} + {"prompt": "What is the capital of China?"} + ``` + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + if self.dataset_path is None: + raise ValueError("dataset_path must be provided for loading data.") + + # self.data will be a list of dictionaries + # e.g., [{"prompt": "What is the capital of India?"}, ...] + # This will be the standardized format which load_data() + # has to convert into depending on the filetype of dataset_path. + # sample() will assume this standardized format of self.data + self.data = [] + + # Load the JSONL file + if self.dataset_path.endswith(".jsonl"): + jsonl_data = pd.read_json(path_or_buf=self.dataset_path, + lines=True) + + # check if the JSONL file has a 'prompt' column + if "prompt" not in jsonl_data.columns: + raise ValueError("JSONL file must contain a 'prompt' column.") + + # Convert each row to a dictionary and append to self.data + # This will convert the DataFrame to a list of dictionaries + # where each dictionary corresponds to a row in the DataFrame. + # This is the standardized format we want for self.data + for _, row in jsonl_data.iterrows(): + self.data.append(row.to_dict()) + else: + raise NotImplementedError( + "Only JSONL format is supported for CustomDataset.") + + random.seed(self.random_seed) + random.shuffle(self.data) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + lora_path: Optional[str] = None, + max_loras: Optional[int] = None, + output_len: Optional[int] = None, + enable_multimodal_chat: bool = False, + skip_chat_template: bool = False, + **kwargs, + ) -> list: + sampled_requests = [] + for item in self.data: + if len(sampled_requests) >= num_requests: + break + prompt = item["prompt"] + + # apply template + if not skip_chat_template: + prompt = tokenizer.apply_chat_template( + [{ + "role": "user", + "content": prompt + }], + add_generation_prompt=True, + tokenize=False, + ) + + prompt_len = len(tokenizer(prompt).input_ids) + sampled_requests.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + )) + self.maybe_oversample_requests(sampled_requests, num_requests) + + return sampled_requests + + # ----------------------------------------------------------------------------- # Sonnet Dataset Implementation # ----------------------------------------------------------------------------- diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 040815e87..858a0c6a0 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -1110,6 +1110,8 @@ def main(args: argparse.Namespace): ]: if field in result_json: del result_json[field] + if field in benchmark_result: + del benchmark_result[field] # Save to file base_model_id = model_id.split("/")[-1] @@ -1120,6 +1122,7 @@ def main(args: argparse.Namespace): if args.result_filename: file_name = args.result_filename if args.result_dir: + os.makedirs(args.result_dir, exist_ok=True) file_name = os.path.join(args.result_dir, file_name) with open(file_name, mode="a+" if args.append_result else "w", -- GitLab From 1bc86a3da1bd45e7d43347d6532a515950a438f0 Mon Sep 17 00:00:00 2001 From: Benjamin Chislett Date: Sat, 31 May 2025 22:58:07 -0400 Subject: [PATCH 101/274] [Bugfix] Fix EAGLE3 broken logits (#18909) Signed-off-by: Benjamin Chislett --- vllm/model_executor/models/llama_eagle3.py | 23 +++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index f211bfe54..1e40017fc 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -215,6 +215,9 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM): logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata) if self.draft_id_to_target_id is None: + assert logits.shape[1] == self.config.vocab_size, \ + "Expected logits to have shape " \ + f"(*, {self.config.vocab_size}), but got {logits.shape}" return logits base = torch.arange(self.config.draft_vocab_size, device=logits.device) @@ -234,24 +237,22 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM): return self.model.fc(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): - loader = AutoWeightsLoader( - self, - skip_prefixes=None, - ) - model_weights = {} + includes_draft_id_mapping = False for name, loaded_weight in weights: if "t2d" in name: continue if "d2t" in name: name = name.replace("d2t", "draft_id_to_target_id") + includes_draft_id_mapping = True elif "lm_head" not in name: name = "model." + name model_weights[name] = loaded_weight - loaded_weights = loader.load_weights(model_weights.items()) - - if 'd2t' not in loaded_weights: - self.draft_id_to_target_id = None - - return loaded_weights + loader = AutoWeightsLoader( + self, + skip_prefixes=None, + skip_substrs=["draft_id_to_target_id"] \ + if not includes_draft_id_mapping else None, + ) + loader.load_weights(model_weights.items()) -- GitLab From 6aa8f9a4e7ed7af459476afe4f293f383e35f3e8 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 1 Jun 2025 11:04:23 +0800 Subject: [PATCH 102/274] [Core] Rework dtype resolution (#18751) Signed-off-by: DarkLight1337 --- .../test_basic_correctness.py | 5 +- tests/conftest.py | 7 +- tests/models/language/pooling/mteb_utils.py | 11 +- .../language/pooling/test_classification.py | 2 +- .../models/language/pooling/test_embedding.py | 6 +- .../multimodal/generation/test_whisper.py | 1 + .../multimodal/processing/test_common.py | 2 +- tests/samplers/test_no_bad_words.py | 2 +- tests/test_utils.py | 102 +++++++-- vllm/config.py | 202 ++++++++++++------ vllm/platforms/cpu.py | 2 +- vllm/transformers_utils/config.py | 40 +++- vllm/utils.py | 51 ++++- 13 files changed, 314 insertions(+), 119 deletions(-) diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 86b5e1e0a..11c8e7a4b 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -60,7 +60,6 @@ def _fix_prompt_embed_outputs( @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("backend", ["FLASH_ATTN"]) -@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("enforce_eager", [False]) @pytest.mark.parametrize("enable_prompt_embeds", [True, False]) @@ -69,7 +68,6 @@ def test_models( hf_runner, model: str, backend: str, - dtype: str, max_tokens: int, enforce_eager: bool, enable_prompt_embeds: bool, @@ -97,7 +95,7 @@ def test_models( str(i) for i in range(1024)) + " are:" example_prompts = [prompt] - with hf_runner(model, dtype=dtype) as hf_model: + with hf_runner(model) as hf_model: hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) if enable_prompt_embeds: with torch.no_grad(): @@ -106,7 +104,6 @@ def test_models( with VllmRunner(model, max_model_len=8192, - dtype=dtype, enforce_eager=enforce_eager, enable_prompt_embeds=enable_prompt_embeds, gpu_memory_utilization=0.7) as vllm_model: diff --git a/tests/conftest.py b/tests/conftest.py index 26674483f..6336c6c2c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -324,7 +324,12 @@ class HfRunner: trust_remote_code=trust_remote_code, ) self.device = self.get_default_device() - self.dtype = torch_dtype = _get_and_verify_dtype(self.config, dtype) + self.dtype = torch_dtype = _get_and_verify_dtype( + self.model_name, + self.config, + dtype=dtype, + is_pooling_model=is_sentence_transformer or is_cross_encoder, + ) model_kwargs = model_kwargs if model_kwargs is not None else {} model_kwargs.setdefault("torch_dtype", torch_dtype) diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index f4837ae95..f45168bc0 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -102,21 +102,18 @@ def mteb_test_embed_models(hf_runner, vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS) vllm_dtype = vllm_model.model.llm_engine.model_config.dtype - model_dtype = getattr( - vllm_model.model.llm_engine.model_config.hf_config, "torch_dtype", - vllm_dtype) - with set_default_torch_dtype(model_dtype) and hf_runner( + with set_default_torch_dtype(vllm_dtype) and hf_runner( model_info.name, is_sentence_transformer=True, - dtype=model_dtype) as hf_model: + dtype=vllm_dtype) as hf_model: if hf_model_callback is not None: hf_model_callback(hf_model) st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS) - print("VLLM:", vllm_dtype, vllm_main_score) - print("SentenceTransformer:", model_dtype, st_main_score) + print("VLLM:", vllm_main_score) + print("SentenceTransformers:", st_main_score) print("Difference:", st_main_score - vllm_main_score) assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL) diff --git a/tests/models/language/pooling/test_classification.py b/tests/models/language/pooling/test_classification.py index 44af3df08..57b3cb58d 100644 --- a/tests/models/language/pooling/test_classification.py +++ b/tests/models/language/pooling/test_classification.py @@ -43,6 +43,6 @@ def test_models( # the tolerance value of 1e-2 is selected based on the # half datatype tests in - # tests/models/embedding/language/test_embedding.py + # tests/models/language/pooling/test_embedding.py assert torch.allclose(hf_output, vllm_output, 1e-3 if dtype == "float" else 1e-2) diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index 306cfdf37..8f82c8091 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -30,13 +30,11 @@ from ...utils import check_embeddings_close pytest.param("sentence-transformers/stsb-roberta-base-v2"), ], ) -@pytest.mark.parametrize("dtype", ["half"]) def test_models( hf_runner, vllm_runner, example_prompts, model, - dtype: str, monkeypatch, ) -> None: @@ -58,13 +56,11 @@ def test_models( # So we need to strip the input texts to avoid test failing. example_prompts = [str(s).strip() for s in example_prompts] - with hf_runner(model, dtype=dtype, - is_sentence_transformer=True) as hf_model: + with hf_runner(model, is_sentence_transformer=True) as hf_model: hf_outputs = hf_model.encode(example_prompts) with vllm_runner(model, task="embed", - dtype=dtype, max_model_len=None, **vllm_extra_kwargs) as vllm_model: vllm_outputs = vllm_model.encode(example_prompts) diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py index 4e48bdbd0..d0b85842a 100644 --- a/tests/models/multimodal/generation/test_whisper.py +++ b/tests/models/multimodal/generation/test_whisper.py @@ -100,6 +100,7 @@ def run_test( with vllm_runner( model, + dtype="half", max_model_len=448, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 572fa366d..d7f950c23 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -40,7 +40,7 @@ def _test_processing_correctness( tokenizer_mode=model_info.tokenizer_mode, trust_remote_code=model_info.trust_remote_code, seed=0, - dtype="float16", + dtype="auto", revision=None, hf_overrides=model_info.hf_overrides, ) diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py index 355e3adcf..f9688b4b9 100644 --- a/tests/samplers/test_no_bad_words.py +++ b/tests/samplers/test_no_bad_words.py @@ -103,7 +103,7 @@ class TestTwoTokenBadWord: add_special_tokens=False)[0] def test_two_token_bad_word(self, vllm_runner): - with vllm_runner(self.MODEL) as llm: + with vllm_runner(self.MODEL, dtype="half") as llm: output_token_ids = self._generate(llm) assert output_token_ids[:2] == [ self.target_token_id1, self.target_token_id2 diff --git a/tests/test_utils.py b/tests/test_utils.py index 0b88d05ef..dd8777f06 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -17,7 +17,8 @@ from vllm_test_utils.monitor import monitor from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache, MemorySnapshot, PlaceholderModule, StoreBoolean, - bind_kv_cache, deprecate_kwargs, get_open_port, + bind_kv_cache, common_broadcastable_dtype, + deprecate_kwargs, get_open_port, is_lossless_cast, make_zmq_path, make_zmq_socket, memory_profiling, merge_async_iterators, sha256, split_zmq_path, supports_kw, swap_dict_values) @@ -567,12 +568,65 @@ def test_lru_cache(): assert 6 in cache +# yapf: disable +@pytest.mark.parametrize( + ("src_dtype", "tgt_dtype", "expected_result"), + [ + # Different precision_levels + (torch.bool, torch.int8, True), + (torch.bool, torch.float16, True), + (torch.bool, torch.complex32, True), + (torch.int64, torch.bool, False), + (torch.int64, torch.float16, True), + (torch.int64, torch.complex32, True), + (torch.float64, torch.bool, False), + (torch.float64, torch.int8, False), + (torch.float64, torch.complex32, True), + (torch.complex128, torch.bool, False), + (torch.complex128, torch.int8, False), + (torch.complex128, torch.float16, False), + # precision_level=0 + (torch.bool, torch.bool, True), + # precision_level=1 + (torch.int8, torch.int16, True), + (torch.int16, torch.int8, False), + (torch.uint8, torch.int8, False), + (torch.int8, torch.uint8, False), + # precision_level=2 + (torch.float16, torch.float32, True), + (torch.float32, torch.float16, False), + (torch.bfloat16, torch.float32, True), + (torch.float32, torch.bfloat16, False), + # precision_level=3 + (torch.complex32, torch.complex64, True), + (torch.complex64, torch.complex32, False), + ], +) +# yapf: enable +def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result): + assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result + + +# yapf: disable +@pytest.mark.parametrize( + ("dtypes", "expected_result"), + [ + ([torch.bool], torch.bool), + ([torch.bool, torch.int8], torch.int8), + ([torch.bool, torch.int8, torch.float16], torch.float16), + ([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32), # noqa: E501 + ], +) +# yapf: enable +def test_common_broadcastable_dtype(dtypes, expected_result): + assert common_broadcastable_dtype(dtypes) == expected_result + + def test_placeholder_module_error_handling(): placeholder = PlaceholderModule("placeholder_1234") def build_ctx(): - return pytest.raises(ModuleNotFoundError, - match="No module named") + return pytest.raises(ModuleNotFoundError, match="No module named") with build_ctx(): int(placeholder) @@ -608,6 +662,7 @@ def test_placeholder_module_error_handling(): _ = placeholder_attr.module +# yapf: disable @pytest.mark.parametrize( "obj,key1,key2", [ @@ -618,6 +673,7 @@ def test_placeholder_module_error_handling(): # Tests for both keys do not exist ({1: "a", 2: "b"}, 3, 4), ]) +# yapf: enable def test_swap_dict_values(obj, key1, key2): original_obj = obj.copy() swap_dict_values(obj, key1, key2) @@ -631,19 +687,19 @@ def test_swap_dict_values(obj, key1, key2): assert key1 not in obj -def test_model_specification(parser_with_config, - cli_config_file, +def test_model_specification(parser_with_config, cli_config_file, cli_config_file_with_model): # Test model in CLI takes precedence over config - args = parser_with_config.parse_args([ - 'serve', 'cli-model', '--config', cli_config_file_with_model - ]) + args = parser_with_config.parse_args( + ['serve', 'cli-model', '--config', cli_config_file_with_model]) assert args.model_tag == 'cli-model' assert args.served_model_name == 'mymodel' # Test model from config file works args = parser_with_config.parse_args([ - 'serve', '--config', cli_config_file_with_model, + 'serve', + '--config', + cli_config_file_with_model, ]) assert args.model == 'config-model' assert args.served_model_name == 'mymodel' @@ -654,17 +710,19 @@ def test_model_specification(parser_with_config, # Test using --model option raises error with pytest.raises( - ValueError, - match=( - "With `vllm serve`, you should provide the model as a positional " - "argument or in a config file instead of via the `--model` option." - ), + ValueError, + match= + ("With `vllm serve`, you should provide the model as a positional " + "argument or in a config file instead of via the `--model` option."), ): parser_with_config.parse_args(['serve', '--model', 'my-model']) # Test other config values are preserved args = parser_with_config.parse_args([ - 'serve', 'cli-model', '--config', cli_config_file_with_model, + 'serve', + 'cli-model', + '--config', + cli_config_file_with_model, ]) assert args.tensor_parallel_size == 2 assert args.trust_remote_code is True @@ -673,7 +731,7 @@ def test_model_specification(parser_with_config, @pytest.mark.parametrize("input", [(), ("abc", ), (None, ), - (None, bool, [1, 2, 3])]) + (None, bool, [1, 2, 3])]) @pytest.mark.parametrize("output", [0, 1, 2]) def test_sha256(input: tuple, output: int): hash = sha256(input) @@ -682,7 +740,8 @@ def test_sha256(input: tuple, output: int): assert hash != 0 bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL) - assert hash == int.from_bytes(hashlib.sha256(bytes).digest(), byteorder="big") + assert hash == int.from_bytes(hashlib.sha256(bytes).digest(), + byteorder="big") # hashing again, returns the same value assert hash == sha256(input) @@ -698,8 +757,7 @@ def test_sha256(input: tuple, output: int): ("tcp://127.0.0.1:5555", ("tcp", "127.0.0.1", "5555")), ("tcp://[::1]:5555", ("tcp", "::1", "5555")), # IPv6 address ("inproc://some_identifier", ("inproc", "some_identifier", "")), - ] -) + ]) def test_split_zmq_path(path, expected): assert split_zmq_path(path) == expected @@ -711,8 +769,7 @@ def test_split_zmq_path(path, expected): "tcp://127.0.0.1", # Missing port "tcp://[::1]", # Missing port for IPv6 "tcp://:5555", # Missing host - ] -) + ]) def test_split_zmq_path_invalid(invalid_path): with pytest.raises(ValueError): split_zmq_path(invalid_path) @@ -734,7 +791,8 @@ def test_make_zmq_socket_ipv6(): zsock: zmq.Socket = make_zmq_socket(ctx, ipv6_path, socket_type) # Verify that the IPV6 option is set - assert zsock.getsockopt(zmq.IPV6) == 1, "IPV6 option should be enabled for IPv6 addresses" + assert zsock.getsockopt( + zmq.IPV6) == 1, "IPV6 option should be enabled for IPv6 addresses" # Clean up zsock.close() diff --git a/vllm/config.py b/vllm/config.py index dfa44b044..f400e9875 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -24,6 +24,7 @@ import torch from pydantic import (ConfigDict, SkipValidation, TypeAdapter, field_validator, model_validator) from pydantic.dataclasses import dataclass +from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE from torch.distributed import ProcessGroup, ReduceOp from transformers import PretrainedConfig from typing_extensions import deprecated, runtime_checkable @@ -42,15 +43,16 @@ from vllm.transformers_utils.config import ( ConfigFormat, get_config, get_hf_image_processor_config, get_hf_text_config, get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, - try_get_generation_config, uses_mrope) + try_get_generation_config, try_get_safetensors_metadata, uses_mrope) from vllm.transformers_utils.s3_utils import S3Model from vllm.transformers_utils.utils import is_s3, maybe_model_redirect from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes, - LayerBlockType, cuda_device_count_stateless, - get_cpu_memory, get_open_port, is_torch_equal_or_newer, - random_uuid, resolve_obj_by_qualname) + LayerBlockType, common_broadcastable_dtype, + cuda_device_count_stateless, get_cpu_memory, + get_open_port, is_torch_equal_or_newer, random_uuid, + resolve_obj_by_qualname) if TYPE_CHECKING: from _typeshed import DataclassInstance @@ -540,7 +542,24 @@ class ModelConfig: self.encoder_config = self._get_encoder_config() self.hf_image_processor_config = get_hf_image_processor_config( self.model, hf_token=self.hf_token, revision=self.revision) - self.dtype = _get_and_verify_dtype(self.hf_config, self.dtype) + + supported_tasks, task = self._resolve_task(self.task) + self.supported_tasks = supported_tasks + self.task = task + if self.task in ("draft", "generate"): + self.truncation_side = "left" + else: + self.truncation_side = "right" + + self.pooler_config = self._init_pooler_config() + + self.dtype = _get_and_verify_dtype( + self.model, + self.hf_config, + self.dtype, + is_pooling_model=self.runner_type == "pooling", + revision=self.revision, + ) # Workaround for Gemma 2 which uses interleaved sliding window # attention, but it's not specified in its config. TODO: remove this @@ -597,16 +616,6 @@ class ModelConfig: raise ValueError( "`override_neuron_config` is only supported on Neuron.") - supported_tasks, task = self._resolve_task(self.task) - self.supported_tasks = supported_tasks - self.task = task - if self.task in ("draft", "generate"): - self.truncation_side = "left" - else: - self.truncation_side = "right" - - self.pooler_config = self._init_pooler_config() - self._verify_quantization() self._verify_cuda_graph() self._verify_bnb_config() @@ -692,7 +701,6 @@ class ModelConfig: self.model, self.revision) def _init_pooler_config(self) -> Optional["PoolerConfig"]: - if self.runner_type == "pooling": if isinstance(self.override_pooler_config, dict): self.override_pooler_config = PoolerConfig( @@ -3074,13 +3082,37 @@ _STR_DTYPE_TO_TORCH_DTYPE = { "bfloat16": torch.bfloat16, } -_ROCM_NOT_SUPPORTED_DTYPE: list[str] = [] # +# model_type -> reason +_FLOAT16_NOT_SUPPORTED_MODELS = { + "gemma2": "Numerical instability. Please use bfloat16 or float32 instead.", + "gemma3": "Numerical instability. Please use bfloat16 or float32 instead.", + "plamo2": "Numerical instability. Please use bfloat16 or float32 instead.", + "glm4": "Numerical instability. Please use bfloat16 or float32 instead.", +} -def _get_and_verify_dtype( +def _is_valid_dtype(model_type: str, dtype: torch.dtype): + if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16: # noqa: E501, SIM103 + return False + + return True + + +def _check_valid_dtype(model_type: str, dtype: torch.dtype): + if model_type in _FLOAT16_NOT_SUPPORTED_MODELS and dtype == torch.float16: + reason = _FLOAT16_NOT_SUPPORTED_MODELS[model_type] + raise ValueError(f"The model type {model_type!r} " + f"does not support float16. Reason: {reason}") + + return True + + +def _find_dtype( + model_id: str, config: PretrainedConfig, - dtype: Union[str, torch.dtype], -) -> torch.dtype: + *, + revision: Optional[str], +): # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct # because config.torch_dtype can be None. config_dtype = getattr(config, "torch_dtype", None) @@ -3092,75 +3124,111 @@ def _get_and_verify_dtype( if config_dtype is None and hasattr(config, "vision_config"): config_dtype = getattr(config.vision_config, "torch_dtype", None) + # Try to read the dtype of the weights if they are in safetensors format + if config_dtype is None: + repo_mt = try_get_safetensors_metadata(model_id, revision=revision) + + if repo_mt and (files_mt := repo_mt.files_metadata): + param_dtypes: set[torch.dtype] = { + _SAFETENSORS_TO_TORCH_DTYPE[dtype_str] + for file_mt in files_mt.values() + for dtype_str in file_mt.parameter_count + if dtype_str in _SAFETENSORS_TO_TORCH_DTYPE + } + + if param_dtypes: + return common_broadcastable_dtype(param_dtypes) + if config_dtype is None: config_dtype = torch.float32 - if isinstance(dtype, str): - dtype = dtype.lower() - if dtype == "auto": - # Set default dtype from model config - if config_dtype == torch.float32: - # Following common practice, we use float16 for float32 models - torch_dtype = torch.float16 - else: - torch_dtype = config_dtype + return config_dtype - if config.model_type == "plamo2": - logger.warning( - "For PLaMo2, we cast models to bfloat16 instead of using " - "float16 by default. This is because float16 does not work." - ) - torch_dtype = torch.bfloat16 - # Deal with torch dtype fallback for device compatibility. - from vllm.platforms import current_platform - if torch_dtype not in current_platform.supported_dtypes: - device_name = current_platform.get_device_name() +def _resolve_auto_dtype( + model_type: str, + config_dtype: torch.dtype, + *, + is_pooling_model: bool, +): + from vllm.platforms import current_platform - if ((capability := current_platform.get_device_capability()) - is None): - compute_str = "" - else: - version_str = capability.as_version_str() - compute_str = f" (with compute capability {version_str})" - fallback_dtype = current_platform.supported_dtypes[0] - logger.warning( - "Your %s device%s doesn't support %s. " \ - "Falling back to %s for compatibility.", - device_name, compute_str, torch_dtype, fallback_dtype - ) - torch_dtype = fallback_dtype + supported_dtypes = [ + dtype for dtype in current_platform.supported_dtypes + if _is_valid_dtype(model_type, dtype) + ] - if current_platform.is_hpu() and torch_dtype == torch.float16: - logger.warning( - "For HPU, we cast models to bfloat16 instead of " - "using float16 by default. Please specify `dtype` if you " - "want to use float16.") - torch_dtype = torch.bfloat16 - elif dtype == "float16" and config.model_type == "plamo2": - logger.warning( - "For PLaMo2, using float16 is unstable and might cause " - "unexpected behavior. Please use bfloat16 or float32 instead.") - torch_dtype = torch.float16 + if is_pooling_model and torch.float16 in supported_dtypes: + preferred_dtype = torch.float16 + else: + preferred_dtype = supported_dtypes[0] + + # Downcast for float32 models + if config_dtype == torch.float32: + config_dtype = preferred_dtype + + if config_dtype in supported_dtypes: + return config_dtype + + # Ensure device compatibility + device_name = current_platform.get_device_name() + device_capability = current_platform.get_device_capability() + + if device_capability is None: + device_str = f"{device_name!r}" + else: + version_str = device_capability.as_version_str() + device_str = f"{device_name!r} (with compute capability {version_str})" + + logger.warning( + "Your device %s doesn't support %s. " + "Falling back to %s for compatibility.", + device_str, + config_dtype, + preferred_dtype, + ) + + return preferred_dtype + + +def _get_and_verify_dtype( + model_id: str, + config: PretrainedConfig, + dtype: Union[str, torch.dtype], + *, + is_pooling_model: bool, + revision: Optional[str] = None, +) -> torch.dtype: + config_dtype = _find_dtype(model_id, config, revision=revision) + model_type = config.model_type + + if isinstance(dtype, str): + dtype = dtype.lower() + if dtype == "auto": + # Set default dtype from model config + torch_dtype = _resolve_auto_dtype( + model_type, + config_dtype, + is_pooling_model=is_pooling_model, + ) else: if dtype not in _STR_DTYPE_TO_TORCH_DTYPE: - raise ValueError(f"Unknown dtype: {dtype}") + raise ValueError(f"Unknown dtype: {dtype!r}") torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype] elif isinstance(dtype, torch.dtype): torch_dtype = dtype else: raise ValueError(f"Unknown dtype: {dtype}") - # Verify the dtype. + _check_valid_dtype(model_type, torch_dtype) + if torch_dtype != config_dtype: if torch_dtype == torch.float32: # Upcasting to float32 is allowed. logger.info("Upcasting %s to %s.", config_dtype, torch_dtype) - pass elif config_dtype == torch.float32: # Downcasting from float32 to float16 or bfloat16 is allowed. logger.info("Downcasting %s to %s.", config_dtype, torch_dtype) - pass else: # Casting between float16 and bfloat16 is allowed with a warning. logger.warning("Casting %s to %s.", config_dtype, torch_dtype) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index c79c603c0..eaffaac78 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -28,7 +28,7 @@ class CpuPlatform(Platform): dispatch_key: str = "CPU" @property - def supported_dtypes(self) -> list: + def supported_dtypes(self) -> list[torch.dtype]: if self.get_cpu_architecture() == CpuArchEnum.POWERPC: return [torch.bfloat16, torch.float32] elif sys.platform.startswith( diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 2ed71a4d3..8774f95a2 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -4,12 +4,12 @@ import enum import json import os import time -from functools import cache +from functools import cache, partial from pathlib import Path -from typing import Any, Callable, Literal, Optional, Union +from typing import Any, Callable, Literal, Optional, TypeVar, Union import huggingface_hub -from huggingface_hub import hf_hub_download +from huggingface_hub import get_safetensors_metadata, hf_hub_download from huggingface_hub import list_repo_files as hf_list_repo_files from huggingface_hub import try_to_load_from_cache from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError, @@ -93,10 +93,15 @@ class ConfigFormat(str, enum.Enum): MISTRAL = "mistral" -def with_retry(func: Callable[[], Any], - log_msg: str, - max_retries: int = 2, - retry_delay: int = 2): +_R = TypeVar("_R") + + +def with_retry( + func: Callable[[], _R], + log_msg: str, + max_retries: int = 2, + retry_delay: int = 2, +) -> _R: for attempt in range(max_retries): try: return func() @@ -109,6 +114,8 @@ def with_retry(func: Callable[[], Any], time.sleep(retry_delay) retry_delay *= 2 + raise AssertionError("Should not be reached") + # @cache doesn't cache exceptions @cache @@ -840,3 +847,22 @@ def get_cross_encoder_activation_function(config: PretrainedConfig): return resolve_obj_by_qualname(function_name)() else: return nn.Sigmoid() if config.num_labels == 1 else nn.Identity() + + +def try_get_safetensors_metadata( + model: str, + *, + revision: Optional[str] = None, +): + get_safetensors_metadata_partial = partial( + get_safetensors_metadata, + model, + revision=revision, + token=os.getenv('HF_TOKEN', None), + ) + + try: + return with_retry(get_safetensors_metadata_partial, + "Error retrieving safetensors") + except Exception: + return None diff --git a/vllm/utils.py b/vllm/utils.py index 65d3579d5..c879b38d0 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -37,8 +37,8 @@ from argparse import (Action, ArgumentDefaultsHelpFormatter, ArgumentParser, _ArgumentGroup) from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task from collections import UserDict, defaultdict -from collections.abc import (AsyncGenerator, Awaitable, Generator, Hashable, - Iterable, Iterator, KeysView, Mapping) +from collections.abc import (AsyncGenerator, Awaitable, Collection, Generator, + Hashable, Iterable, Iterator, KeysView, Mapping) from concurrent.futures.process import ProcessPoolExecutor from dataclasses import dataclass, field from functools import cache, lru_cache, partial, wraps @@ -979,6 +979,53 @@ def get_dtype_size(dtype: torch.dtype) -> int: return torch.tensor([], dtype=dtype).element_size() +# bool = 0, int = 1, float = 2, complex = 3 +def _get_precision_level(dtype: torch.dtype) -> int: + # NOTE: Complex dtypes return `is_floating_point=False` + return ((dtype != torch.bool) + dtype.is_floating_point + + dtype.is_complex * 2) + + +def is_lossless_cast(src_dtype: torch.dtype, tgt_dtype: torch.dtype): + """ + Test whether it is lossless to cast a tensor from + `src_dtype` to `tgt_dtype`. + """ + if src_dtype == tgt_dtype: + return True + + src_level = _get_precision_level(src_dtype) + tgt_level = _get_precision_level(tgt_dtype) + + if src_level < tgt_level: + return True + if src_level > tgt_level: + return False + + # Compare integral types + if not src_dtype.is_floating_point and not src_dtype.is_complex: + src_info = torch.iinfo(src_dtype) + tgt_info = torch.iinfo(tgt_dtype) + return src_info.min >= tgt_info.min and src_info.max <= tgt_info.max + + # Compare floating-point types + src_info = torch.finfo(src_dtype) + tgt_info = torch.finfo(tgt_dtype) + return (src_info.min >= tgt_info.min and src_info.max <= tgt_info.max + and src_info.resolution >= tgt_info.resolution) + + +def common_broadcastable_dtype(dtypes: Collection[torch.dtype]): + """ + Get the common `dtype` where all of the other `dtypes` can be + cast to it without losing any information. + """ + return max( + dtypes, + key=lambda dtype: sum(is_lossless_cast(dt, dtype) for dt in dtypes), + ) + + # `collections` helpers def is_list_of( value: object, -- GitLab From a35ca765a52fff242edf0e9fd3203ea2534aed58 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 1 Jun 2025 11:06:57 +0800 Subject: [PATCH 103/274] [LoRA] Support dynamically initialize `packed_modules_mapping` for VLM with arbitrary components (#18987) Signed-off-by: isotr0py <2037008807@qq.com> Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/lora/models.py | 6 +++--- .../model_loader/bitsandbytes_loader.py | 8 +++---- vllm/model_executor/models/intern_vit.py | 4 ++++ vllm/model_executor/models/internvl.py | 9 -------- vllm/model_executor/models/qwen2_5_vl.py | 11 ---------- vllm/model_executor/models/qwen2_vl.py | 11 ---------- vllm/model_executor/utils.py | 21 +++++++++++++++++++ 7 files changed, 32 insertions(+), 38 deletions(-) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index d3b1374a9..dfdc908d7 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import copy import math import os from collections.abc import Sequence @@ -34,6 +33,7 @@ from vllm.model_executor.models import SupportsLoRA, supports_multimodal from vllm.model_executor.models.interfaces import is_pooling_model from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper +from vllm.model_executor.utils import get_packed_modules_mapping from vllm.utils import is_pin_memory_available logger = init_logger(__name__) @@ -364,8 +364,8 @@ class LoRAModelManager(AdapterModelManager): # We need to replace rotary emb layer to do batch computation # for long lora. self.supported_lora_modules.append("rotary_emb") - self.packed_modules_mapping = copy.deepcopy( - self.model.packed_modules_mapping) + + self.packed_modules_mapping = get_packed_modules_mapping(self.model) # Used to indicate whether the model is a multimodal model self.supports_mm: bool = ( supports_multimodal(self.model) diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index 0d83c8d53..8996ea266 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # ruff: noqa: SIM117 -import copy import fnmatch import glob import itertools @@ -36,7 +35,8 @@ from vllm.model_executor.model_loader.weight_utils import ( filter_duplicate_safetensors_files, filter_files_not_needed_for_inference, pt_weights_iterator, safetensors_weights_iterator) from vllm.model_executor.models import is_pooling_model -from vllm.model_executor.utils import set_weight_attrs +from vllm.model_executor.utils import (get_packed_modules_mapping, + set_weight_attrs) from vllm.platforms import current_platform logger = init_logger(__name__) @@ -420,8 +420,8 @@ class BitsAndBytesModelLoader(BaseModelLoader): f"Model {type(model).__name__} does not support BitsAndBytes " "quantization yet. No 'packed_modules_mapping' found.") self.is_pool_model=is_pooling_model(model) - self.modules_mapping = ParamMapping( - copy.deepcopy(model.packed_modules_mapping)) + + self.modules_mapping = ParamMapping(get_packed_modules_mapping(model)) # For some models like Molmo, we need to use hf_to_vllm_mapper # to ensure correct loading of weights. diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index d9d9002bd..538e9de4f 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -415,6 +415,10 @@ class InternVisionEncoder(nn.Module): class InternVisionModel(nn.Module): + packed_modules_mapping = { + "qkv": ["qkv"], + } + def __init__( self, config: PretrainedConfig, diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 71be2b48d..c37d3afb4 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -1019,15 +1019,6 @@ class InternVLMultiModalProcessor( class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): - packed_modules_mapping = { - "wqkv": ["wqkv"], - "qkv": ["qkv"], - "gate_up_proj": [ - "w1", - "w3", - ], - } - def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 68dd07820..e3fa9f67c 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -821,17 +821,6 @@ class Qwen2_5_VLMultiModalProcessor(Qwen2VLMultiModalProcessor): dummy_inputs=Qwen2_5_VLDummyInputsBuilder) class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 0ff0836b0..873baa56f 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1069,17 +1069,6 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo] dummy_inputs=Qwen2VLDummyInputsBuilder) class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index f9d89e64b..1b120c354 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """Utils for model executor.""" +import copy from typing import Any, Optional import torch @@ -51,3 +52,23 @@ def _make_synced_weight_loader(original_weight_loader): torch._sync(param) return _synced_weight_loader + + +def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]: + parent_map = copy.deepcopy(getattr(model, "packed_modules_mapping", {})) + + # don't infer mapping if the model has defined it explicitly. + if parent_map: + return parent_map + + # We only check main components instead of whole model submodules + for child in model.children(): + child_map = getattr(child, "packed_modules_mapping", {}) + if any((k in parent_map and parent_map[k] != v) + for k, v in child_map.items()): + raise ValueError( + f"Can't update {type(model).__name__}'s packed_modules_mapping " + f"safely because of conflicts from {type(child).__name__}.") + else: + parent_map.update(child_map) + return parent_map \ No newline at end of file -- GitLab From c594cbf56546f74e8042018477ca25d1ee95613a Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Sun, 1 Jun 2025 11:23:43 +0800 Subject: [PATCH 104/274] [doc] small fix - mkdocs (#18996) Signed-off-by: reidliu41 Co-authored-by: reidliu41 --- docs/contributing/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/contributing/README.md b/docs/contributing/README.md index 72c4909b1..6216a3d16 100644 --- a/docs/contributing/README.md +++ b/docs/contributing/README.md @@ -43,11 +43,12 @@ Install MkDocs along with the [plugins](https://github.com/vllm-project/vllm/blo pip install -r requirements/docs.txt ``` -> **Note:** Ensure that your Python version is compatible with the plugins (e.g., mkdocs-awesome-nav requires Python 3.10+) +!!! note + Ensure that your Python version is compatible with the plugins (e.g., `mkdocs-awesome-nav` requires Python 3.10+) #### Verify Installation -Confirm that MkDocs is correctly installed:: +Confirm that MkDocs is correctly installed: ```bash mkdocs --version -- GitLab From 2ad6194a0250932da7354e4d9dbebaf610f90202 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 31 May 2025 23:41:29 -0400 Subject: [PATCH 105/274] Let max_num_batched_tokens use human_readable_int for large numbers (#18968) Signed-off-by: mgoin --- vllm/engine/arg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 13d8a280e..555532526 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -224,7 +224,7 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]: elif contains_type(type_hints, int): kwargs[name]["type"] = int # Special case for large integers - if name in {"max_model_len"}: + if name in {"max_model_len", "max_num_batched_tokens"}: kwargs[name]["type"] = human_readable_int elif contains_type(type_hints, float): kwargs[name]["type"] = float -- GitLab From aa54a7bf7bae7e1db43693470ebe93e3dcd30f9d Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Sun, 1 Jun 2025 11:42:10 +0800 Subject: [PATCH 106/274] [BugFix] fix data parallel construct ipv6 url addres (#18991) Signed-off-by: rongfu.leng --- vllm/distributed/parallel_state.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index b674d05a7..6e48c02da 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -41,8 +41,8 @@ from vllm.distributed.device_communicators.base_device_communicator import ( DeviceCommunicatorBase) from vllm.distributed.utils import StatelessProcessGroup from vllm.logger import init_logger -from vllm.utils import (direct_register_custom_op, resolve_obj_by_qualname, - supports_custom_op) +from vllm.utils import (direct_register_custom_op, get_distributed_init_method, + resolve_obj_by_qualname, supports_custom_op) @dataclass @@ -929,7 +929,7 @@ def init_distributed_environment( world_size = parallel_config.world_size_across_dp ip = parallel_config.data_parallel_master_ip port = parallel_config.get_next_dp_init_port() - distributed_init_method = f"tcp://{ip}:{port}" # noqa + distributed_init_method = get_distributed_init_method(ip, port) logger.info( "Adjusting world_size=%d rank=%d distributed_init_method=%s for DP", world_size, rank, distributed_init_method) -- GitLab From 2b102d51ad357903c502d72c1264fb2f724353f3 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sat, 31 May 2025 20:42:23 -0700 Subject: [PATCH 107/274] [BugFix] Fix incorrect metrics shutdown error log message (#18992) Signed-off-by: Nick Hill --- vllm/v1/metrics/prometheus.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/v1/metrics/prometheus.py b/vllm/v1/metrics/prometheus.py index f12568535..a364b286d 100644 --- a/vllm/v1/metrics/prometheus.py +++ b/vllm/v1/metrics/prometheus.py @@ -69,9 +69,13 @@ def unregister_vllm_metrics(): def shutdown_prometheus(): """Shutdown prometheus metrics.""" + + path = _prometheus_multiproc_dir + if path is None: + return try: pid = os.getpid() - multiprocess.mark_process_dead(pid) + multiprocess.mark_process_dead(pid, path) logger.debug("Marked Prometheus metrics for process %d as dead", pid) except Exception as e: logger.error("Error during metrics cleanup: %s", str(e)) -- GitLab From 432ec9926ebd3ce826f0d49df0a2a5ae3cc81ec0 Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Sun, 1 Jun 2025 19:26:14 +0800 Subject: [PATCH 108/274] [doc] wrong output (#19000) Signed-off-by: reidliu41 Co-authored-by: reidliu41 --- docs/contributing/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributing/README.md b/docs/contributing/README.md index 6216a3d16..f84fb242f 100644 --- a/docs/contributing/README.md +++ b/docs/contributing/README.md @@ -57,7 +57,7 @@ mkdocs --version Example output: ```console -mkdocs, version 1.6.1 from /opt/miniconda3/envs/mkdoc/lib/python3.9/site-packages/mkdocs (Python 3.9) +mkdocs, version 1.6.1 from /opt/miniconda3/envs/mkdoc/lib/python3.10/site-packages/mkdocs (Python 3.10) ``` #### Clone the `vLLM` repository -- GitLab From d6fd3a33b863003929fc3eef5dd9828219e04ab0 Mon Sep 17 00:00:00 2001 From: zhrrr <43847754+izhuhaoran@users.noreply.github.com> Date: Mon, 2 Jun 2025 03:41:18 +0800 Subject: [PATCH 109/274] [Misc] reuse num_tokens_across_dp of get_dp_padding to avoid unnecessary dp all reduce in set_forward_context (#18935) Signed-off-by: Tyler Michael Smith Co-authored-by: zhuhaoran Co-authored-by: Tyler Michael Smith --- vllm/forward_context.py | 27 ++++++++++++++------- vllm/v1/worker/gpu_model_runner.py | 38 ++++++++++++++++++++++-------- 2 files changed, 47 insertions(+), 18 deletions(-) diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 0af16bbc0..f192be1c4 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -47,8 +47,12 @@ class DPMetadata: return num_tokens_tensor @staticmethod - def make(parallel_config: ParallelConfig, attn_metadata: Any, - num_tokens: int) -> "DPMetadata": + def make( + parallel_config: ParallelConfig, + attn_metadata: Any, + num_tokens: int, + num_tokens_across_dp: Optional[torch.Tensor] = None + ) -> "DPMetadata": assert parallel_config.data_parallel_size > 1 dp_size = parallel_config.data_parallel_size @@ -62,10 +66,15 @@ class DPMetadata: # for v1 attention backends or no attn_metadata batchsize = num_tokens - num_tokens_tensor = DPMetadata.num_tokens_across_dp( - batchsize, dp_size, dp_rank) - max_tokens_across_dp_cpu = torch.max(num_tokens_tensor) - cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_tensor, dim=0) + # If num_tokens_across_dp is None, it will be computed by all_reduce + # Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize + assert (num_tokens_across_dp is None + or num_tokens_across_dp[dp_rank] == batchsize) + if num_tokens_across_dp is None: + num_tokens_across_dp = DPMetadata.num_tokens_across_dp( + batchsize, dp_size, dp_rank) + max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp) + cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_across_dp, dim=0) return DPMetadata(max_tokens_across_dp_cpu, cu_tokens_across_dp_cpu) @@ -101,7 +110,8 @@ def get_forward_context() -> ForwardContext: def set_forward_context(attn_metadata: Any, vllm_config: VllmConfig, virtual_engine: int = 0, - num_tokens: Optional[int] = None): + num_tokens: Optional[int] = None, + num_tokens_across_dp: Optional[torch.Tensor] = None): """A context manager that stores the current forward context, can be attention metadata, etc. Here we can inject common logic for every model forward pass. @@ -114,7 +124,8 @@ def set_forward_context(attn_metadata: Any, if vllm_config.parallel_config.data_parallel_size > 1 and ( attn_metadata is not None or num_tokens is not None): dp_metadata = DPMetadata.make(vllm_config.parallel_config, - attn_metadata, num_tokens or 0) + attn_metadata, num_tokens or 0, + num_tokens_across_dp) global _forward_context prev_context = _forward_context diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b6fa68ab0..4bc825ccb 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1111,17 +1111,30 @@ class GPUModelRunner(LoRAModelRunnerMixin): for k, v in self.intermediate_tensors.items() }) - def get_dp_padding(self, num_tokens: int): + def get_dp_padding(self, + num_tokens: int) -> tuple[int, Optional[torch.Tensor]]: dp_size = self.vllm_config.parallel_config.data_parallel_size dp_rank = self.vllm_config.parallel_config.data_parallel_rank - if dp_size == 1: + + # For DP: Don't pad when setting enforce_eager. + # This lets us set enforce_eager on the prefiller in a P/D setup and + # still use CUDA graphs (enabled by this padding) on the decoder. + # + # TODO(tms) : There are many cases where padding is enabled for + # prefills, causing unnecessary and excessive padding of activations. + + if dp_size == 1 or self.vllm_config.model_config.enforce_eager: # Early exit. - return 0 + return 0, None num_tokens_across_dp = DPMetadata.num_tokens_across_dp( num_tokens, dp_size, dp_rank) max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item() - return max_tokens_across_dp_cpu - num_tokens + num_tokens_after_padding = torch.tensor([max_tokens_across_dp_cpu] * + dp_size, + device="cpu", + dtype=torch.int32) + return max_tokens_across_dp_cpu - num_tokens, num_tokens_after_padding @torch.inference_mode() def execute_model( @@ -1161,7 +1174,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): num_input_tokens = num_scheduled_tokens # Padding for DP - num_input_tokens += self.get_dp_padding(num_input_tokens) + num_pad, num_tokens_across_dp = self.get_dp_padding(num_input_tokens) + num_input_tokens += num_pad # _prepare_inputs may reorder the batch, so we must gather multi # modal outputs after that to ensure the correct order @@ -1208,7 +1222,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): # Use persistent buffers for CUDA graphs. with set_forward_context(attn_metadata, self.vllm_config, - num_tokens=num_input_tokens): + num_tokens=num_input_tokens, + num_tokens_across_dp=num_tokens_across_dp): self.maybe_setup_kv_connector(scheduler_output) model_output = self.model( @@ -1681,7 +1696,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): ) -> torch.Tensor: # Padding for DP - num_tokens += self.get_dp_padding(num_tokens) + num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens) + num_tokens += num_pad # Set num_scheduled_tokens based on num_tokens and max_num_seqs # for dummy run with LoRA so that the num_reqs collectively @@ -1747,9 +1763,11 @@ class GPUModelRunner(LoRAModelRunnerMixin): intermediate_tensors = self.sync_and_slice_intermediate_tensors( num_tokens, None, False) - with set_forward_context(attn_metadata, - self.vllm_config, - num_tokens=num_tokens): + with set_forward_context( + attn_metadata, + self.vllm_config, + num_tokens=num_tokens, + num_tokens_across_dp=num_tokens_across_dp): outputs = model( input_ids=input_ids, positions=positions, -- GitLab From b9f61e13875e1682d3982829006bec26981fde4d Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Sun, 1 Jun 2025 23:30:41 -0400 Subject: [PATCH 110/274] [Bugfix][Nixl] Fix DP Metadata Handshake (#19008) Signed-off-by: rshaw@neuralmagic.com --- .../kv_connector/v1/nixl_connector.py | 68 ++++++++++--------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 6a3472157..4d228dbc9 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -19,7 +19,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) from vllm.distributed.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, - get_tp_group, get_world_group) + get_tp_group) from vllm.logger import init_logger from vllm.utils import make_zmq_path, make_zmq_socket, round_down from vllm.v1.core.sched.output import SchedulerOutput @@ -172,6 +172,11 @@ class NixlConnectorScheduler: self.vllm_config = vllm_config self.block_size = vllm_config.cache_config.block_size self.engine_id = engine_id + self.side_channel_host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST + self.side_channel_port = ( + envs.VLLM_NIXL_SIDE_CHANNEL_PORT + + vllm_config.parallel_config.data_parallel_rank_local * + vllm_config.parallel_config.tensor_parallel_size) logger.info("Initializing NIXL Scheduler %s", engine_id) # Requests that need to start recv. @@ -310,8 +315,8 @@ class NixlConnectorScheduler: do_remote_decode=False, remote_block_ids=computed_block_ids, remote_engine_id=self.engine_id, - remote_host=envs.VLLM_NIXL_SIDE_CHANNEL_HOST, - remote_port=envs.VLLM_NIXL_SIDE_CHANNEL_PORT, + remote_host=self.side_channel_host, + remote_port=self.side_channel_port, ) @@ -330,11 +335,19 @@ class NixlConnectorWorker: # Map of engine_id -> agent_name. self._remote_agents: dict[str, str] = {} + # NIXL handshake port. + # NOTE(rob): Within a DP group, each DP rank gets its own + # base port (which is sent in the KVTransferParams). + # Each TP rank listens/queries on the base_port + tp_rank. + self.side_channel_port = ( + envs.VLLM_NIXL_SIDE_CHANNEL_PORT + + vllm_config.parallel_config.data_parallel_rank_local * + vllm_config.parallel_config.tensor_parallel_size) + # Metadata. self.engine_id = engine_id - self.rank = get_tensor_model_parallel_rank() + self.tp_rank = get_tensor_model_parallel_rank() self.world_size = get_tensor_model_parallel_world_size() - self.world_rank = get_world_group().rank_in_group self.tp_group = get_tp_group() # KV Caches and nixl tracking data. @@ -383,16 +396,11 @@ class NixlConnectorWorker: @staticmethod def _nixl_handshake_listener(metadata: NixlAgentMetadata, - ready_event: threading.Event, - world_rank: int): + ready_event: threading.Event, base_port: int, + tp_rank: int): """Background thread for getting new NIXL handshakes.""" # NOTE(rob): this is a simple implementation. We will move - # to a better approach like an ETCD server in the future. - - # NOTE(rob): to support heterogeneous TP, we will have to - # move this into the scheduler rather than worker, since - # each rank needs the metadata of all other ranks (whereas - # in this setup, each rank only gets one other rank's meta. + # to a better approach via HTTP endpoint soon. encoder = msgspec.msgpack.Encoder() encoded_data = encoder.encode(metadata) @@ -402,11 +410,7 @@ class NixlConnectorWorker: # Listen for new requests for metadata. host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST - # NOTE(rob): we need each rank to have a unique port. This - # hack to keeps us moving. We will switch when moving to etcd - # or where we have a single ZMQ socket in the scheduler. - port = envs.VLLM_NIXL_SIDE_CHANNEL_PORT + world_rank - path = make_zmq_path("tcp", host, port) + path = make_zmq_path("tcp", host, base_port + tp_rank) logger.debug("Starting listening on path: %s", path) with zmq_ctx(zmq.ROUTER, path) as sock: ready_event.set() @@ -421,10 +425,10 @@ class NixlConnectorWorker: """Do a NIXL handshake with a remote instance.""" start_time = time.perf_counter() - # NOTE(rob): we need each rank to have a unique port. This is - # a hack to keep us moving. We will switch when moving to etcd - # or where we have a single ZMQ socket in the scheduler. - path = make_zmq_path("tcp", host, port + self.world_rank) + # NOTE(rob): we need each tp_rank to have a unique port. + # This is a hack to keep us moving. We will switch when + # we switch to HTTP-based NIXL metadata exchange. + path = make_zmq_path("tcp", host, port + self.tp_rank) logger.debug("Querying metadata on path: %s", path) with zmq_ctx(zmq.REQ, path) as sock: # Send query for the request. @@ -532,7 +536,7 @@ class NixlConnectorWorker: ready_event = threading.Event() self._nixl_handshake_listener_t = threading.Thread( target=self._nixl_handshake_listener, - args=(metadata, ready_event, self.world_rank), + args=(metadata, ready_event, self.side_channel_port, self.tp_rank), daemon=True, name="nixl_handshake_listener") self._nixl_handshake_listener_t.start() @@ -556,9 +560,9 @@ class NixlConnectorWorker: block_offset = block_id * self.block_len # (addr, len, device id) blocks_data.append( - (base_addr + block_offset, self.block_len, self.rank)) - logger.debug("Created %s blocks for src engine %s and rank %s", - len(blocks_data), self.engine_id, self.rank) + (base_addr + block_offset, self.block_len, self.tp_rank)) + logger.debug("Created %s blocks for src engine %s and tp_rank %s", + len(blocks_data), self.engine_id, self.tp_rank) # Register with NIXL. descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM") @@ -573,9 +577,9 @@ class NixlConnectorWorker: block_offset = block_id * self.block_len # (addr, len, device id) blocks_data.append( - (base_addr + block_offset, self.block_len, self.rank)) - logger.debug("Created %s blocks for dst engine %s and rank %s", - len(blocks_data), engine_id, self.rank) + (base_addr + block_offset, self.block_len, self.tp_rank)) + logger.debug("Created %s blocks for dst engine %s and tp_rank %s", + len(blocks_data), engine_id, self.tp_rank) # Register with NIXL. descs = self.nixl_wrapper.get_xfer_descs(blocks_data, "VRAM") @@ -600,14 +604,14 @@ class NixlConnectorWorker: if len(done_sending) > 0 or len(done_recving) > 0: logger.debug( "Rank %s, get_finished: %s requests done sending " - "and %s requests done recving", self.rank, len(done_sending), - len(done_recving)) + "and %s requests done recving", self.tp_rank, + len(done_sending), len(done_recving)) if self.world_size == 1: return done_sending, done_recving # Rank 0: get finished from all other ranks. - if self.rank == 0: + if self.tp_rank == 0: for req_id in done_sending: self._done_sending_count[req_id] += 1 for req_id in done_recving: -- GitLab From 9760fd8f6acc752168333a25f5da80a6d2097608 Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Mon, 2 Jun 2025 02:38:50 -0700 Subject: [PATCH 111/274] [Core] Support inplace model weights loading (#18745) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- tests/tensorizer_loader/test_tensorizer.py | 17 -- tests/v1/worker/test_gpu_model_runner.py | 18 ++ .../model_loader/base_loader.py | 22 ++- .../model_loader/bitsandbytes_loader.py | 18 +- .../model_loader/default_loader.py | 50 ++---- .../model_loader/dummy_loader.py | 23 +-- .../model_loader/gguf_loader.py | 10 +- .../model_loader/runai_streamer_loader.py | 30 +--- .../model_loader/sharded_state_loader.py | 99 +++++------ .../model_executor/model_loader/tensorizer.py | 165 +++++++----------- .../model_loader/tensorizer_loader.py | 58 +++--- vllm/v1/worker/gpu_model_runner.py | 15 +- vllm/v1/worker/tpu_model_runner.py | 21 ++- 13 files changed, 249 insertions(+), 297 deletions(-) diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index b6286e148..747ec56ad 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -4,7 +4,6 @@ import gc import os import pathlib import subprocess -from unittest.mock import MagicMock, patch import pytest import torch @@ -16,7 +15,6 @@ from vllm.engine.arg_utils import EngineArgs from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig, TensorSerializer, is_vllm_tensorized, - load_with_tensorizer, open_stream, tensorize_vllm_model) # yapf: enable @@ -61,21 +59,6 @@ def write_keyfile(keyfile_path: str): f.write(encryption_params.key) -@patch('vllm.model_executor.model_loader.tensorizer.TensorizerAgent') -def test_load_with_tensorizer(mock_agent, tensorizer_config): - mock_linear_method = MagicMock() - mock_agent_instance = mock_agent.return_value - mock_agent_instance.deserialize.return_value = MagicMock() - - result = load_with_tensorizer(tensorizer_config, - quant_method=mock_linear_method) - - mock_agent.assert_called_once_with(tensorizer_config, - quant_method=mock_linear_method) - mock_agent_instance.deserialize.assert_called_once() - assert result == mock_agent_instance.deserialize.return_value - - @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") def test_can_deserialize_s3(vllm_runner): model_ref = "EleutherAI/pythia-1.4b" diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index c38eb4866..6ba6d1f6f 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -94,6 +94,9 @@ def model_runner(): return runner +model_runner_2 = model_runner + + def _schedule_new_request(*req_ids: str) -> SchedulerOutput: new_reqs = [] num_scheduled_tokens = {} @@ -366,3 +369,18 @@ def test_kv_cache_stride_order(monkeypatch, model_runner): assert all(kv.is_contiguous() for kv in model_runner.kv_caches) else: assert all(not kv.is_contiguous() for kv in model_runner.kv_caches) + + +def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2): + # In this test, model_runner loads model + weights in one go, while + # model_runner_2 loads dummy weights first then load real weights inplace + model_runner.load_model() + original_load_format = model_runner_2.load_config.load_format + model_runner_2.load_config.load_format = "dummy" + model_runner_2.load_model() # Initial model loading with dummy weights + assert str(model_runner.get_model().state_dict()) != str( + model_runner_2.get_model().state_dict()) + model_runner_2.load_config.load_format = original_load_format + model_runner_2.load_model() # Load real weights inplace + assert str(model_runner.get_model().state_dict()) == str( + model_runner_2.get_model().state_dict()) diff --git a/vllm/model_executor/model_loader/base_loader.py b/vllm/model_executor/model_loader/base_loader.py index 010dd5157..d619d9f25 100644 --- a/vllm/model_executor/model_loader/base_loader.py +++ b/vllm/model_executor/model_loader/base_loader.py @@ -1,9 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 from abc import ABC, abstractmethod +import torch import torch.nn as nn from vllm.config import LoadConfig, ModelConfig, VllmConfig +from vllm.model_executor.model_loader.utils import ( + initialize_model, process_weights_after_loading, set_default_torch_dtype) class BaseModelLoader(ABC): @@ -18,7 +21,22 @@ class BaseModelLoader(ABC): raise NotImplementedError @abstractmethod - def load_model(self, *, vllm_config: VllmConfig, + def load_weights(self, model: nn.Module, + model_config: ModelConfig) -> None: + """Load weights into a model. This standalone API allows + inplace weights loading for an already-initialized model""" + raise NotImplementedError + + def load_model(self, vllm_config: VllmConfig, model_config: ModelConfig) -> nn.Module: """Load a model with the given configurations.""" - raise NotImplementedError + device_config = vllm_config.device_config + target_device = torch.device(device_config.device) + with set_default_torch_dtype(model_config.dtype): + with target_device: + model = initialize_model(vllm_config=vllm_config, + model_config=model_config) + # Quantization does not happen in `load_weights` but after it + self.load_weights(model, model_config) + process_weights_after_loading(model, model_config, target_device) + return model.eval() diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index 8996ea266..3df835a93 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -14,7 +14,7 @@ from huggingface_hub import HfApi from torch import nn from transformers.utils import SAFE_WEIGHTS_INDEX_NAME -from vllm.config import LoadConfig, ModelConfig, VllmConfig +from vllm.config import LoadConfig, ModelConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) # yapf: enable @@ -28,7 +28,6 @@ from vllm.model_executor.layers.linear import (LinearBase, RowParallelLinear) from vllm.model_executor.model_loader.base_loader import BaseModelLoader from vllm.model_executor.model_loader.utils import (ParamMapping, - initialize_model, set_default_torch_dtype) from vllm.model_executor.model_loader.weight_utils import ( download_safetensors_index_file_from_hf, download_weights_from_hf, @@ -408,8 +407,7 @@ class BitsAndBytesModelLoader(BaseModelLoader): ), "vllm currently does not support BNB quantization for" f" {type(model).__name__}" - def _load_weights(self, model_config: ModelConfig, - model: nn.Module) -> None: + def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None: if not hasattr(model, "load_weights"): raise AttributeError( "The required method 'load_weights' is not defined in class" @@ -568,15 +566,3 @@ class BitsAndBytesModelLoader(BaseModelLoader): def download_model(self, model_config: ModelConfig) -> None: self._prepare_weights(model_config.model, model_config.revision) - - def load_model(self, vllm_config: VllmConfig, - model_config: ModelConfig) -> nn.Module: - device_config = vllm_config.device_config - with set_default_torch_dtype(model_config.dtype): - with torch.device(device_config.device): - - model = initialize_model(vllm_config=vllm_config) - - self._load_weights(model_config, model) - - return model.eval() diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index 29a6e0af4..6946627a5 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -12,11 +12,9 @@ from torch import nn from transformers.utils import SAFE_WEIGHTS_INDEX_NAME from vllm import envs -from vllm.config import LoadConfig, LoadFormat, ModelConfig, VllmConfig +from vllm.config import LoadConfig, LoadFormat, ModelConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.base_loader import BaseModelLoader -from vllm.model_executor.model_loader.utils import ( - initialize_model, process_weights_after_loading, set_default_torch_dtype) from vllm.model_executor.model_loader.weight_utils import ( download_safetensors_index_file_from_hf, download_weights_from_hf, fastsafetensors_weights_iterator, filter_duplicate_safetensors_files, @@ -264,32 +262,20 @@ class DefaultModelLoader(BaseModelLoader): fall_back_to_pt=True, allow_patterns_overrides=None) - def load_model(self, vllm_config: VllmConfig, - model_config: ModelConfig) -> nn.Module: - device_config = vllm_config.device_config - target_device = torch.device(device_config.device) - with set_default_torch_dtype(model_config.dtype): - with target_device: - model = initialize_model(vllm_config=vllm_config, - model_config=model_config) - - weights_to_load = {name for name, _ in model.named_parameters()} - loaded_weights = model.load_weights( - self.get_all_weights(model_config, model)) - self.counter_after_loading_weights = time.perf_counter() - logger.info( - "Loading weights took %.2f seconds", - self.counter_after_loading_weights - - self.counter_before_loading_weights) - # We only enable strict check for non-quantized models - # that have loaded weights tracking currently. - if model_config.quantization is None and loaded_weights is not None: - weights_not_loaded = weights_to_load - loaded_weights - if weights_not_loaded: - raise ValueError( - "Following weights were not initialized from " - f"checkpoint: {weights_not_loaded}") - - process_weights_after_loading(model, model_config, target_device) - - return model.eval() + def load_weights(self, model: nn.Module, + model_config: ModelConfig) -> None: + weights_to_load = {name for name, _ in model.named_parameters()} + loaded_weights = model.load_weights( + self.get_all_weights(model_config, model)) + self.counter_after_loading_weights = time.perf_counter() + logger.info( + "Loading weights took %.2f seconds", + self.counter_after_loading_weights - + self.counter_before_loading_weights) + # We only enable strict check for non-quantized models + # that have loaded weights tracking currently. + if model_config.quantization is None and loaded_weights is not None: + weights_not_loaded = weights_to_load - loaded_weights + if weights_not_loaded: + raise ValueError("Following weights were not initialized from " + f"checkpoint: {weights_not_loaded}") diff --git a/vllm/model_executor/model_loader/dummy_loader.py b/vllm/model_executor/model_loader/dummy_loader.py index 0e2f0be1e..64fa2be76 100644 --- a/vllm/model_executor/model_loader/dummy_loader.py +++ b/vllm/model_executor/model_loader/dummy_loader.py @@ -1,11 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 -import torch import torch.nn as nn -from vllm.config import LoadConfig, ModelConfig, VllmConfig +from vllm.config import LoadConfig, ModelConfig from vllm.model_executor.model_loader.base_loader import BaseModelLoader -from vllm.model_executor.model_loader.utils import ( - initialize_model, process_weights_after_loading, set_default_torch_dtype) from vllm.model_executor.model_loader.weight_utils import ( initialize_dummy_weights) @@ -22,16 +19,8 @@ class DummyModelLoader(BaseModelLoader): def download_model(self, model_config: ModelConfig) -> None: pass # Nothing to download - def load_model(self, vllm_config: VllmConfig, - model_config: ModelConfig) -> nn.Module: - device_config = vllm_config.device_config - target_device = torch.device(device_config.device) - with set_default_torch_dtype(model_config.dtype): - with target_device: - model = initialize_model(vllm_config=vllm_config) - # NOTE(woosuk): For accurate performance evaluation, we assign - # random values to the weights. - initialize_dummy_weights(model) - - process_weights_after_loading(model, model_config, target_device) - return model.eval() + def load_weights(self, model: nn.Module, + model_config: ModelConfig) -> None: + # NOTE(woosuk): For accurate performance evaluation, we assign + # random values to the weights. + initialize_dummy_weights(model) diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py index 806004bf9..1eac50422 100644 --- a/vllm/model_executor/model_loader/gguf_loader.py +++ b/vllm/model_executor/model_loader/gguf_loader.py @@ -92,6 +92,13 @@ class GGUFModelLoader(BaseModelLoader): def download_model(self, model_config: ModelConfig) -> None: self._prepare_weights(model_config.model) + def load_weights(self, model: nn.Module, + model_config: ModelConfig) -> None: + local_model_path = self._prepare_weights(model_config.model) + gguf_weights_map = self._get_gguf_weights_map(model_config) + model.load_weights( + self._get_weights_iterator(local_model_path, gguf_weights_map)) + def load_model(self, vllm_config: VllmConfig, model_config: ModelConfig) -> nn.Module: device_config = vllm_config.device_config @@ -106,8 +113,7 @@ class GGUFModelLoader(BaseModelLoader): with set_default_torch_dtype(model_config.dtype): with target_device: model = initialize_model(vllm_config=vllm_config) - model.load_weights( - self._get_weights_iterator(local_model_path, gguf_weights_map)) + self.load_weights(model, model_config) process_weights_after_loading(model, model_config, target_device) return model diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py index 9f1022c25..a39e26c6d 100644 --- a/vllm/model_executor/model_loader/runai_streamer_loader.py +++ b/vllm/model_executor/model_loader/runai_streamer_loader.py @@ -9,10 +9,8 @@ import torch from torch import nn from transformers.utils import SAFE_WEIGHTS_INDEX_NAME -from vllm.config import LoadConfig, ModelConfig, VllmConfig +from vllm.config import LoadConfig, ModelConfig from vllm.model_executor.model_loader.base_loader import BaseModelLoader -from vllm.model_executor.model_loader.utils import ( - initialize_model, process_weights_after_loading, set_default_torch_dtype) from vllm.model_executor.model_loader.weight_utils import ( download_safetensors_index_file_from_hf, download_weights_from_hf, runai_safetensors_weights_iterator) @@ -100,21 +98,11 @@ class RunaiModelStreamerLoader(BaseModelLoader): """Download model if necessary""" self._prepare_weights(model_config.model, model_config.revision) - def load_model(self, vllm_config: VllmConfig, - model_config: ModelConfig) -> nn.Module: - """Perform streaming of the model to destination""" - device_config = vllm_config.device_config - target_device = torch.device(device_config.device) - with set_default_torch_dtype(model_config.dtype): - with target_device: - model = initialize_model(vllm_config=vllm_config) - - model_weights = model_config.model - if hasattr(model_config, "model_weights"): - model_weights = model_config.model_weights - model.load_weights( - self._get_weights_iterator(model_weights, - model_config.revision)) - - process_weights_after_loading(model, model_config, target_device) - return model.eval() + def load_weights(self, model: nn.Module, + model_config: ModelConfig) -> None: + """Load weights into a model.""" + model_weights = model_config.model + if hasattr(model_config, "model_weights"): + model_weights = model_config.model_weights + model.load_weights( + self._get_weights_iterator(model_weights, model_config.revision)) diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py index 78bca89f0..b5a5031bb 100644 --- a/vllm/model_executor/model_loader/sharded_state_loader.py +++ b/vllm/model_executor/model_loader/sharded_state_loader.py @@ -9,11 +9,9 @@ from typing import Any, Optional import torch from torch import nn -from vllm.config import LoadConfig, ModelConfig, VllmConfig +from vllm.config import LoadConfig, ModelConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.base_loader import BaseModelLoader -from vllm.model_executor.model_loader.utils import ( - initialize_model, process_weights_after_loading, set_default_torch_dtype) from vllm.model_executor.model_loader.weight_utils import ( download_weights_from_hf, runai_safetensors_weights_iterator) from vllm.transformers_utils.s3_utils import glob as s3_glob @@ -100,11 +98,8 @@ class ShardedStateLoader(BaseModelLoader): def download_model(self, model_config: ModelConfig) -> None: self._prepare_weights(model_config.model, model_config.revision) - def load_model(self, vllm_config: VllmConfig, - model_config: ModelConfig) -> nn.Module: - device_config = vllm_config.device_config - target_device = torch.device(device_config.device) - + def load_weights(self, model: nn.Module, + model_config: ModelConfig) -> None: from vllm.distributed import get_tensor_model_parallel_rank model_weights = model_config.model @@ -112,53 +107,47 @@ class ShardedStateLoader(BaseModelLoader): model_weights = model_config.model_weights local_model_path = model_weights - with set_default_torch_dtype(model_config.dtype): - with target_device: - model = initialize_model(vllm_config=vllm_config) - process_weights_after_loading(model, model_config, - target_device) - rank = get_tensor_model_parallel_rank() - pattern = os.path.join( - local_model_path, - self.pattern.format(rank=rank, part="*"), - ) - - filepaths = [] - if is_s3(local_model_path): - file_pattern = f"*{self.pattern.format(rank=rank, part=' * ')}" - filepaths = s3_glob(path=local_model_path, - allow_pattern=[file_pattern]) - else: - filepaths = glob.glob(pattern) - if not filepaths: - # TODO: support un-sharded checkpoints too - raise ValueError( - f"Could not find checkpoint files '{pattern}', only " - f"pre-sharded checkpoints are currently supported!") - state_dict = self._filter_subtensors(model.state_dict()) - for key, tensor in self.iterate_over_files(filepaths): - # If loading with LoRA enabled, additional padding may - # be added to certain parameters. We only load into a - # narrowed view of the parameter data. - param_data = state_dict[key].data - param_shape = state_dict[key].shape - for dim, size in enumerate(tensor.shape): - if size < param_shape[dim]: - param_data = param_data.narrow(dim, 0, size) - if tensor.shape != param_shape: - logger.warning( - "loading tensor of shape %s into " - "parameter '%s' of shape %s", - tensor.shape, - key, - param_shape, - ) - param_data.copy_(tensor) - state_dict.pop(key) - if state_dict: - raise ValueError( - f"Missing keys {tuple(state_dict)} in loaded state!") - return model.eval() + rank = get_tensor_model_parallel_rank() + pattern = os.path.join( + local_model_path, + self.pattern.format(rank=rank, part="*"), + ) + + filepaths = [] + if is_s3(local_model_path): + file_pattern = f"*{self.pattern.format(rank=rank, part=' * ')}" + filepaths = s3_glob(path=local_model_path, + allow_pattern=[file_pattern]) + else: + filepaths = glob.glob(pattern) + if not filepaths: + # TODO: support un-sharded checkpoints too + raise ValueError( + f"Could not find checkpoint files '{pattern}', only " + f"pre-sharded checkpoints are currently supported!") + state_dict = self._filter_subtensors(model.state_dict()) + for key, tensor in self.iterate_over_files(filepaths): + # If loading with LoRA enabled, additional padding may + # be added to certain parameters. We only load into a + # narrowed view of the parameter data. + param_data = state_dict[key].data + param_shape = state_dict[key].shape + for dim, size in enumerate(tensor.shape): + if size < param_shape[dim]: + param_data = param_data.narrow(dim, 0, size) + if tensor.shape != param_shape: + logger.warning( + "loading tensor of shape %s into " + "parameter '%s' of shape %s", + tensor.shape, + key, + param_shape, + ) + param_data.copy_(tensor) + state_dict.pop(key) + if state_dict: + raise ValueError( + f"Missing keys {tuple(state_dict)} in loaded state!") def iterate_over_files( self, paths) -> Generator[tuple[str, torch.Tensor], None, None]: diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 4c4502284..90c0bdf08 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -21,7 +21,8 @@ from torch.utils._python_dispatch import TorchDispatchMode from transformers import PretrainedConfig import vllm.envs as envs -from vllm.config import ModelConfig, ParallelConfig, set_current_vllm_config +from vllm.config import (ModelConfig, ParallelConfig, VllmConfig, + set_current_vllm_config) from vllm.engine.arg_utils import EngineArgs from vllm.logger import init_logger from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -208,12 +209,6 @@ class TensorizerConfig: **tensorizer_args.stream_params) -def load_with_tensorizer(tensorizer_config: TensorizerConfig, - **extra_kwargs) -> nn.Module: - tensorizer = TensorizerAgent(tensorizer_config, **extra_kwargs) - return tensorizer.deserialize() - - @dataclass class TensorizerArgs: tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, BinaryIO, str, @@ -366,100 +361,72 @@ class TensorizerArgs: return tensorizer_args -class TensorizerAgent: - """ - A class for performing tensorizer deserializations specifically for - vLLM models using plaid_mode. Uses TensorizerArgs to configure the - behavior of the TensorDeserializer when loading tensors from a serialized - model. For deserializations of HuggingFace models, TensorDeserializer is - instead used as an iterator directly in the func hf_model_weights_iterator - in vllm/model_executor/model_loader/weight_utils.py - """ - - def __init__(self, tensorizer_config: TensorizerConfig, vllm_config): - self.tensorizer_config = tensorizer_config - self.tensorizer_args = ( - self.tensorizer_config._construct_tensorizer_args()) - self.vllm_config = vllm_config - self.model = self._init_model() - - def _init_model(self): - assert self.tensorizer_config.hf_config is not None - model_args = self.tensorizer_config.hf_config - model_args.torch_dtype = self.tensorizer_config.dtype - assert self.tensorizer_config.model_class is not None - # TODO: Do we need to consider old-style model class? - with meta_tensor_mode(), set_current_vllm_config(self.vllm_config, - check_compile=True): - return self.tensorizer_config.model_class( - vllm_config=self.vllm_config) - - def _resize_lora_embeddings(self): - """Modify LoRA embedding layers to use bigger tensors - to allow for adapter added tokens.""" - for child in self.model.modules(): - if (isinstance(child, VocabParallelEmbedding) - and child.weight.shape[0] - < child.num_embeddings_per_partition): - new_weight = torch.empty(child.num_embeddings_per_partition, - child.embedding_dim, - dtype=child.weight.dtype, - device=child.weight.device) - new_weight[:child.weight.shape[0]].copy_(child.weight.data) - new_weight[child.weight.shape[0]:].fill_(0) - child.weight.data = new_weight - - def _check_tensors_on_meta_device(self): - for tensor in self.model.state_dict().values(): - if tensor.device.type == 'meta': - raise ValueError( - "The serialized model contains tensors on the meta device," - " indicating that some tensors were not loaded properly." - " Please check that the parameters of the model being" - " specified match that of the serialized model, such as" - " its quantization.") - - def deserialize(self): - """ - Deserialize the model using the TensorDeserializer. This method is - specifically for vLLM models using tensorizer's plaid_mode. - - The deserializer makes use of tensorizer_args.stream_params - to configure the behavior of the stream when loading tensors from a - serialized model. The deserializer_params are used to configure the - behavior of the TensorDeserializer when loading tensors themselves. - Documentation on these params can be found in TensorizerArgs - - Returns: - nn.Module: The deserialized model. - """ - before_mem = get_mem_usage() - start = time.perf_counter() - with _read_stream( - self.tensorizer_config.tensorizer_uri, - **self.tensorizer_args.stream_params - ) as stream, TensorDeserializer( +def _check_tensors_on_meta_device(model: nn.Module) -> None: + for tensor in model.state_dict().values(): + if tensor.device.type == 'meta': + raise ValueError( + "The serialized model contains tensors on the meta device," + " indicating that some tensors were not loaded properly." + " Please check that the parameters of the model being" + " specified match that of the serialized model, such as" + " its quantization.") + + +def _resize_lora_embeddings(model: nn.Module): + """Modify LoRA embedding layers to use bigger tensors + to allow for adapter added tokens.""" + for child in model.modules(): + if (isinstance(child, VocabParallelEmbedding) and child.weight.shape[0] + < child.num_embeddings_per_partition): + new_weight = torch.empty(child.num_embeddings_per_partition, + child.embedding_dim, + dtype=child.weight.dtype, + device=child.weight.device) + new_weight[:child.weight.shape[0]].copy_(child.weight.data) + new_weight[child.weight.shape[0]:].fill_(0) + child.weight.data = new_weight + + +def init_tensorizer_model(tensorizer_config: TensorizerConfig, + vllm_config: VllmConfig) -> nn.Module: + assert tensorizer_config.hf_config is not None + model_args = tensorizer_config.hf_config + model_args.torch_dtype = tensorizer_config.dtype + assert tensorizer_config.model_class is not None + # TODO: Do we need to consider old-style model class? + with meta_tensor_mode(), set_current_vllm_config(vllm_config, + check_compile=True): + return tensorizer_config.model_class(vllm_config=vllm_config) + + +def deserialize_tensorizer_model(model: nn.Module, + tensorizer_config: TensorizerConfig) -> None: + tensorizer_args = tensorizer_config._construct_tensorizer_args() + before_mem = get_mem_usage() + start = time.perf_counter() + with _read_stream( + tensorizer_config.tensorizer_uri, + **tensorizer_args.stream_params) as stream, TensorDeserializer( stream, - dtype=self.tensorizer_config.dtype, + dtype=tensorizer_config.dtype, device=f'cuda:{torch.cuda.current_device()}', - **self.tensorizer_args.deserializer_params) as deserializer: - deserializer.load_into_module(self.model) - end = time.perf_counter() - - total_bytes_str = convert_bytes(deserializer.total_tensor_bytes) - duration = end - start - per_second = convert_bytes(deserializer.total_tensor_bytes / duration) - after_mem = get_mem_usage() - deserializer.close() - logger.info("Deserialized %s in %0.2fs, %s/s", total_bytes_str, - end - start, per_second) - logger.info("Memory usage before: %s", before_mem) - logger.info("Memory usage after: %s", after_mem) - - self._check_tensors_on_meta_device() - self._resize_lora_embeddings() - del self.model.vllm_tensorized_marker - return self.model.eval() + **tensorizer_args.deserializer_params) as deserializer: + deserializer.load_into_module(model) + end = time.perf_counter() + + total_bytes_str = convert_bytes(deserializer.total_tensor_bytes) + duration = end - start + per_second = convert_bytes(deserializer.total_tensor_bytes / duration) + after_mem = get_mem_usage() + deserializer.close() + logger.info("Deserialized %s in %0.2fs, %s/s", total_bytes_str, + end - start, per_second) + logger.info("Memory usage before: %s", before_mem) + logger.info("Memory usage after: %s", after_mem) + + _check_tensors_on_meta_device(model) + _resize_lora_embeddings(model) + del model.vllm_tensorized_marker def tensorizer_weights_iterator( diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py index 2afe2b59e..1923e040a 100644 --- a/vllm/model_executor/model_loader/tensorizer_loader.py +++ b/vllm/model_executor/model_loader/tensorizer_loader.py @@ -11,8 +11,8 @@ from vllm.config import LoadConfig, ModelConfig, ParallelConfig, VllmConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.base_loader import BaseModelLoader from vllm.model_executor.model_loader.tensorizer import ( - TensorizerConfig, is_vllm_tensorized, load_with_tensorizer, - serialize_vllm_model, tensorizer_weights_iterator) + TensorizerConfig, deserialize_tensorizer_model, init_tensorizer_model, + is_vllm_tensorized, serialize_vllm_model, tensorizer_weights_iterator) from vllm.model_executor.model_loader.utils import (get_model_architecture, initialize_model, set_default_torch_dtype) @@ -61,38 +61,34 @@ class TensorizerLoader(BaseModelLoader): model.load_weights(self._get_weights_iterator()) return model.eval() - def _load_model_serialized( - self, - vllm_config: VllmConfig, - ) -> nn.Module: - """Load a serialized model with tensorizer. - - Expects a vLLM-tensorized model. See the - examples/others/tensorize_vllm_model.py example script - for serializing vLLM models.""" - - device_config = vllm_config.device_config - model_config = vllm_config.model_config - - with set_default_torch_dtype(model_config.dtype): - with torch.device(device_config.device): - model_class = get_model_architecture(model_config)[0] - - tensorizer_config = copy.copy(self.tensorizer_config) - tensorizer_config.model_class = model_class - tensorizer_config.hf_config = model_config.hf_config - tensorizer_config.dtype = model_config.dtype - - model = load_with_tensorizer(tensorizer_config, - vllm_config=vllm_config) - return model.eval() - def download_model(self, model_config: ModelConfig) -> None: self.tensorizer_config.verify_with_model_config(model_config) with self.tensorizer_config.open_stream(): pass + def _patch_tensorizer_config( + self, model_config: ModelConfig) -> TensorizerConfig: + model_class = get_model_architecture(model_config)[0] + tensorizer_config = copy.copy(self.tensorizer_config) + tensorizer_config.model_class = model_class + tensorizer_config.hf_config = model_config.hf_config + tensorizer_config.dtype = model_config.dtype + return tensorizer_config + + def load_weights(self, model: nn.Module, + model_config: ModelConfig) -> None: + """Load serialized model weights with tensorizer. + + Expects a vLLM-tensorized model. See the + examples/others/tensorize_vllm_model.py example script + for serializing vLLM models.""" + if is_vllm_tensorized(self.tensorizer_config): + tensorizer_config = self._patch_tensorizer_config(model_config) + deserialize_tensorizer_model(model, tensorizer_config) + else: + model.load_weights(self._get_weights_iterator()) + def load_model(self, vllm_config: VllmConfig, model_config: ModelConfig) -> nn.Module: parallel_config = vllm_config.parallel_config @@ -106,7 +102,11 @@ class TensorizerLoader(BaseModelLoader): get_tensor_model_parallel_rank()) if is_vllm_tensorized(self.tensorizer_config): - return self._load_model_serialized(vllm_config=vllm_config) + tensorizer_config = self._patch_tensorizer_config(model_config) + model = init_tensorizer_model(tensorizer_config=tensorizer_config, + vllm_config=vllm_config) + self.load_weights(model, model_config) + return model return self._load_model_serialized_cpu(vllm_config=vllm_config) @staticmethod diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4bc825ccb..9f7c474c7 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -28,7 +28,7 @@ from vllm.forward_context import (DPMetadata, get_forward_context, set_forward_context) from vllm.logger import init_logger from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding -from vllm.model_executor.model_loader import TensorizerLoader, get_model +from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange from vllm.multimodal.utils import group_mm_inputs_by_modality @@ -1564,7 +1564,18 @@ class GPUModelRunner(LoRAModelRunnerMixin): logger.info("Starting to load model %s...", self.model_config.model) with DeviceMemoryProfiler() as m: # noqa: SIM117 time_before_load = time.perf_counter() - self.model = get_model(vllm_config=self.vllm_config) + model_loader = get_model_loader(self.load_config) + if not hasattr(self, "model"): + logger.info("Loading model from scratch...") + self.model = model_loader.load_model( + vllm_config=self.vllm_config, + model_config=self.model_config) + else: + logger.info( + "Model was already initialized. Loading weights inplace..." + ) + model_loader.load_weights(self.model, + model_config=self.model_config) if self.lora_config: self.model = self.load_lora_model(self.model, self.model_config, diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index c57ac3138..5de92351e 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -21,7 +21,7 @@ from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.lora.layers import BaseLayerWithLoRA -from vllm.model_executor.model_loader import get_model +from vllm.model_executor.model_loader import get_model_loader from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs, PlaceholderRange) @@ -171,7 +171,7 @@ class TPUModelRunner(LoRAModelRunnerMixin): self.encoder_cache_size = encoder_cache_size # Lazy initialization - # self.model: nn.Module # Set after load_model + self.model: nn.Module # Set after load_model self.kv_caches: list[torch.Tensor] = [] # req_id -> (input_id -> encoder_output) self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {} @@ -419,7 +419,6 @@ class TPUModelRunner(LoRAModelRunnerMixin): return len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0 def get_model(self) -> nn.Module: - assert self.model is not None return self.model def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: @@ -936,7 +935,18 @@ class TPUModelRunner(LoRAModelRunnerMixin): "vllm.model_executor.layers.vocab_parallel_embedding." "get_tensor_model_parallel_rank", return_value=xm_tp_rank): - model = get_model(vllm_config=self.vllm_config) + # model = get_model(vllm_config=self.vllm_config) + model_loader = get_model_loader(self.load_config) + if not hasattr(self, "model"): + logger.info("Loading model from scratch...") + model = model_loader.load_model(vllm_config=self.vllm_config, + model_config=self.model_config) + else: + logger.info( + "Model was already initialized. Loading weights inplace..." + ) + model_loader.load_weights(self.model, + model_config=self.model_config) if self.lora_config is not None: model = self.load_lora_model(model, self.model_config, self.scheduler_config, @@ -947,7 +957,8 @@ class TPUModelRunner(LoRAModelRunnerMixin): # loading. xm.mark_step() xm.wait_device_ops() - self.model = model + if not hasattr(self, "model"): + self.model = model self.sampler = TPUSampler() @torch.no_grad() -- GitLab From 5b168b6d7a36877bc57db75cfb9ff6c9d850ced4 Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Mon, 2 Jun 2025 19:07:26 +0800 Subject: [PATCH 112/274] [doc] add pytest tips (#19010) Signed-off-by: reidliu41 Co-authored-by: reidliu41 --- docs/contributing/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/contributing/README.md b/docs/contributing/README.md index f84fb242f..65ae9cc96 100644 --- a/docs/contributing/README.md +++ b/docs/contributing/README.md @@ -108,6 +108,9 @@ pre-commit run mypy-3.9 --hook-stage manual --all-files # Unit tests pytest tests/ + +# Run tests for a single test file with detailed output +pytest -s -v tests/test_logger.py ``` !!! tip -- GitLab From ebb1ec931871ee1baca23673049865856c44ce4e Mon Sep 17 00:00:00 2001 From: jennyyyyzhen <47012288+jennyyyyzhen@users.noreply.github.com> Date: Mon, 2 Jun 2025 04:22:54 -0700 Subject: [PATCH 113/274] [Model] enable data parallel for Llama4 vision encoder (#18368) Signed-off-by: yzhen Co-authored-by: yZhen Co-authored-by: yzhen --- vllm/config.py | 4 + vllm/engine/arg_utils.py | 8 + vllm/model_executor/models/mllama4.py | 235 ++++++++++++++++++-------- vllm/multimodal/utils.py | 35 ++++ 4 files changed, 214 insertions(+), 68 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index f400e9875..d7bae52cb 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1790,6 +1790,10 @@ class ParallelConfig: rank: int = 0 """Global rank in distributed setup.""" + enable_multimodal_encoder_data_parallel: bool = False + """ Use data parallelism instead of tensor parallelism for vision encoder. + Only support LLama4 for now""" + @property def world_size_across_dp(self) -> int: """world_size_across_dp is TPxPPxDP, it is the size of the world diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 555532526..299c8347f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -423,6 +423,9 @@ class EngineArgs: use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load pt_load_map_location: str = LoadConfig.pt_load_map_location + enable_multimodal_encoder_data_parallel: bool = \ + ParallelConfig.enable_multimodal_encoder_data_parallel + def __post_init__(self): # support `EngineArgs(compilation_config={...})` # without having to manually construct a @@ -637,6 +640,9 @@ class EngineArgs: **parallel_kwargs["worker_cls"]) parallel_group.add_argument("--worker-extension-cls", **parallel_kwargs["worker_extension_cls"]) + parallel_group.add_argument( + "--enable-multimodal-encoder-data-parallel", + **parallel_kwargs["enable_multimodal_encoder_data_parallel"]) # KV cache arguments cache_kwargs = get_kwargs(CacheConfig) @@ -1078,6 +1084,8 @@ class EngineArgs: distributed_executor_backend=self.distributed_executor_backend, worker_cls=self.worker_cls, worker_extension_cls=self.worker_extension_cls, + enable_multimodal_encoder_data_parallel=self. + enable_multimodal_encoder_data_parallel, ) speculative_config = self.create_speculative_config( diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 8c98492c0..58549b10e 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -34,6 +34,7 @@ from vllm.distributed import get_tensor_model_parallel_world_size from vllm.inputs import InputProcessingContext from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, + ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope @@ -49,6 +50,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate, PromptUpdateDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.multimodal.utils import run_dp_sharded_vision_model from vllm.sequence import IntermediateTensors from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP @@ -84,23 +86,29 @@ class Llama4ImagePatchInputs(TypedDict): class Llama4VisionMLP(nn.Module): - def __init__(self, - input_size: int, - intermediate_size: int, - output_size: int, - bias: bool, - output_activation: bool, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + def __init__( + self, + input_size: int, + intermediate_size: int, + output_size: int, + bias: bool, + output_activation: bool, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + use_data_parallel: bool = False, + ): super().__init__() - self.fc1 = ColumnParallelLinear( + cls_fc1 = (ReplicatedLinear + if use_data_parallel else ColumnParallelLinear) + self.fc1 = cls_fc1( input_size=input_size, output_size=intermediate_size, bias=bias, quant_config=quant_config, prefix=f"{prefix}.fc1", ) - self.fc2 = RowParallelLinear( + cls_fc2 = ReplicatedLinear if use_data_parallel else RowParallelLinear + self.fc2 = cls_fc2( input_size=intermediate_size, output_size=output_size, bias=bias, @@ -155,10 +163,12 @@ def pixel_shuffle(input_tensor, shuffle_ratio): int(channels / shuffle_ratio)) reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous() - reshaped_tensor = reshaped_tensor.view(batch_size, - int(height * shuffle_ratio), - int(width * shuffle_ratio), - int(channels / (shuffle_ratio**2))) + reshaped_tensor = reshaped_tensor.view( + batch_size, + int(height * shuffle_ratio), + int(width * shuffle_ratio), + int(channels / (shuffle_ratio**2)), + ) reshaped_tensor = reshaped_tensor.permute(0, 2, 1, 3).contiguous() output_tensor = reshaped_tensor.view(batch_size, -1, @@ -173,6 +183,7 @@ class Llama4VisionPixelShuffleMLP(nn.Module): config, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ): super().__init__() self.pixel_shuffle_ratio = config.pixel_shuffle_ratio @@ -186,7 +197,9 @@ class Llama4VisionPixelShuffleMLP(nn.Module): bias=config.multi_modal_projector_bias, output_activation=True, quant_config=quant_config, - prefix=f"{prefix}.mlp") + prefix=f"{prefix}.mlp", + use_data_parallel=use_data_parallel, + ) def forward(self, encoded_patches: torch.Tensor) -> torch.Tensor: encoded_patches = pixel_shuffle(encoded_patches, @@ -201,10 +214,12 @@ class Llama4VisionAttention(nn.Module): config: Llama4VisionConfig, quant_config: Optional[QuantizationConfig], prefix: str = "", + use_data_parallel: bool = False, ): super().__init__() self.config = config - self.tp_size = get_tensor_model_parallel_world_size() + self.tp_size = (1 if use_data_parallel else + get_tensor_model_parallel_world_size()) self.embed_dim = config.hidden_size self.num_heads = config.num_attention_heads self.head_dim = config.hidden_size // self.num_heads @@ -217,22 +232,39 @@ class Llama4VisionAttention(nn.Module): self.attn = MultiHeadAttention(self.num_local_heads, self.head_dim, self.scaling) - self.qkv_proj = QKVParallelLinear( - self.embed_dim, - self.head_dim, - self.num_heads, - bias=True, - quant_config=quant_config, - prefix=f"{prefix}.qkv_proj", - ) - self.o_proj = RowParallelLinear( - self.num_heads * self.head_dim, - self.embed_dim, - bias=True, - input_is_parallel=True, - quant_config=quant_config, - prefix=f"{prefix}.o_proj", - ) + + if use_data_parallel: + self.qkv_proj = ReplicatedLinear( + self.embed_dim, + self.q_size + 2 * self.kv_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + self.o_proj = ReplicatedLinear( + self.num_heads * self.head_dim, + self.embed_dim, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + else: + self.qkv_proj = QKVParallelLinear( + self.embed_dim, + self.head_dim, + self.num_heads, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + self.o_proj = RowParallelLinear( + self.num_heads * self.head_dim, + self.embed_dim, + bias=True, + input_is_parallel=True, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) self.rotary_emb = get_rope( head_size=self.head_dim, @@ -275,22 +307,29 @@ class Llama4VisionEncoderLayer(nn.Module): config: Llama4VisionConfig, quant_config: Optional[QuantizationConfig], prefix: str = "", + use_data_parallel: bool = False, ): super().__init__() self.hidden_size = config.hidden_size self.num_attention_heads = config.num_attention_heads self.intermediate_size = config.intermediate_size - self.self_attn = Llama4VisionAttention(config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn") - self.mlp = Llama4VisionMLP(input_size=config.hidden_size, - intermediate_size=config.intermediate_size, - output_size=config.hidden_size, - bias=True, - output_activation=False, - quant_config=quant_config, - prefix=f"{prefix}.mlp") + self.self_attn = Llama4VisionAttention( + config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + use_data_parallel=use_data_parallel, + ) + self.mlp = Llama4VisionMLP( + input_size=config.hidden_size, + intermediate_size=config.intermediate_size, + output_size=config.hidden_size, + bias=True, + output_activation=False, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + use_data_parallel=use_data_parallel, + ) self.input_layernorm = nn.LayerNorm(config.hidden_size) self.post_attention_layernorm = nn.LayerNorm(config.hidden_size) @@ -322,6 +361,7 @@ class Llama4VisionEncoder(nn.Module): config: Llama4VisionConfig, quant_config: Optional[QuantizationConfig], prefix: str = "", + use_data_parallel: bool = False, ): super().__init__() self.config = config @@ -330,6 +370,7 @@ class Llama4VisionEncoder(nn.Module): config, quant_config=quant_config, prefix=f"{prefix}.layers.{layer_idx}", + use_data_parallel=use_data_parallel, ) for layer_idx in range(config.num_hidden_layers) ]) @@ -357,23 +398,33 @@ class Llama4VisionEncoder(nn.Module): class Llama4UnfoldConvolution(nn.Module): - def __init__(self, - config: Llama4VisionConfig, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + def __init__( + self, + config: Llama4VisionConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + use_data_parallel: bool = False, + ): super().__init__() kernel_size = config.patch_size if isinstance(kernel_size, int): kernel_size = (kernel_size, kernel_size) self.unfold = torch.nn.Unfold(kernel_size=kernel_size, stride=config.patch_size) - self.linear = ColumnParallelLinear(config.num_channels * - kernel_size[0] * kernel_size[1], - config.hidden_size, - bias=False, - quant_config=quant_config, - gather_output=True, - prefix=f"{prefix}.linear") + params = { + "input_size": + config.num_channels * kernel_size[0] * kernel_size[1], + "output_size": config.hidden_size, + "bias": False, + "quant_config": quant_config, + "prefix": f"{prefix}.linear", + } + if use_data_parallel: + cls = ReplicatedLinear + else: + cls = ColumnParallelLinear + params["gather_output"] = True + self.linear = cls(**params) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = self.unfold(hidden_states) @@ -389,6 +440,7 @@ class Llama4VisionModel(nn.Module): config: Llama4VisionConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + use_data_parallel: bool = False, ): super().__init__() self.config = config @@ -403,7 +455,9 @@ class Llama4VisionModel(nn.Module): self.patch_embedding = Llama4UnfoldConvolution( config, quant_config=quant_config, - prefix=f"{prefix}.patch_embedding") + prefix=f"{prefix}.patch_embedding", + use_data_parallel=use_data_parallel, + ) self.class_embedding = nn.Parameter(self.scale * torch.randn(self.hidden_size)) @@ -415,11 +469,18 @@ class Llama4VisionModel(nn.Module): self.layernorm_post = nn.LayerNorm(self.hidden_size, eps=1e-5) # encoders - self.model = Llama4VisionEncoder(config, - quant_config=quant_config, - prefix=f"{prefix}.model") + self.model = Llama4VisionEncoder( + config, + quant_config=quant_config, + prefix=f"{prefix}.model", + use_data_parallel=use_data_parallel, + ) self.vision_adapter = Llama4VisionPixelShuffleMLP( - config, quant_config, prefix=f"{prefix}.vision_adapter") + config, + quant_config, + prefix=f"{prefix}.vision_adapter", + use_data_parallel=use_data_parallel, + ) def forward( self, @@ -528,8 +589,9 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo] vision_config = self.info.get_hf_config().vision_config if processed_outputs.get("pixel_values") is not None: - assert "images" in mm_data, \ - "images expected to be in mm_data when pixel_values is present" + assert ( + "images" in mm_data + ), "images expected to be in mm_data when pixel_values is present" images = mm_data["images"] parsed_images = (self._get_data_parser().parse_mm_data({ @@ -546,8 +608,8 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo] get_best_fit( (image.size[1], image.size[0]), torch.tensor(possible_resolutions), - resize_to_max_canvas=image_processor.resize_to_max_canvas) - for image in parsed_images + resize_to_max_canvas=image_processor.resize_to_max_canvas, + ) for image in parsed_images ] # TODO tile height/width do not necessarily need to match aspect_ratios = [(image_size[0] // tile_size, @@ -659,13 +721,17 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config + self.use_data_parallel = (vllm_config.parallel_config. + enable_multimodal_encoder_data_parallel) self.config = config self.quant_config = quant_config self.multimodal_config = multimodal_config - self.vision_model = Llama4VisionModel(config.vision_config, - None, - prefix=maybe_prefix( - prefix, "vision_model")) + self.vision_model = Llama4VisionModel( + config.vision_config, + None, + prefix=maybe_prefix(prefix, "vision_model"), + use_data_parallel=self.use_data_parallel, + ) self.multi_modal_projector = Llama4MultiModalProjector( self.config, None, @@ -709,7 +775,13 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, flat_data = image_input["flat_data"] patches_per_image = image_input["patches_per_image"].tolist() - vision_embeddings_flat = self.vision_model(flat_data) + # shard image input + if self.use_data_parallel: + vision_embeddings_flat = run_dp_sharded_vision_model( + flat_data, self.vision_model) + else: + vision_embeddings_flat = self.vision_model(flat_data) + vision_embeddings_flat = self.multi_modal_projector( vision_embeddings_flat) @@ -796,6 +868,30 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, return get_prefix_weights(), get_other_weights() + def _consolidate_qkv_weights( + self, weights: Iterable[tuple[str, torch.Tensor]] + ) -> Iterable[tuple[str, torch.Tensor]]: + qkv_idx_mappings = { + ".self_attn.q_proj": 0, + ".self_attn.k_proj": 1, + ".self_attn.v_proj": 2, + } + qkv_weights = {} + for name, loaded_weight in weights: + for weight_name, idx in qkv_idx_mappings.items(): + if weight_name not in name: + continue + new_name = name.replace(weight_name, ".self_attn.qkv_proj") + if new_name not in qkv_weights: + qkv_weights[new_name] = [None] * 3 + qkv_weights[new_name][idx] = loaded_weight + break + else: + yield name, loaded_weight + for key, weight in qkv_weights.items(): + qkv_weight = torch.cat(weight, dim=0) + yield key, qkv_weight + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: @@ -818,9 +914,12 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, assert loaded_language_model_params is not None updated_params.update(loaded_language_model_params) + if self.use_data_parallel: + other_weights = self._consolidate_qkv_weights(other_weights) + for name, loaded_weight in other_weights: for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: + if weight_name not in name or self.use_data_parallel: continue name = name.replace(weight_name, param_name) param = params_dict[name] diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 9ddba67bf..1d838f66f 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -12,6 +12,9 @@ from PIL import Image import vllm.envs as envs from vllm.connections import HTTPConnection, global_http_connection +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather) from .audio import AudioMediaIO from .base import MediaIO @@ -390,3 +393,35 @@ def group_mm_inputs_by_modality( return [ list(group) for _, group in groupby(mm_inputs, key=modality_group_func) ] + + +def run_dp_sharded_vision_model(image_input: torch.Tensor, + vision_model: torch.nn.Module) -> torch.Tensor: + """Run a vision model with data parallelism (DP) sharding. The function + will shard the input image tensor on the first dimension and run the vision + model + + Args: + image_input (torch.Tensor): Image input tensor. + vision_model (torch.nn.Module): Vision model. + + Returns: + torch.Tensor: Output image embeddings + """ + + num_chunks = image_input.shape[0] + mp_world_size = get_tensor_model_parallel_world_size() + num_chunks_per_rank = (num_chunks + mp_world_size - 1) // mp_world_size + num_padded_chunks = num_chunks_per_rank * mp_world_size - num_chunks + pad = (0, ) * (2 * (image_input.dim() - 1)) + (0, num_padded_chunks) + image_input_padded = torch.nn.functional.pad(image_input, pad) + rank = get_tensor_model_parallel_rank() + image_input_per_rank = image_input_padded[rank * + num_chunks_per_rank:(rank + 1) * + num_chunks_per_rank, ...] + + vision_embeddings = vision_model(image_input_per_rank) + vision_embeddings = tensor_model_parallel_all_gather(vision_embeddings, + dim=0) + vision_embeddings = vision_embeddings[:num_chunks, ...] + return vision_embeddings -- GitLab From 20133cfee2e8a21d18be08f90b611cf0424cce96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=CE=B1n=C3=A7ois?= Date: Mon, 2 Jun 2025 17:04:23 +0200 Subject: [PATCH 114/274] [Frontend] enable custom logging for the uvicorn server (OpenAI API server) (#18403) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: François Paupier Co-authored-by: Cyrus Leung --- vllm/entrypoints/openai/api_server.py | 21 +++++++++++++++++++-- vllm/entrypoints/openai/cli_args.py | 8 ++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 1e7f88a6a..5a4295ff7 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -5,6 +5,7 @@ import atexit import gc import importlib import inspect +import json import multiprocessing import os import signal @@ -16,7 +17,6 @@ from collections.abc import AsyncIterator from contextlib import asynccontextmanager from functools import partial from http import HTTPStatus -from json import JSONDecodeError from typing import Annotated, Any, Optional import prometheus_client @@ -930,7 +930,7 @@ async def invocations(raw_request: Request): """ try: body = await raw_request.json() - except JSONDecodeError as e: + except json.JSONDecodeError as e: raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value, detail=f"JSON decode error: {e}") from e @@ -1003,6 +1003,18 @@ if envs.VLLM_ALLOW_RUNTIME_LORA_UPDATING: return Response(status_code=200, content=response) +def load_log_config(log_config_file: Optional[str]) -> Optional[dict]: + if not log_config_file: + return None + try: + with open(log_config_file) as f: + return json.load(f) + except Exception as e: + logger.warning("Failed to load log config from file %s: error %s", + log_config_file, e) + return None + + def build_app(args: Namespace) -> FastAPI: if args.disable_fastapi_docs: app = FastAPI(openapi_url=None, @@ -1324,6 +1336,11 @@ async def run_server_worker(listen_address, server_index = client_config.get("client_index", 0) if client_config else 0 + # Load logging config for uvicorn if specified + log_config = load_log_config(args.log_config_file) + if log_config is not None: + uvicorn_kwargs['log_config'] = log_config + async with build_async_engine_client(args, client_config) as engine_client: app = build_app(args) diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index d01af5e42..f196ff6ed 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -11,6 +11,7 @@ import ssl from collections.abc import Sequence from typing import Optional, Union, get_args +import vllm.envs as envs from vllm.engine.arg_utils import AsyncEngineArgs, optional_type from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, validate_chat_template) @@ -243,6 +244,13 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: " into OpenAI API format, the name register in this plugin can be used " "in ``--tool-call-parser``.") + parser.add_argument( + "--log-config-file", + type=str, + default=envs.VLLM_LOGGING_CONFIG_PATH, + help="Path to logging config JSON file for both vllm and uvicorn", + ) + parser = AsyncEngineArgs.add_cli_args(parser) parser.add_argument('--max-log-len', -- GitLab From ca2f6b9c301df6dbe2e5c83c705051f478140695 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Mon, 2 Jun 2025 11:15:53 -0400 Subject: [PATCH 115/274] [Bugfix][Model] Attempt to fix eagle in V0. (#18978) Signed-off-by: Gregory Shtrasberg --- vllm/config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index d7bae52cb..d0891d670 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2595,7 +2595,8 @@ class SpeculativeConfig: else: eagle_config = EAGLEConfig( self.draft_model_config.hf_config, - method=self.method) + method=self.method, + model_type="eagle") self.draft_model_config.hf_config = eagle_config if (self.num_speculative_tokens is not None -- GitLab From c57d577e8dc41f270a3ce0d604f5d8ac51b08ed7 Mon Sep 17 00:00:00 2001 From: Calvin Chen <45745657+calvin0327@users.noreply.github.com> Date: Tue, 3 Jun 2025 03:38:23 +0800 Subject: [PATCH 116/274] add an absolute path for run.sh (#18258) Signed-off-by: calvin chen <120380290@qq.com> --- .../disaggregated-prefill-v1/run.sh | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/examples/offline_inference/disaggregated-prefill-v1/run.sh b/examples/offline_inference/disaggregated-prefill-v1/run.sh index 0ebf45a15..c1dcc95a2 100644 --- a/examples/offline_inference/disaggregated-prefill-v1/run.sh +++ b/examples/offline_inference/disaggregated-prefill-v1/run.sh @@ -1,5 +1,11 @@ rm -rf local_storage/ -rm output.txt -VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 prefill_example.py -VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py +if [ -f "output.txt" ]; then + rm output.txt +fi + +# The directory of current script +SCRIPT_DIR=$(dirname "$(readlink -f "$0")") + +VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/prefill_example.py" +VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 "$SCRIPT_DIR/decode_example.py" -- GitLab From 9112b443a042d8d815880b8780633882ad32b183 Mon Sep 17 00:00:00 2001 From: Siyuan Liu Date: Mon, 2 Jun 2025 17:06:20 -0700 Subject: [PATCH 117/274] [Hardware][TPU] Initial support of model parallelism with single worker using SPMD (#18011) Signed-off-by: Siyuan Liu Co-authored-by: Hossein Sarshar Co-authored-by: Chengji Yao --- .../scripts/hardware_ci/run-tpu-v1-test.sh | 4 + examples/offline_inference/tpu.py | 29 ++- .../v1/tpu/test_spmd_model_weight_loading.py | 67 +++++++ tests/v1/tpu/test_tpu_qkv_linear.py | 89 +++++++++ vllm/config.py | 2 + vllm/distributed/tpu_distributed_utils.py | 177 ++++++++++++++++++ vllm/envs.py | 5 + vllm/model_executor/model_loader/tpu.py | 112 +++++++++++ vllm/model_executor/utils.py | 4 +- vllm/v1/worker/tpu_model_runner.py | 101 ++++++---- vllm/v1/worker/tpu_worker.py | 87 +++++---- 11 files changed, 605 insertions(+), 72 deletions(-) create mode 100644 tests/v1/tpu/test_spmd_model_weight_loading.py create mode 100644 tests/v1/tpu/test_tpu_qkv_linear.py create mode 100644 vllm/distributed/tpu_distributed_utils.py create mode 100644 vllm/model_executor/model_loader/tpu.py diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index 610243145..3212b660e 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -155,6 +155,10 @@ run_and_track_test 12 "test_moe_pallas.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" run_and_track_test 13 "test_lora.py" \ "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py" +run_and_track_test 14 "test_tpu_qkv_linear.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py" +run_and_track_test 15 "test_spmd_model_weight_loading.py" \ + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py" # After all tests have been attempted, exit with the overall status. if [ "$overall_script_exit_code" -ne 0 ]; then diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py index e4a75b3f9..f3c2859d4 100644 --- a/examples/offline_inference/tpu.py +++ b/examples/offline_inference/tpu.py @@ -1,5 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +import argparse +import os + from vllm import LLM, SamplingParams prompts = [ @@ -18,14 +21,28 @@ sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16) def main(): + parser = argparse.ArgumentParser(description="TPU offline inference example") + parser.add_argument("--use-spmd", action="store_true", help="Enable SPMD mode") + args = parser.parse_args() + + llm_args = { + "model": "Qwen/Qwen2-1.5B-Instruct", + "max_num_batched_tokens": 64, + "max_num_seqs": 4, + "max_model_len": 128, + } + if args.use_spmd: + os.environ["VLLM_XLA_USE_SPMD"] = "1" + # Can only hardcode the number of chips for now. + # calling xr.global_runtime_device_count() beforeing init SPMD env in + # torch_xla will mess up the distributed env. + llm_args["tensor_parallel_size"] = 8 + # Use Llama, for num_kv_heads = 8. + llm_args["model"] = "meta-llama/Llama-3.1-8B-Instruct" + # Set `enforce_eager=True` to avoid ahead-of-time compilation. # In real workloads, `enforace_eager` should be `False`. - llm = LLM( - model="Qwen/Qwen2-1.5B-Instruct", - max_num_batched_tokens=64, - max_num_seqs=4, - max_model_len=128, - ) + llm = LLM(**llm_args) outputs = llm.generate(prompts, sampling_params) print("-" * 50) for output, answer in zip(outputs, answers): diff --git a/tests/v1/tpu/test_spmd_model_weight_loading.py b/tests/v1/tpu/test_spmd_model_weight_loading.py new file mode 100644 index 000000000..d36edfc3f --- /dev/null +++ b/tests/v1/tpu/test_spmd_model_weight_loading.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: Apache-2.0 +import gc +import tempfile + +import numpy as np +import pytest +import torch_xla.distributed.spmd as xs +import torch_xla.runtime as xr + +from vllm.config import set_current_vllm_config +from vllm.distributed.parallel_state import (ensure_model_parallel_initialized, + init_distributed_environment) +from vllm.engine.arg_utils import EngineArgs +from vllm.model_executor.model_loader.tpu import TPUModelLoader + + +def _setup_environment(model): + engine_args = EngineArgs(model=model, ) + vllm_config = engine_args.create_engine_config() + with set_current_vllm_config(vllm_config): + temp_file = tempfile.mkstemp()[1] + init_distributed_environment( + 1, + 0, + local_rank=0, + distributed_init_method=f"file://{temp_file}", + backend="gloo") + # Under single worker mode, full model is init first and then + # partitioned using GSPMD. + ensure_model_parallel_initialized(1, 1) + return vllm_config + + +MESH = None + + +def _get_spmd_mesh(): + global MESH + if MESH is None: + xr.use_spmd() + num_devices = xr.global_runtime_device_count() + mesh_shape = (num_devices, 1) + device_ids = np.array(range(num_devices)) + MESH = xs.Mesh(device_ids, mesh_shape, ('x', 'y')) + return MESH + + +@pytest.mark.parametrize("model", [ + "Qwen/Qwen2-1.5B-Instruct", + "meta-llama/Llama-3.1-8B-Instruct", + "meta-llama/Llama-3.1-70B-Instruct", +]) +def test_tpu_model_loader(model): + # Skip the 70B test if there are less than 8 chips + # TODO: Query using torch xla API, the query API is not working + # with SPMD now. However, This test is running under SPMD mode. + if '70B' in model and xr.global_runtime_device_count() < 8: + pytest.skip( + "Skipping 70B model if the TPU VM has less than 8 chips to \ + avoid OOM.") + + vllm_config = _setup_environment(model) + loader = TPUModelLoader(load_config=vllm_config.load_config) + mesh = _get_spmd_mesh() + model = loader.load_model(vllm_config, vllm_config.model_config, mesh) + del model + gc.collect() diff --git a/tests/v1/tpu/test_tpu_qkv_linear.py b/tests/v1/tpu/test_tpu_qkv_linear.py new file mode 100644 index 000000000..b98570f01 --- /dev/null +++ b/tests/v1/tpu/test_tpu_qkv_linear.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: Apache-2.0 +import tempfile + +import numpy as np +import pytest +import torch +import torch_xla.distributed.spmd as xs +import torch_xla.runtime as xr + +from vllm.config import set_current_vllm_config +from vllm.distributed.parallel_state import (ensure_model_parallel_initialized, + init_distributed_environment) +from vllm.distributed.tpu_distributed_utils import XlaQKVParallelLinear +from vllm.engine.arg_utils import EngineArgs +from vllm.model_executor.layers.linear import QKVParallelLinear + + +@pytest.fixture(autouse=True) +def setup_environment(): + # This is a fake config used for init dist env. + # QKVParallelLinear needs dist env to be initialized. + engine_args = EngineArgs( + model="Qwen/Qwen2-1.5B-Instruct", + max_model_len=64, + max_num_batched_tokens=64, + max_num_seqs=4, + ) + + vllm_config = engine_args.create_engine_config() + + with set_current_vllm_config(vllm_config): + temp_file = tempfile.mkstemp()[1] + init_distributed_environment( + 1, + 0, + local_rank=0, + distributed_init_method=f"file://{temp_file}", + backend="gloo") + ensure_model_parallel_initialized(1, 1) + yield + + +MESH = None + + +def _get_spmd_mesh(): + global MESH + if MESH is None: + xr.use_spmd() + num_devices = xr.global_runtime_device_count() + mesh_shape = (num_devices, 1) + device_ids = np.array(range(num_devices)) + MESH = xs.Mesh(device_ids, mesh_shape, ('x', 'y')) + return MESH + + +@pytest.mark.parametrize("bias", [False, True]) +# `xr.use_spmd()` will set a global state, and this state is not reversible. +# Therefore, non-SPMD tests should be run before SPMD tests. +@pytest.mark.parametrize("mesh", [None, _get_spmd_mesh()]) +@pytest.mark.parametrize("device", ['cpu', 'xla']) +@torch.no_grad() +def test_xla_qkv_linear(bias, mesh, device): + torch.manual_seed(123) + + qkv_linear = QKVParallelLinear( + hidden_size=4096, + head_size=128, + total_num_heads=32, + total_num_kv_heads=8, + bias=bias, + params_dtype=torch.bfloat16, + return_bias=False, + ) + + qkv_linear.weight.data = torch.rand_like(qkv_linear.weight.data) / 10 + if bias: + qkv_linear.bias.data = torch.rand_like(qkv_linear.bias.data) + + xla_qkv_linear = XlaQKVParallelLinear(qkv_linear, mesh=mesh) + + qkv_linear = qkv_linear.to(device) + xla_qkv_linear = xla_qkv_linear.to(device) + input_tensor = torch.rand(10, 4096, dtype=torch.bfloat16) / 10 + input_tensor = input_tensor.to(device) + + output = qkv_linear(input_tensor) + xla_output = xla_qkv_linear(input_tensor) + assert torch.allclose(output.cpu(), xla_output.cpu()) diff --git a/vllm/config.py b/vllm/config.py index d0891d670..1bd53e35b 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1901,6 +1901,8 @@ class ParallelConfig: if current_platform.is_neuron(): # neuron uses single process to control multiple devices backend = "uni" + elif current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD: + backend = "uni" elif (current_platform.is_cuda() and cuda_device_count_stateless() < self.world_size): if not ray_found: diff --git a/vllm/distributed/tpu_distributed_utils.py b/vllm/distributed/tpu_distributed_utils.py new file mode 100644 index 000000000..36ab2eb3a --- /dev/null +++ b/vllm/distributed/tpu_distributed_utils.py @@ -0,0 +1,177 @@ +# SPDX-License-Identifier: Apache-2.0 +from collections import OrderedDict +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch_xla.distributed.spmd as xs +from torch.nn.parameter import Parameter + +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) + +logger = init_logger(__name__) + + +class XlaQKVParallelLinear(nn.Module): + + def __init__(self, + qkv_linear: nn.Module, + mesh: Optional["xs.Mesh"] = None): + super().__init__() + assert isinstance(qkv_linear, QKVParallelLinear) + self.skip_bias_add = qkv_linear.skip_bias_add + self.return_bias = qkv_linear.return_bias + assert qkv_linear.tp_size == 1, "TP > 1 is only supported under SPMD." + + self.q_weight: Parameter + self.k_weight: Parameter + self.v_weight: Parameter + self.q_bias: Optional[Parameter] + self.k_bias: Optional[Parameter] + self.v_bias: Optional[Parameter] + self._load_weights_from_qkv_linear(qkv_linear) + if mesh is not None: + self._shard_weight(mesh) + + def _shard_weight(self, mesh: "xs.Mesh"): + self.q_weight = Parameter(self.q_weight.to('xla'), requires_grad=False) + self.k_weight = Parameter(self.k_weight.to('xla'), requires_grad=False) + self.v_weight = Parameter(self.v_weight.to('xla'), requires_grad=False) + xs.mark_sharding(self.q_weight, mesh, ('x', None)) + xs.mark_sharding(self.k_weight, mesh, ('x', None)) + xs.mark_sharding(self.v_weight, mesh, ('x', None)) + if self.q_bias is not None: + assert self.k_bias is not None and self.v_bias is not None, \ + "QKVParallelLinear should have q, k, and v biases together." + self.q_bias = Parameter(self.q_bias.to('xla'), requires_grad=False) + xs.mark_sharding(self.q_bias, mesh, ('x', )) + self.k_bias = Parameter(self.k_bias.to('xla'), requires_grad=False) + xs.mark_sharding(self.k_bias, mesh, ('x', )) + self.v_bias = Parameter(self.v_bias.to('xla'), requires_grad=False) + xs.mark_sharding(self.v_bias, mesh, ('x', )) + + def _load_weights_from_qkv_linear(self, qkv_linear: nn.Module): + q_proj_size, k_proj_size, _ = qkv_linear.output_sizes + # The weight of qkv linear is a concatenation of q, k, and v weights + # along the output dimension. + qkv_weight = qkv_linear.weight.data.cpu() + q_weight = Parameter(qkv_weight[:q_proj_size], requires_grad=False) + k_weight = Parameter(qkv_weight[q_proj_size:q_proj_size + k_proj_size], + requires_grad=False) + v_weight = Parameter(qkv_weight[q_proj_size + k_proj_size:], + requires_grad=False) + self.register_parameter("q_weight", q_weight) + self.register_parameter("k_weight", k_weight) + self.register_parameter("v_weight", v_weight) + + if qkv_linear.bias is not None: + q_bias = Parameter(qkv_linear.bias[:q_proj_size], + requires_grad=False) + k_bias = Parameter(qkv_linear.bias[q_proj_size:q_proj_size + + k_proj_size], + requires_grad=False) + v_bias = Parameter(qkv_linear.bias[q_proj_size + k_proj_size:], + requires_grad=False) + self.register_parameter("q_bias", q_bias) + self.register_parameter("k_bias", k_bias) + self.register_parameter("v_bias", v_bias) + else: + self.register_parameter("q_bias", None) + self.register_parameter("k_bias", None) + self.register_parameter("v_bias", None) + + def forward(self, input): + # Same forward functionality as QKVParallelLinear, but doing qkv porj + # separately. + q_bias = self.q_bias if not self.skip_bias_add else None + k_bias = self.k_bias if not self.skip_bias_add else None + v_bias = self.v_bias if not self.skip_bias_add else None + q_proj = F.linear(input, self.q_weight, q_bias) + k_proj = F.linear(input, self.k_weight, k_bias) + v_proj = F.linear(input, self.v_weight, v_bias) + # The q/k/v projections will be split outside of the QKVParallelLinear. + # Because we are replacing XlaQKVParallelLinear with the + # QKVParallelLinear, we need to concatenate q, k, and v projections to + # match the output shape of the QKVParallelLinear implementation even if + # it seems to be redundant. + # The concat and the following split will be noop, and should be + # optimized away by the compiler. + qkv_proj = torch.cat([q_proj, k_proj, v_proj], dim=-1) + output_bias = torch.cat([q_bias, k_bias, v_bias], dim=-1) if \ + self.skip_bias_add else None + if not self.return_bias: + return qkv_proj + return qkv_proj, output_bias + + +def partition_column_parallel_linear(layer: torch.nn.Module, + mesh: xs.Mesh) -> torch.nn.Module: + assert isinstance(layer, ColumnParallelLinear) + xs.mark_sharding(layer.weight, mesh, ('x', None)) + logger.debug("Applied column-parallel sharding to %s", layer) + return layer + + +def partition_row_parallel_linear(layer: torch.nn.Module, + mesh: xs.Mesh) -> torch.nn.Module: + assert isinstance(layer, RowParallelLinear) + xs.mark_sharding(layer.weight, mesh, (None, 'x')) + logger.debug("Applied row-parallel sharding to %s", layer) + return layer + + +def partition_qkv_parallel_linear(layer: torch.nn.Module, + mesh: xs.Mesh) -> torch.nn.Module: + assert isinstance(layer, QKVParallelLinear) + xla_layer = XlaQKVParallelLinear(layer, mesh) + logger.debug("Applied qkv parallel sharding to %s", layer) + return xla_layer + + +MODULE_TYPE_TO_WRAPPING_FUNC = OrderedDict([ + ("QKVParallelLinear", partition_qkv_parallel_linear), + ("ColumnParallelLinear", partition_column_parallel_linear), + ("RowParallelLinear", partition_row_parallel_linear), +]) + + +def get_fqn(module): + # Get the fully qualified name of the module + return module.__class__.__qualname__ + + +def shard_model(model: torch.nn.Module, mesh: "xs.Mesh") -> None: + """ + Recursively check a PyTorch model and apply appropriate sharding based on + the MODULE_TYPE_TO_WRAPPING_FUNC mapping. + + Args: + model: torch.nn.Module to process + mesh: An XLA SPMD mesh object used for sharding + """ + + def _process_module(module, name=None, parent=None): + for module_type, wrapping_func in MODULE_TYPE_TO_WRAPPING_FUNC.items(): + if get_fqn(module) == module_type: + wrapped_module = wrapping_func(module, mesh) + + assert parent is not None and name is not None, ( + "Top Level module is not expected to be wrapped.") + if wrapped_module is not module: + # Wrapped module and module are different py object. + # The original module should be replaced by the + # wrapped_module. + logger.debug("replace %s with %s", module, wrapped_module) + setattr(parent, name, wrapped_module) + + module = wrapped_module + break + + for child_name, child_module in list(module.named_children()): + _process_module(child_module, child_name, module) + + _process_module(model) diff --git a/vllm/envs.py b/vllm/envs.py index 44baf5a18..3dd0d9045 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -51,6 +51,7 @@ if TYPE_CHECKING: VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: str = "auto" VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False + VLLM_XLA_USE_SPMD: bool = False VLLM_WORKER_MULTIPROC_METHOD: str = "fork" VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets") VLLM_IMAGE_FETCH_TIMEOUT: int = 5 @@ -513,6 +514,10 @@ environment_variables: dict[str, Callable[[], Any]] = { # If set, assert on XLA recompilation after each execution step. "VLLM_XLA_CHECK_RECOMPILATION": lambda: bool(int(os.getenv("VLLM_XLA_CHECK_RECOMPILATION", "0"))), + + # Enable SPMD mode for TPU backend. + "VLLM_XLA_USE_SPMD": + lambda: bool(int(os.getenv("VLLM_XLA_USE_SPMD", "0"))), "VLLM_FUSED_MOE_CHUNK_SIZE": lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")), diff --git a/vllm/model_executor/model_loader/tpu.py b/vllm/model_executor/model_loader/tpu.py new file mode 100644 index 000000000..6197bcdba --- /dev/null +++ b/vllm/model_executor/model_loader/tpu.py @@ -0,0 +1,112 @@ +# SPDX-License-Identifier: Apache-2.0 +import time +from typing import Optional + +import torch +import torch.nn as nn +import torch_xla.core.xla_model as xm +import torch_xla.distributed.spmd as xs + +from vllm.config import ModelConfig, VllmConfig +from vllm.distributed.tpu_distributed_utils import get_fqn, shard_model +from vllm.logger import init_logger +from vllm.model_executor.model_loader.default_loader import DefaultModelLoader +from vllm.model_executor.model_loader.utils import ( + initialize_model, process_weights_after_loading, set_default_torch_dtype) + +logger = init_logger(__name__) + + +class TPUModelLoader(DefaultModelLoader): + """ + A TPU model loader for model loading under SPMD mode. + """ + + def load_model( + self, + vllm_config: VllmConfig, + model_config: ModelConfig, + mesh: Optional[xs.Mesh] = None, + ) -> nn.Module: + # Initialize model and load weights on CPU. Then, during SPMD partition, + # weights are sharded and transferred to TPUs. + self.counter_before_loading_weights = time.perf_counter() + model_config = vllm_config.model_config + assert model_config.quantization is None, "Quantization not supported" + target_device = torch.device('cpu') + with set_default_torch_dtype(model_config.dtype): + with target_device: + model = initialize_model(vllm_config=vllm_config) + + load_format = vllm_config.load_config.load_format + if load_format != "dummy": + weights_to_load = { + name + for name, _ in model.named_parameters() + } + all_weights = self.get_all_weights(model_config, model) + loaded_weights = model.load_weights(all_weights) + self.counter_after_loading_weights = time.perf_counter() + logger.info( + "Loading weights took %.2f seconds", + self.counter_after_loading_weights - + self.counter_before_loading_weights) + # We only enable strict check for non-quantized models + # that have loaded weights tracking currently. + if model_config.quantization is None and \ + loaded_weights is not None: + weights_not_loaded = weights_to_load - loaded_weights + if weights_not_loaded: + raise ValueError( + "Following weights were not initialized from " + f"checkpoint: {weights_not_loaded}") + else: + logger.info("Use dummy weight during weight loading.") + + process_weights_after_loading(model, model_config, target_device) + + counter_before_partition = time.perf_counter() + model = model.eval() + model = model.to('xla') + shard_model(model, mesh) + counter_after_partition = time.perf_counter() + logger.info("Partition model took %.2f seconds", + counter_after_partition - counter_before_partition) + + # Ensure the model is properly loaded. + self._check_model_is_loaded(mesh, model) + + # Need to torch compile after model sharding are done. Because the + # compiler hints ('xs.mark_sharding') are torch ops. + if not model_config.is_multimodal_model: + model.model = torch.compile(model.model, backend="openxla") + else: + model.language_model.model = \ + torch.compile(model.language_model.model, backend="openxla") + return model + + def _check_model_is_loaded(self, mesh: Optional[xs.Mesh], + model: nn.Module) -> None: + """ + Ensure the model is properly loaded. + 1. All model parameters and buffers are on XLA device. + 2. Non-SPMD friendly layers are replaced as expected. + """ + device = xm.xla_device() + device_type = str(device.type) + + # Check parameters + for name, param in model.named_parameters(): + assert param.device.type == device_type, f"Parameter {name} is on \ + {param.device.type} instead of {device_type}" + + # Check buffers + for name, buffer in model.named_buffers(): + assert buffer.device.type == device_type, \ + f"Buffer {name} is on {buffer.device.type} instead of \ + {device_type}" + + for module in model.modules(): + if (mesh is not None) and (get_fqn(module) == 'QKVParallelLinear'): + raise AssertionError("QKVParallelLinear should be replaced by \ + XlaQKVParallelLinear under SPMD mode.") diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index 1b120c354..27cea6521 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -49,7 +49,9 @@ def _make_synced_weight_loader(original_weight_loader): def _synced_weight_loader(param, *args, **kwargs): original_weight_loader(param, *args, **kwargs) - torch._sync(param) + # torch._sync doesn't support, is not needed for CPU tensors. + if param.device != torch.device("cpu"): + torch._sync(param) return _synced_weight_loader diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 5de92351e..c5171b973 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -7,21 +7,22 @@ from unittest.mock import patch import numpy as np import torch -import torch.distributed import torch.nn as nn # TPU XLA related import torch_xla.core.xla_model as xm +import torch_xla.distributed.spmd as xs import torch_xla.runtime as xr import vllm.envs as envs from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher -from vllm.config import VllmConfig, get_layers_from_vllm_config +from vllm.config import ParallelConfig, VllmConfig, get_layers_from_vllm_config from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.lora.layers import BaseLayerWithLoRA from vllm.model_executor.model_loader import get_model_loader +from vllm.model_executor.model_loader.tpu import TPUModelLoader from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (BatchedTensorInputs, MultiModalKwargs, PlaceholderRange) @@ -98,6 +99,7 @@ class TPUModelRunner(LoRAModelRunnerMixin): self, vllm_config: VllmConfig, device: torch.device, + original_parallel_config: Optional[ParallelConfig] = None, ): self.vllm_config = vllm_config self.model_config = vllm_config.model_config @@ -105,6 +107,7 @@ class TPUModelRunner(LoRAModelRunnerMixin): self.lora_config = vllm_config.lora_config self.load_config = vllm_config.load_config self.parallel_config = vllm_config.parallel_config + self.original_parallel_config = original_parallel_config self.scheduler_config = vllm_config.scheduler_config self.speculative_config = vllm_config.speculative_config self.prompt_adapter_config = vllm_config.prompt_adapter_config @@ -118,6 +121,14 @@ class TPUModelRunner(LoRAModelRunnerMixin): self.device = device self.check_recompilation = envs.VLLM_XLA_CHECK_RECOMPILATION + # SPMD Related + self.use_spmd = envs.VLLM_XLA_USE_SPMD + if self.use_spmd: + num_devices = xr.global_runtime_device_count() + mesh_shape = (num_devices, 1) + device_ids = np.array(range(num_devices)) + self.mesh = xs.Mesh(device_ids, mesh_shape, ('x', 'y')) + self.enforce_eager = model_config.enforce_eager self.num_xla_graphs = 0 @@ -271,6 +282,15 @@ class TPUModelRunner(LoRAModelRunnerMixin): max_num_mm_items_decoder_budget) self.max_num_mm_items_by_modality[modality] = max_num_mm_items + if not self.use_spmd: + self.sample_from_logits_func = torch.compile( + self.sample_from_logits, + backend="openxla", + fullgraph=True, + dynamic=False) + else: + self.sample_from_logits_func = self.sample_from_logits + def _update_num_xla_graphs(self, case_str): check_comp = self.check_recompilation and not self.enforce_eager if not check_comp: @@ -825,9 +845,8 @@ class TPUModelRunner(LoRAModelRunnerMixin): logits = self.structured_decode(require_struct_decoding, grammar_bitmask_padded, logits, arange) - selected_token_ids = self.sample_from_logits(logits, - tpu_sampling_metadata) - + selected_token_ids = self.sample_from_logits_func( + logits, tpu_sampling_metadata) # NOTE (NickLucche) Use the original logits (before any penalties or # temperature scaling) for the top-k logprobs. We can't enforce it due # to recompilations outside torch.compiled code, so just make sure @@ -935,18 +954,26 @@ class TPUModelRunner(LoRAModelRunnerMixin): "vllm.model_executor.layers.vocab_parallel_embedding." "get_tensor_model_parallel_rank", return_value=xm_tp_rank): - # model = get_model(vllm_config=self.vllm_config) - model_loader = get_model_loader(self.load_config) - if not hasattr(self, "model"): - logger.info("Loading model from scratch...") - model = model_loader.load_model(vllm_config=self.vllm_config, - model_config=self.model_config) + if self.use_spmd: + tpu_loader = TPUModelLoader( + load_config=self.vllm_config.load_config) + model = tpu_loader.load_model( + vllm_config=self.vllm_config, + model_config=self.vllm_config.model_config, + mesh=self.mesh) else: - logger.info( - "Model was already initialized. Loading weights inplace..." - ) - model_loader.load_weights(self.model, - model_config=self.model_config) + # model = get_model(vllm_config=self.vllm_config) + model_loader = get_model_loader(self.load_config) + if not hasattr(self, "model"): + logger.info("Loading model from scratch...") + model = model_loader.load_model( + vllm_config=self.vllm_config, + model_config=self.model_config) + else: + logger.info("Model was already initialized. \ + Loading weights inplace...") + model_loader.load_weights(self.model, + model_config=self.model_config) if self.lora_config is not None: model = self.load_lora_model(model, self.model_config, self.scheduler_config, @@ -970,31 +997,25 @@ class TPUModelRunner(LoRAModelRunnerMixin): device=self.device) else: input_ids = torch.zeros((num_tokens), - dtype=torch.int32, - device=self.device) + dtype=torch.int32).to(self.device) inputs_embeds = None actual_num_reqs = min(num_tokens, self.max_num_reqs) position_ids = torch.zeros(num_tokens, - dtype=torch.int32, - device=self.device) + dtype=torch.int32).to(self.device) slot_mapping = torch.zeros(num_tokens, - dtype=torch.int64, - device=self.device) + dtype=torch.int64).to(self.device) block_tables = torch.zeros( (self.max_num_reqs, self.block_table_cpu.shape[1]), - dtype=torch.int32, - device=self.device) + dtype=torch.int32).to(self.device) query_lens = [1] * self.max_num_reqs query_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0, dtype=torch.int32).to(self.device) context_lens = torch.ones((self.max_num_reqs, ), - dtype=torch.int32, - device=self.device) + dtype=torch.int32).to(self.device) num_seqs = torch.tensor([actual_num_reqs], - dtype=torch.int32, - device=self.device) + dtype=torch.int32).to(self.device) attn_metadata = PallasMetadata( slot_mapping=slot_mapping, block_tables=block_tables, @@ -1198,7 +1219,8 @@ class TPUModelRunner(LoRAModelRunnerMixin): with self.maybe_select_dummy_loras( self.lora_config, np.array([num_reqs], dtype=np.int32)): - self.sample_from_logits(dummy_logits, sampling_metadata) + self.sample_from_logits_func(dummy_logits, + sampling_metadata) logger.info(" -- num_seqs: %d", num_reqs) xm.wait_device_ops() end = time.perf_counter() @@ -1332,14 +1354,22 @@ class TPUModelRunner(LoRAModelRunnerMixin): assert tensor_config.size % kv_cache_spec.page_size_bytes == 0 num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes if isinstance(kv_cache_spec, AttentionSpec): + if self.use_spmd: + num_kv_heads = kv_cache_spec.num_kv_heads + assert self.original_parallel_config is not None + tp_size = \ + self.original_parallel_config.tensor_parallel_size + # TODO: Handle kv cache duplication under SPMD mode. + assert num_kv_heads % tp_size == 0, ( + f"num_kv_heads {num_kv_heads} must be divisible by " + f"tp_size {tp_size} under SPMD mode") kv_cache_shape = PallasAttentionBackend.get_kv_cache_shape( num_blocks, kv_cache_spec.block_size, kv_cache_spec.num_kv_heads, kv_cache_spec.head_size) dtype = kv_cache_spec.dtype tpu_kv_cache = torch.zeros(kv_cache_shape, - dtype=dtype, - device=self.device) + dtype=dtype).to(self.device) kv_caches[layer_name] = tpu_kv_cache else: @@ -1350,6 +1380,11 @@ class TPUModelRunner(LoRAModelRunnerMixin): self.vllm_config.compilation_config.static_forward_context, self.kv_caches) + if self.use_spmd: + # Shard KV Cache + for cache in self.kv_caches: + xs.mark_sharding(cache, self.mesh, (None, 'x', None, None)) + def reset_dynamo_cache(self): if self.is_multimodal_model: compiled_model = self.model.get_language_model().model @@ -1370,7 +1405,9 @@ class TPUModelRunner(LoRAModelRunnerMixin): sample_hidden_states: torch.Tensor) -> torch.Tensor: return self.model.compute_logits(sample_hidden_states, None) - @torch.compile(backend="openxla", fullgraph=True, dynamic=False) + # TODO: Under SPMD mode, sample_from_logits has correctness issue. + # Re-enable the torch.compile once the issue is fixed in torchxla. + # @torch.compile(backend="openxla", fullgraph=True, dynamic=False) def sample_from_logits( self, logits: torch.Tensor, sampling_metadata: TPUSupportedSamplingMetadata) -> torch.Tensor: diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index 0707e17af..bf0a5777c 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -45,6 +45,15 @@ class TPUWorker: self.lora_config = vllm_config.lora_config self.load_config = vllm_config.load_config self.parallel_config = vllm_config.parallel_config + self.use_spmd = envs.VLLM_XLA_USE_SPMD + self.original_parallel_config = None + if self.use_spmd: + # Under SPMD mode, distributed env is initialized as if there is + # only one worker/device. + self.original_parallel_config = self.parallel_config + self.parallel_config.tensor_parallel_size = 1 + self.parallel_config.pipeline_parallel_size = 1 + self.parallel_config.world_size = 1 self.scheduler_config = vllm_config.scheduler_config self.device_config = vllm_config.device_config self.speculative_config = vllm_config.speculative_config @@ -95,10 +104,9 @@ class TPUWorker: torch.set_default_dtype(self.model_config.dtype) # Initialize the distributed environment. - init_tpu_worker_distributed_environment(self.parallel_config, - self.rank, - self.distributed_init_method, - self.local_rank) + self._init_tpu_worker_distributed_environment( + self.parallel_config, self.rank, self.distributed_init_method, + self.local_rank) # Device initialization should happen after initializing # the distributed runtime. @@ -132,7 +140,9 @@ class TPUWorker: xr.initialize_cache(per_rank_path, readonly=False) # Init ModelRunner here, so that we have access to self.device. - self.model_runner = TPUModelRunner(self.vllm_config, self.device) + self.model_runner = \ + TPUModelRunner(self.vllm_config, self.device, + self.original_parallel_config) if rank == 0: # If usage stat is enabled, collect relevant info. @@ -147,9 +157,7 @@ class TPUWorker: # Use an empty tensor instead of `None`` to force Dynamo to pass # it by reference, rather by specializing on the value ``None``. - tpu_kv_cache = torch.tensor([], - dtype=dtype, - device=self.device) + tpu_kv_cache = torch.tensor([], dtype=dtype).to(self.device) kv_caches[layer_name] = tpu_kv_cache else: raise NotImplementedError( @@ -178,9 +186,20 @@ class TPUWorker: # Get the maximum amount of memory used by the model weights and # intermediate activations. - m = xm.get_memory_info(self.device) - total_memory_size = m["bytes_limit"] - current_mem = m["bytes_used"] + if self.use_spmd: + # This is a workaround for the TPU SPMD mode. The get_memory_info + # API doesn't work with SPMD mode in PyTorch/XLA. + # TODO: use xm.get_memory_info for SPMD once it's supported in + # PyTorch/XLA. + import tpu_info + chip_type, _ = tpu_info.device.get_local_chips() + device_usage = tpu_info.metrics.get_chip_usage(chip_type) + total_memory_size = device_usage[0].total_memory + current_mem = device_usage[0].memory_usage + else: + m = xm.get_memory_info(self.device) + total_memory_size = m["bytes_limit"] + current_mem = m["bytes_used"] # Ideally we would use profiled = m["peak_bytes_used"] to # get weights + activations. But there is memory used during # compilation / weight loading that impacts the peak and @@ -241,28 +260,30 @@ class TPUWorker: # worker will always be healthy as long as it's running. return - -def init_tpu_worker_distributed_environment( - parallel_config: ParallelConfig, - rank: int, - distributed_init_method: Optional[str] = None, - local_rank: int = -1, -) -> None: - """Initialize the distributed environment.""" - - # NOTE(woosuk): This is just to initialize the TP group and broadcast - # the input objects on CPU. The all-reduce and all-gather ops on TPU - # are invoked by `xm.all_reduce` and `xm.all_gather` which use their - # own context. - init_distributed_environment( - world_size=parallel_config.world_size, - rank=rank, - local_rank=local_rank, - distributed_init_method=distributed_init_method, - backend="gloo", - ) - ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) + def _init_tpu_worker_distributed_environment( + self, + parallel_config: ParallelConfig, + rank: int, + distributed_init_method: Optional[str] = None, + local_rank: int = -1, + ) -> None: + """Initialize the distributed environment.""" + if self.use_spmd: + xr.use_spmd() + # NOTE(woosuk): This is just to initialize the TP group and broadcast + # the input objects on CPU. The all-reduce and all-gather ops on TPU + # are invoked by `xm.all_reduce` and `xm.all_gather` which use their + # own context. + init_distributed_environment( + world_size=parallel_config.world_size, + rank=rank, + local_rank=local_rank, + distributed_init_method=distributed_init_method, + backend="gloo", + ) + ensure_model_parallel_initialized( + parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size) try: -- GitLab From 5bc1ad6cee754405464a9957e86cf3a9302e4986 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?= Date: Tue, 3 Jun 2025 11:49:48 +0900 Subject: [PATCH 118/274] [Doc] Remove duplicate TOCs during MkDocs migration (#19021) Signed-off-by: Zerohertz --- docs/cli/README.md | 13 ------------- docs/deployment/nginx.md | 10 ---------- 2 files changed, 23 deletions(-) diff --git a/docs/cli/README.md b/docs/cli/README.md index 5feb316d6..f43ce7663 100644 --- a/docs/cli/README.md +++ b/docs/cli/README.md @@ -12,19 +12,6 @@ Available Commands: vllm {chat,complete,serve,bench,collect-env,run-batch} ``` -## Table of Contents - -- [serve](#serve) -- [chat](#chat) -- [complete](#complete) -- [bench](#bench) - - [latency](#latency) - - [serve](#serve-1) - - [throughput](#throughput) -- [collect-env](#collect-env) -- [run-batch](#run-batch) -- [More Help](#more-help) - ## serve Start the vLLM OpenAI Compatible API server. diff --git a/docs/deployment/nginx.md b/docs/deployment/nginx.md index 80242919b..f0ff5c1d0 100644 --- a/docs/deployment/nginx.md +++ b/docs/deployment/nginx.md @@ -5,16 +5,6 @@ title: Using Nginx This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers. -Table of contents: - -1. [Build Nginx Container][nginxloadbalancer-nginx-build] -2. [Create Simple Nginx Config file][nginxloadbalancer-nginx-conf] -3. [Build vLLM Container][nginxloadbalancer-nginx-vllm-container] -4. [Create Docker Network][nginxloadbalancer-nginx-docker-network] -5. [Launch vLLM Containers][nginxloadbalancer-nginx-launch-container] -6. [Launch Nginx][nginxloadbalancer-nginx-launch-nginx] -7. [Verify That vLLM Servers Are Ready][nginxloadbalancer-nginx-verify-nginx] - [](){ #nginxloadbalancer-nginx-build } ## Build Nginx Container -- GitLab From 8a57872b2ac9b01004ae1d3a3a689de218ea5be5 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Mon, 2 Jun 2025 23:36:51 -0400 Subject: [PATCH 119/274] [Bugfix][EP+DP] Use pplx-kernel internode instead of intranode (#19034) Signed-off-by: Tyler Michael Smith Signed-off-by: Tyler Michael Smith --- vllm/distributed/device_communicators/all2all.py | 4 ++++ vllm/model_executor/layers/fused_moe/layer.py | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index a250ec89c..7177754a3 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -83,6 +83,10 @@ class PPLXAll2AllManager(All2AllManagerBase): assert has_pplx, "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels." # noqa super().__init__(cpu_group) + # TODO(tms): Disable pplx-a2a intranode as it fails with the error: + # failed: cuda error /app/pplx/csrc/all_to_all/intranode.cpp:84 'invalid resource handle' # noqa + self.internode = True + if self.internode: # inter-node communication needs nvshmem, # intra-node communication uses p2p mapping directly diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index af7b98e14..1e193c909 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -269,9 +269,13 @@ class FusedMoEMethodBase(QuantizeMethodBase): hidden_dim_scale_bytes=(0 if moe.in_dtype.itemsize != 1 else ( (moe.hidden_dim + moe.block_size - 1) // moe.block_size * torch.float32.itemsize)), - group_name=all2all_manager.cpu_group.group_name, ) + # Intranode pplx a2a takes a group name while internode does not. + if not all2all_manager.internode: + all_to_all_args[ + "group_name"] = all2all_manager.cpu_group.group_name + handle = all2all_manager.get_handle(all_to_all_args) prepare_finalize = PplxPrepareAndFinalize( -- GitLab From 4ce42f92042ef8a24e925fc7121f7c98e51f73ba Mon Sep 17 00:00:00 2001 From: Concurrensee Date: Mon, 2 Jun 2025 22:46:44 -0500 Subject: [PATCH 120/274] Adding "LoRA Test %N" to AMD production tests (#18929) Signed-off-by: Yida Wu --- .buildkite/scripts/hardware_ci/run-amd-test.sh | 4 ++++ .buildkite/test-pipeline.yaml | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index bbc896ec6..6e9af1e72 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -94,6 +94,10 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"} fi +if [[ $commands == *"pytest -v -s lora"* ]]; then + commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"} +fi + #ignore certain kernels tests if [[ $commands == *" kernels/core"* ]]; then commands="${commands} \ diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index bff2f69c1..5fb8ceaac 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -287,7 +287,7 @@ steps: - pytest -v -s spec_decode/e2e/test_eagle_correctness.py - label: LoRA Test %N # 15min each - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] source_file_dependencies: - vllm/lora - tests/lora -- GitLab From 8655f47f37750eb5d00992d39305d6705659983f Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Tue, 3 Jun 2025 11:46:47 +0800 Subject: [PATCH 121/274] [CPU][CI] Re-enable the CPU CI tests (#19046) Signed-off-by: jiang.li --- .../scripts/hardware_ci/run-cpu-test.sh | 42 +++++++++---------- docker/Dockerfile.cpu | 10 +++-- vllm/distributed/parallel_state.py | 3 +- 3 files changed, 29 insertions(+), 26 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 40f3df960..0a1193560 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -8,67 +8,65 @@ set -ex CORE_RANGE=${CORE_RANGE:-48-95} NUMA_NODE=${NUMA_NODE:-1} +export CMAKE_BUILD_PARALLEL_LEVEL=32 + # Setup cleanup remove_docker_container() { set -e; - docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; - docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; + docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; } trap remove_docker_container EXIT remove_docker_container # Try building the docker image -numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu . -numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . # Run the image, setting --shm-size=4g for tensor parallel. docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER" + --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 + --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 function cpu_tests() { set -e export NUMA_NODE=$2 - export BUILDKITE_BUILD_NUMBER=$3 # offline inference - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c " set -e python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" # Run basic model test - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - pytest -v -s tests/kernels/test_cache.py -m cpu_model - pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model - pytest -v -s tests/models/decoder_only/language -m cpu_model - pytest -v -s tests/models/embedding/language -m cpu_model - pytest -v -s tests/models/encoder_decoder/language -m cpu_model - pytest -v -s tests/models/decoder_only/audio_language -m cpu_model - pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" + pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model + pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model + pytest -v -s tests/models/language/generation -m cpu_model + pytest -v -s tests/models/language/pooling -m cpu_model + pytest -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_mllama.py -m cpu_model" # Run compressed-tensor test - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -s -v \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" # Run AWQ test - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -s -v \ tests/quantization/test_ipex_quant.py" # Run chunked-prefill and prefix-cache test - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -s -v -k cpu_model \ tests/basic_correctness/test_chunked_prefill.py" # online serving - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e export VLLM_CPU_KVCACHE_SPACE=10 export VLLM_CPU_OMP_THREADS_BIND=$1 @@ -83,7 +81,7 @@ function cpu_tests() { --tokenizer facebook/opt-125m" # Run multi-lora tests - docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -s -v \ tests/lora/test_qwen2vl.py" @@ -91,4 +89,4 @@ function cpu_tests() { # All of CPU tests are expected to be finished less than 40 mins. export -f cpu_tests -timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER" +timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 5395b3884..6db2f307a 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -75,6 +75,7 @@ RUN --mount=type=bind,source=.git,target=.git \ RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \ --mount=type=bind,source=.git,target=.git \ VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel @@ -85,7 +86,7 @@ WORKDIR /workspace/vllm RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt,sharing=locked \ - apt-get install -y --no-install-recommends vim numactl + apt-get install -y --no-install-recommends vim numactl xz-utils # install development dependencies (for testing) RUN --mount=type=cache,target=/root/.cache/uv \ @@ -108,8 +109,11 @@ FROM base AS vllm-test WORKDIR /workspace/ RUN --mount=type=cache,target=/root/.cache/uv \ - --mount=type=bind,src=requirements/test.txt,target=requirements/test.txt \ - uv pip install -r requirements/test.txt + --mount=type=bind,src=requirements/test.in,target=requirements/test.in \ + cp requirements/test.in requirements/test-cpu.in && \ + sed -i '/mamba_ssm/d' requirements/test-cpu.in && \ + uv pip compile requirements/test-cpu.in -o requirements/cpu-test.txt && \ + uv pip install -r requirements/cpu-test.txt RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \ diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 6e48c02da..32c9301bf 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1203,7 +1203,8 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False): if empty_cache is not None: empty_cache() try: - torch._C._host_emptyCache() + if not current_platform.is_cpu(): + torch._C._host_emptyCache() except AttributeError: logger.warning( "torch._C._host_emptyCache() only available in Pytorch >=2.5") -- GitLab From 9e6f61e8c3df833537e4bea6c33f85eca5d73b15 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Mon, 2 Jun 2025 23:47:47 -0400 Subject: [PATCH 122/274] [ROCm][Build] Clean up the ROCm build (#19040) Signed-off-by: Gregory Shtrasberg --- CMakeLists.txt | 4 ---- docker/Dockerfile.rocm | 17 ----------------- .../installation/gpu/rocm.inc.md | 2 -- requirements/rocm.txt | 2 ++ 4 files changed, 2 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6536e9a57..87aa23c08 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -182,9 +182,6 @@ include(FetchContent) file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") -# -# Set rocm version dev int. -# if(VLLM_GPU_LANG STREQUAL "HIP") # # Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info @@ -192,7 +189,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP") set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3") - # # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates # a lot of warnings that always mask real issues. Suppressing until this is properly addressed. diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index e60cf5e69..b186f88d2 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -1,7 +1,5 @@ # default base image ARG REMOTE_VLLM="0" -ARG USE_CYTHON="0" -ARG BUILD_RPD="1" ARG COMMON_WORKDIR=/app ARG BASE_IMAGE=rocm/vllm-dev:base @@ -36,12 +34,10 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm # ----------------------- # vLLM build stages FROM fetch_vllm AS build_vllm -ARG USE_CYTHON # Build vLLM RUN cd vllm \ && python3 -m pip install -r requirements/rocm.txt \ && python3 setup.py clean --all \ - && if [ ${USE_CYTHON} -eq "1" ]; then python3 tests/build_cython.py build_ext --inplace; fi \ && python3 setup.py bdist_wheel --dist-dir=dist FROM scratch AS export_vllm ARG COMMON_WORKDIR @@ -90,13 +86,6 @@ RUN case "$(which python3)" in \ *) ;; esac RUN python3 -m pip install --upgrade huggingface-hub[cli] -ARG BUILD_RPD -RUN if [ ${BUILD_RPD} -eq "1" ]; then \ - git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git \ - && cd rocmProfileData/rpd_tracer \ - && pip install -r requirements.txt && cd ../ \ - && make && make install \ - && cd hipMarker && python3 setup.py install ; fi # Install vLLM RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ @@ -117,12 +106,6 @@ ENV TOKENIZERS_PARALLELISM=false # ENV that can improve safe tensor loading, and end-to-end time ENV SAFETENSORS_FAST_GPU=1 -# User-friendly environment setting for multi-processing to avoid below RuntimeError. -# RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, -# you must use the 'spawn' start method -# See https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing -ENV VLLM_WORKER_MULTIPROC_METHOD=spawn - # Performance environment variable. ENV HIP_FORCE_DEV_KERNARG=1 diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md index 0029b3a24..8b7dc6dd0 100644 --- a/docs/getting_started/installation/gpu/rocm.inc.md +++ b/docs/getting_started/installation/gpu/rocm.inc.md @@ -179,8 +179,6 @@ It is important that the user kicks off the docker build using buildkit. Either It provides flexibility to customize the build of docker image using the following arguments: - `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using -- `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build -- `BUILD_RPD`: Include RocmProfileData profiling tool in the image - `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image Their values can be passed in when running `docker build` with `--build-arg` options. diff --git a/requirements/rocm.txt b/requirements/rocm.txt index 8a84f2ff1..fb1febdac 100644 --- a/requirements/rocm.txt +++ b/requirements/rocm.txt @@ -12,5 +12,7 @@ ray>=2.10.0,<2.45.0 peft pytest-asyncio tensorizer>=2.9.0 +setuptools-scm>=8 +setuptools>=77.0.3,<80.0.0 runai-model-streamer==0.11.0 runai-model-streamer-s3==0.11.0 -- GitLab From bdce64f2365b39335141f8efcb3a0a8ecc559153 Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Mon, 2 Jun 2025 21:15:13 -0700 Subject: [PATCH 123/274] [V1] Support DP with Ray (#18779) --- requirements/test.in | 2 +- requirements/test.txt | 50 +++++++ tests/v1/test_async_llm_dp.py | 13 +- vllm/config.py | 6 + vllm/engine/arg_utils.py | 29 +++- vllm/entrypoints/cli/serve.py | 35 ++++- vllm/v1/engine/async_llm.py | 13 +- vllm/v1/engine/core.py | 168 ++++++++++++++++------ vllm/v1/engine/core_client.py | 74 ++++++++-- vllm/v1/utils.py | 257 +++++++++++++++++++++++++++++----- 10 files changed, 539 insertions(+), 108 deletions(-) diff --git a/requirements/test.in b/requirements/test.in index e906752ff..9b574a09f 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -17,7 +17,7 @@ vector_quantize_pytorch # required for minicpmo_26 test vocos # required for minicpmo_26 test peft pqdm -ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests +ray[cgraph,default]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests sentence-transformers # required for embedding tests soundfile # required for audio tests jiwer # required for audio tests diff --git a/requirements/test.txt b/requirements/test.txt index 60dcaca81..03aec80ac 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -10,9 +10,13 @@ aiohappyeyeballs==2.4.3 # via aiohttp aiohttp==3.10.11 # via + # aiohttp-cors # datasets # fsspec # lm-eval + # ray +aiohttp-cors==0.8.1 + # via ray aiosignal==1.3.1 # via # aiohttp @@ -57,6 +61,8 @@ bounded-pool-executor==0.0.3 # via pqdm buildkite-test-collector==0.1.9 # via -r requirements/test.in +cachetools==5.5.2 + # via google-auth certifi==2024.8.30 # via # httpcore @@ -81,6 +87,8 @@ colorama==0.4.6 # sacrebleu # schemathesis # tqdm-multiprocess +colorful==0.5.6 + # via ray contourpy==1.3.0 # via matplotlib cramjam==2.9.0 @@ -108,6 +116,8 @@ dill==0.3.8 # evaluate # lm-eval # multiprocess +distlib==0.3.9 + # via virtualenv dnspython==2.7.0 # via email-validator docopt==0.6.2 @@ -143,6 +153,7 @@ filelock==3.16.1 # ray # torch # transformers + # virtualenv fonttools==4.54.1 # via matplotlib fqdn==1.5.1 @@ -165,8 +176,16 @@ genai-perf==0.0.8 # via -r requirements/test.in genson==1.3.0 # via datamodel-code-generator +google-api-core==2.24.2 + # via opencensus +google-auth==2.40.2 + # via google-api-core +googleapis-common-protos==1.70.0 + # via google-api-core graphql-core==3.2.6 # via hypothesis-graphql +grpcio==1.71.0 + # via ray h11==0.14.0 # via httpcore harfile==0.3.0 @@ -392,6 +411,10 @@ nvidia-nvjitlink-cu12==12.8.61 # torch nvidia-nvtx-cu12==12.8.55 # via torch +opencensus==0.11.4 + # via ray +opencensus-context==0.1.3 + # via opencensus opencv-python-headless==4.11.0.86 # via # -r requirements/test.in @@ -445,6 +468,7 @@ platformdirs==4.3.6 # via # black # pooch + # virtualenv plotly==5.24.1 # via genai-perf pluggy==1.5.0 @@ -457,10 +481,17 @@ portalocker==2.10.1 # via sacrebleu pqdm==0.2.0 # via -r requirements/test.in +prometheus-client==0.22.0 + # via ray propcache==0.2.0 # via yarl +proto-plus==1.26.1 + # via google-api-core protobuf==5.28.3 # via + # google-api-core + # googleapis-common-protos + # proto-plus # ray # tensorizer psutil==6.1.0 @@ -470,10 +501,18 @@ psutil==6.1.0 # tensorizer py==1.11.0 # via pytest-forked +py-spy==0.4.0 + # via ray pyarrow==18.0.0 # via # datasets # genai-perf +pyasn1==0.6.1 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.4.2 + # via google-auth pybind11==2.13.6 # via lm-eval pycparser==2.22 @@ -486,6 +525,7 @@ pydantic==2.11.5 # datamodel-code-generator # mistral-common # mteb + # ray pydantic-core==2.33.2 # via pydantic pygments==2.18.0 @@ -573,6 +613,7 @@ requests==2.32.3 # buildkite-test-collector # datasets # evaluate + # google-api-core # huggingface-hub # lm-eval # mistral-common @@ -601,6 +642,8 @@ rpds-py==0.20.1 # via # jsonschema # referencing +rsa==4.9.1 + # via google-auth runai-model-streamer==0.11.0 # via -r requirements/test.in runai-model-streamer-s3==0.11.0 @@ -648,9 +691,12 @@ shellingham==1.5.4 six==1.16.0 # via # junit-xml + # opencensus # python-dateutil # rfc3339-validator # rouge-score +smart-open==7.1.0 + # via ray sniffio==1.3.1 # via # anyio @@ -801,6 +847,8 @@ urllib3==2.2.3 # tritonclient vector-quantize-pytorch==1.21.2 # via -r requirements/test.in +virtualenv==20.31.2 + # via ray vocos==0.1.0 # via -r requirements/test.in webcolors==24.11.1 @@ -809,6 +857,8 @@ werkzeug==3.1.3 # via schemathesis word2number==1.1 # via lm-eval +wrapt==1.17.2 + # via smart-open xxhash==3.5.0 # via # datasets diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py index ce4c4d198..366fa3b25 100644 --- a/tests/v1/test_async_llm_dp.py +++ b/tests/v1/test_async_llm_dp.py @@ -59,14 +59,22 @@ async def generate(engine: AsyncLLM, @pytest.mark.parametrize( - "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) + "output_kind", + [ + RequestOutputKind.DELTA, + RequestOutputKind.FINAL_ONLY, + ], +) +@pytest.mark.parametrize("data_parallel_backend", ["mp", "ray"]) @pytest.mark.asyncio -async def test_load(output_kind: RequestOutputKind): +async def test_load(output_kind: RequestOutputKind, + data_parallel_backend: str): with ExitStack() as after: prompt = "This is a test of data parallel" + engine_args.data_parallel_backend = data_parallel_backend engine = AsyncLLM.from_engine_args(engine_args) after.callback(engine.shutdown) @@ -82,7 +90,6 @@ async def test_load(output_kind: RequestOutputKind): asyncio.create_task( generate(engine, request_id, prompt, output_kind, NUM_EXPECTED_TOKENS))) - # Confirm that we got all the EXPECTED tokens from the requests. done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_EXCEPTION) diff --git a/vllm/config.py b/vllm/config.py index 1bd53e35b..8aa1b5610 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1742,6 +1742,8 @@ class ParallelConfig: """Port for data parallel messaging.""" data_parallel_master_port: int = 29500 """Port of the data parallel master.""" + data_parallel_backend: str = "mp" + """Backend to use for data parallel, either "mp" or "ray".""" enable_expert_parallel: bool = False """Use expert parallelism instead of tensor parallelism for MoE layers.""" max_parallel_loading_workers: Optional[int] = None @@ -1911,6 +1913,10 @@ class ParallelConfig: "please install Ray with `pip install " "ray`.") from ray_utils.ray_import_err backend = "ray" + elif self.data_parallel_backend == "ray": + logger.info("Using ray distributed inference because " + "data_parallel_backend is ray") + backend = "ray" elif ray_found: if self.placement_group: backend = "ray" diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 299c8347f..a5b155024 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -39,7 +39,7 @@ from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3 from vllm.transformers_utils.utils import check_gguf_file from vllm.usage.usage_lib import UsageContext from vllm.utils import (STR_DUAL_CHUNK_FLASH_ATTN_VAL, FlexibleArgumentParser, - GiB_bytes, is_in_ray_actor) + GiB_bytes, get_ip, is_in_ray_actor) # yapf: enable @@ -292,6 +292,7 @@ class EngineArgs: data_parallel_size_local: Optional[int] = None data_parallel_address: Optional[str] = None data_parallel_rpc_port: Optional[int] = None + data_parallel_backend: str = ParallelConfig.data_parallel_backend enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel max_parallel_loading_workers: Optional[ int] = ParallelConfig.max_parallel_loading_workers @@ -624,6 +625,12 @@ class EngineArgs: type=int, help='Port for data parallel RPC ' 'communication.') + parallel_group.add_argument('--data-parallel-backend', + '-dpb', + type=str, + default='mp', + help='Backend for data parallel, either ' + '"mp" or "ray".') parallel_group.add_argument( "--enable-expert-parallel", **parallel_kwargs["enable_expert_parallel"]) @@ -1059,9 +1066,20 @@ class EngineArgs: # DP address, used in multi-node case for torch distributed group # and ZMQ sockets. - data_parallel_address = self.data_parallel_address if ( - self.data_parallel_address - is not None) else ParallelConfig.data_parallel_master_ip + if self.data_parallel_address is None: + if self.data_parallel_backend == "ray": + host_ip = get_ip() + logger.info( + "Using host IP %s as ray-based data parallel address", + host_ip) + data_parallel_address = host_ip + else: + assert self.data_parallel_backend == "mp", ( + "data_parallel_backend can only be ray or mp, got %s", + self.data_parallel_backend) + data_parallel_address = ParallelConfig.data_parallel_master_ip + else: + data_parallel_address = self.data_parallel_address # This port is only used when there are remote data parallel engines, # otherwise the local IPC transport is used. @@ -1069,6 +1087,8 @@ class EngineArgs: self.data_parallel_rpc_port is not None) else ParallelConfig.data_parallel_rpc_port + data_parallel_backend = self.data_parallel_backend + parallel_config = ParallelConfig( pipeline_parallel_size=self.pipeline_parallel_size, tensor_parallel_size=self.tensor_parallel_size, @@ -1076,6 +1096,7 @@ class EngineArgs: data_parallel_size_local=data_parallel_size_local, data_parallel_master_ip=data_parallel_address, data_parallel_rpc_port=data_parallel_rpc_port, + data_parallel_backend=data_parallel_backend, enable_expert_parallel=self.enable_expert_parallel, max_parallel_loading_workers=self.max_parallel_loading_workers, disable_custom_all_reduce=self.disable_custom_all_reduce, diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index e65c97073..040ae166a 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -27,7 +27,8 @@ from vllm.v1.engine.core_client import CoreEngineProcManager from vllm.v1.executor.abstract import Executor from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus from vllm.v1.utils import (APIServerProcessManager, CoreEngine, - EngineZmqAddresses, get_engine_client_zmq_addr, + CoreEngineActorManager, EngineZmqAddresses, + get_engine_client_zmq_addr, wait_for_completion_or_failure, wait_for_engine_startup) @@ -229,6 +230,31 @@ def run_multi_api_server(args: argparse.Namespace): logger.info("Started DP Coordinator process (PID: %d)", coordinator.proc.pid) + if parallel_config.data_parallel_backend == "ray": + logger.info("Starting ray-based data parallel backend") + + engine_actor_manager = CoreEngineActorManager( + vllm_config=vllm_config, + addresses=addresses, + executor_class=Executor.get_class(vllm_config), + log_stats=not engine_args.disable_log_stats, + ) + # Start API servers using the manager + api_server_manager = APIServerProcessManager( + target_server_fn=run_api_server_worker_proc, + listen_address=listen_address, + sock=sock, + args=args, + num_servers=num_api_servers, + input_addresses=input_addresses, + output_addresses=output_addresses, + stats_update_address=stats_update_address) + + wait_for_completion_or_failure(api_server_manager=api_server_manager, + engine_manager=engine_actor_manager, + coordinator=coordinator) + return + handshake_address = get_engine_client_zmq_addr( local_only, host, parallel_config.data_parallel_rpc_port) @@ -277,10 +303,9 @@ def run_multi_api_server(args: argparse.Namespace): ) # Wait for API servers - wait_for_completion_or_failure( - api_server_manager=api_server_manager, - local_engine_manager=local_engine_manager, - coordinator=coordinator) + wait_for_completion_or_failure(api_server_manager=api_server_manager, + engine_manager=local_engine_manager, + coordinator=coordinator) def run_api_server_worker_proc(listen_address, diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 86781e752..4b235c596 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -27,7 +27,8 @@ from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext from vllm.utils import Device, cdiv from vllm.v1.engine import EngineCoreRequest -from vllm.v1.engine.core_client import AsyncMPClient, DPAsyncMPClient +from vllm.v1.engine.core_client import (AsyncMPClient, DPAsyncMPClient, + RayDPClient) from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError from vllm.v1.engine.output_processor import (OutputProcessor, RequestOutputCollector) @@ -119,9 +120,13 @@ class AsyncLLM(EngineClient): log_stats=self.log_stats) # EngineCore (starts the engine in background process). - core_client_class = AsyncMPClient if ( - vllm_config.parallel_config.data_parallel_size - == 1) else DPAsyncMPClient + core_client_class: type[AsyncMPClient] + if vllm_config.parallel_config.data_parallel_size == 1: + core_client_class = AsyncMPClient + elif vllm_config.parallel_config.data_parallel_backend == "ray": + core_client_class = RayDPClient + else: + core_client_class = DPAsyncMPClient self.engine_core = core_client_class( vllm_config=vllm_config, diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index a02abb62b..7253d1dc6 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -6,8 +6,9 @@ import sys import threading import time from collections import deque +from collections.abc import Generator from concurrent.futures import Future -from contextlib import ExitStack +from contextlib import ExitStack, contextmanager from inspect import isclass, signature from logging import DEBUG from typing import Any, Callable, Optional, TypeVar, Union @@ -367,60 +368,35 @@ class EngineCoreProc(EngineCore): log_stats: bool, engine_index: int = 0, ): - input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]() - - executor_fail_callback = lambda: input_queue.put_nowait( + self.input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]() + self.output_queue = queue.Queue[Union[tuple[int, EngineCoreOutputs], + bytes]]() + executor_fail_callback = lambda: self.input_queue.put_nowait( (EngineCoreRequestType.EXECUTOR_FAILED, b'')) - # Create input socket. - input_ctx = zmq.Context() - identity = engine_index.to_bytes(length=2, byteorder="little") - with make_zmq_socket(input_ctx, - handshake_address, - zmq.DEALER, - identity=identity, - linger=5000, - bind=False) as handshake_socket: + self.engine_index = engine_index + identity = self.engine_index.to_bytes(length=2, byteorder="little") + self.engines_running = False - # Register engine with front-end. - addresses = self.startup_handshake(handshake_socket, on_head_node, - vllm_config.parallel_config) + with self._perform_handshake(handshake_address, identity, on_head_node, + vllm_config) as addresses: self.client_count = len(addresses.outputs) - # Update config which may have changed from the handshake. - vllm_config.__post_init__() - # Set up data parallel environment. self.has_coordinator = addresses.coordinator_output is not None self._init_data_parallel(vllm_config) - # Initialize engine core and model. super().__init__(vllm_config, executor_class, log_stats, executor_fail_callback) - self.engine_index = engine_index - self.step_fn = (self.step if self.batch_queue is None else - self.step_with_batch_queue) - self.engines_running = False - self.last_counts = (0, 0) - - # Send ready message. - num_gpu_blocks = vllm_config.cache_config.num_gpu_blocks - handshake_socket.send( - msgspec.msgpack.encode({ - "status": "READY", - "local": on_head_node, - "num_gpu_blocks": num_gpu_blocks, - })) + self.step_fn = (self.step if self.batch_queue is None else + self.step_with_batch_queue) # Background Threads and Queues for IO. These enable us to # overlap ZMQ socket IO with GPU since they release the GIL, # and to overlap some serialization/deserialization with the # model forward pass. # Threads handle Socket <-> Queues and core_busy_loop uses Queue. - self.input_queue = input_queue - self.output_queue = queue.Queue[Union[tuple[int, EngineCoreOutputs], - bytes]]() threading.Thread(target=self.process_input_sockets, args=(addresses.inputs, addresses.coordinator_input, identity), @@ -428,10 +404,40 @@ class EngineCoreProc(EngineCore): self.output_thread = threading.Thread( target=self.process_output_sockets, args=(addresses.outputs, addresses.coordinator_output, - engine_index), + self.engine_index), daemon=True) self.output_thread.start() + @contextmanager + def _perform_handshake( + self, handshake_address: str, identity: bytes, on_head_node: bool, + vllm_config: VllmConfig + ) -> Generator[EngineZmqAddresses, None, None]: + input_ctx = zmq.Context() + with make_zmq_socket(input_ctx, + handshake_address, + zmq.DEALER, + identity=identity, + linger=5000, + bind=False) as handshake_socket: + # Register engine with front-end. + addresses = self.startup_handshake(handshake_socket, on_head_node, + vllm_config.parallel_config) + + # Update config which may have changed from the handshake + vllm_config.__post_init__() + + yield addresses + + # Send ready message. + num_gpu_blocks = vllm_config.cache_config.num_gpu_blocks + handshake_socket.send( + msgspec.msgpack.encode({ + "status": "READY", + "local": on_head_node, + "num_gpu_blocks": num_gpu_blocks, + })) + @staticmethod def startup_handshake( handshake_socket: zmq.Socket, on_head_node: bool, @@ -743,24 +749,29 @@ class DPEngineCoreProc(EngineCoreProc): executor_class: type[Executor], log_stats: bool, ): - # Add process-specific prefix to stdout and stderr before - # we initialize the engine. - from multiprocessing import current_process - process_name = current_process().name - pid = os.getpid() - _add_prefix(sys.stdout, process_name, pid) - _add_prefix(sys.stderr, process_name, pid) + + self._decorate_logs() # Counts forward-passes of the model so that we can synchronize # finished with DP peers every N steps. self.counter = 0 self.current_wave = 0 + self.last_counts = (0, 0) # Initialize the engine. dp_rank = vllm_config.parallel_config.data_parallel_rank super().__init__(vllm_config, on_head_node, handshake_address, executor_class, log_stats, dp_rank) + def _decorate_logs(self): + # Add process-specific prefix to stdout and stderr before + # we initialize the engine. + from multiprocessing import current_process + process_name = current_process().name + pid = os.getpid() + _add_prefix(sys.stdout, process_name, pid) + _add_prefix(sys.stderr, process_name, pid) + def _init_data_parallel(self, vllm_config: VllmConfig): # Configure GPUs and stateless process group for data parallel. @@ -880,3 +891,70 @@ class DPEngineCoreProc(EngineCoreProc): return ParallelConfig.has_unfinished_dp(self.dp_group, local_unfinished) + + +class DPEngineCoreActor(DPEngineCoreProc): + """ + Ray actor for running EngineCore in a data parallel context + """ + + def __init__( + self, + vllm_config: VllmConfig, + on_head_node: bool, + addresses: EngineZmqAddresses, + executor_class: type[Executor], + log_stats: bool, + dp_rank: int = 0, + local_dp_rank: int = 0, + ): + self.addresses = addresses + vllm_config.parallel_config.data_parallel_rank = dp_rank + vllm_config.parallel_config.data_parallel_rank_local = \ + local_dp_rank + + # Ray sets CUDA_VISIBLE_DEVICES to empty string, + # we clean this up to be able to properly initialize + # data parallel groups. + del os.environ['CUDA_VISIBLE_DEVICES'] + + super().__init__(vllm_config, on_head_node, "", executor_class, + log_stats) + + def _decorate_logs(self): + pass + + @contextmanager + def _perform_handshake(self, handshake_address: str, identity: bytes, + on_head_node: bool, vllm_config: VllmConfig): + """ + For Ray, we don't need to actually perform handshake. + All addresses information is known before the actor creation. + Therefore, we simply yield these addresses. + """ + yield self.addresses + + def wait_for_init(self): + """ + Wait until the engine core is initialized. + + This is just an empty method. When ray.get() on this method + (or any other method of the actor) returns, it is guaranteed + that actor creation (i.e., __init__) is complete. + """ + pass + + def run(self): + """ + Run the engine core busy loop. + """ + try: + self.run_busy_loop() + except SystemExit: + logger.debug("EngineCore exiting.") + raise + except Exception: + logger.exception("EngineCore encountered a fatal error.") + raise + finally: + self.shutdown() diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 232d6742b..fa01998aa 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -29,9 +29,9 @@ from vllm.v1.engine.core import EngineCore, EngineCoreProc from vllm.v1.engine.exceptions import EngineDeadError from vllm.v1.executor.abstract import Executor from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr -from vllm.v1.utils import (CoreEngine, CoreEngineProcManager, - EngineZmqAddresses, get_engine_client_zmq_addr, - wait_for_engine_startup) +from vllm.v1.utils import (CoreEngine, CoreEngineActorManager, + CoreEngineProcManager, EngineZmqAddresses, + get_engine_client_zmq_addr, wait_for_engine_startup) logger = init_logger(__name__) @@ -68,6 +68,8 @@ class EngineCoreClient(ABC): if multiprocess_mode and asyncio_mode: if vllm_config.parallel_config.data_parallel_size > 1: + if vllm_config.parallel_config.data_parallel_backend == "ray": + return RayDPClient(vllm_config, executor_class, log_stats) return DPAsyncMPClient(vllm_config, executor_class, log_stats) return AsyncMPClient(vllm_config, executor_class, log_stats) @@ -273,7 +275,10 @@ class BackgroundResources: circular reference back to the client object.""" ctx: Union[zmq.Context] - local_engine_manager: Optional[CoreEngineProcManager] = None + # If CoreEngineProcManager, it manages local engines; + # if CoreEngineActorManager, it manages all engines. + engine_manager: Optional[Union[CoreEngineProcManager, + CoreEngineActorManager]] = None coordinator: Optional[DPCoordinator] = None output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None @@ -290,8 +295,8 @@ class BackgroundResources: """Clean up background resources.""" self.engine_dead = True - if self.local_engine_manager is not None: - self.local_engine_manager.close() + if self.engine_manager is not None: + self.engine_manager.close() if self.coordinator is not None: self.coordinator.close() @@ -457,7 +462,7 @@ class MPClient(EngineCoreClient): if local_engine_count: # In server mode, start_index and local_start_index will # both be 0. - self.resources.local_engine_manager = CoreEngineProcManager( + self.resources.engine_manager = CoreEngineProcManager( EngineCoreProc.run_engine_core, vllm_config=vllm_config, executor_class=executor_class, @@ -484,13 +489,18 @@ class MPClient(EngineCoreClient): addresses.coordinator_input, addresses.coordinator_output = ( coordinator.get_engine_socket_addresses()) + proc_manager = self.resources.engine_manager + assert isinstance(proc_manager, (type(None), CoreEngineProcManager)), ( + "_wait_for_engine_startup should only be " + "called with CoreEngineProcManager") + wait_for_engine_startup( handshake_socket, addresses, self.core_engines, self.vllm_config.parallel_config, self.vllm_config.cache_config, - self.resources.local_engine_manager, + proc_manager, coordinator.proc if coordinator else None, ) @@ -887,7 +897,6 @@ class DPAsyncMPClient(AsyncMPClient): log_stats: bool, client_addresses: Optional[dict[str, str]] = None, client_index: int = 0): - self.current_wave = 0 self.engines_running = False # To route aborts to the correct engine. @@ -1050,3 +1059,50 @@ class DPAsyncMPClient(AsyncMPClient): if not self.resources.engine_dead: await self._send_input(EngineCoreRequestType.ABORT, request_ids, engine) + + +class RayDPClient(DPAsyncMPClient): + """ + Ray-based client for multi-proc, multi-engine (data parallel) + EngineCore. + """ + + def __init__( + self, + vllm_config: VllmConfig, + executor_class: type[Executor], + log_stats: bool, + client_addresses: Optional[dict[str, str]] = None, + client_index: int = 0, + ): + super().__init__(vllm_config, executor_class, log_stats, + client_addresses, client_index) + + def _init_engines_direct(self, vllm_config: VllmConfig, local_only: bool, + local_start_index: int, input_address: str, + output_address: str, + executor_class: type[Executor], log_stats: bool): + """Self-contained client mode, launch engine and coordinator process + as needed.""" + + parallel_config = vllm_config.parallel_config + assert parallel_config.data_parallel_rank == 0 + assert local_start_index == 0 + + addresses = EngineZmqAddresses( + inputs=[input_address], + outputs=[output_address], + ) + + if len(self.core_engines) > 1: + coordinator = DPCoordinator(parallel_config) + self.resources.coordinator = coordinator + addresses.coordinator_input, addresses.coordinator_output = ( + coordinator.get_engine_socket_addresses()) + + # Start all engines. + self.resources.engine_manager = CoreEngineActorManager( + vllm_config=vllm_config, + addresses=addresses, + executor_class=executor_class, + log_stats=log_stats) diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index a26794561..d347efc42 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -27,6 +27,8 @@ from vllm.utils import (get_mp_context, get_open_port, get_open_zmq_ipc_path, from vllm.v1.executor.abstract import Executor if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + from vllm.attention.layer import Attention from vllm.v1.engine.coordinator import DPCoordinator @@ -112,6 +114,45 @@ def get_engine_client_zmq_addr(local_only: bool, host, port or get_open_port())) +class CoreEngineState(Enum): + NEW = auto() + CONNECTED = auto() + READY = auto() + + +class CoreEngine: + """One per data parallel rank.""" + + def __init__(self, index: int = 0, local: bool = True): + self.local = local + self.index = index + self.identity = index.to_bytes(2, "little") + + self.state = CoreEngineState.NEW + + +@dataclass +class EngineZmqAddresses: + # ZMQ input socket addresses for each front-end client (requests) + inputs: list[str] + # ZMQ output socket addresses for each front-end client (responses) + outputs: list[str] + # ZMQ input socket address of DP coordinator if applicable + coordinator_input: Optional[str] = None + # ZMQ output socket address of DP coordinator if applicable + coordinator_output: Optional[str] = None + + +@dataclass +class EngineHandshakeMetadata: + """Metadata sent to each engine process during startup handshake, + including addresses of the front-end ZMQ queues that they should + connect to. + """ + addresses: EngineZmqAddresses + parallel_config: dict[str, Union[int, str]] + + class APIServerProcessManager: """Manages a group of API server processes. @@ -245,43 +286,168 @@ class CoreEngineProcManager: } -class CoreEngineState(Enum): - NEW = auto() - CONNECTED = auto() - READY = auto() - +class CoreEngineActorManager: + """ + Utility class to handle creation, readiness, and shutdown + of core engine Ray actors used by the AsyncLLM and LLMEngine. -class CoreEngine: - """One per data parallel rank.""" + Different from CoreEngineProcManager, this class manages + core engines for both local and remote nodes. + """ - def __init__(self, index: int = 0, local: bool = True): - self.local = local - self.index = index - self.identity = index.to_bytes(2, "little") + def __init__( + self, + vllm_config: VllmConfig, + addresses: EngineZmqAddresses, + executor_class: type[Executor], + log_stats: bool, + placement_groups: Optional[list["PlacementGroup"]] = None, + local_dp_ranks: Optional[list[int]] = None, + ): + import copy - self.state = CoreEngineState.NEW + import ray + from ray.util.scheduling_strategies import ( + PlacementGroupSchedulingStrategy) + from vllm.v1.engine.core import DPEngineCoreActor -@dataclass -class EngineZmqAddresses: - # ZMQ input socket addresses for each front-end client (requests) - inputs: list[str] - # ZMQ output socket addresses for each front-end client (responses) - outputs: list[str] - # ZMQ input socket address of DP coordinator if applicable - coordinator_input: Optional[str] = None - # ZMQ output socket address of DP coordinator if applicable - coordinator_output: Optional[str] = None + self.local_engine_actors: list[ray.ActorHandle] = [] + self.remote_engine_actors: list[ray.ActorHandle] = [] + dp_size = vllm_config.parallel_config.data_parallel_size + local_engine_count = \ + vllm_config.parallel_config.data_parallel_size_local + world_size = vllm_config.parallel_config.world_size + if ray.is_initialized(): + logger.info( + "Ray is already initialized. Skipping Ray initialization.") + else: + ray.init() + + if placement_groups is not None: + assert local_dp_ranks is not None, ( + "local_dp_ranks must be provided if " + "placement_groups is provided") + assert len(placement_groups) == len(local_dp_ranks), ( + "placement_groups and local_dp_ranks must " + "have the same length") + logger.info("Using provided placement groups") + # TODO(rui): validate passed-in placement groups + self.created_placement_groups = [] + else: + placement_groups, local_dp_ranks = \ + CoreEngineActorManager.create_dp_placement_groups(vllm_config) + self.created_placement_groups = placement_groups + assert len(placement_groups) == dp_size, ( + "Number of placement groups must match data parallel size") + + refs = [] + for index in range(dp_size): + local_index = local_dp_ranks[index] + dp_vllm_config = copy.deepcopy(vllm_config) + pg = placement_groups[index] + dp_vllm_config.parallel_config.placement_group = pg + on_head_node = index < local_engine_count + actor = ray.remote(DPEngineCoreActor).options( + scheduling_strategy=PlacementGroupSchedulingStrategy( + placement_group=pg, + placement_group_bundle_index=world_size, + )).remote(vllm_config=dp_vllm_config, + executor_class=executor_class, + log_stats=log_stats, + on_head_node=on_head_node, + addresses=addresses, + dp_rank=index, + local_dp_rank=local_index) + if on_head_node: + self.local_engine_actors.append(actor) + else: + self.remote_engine_actors.append(actor) + refs.append(actor.wait_for_init.remote()) + + ray.get(refs) + self.run_refs = [] + for actor in self.local_engine_actors + self.remote_engine_actors: + self.run_refs.append(actor.run.remote()) + + @staticmethod + def create_dp_placement_groups( + vllm_config: VllmConfig + ) -> tuple[list["PlacementGroup"], list[int]]: + + import ray + from ray._private.state import available_resources_per_node + from ray.util.state import list_nodes + + logger.info("Creating placement groups for data parallel") + dp_master_ip = \ + vllm_config.parallel_config.data_parallel_master_ip + dp_size = vllm_config.parallel_config.data_parallel_size + local_engine_count = \ + vllm_config.parallel_config.data_parallel_size_local + + nodes = list_nodes() + nodes = sorted(list_nodes(), + key=lambda node: node.node_ip != dp_master_ip) + assert nodes[0].node_ip == dp_master_ip, ( + "The first node must be the head node") + assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, ( + "There can only be one head node") + + available_resources = available_resources_per_node() + world_size = vllm_config.parallel_config.world_size + placement_groups: list[PlacementGroup] = [] + local_dp_ranks: list[int] = [] + + for node in nodes: + node_ip = node.node_ip + node_resources = available_resources[node.node_id] + # For now, each DP rank can only be assigned to one node + # TODO(rui): support allocating a single DP rank + # to multiple nodes + available_engine_count = node_resources["GPU"] // world_size + if node_ip == dp_master_ip: + assert available_engine_count >= local_engine_count, ( + "Not enough resources to allocate DP ranks " + f"on DP master node {node_ip}") + for i in range(local_engine_count): + bundles = [{ + "GPU": 1.0, + "node:" + dp_master_ip: 0.001 + }] * world_size + [{ + "CPU": 1.0 + }] + pg = ray.util.placement_group( + name=f"dp_rank_{len(placement_groups)}", + strategy="STRICT_PACK", + bundles=bundles, + ) + placement_groups.append(pg) + local_dp_ranks.append(i) + else: + for i in range(available_engine_count): + if len(placement_groups) == dp_size: + break + bundles = [{"GPU": 1.0}] * world_size + [{"CPU": 1.0}] + pg = ray.util.placement_group( + name=f"dp_rank_{len(placement_groups)}", + strategy="STRICT_PACK", + bundles=bundles, + ) + placement_groups.append(pg) + local_dp_ranks.append(i) + return placement_groups, local_dp_ranks + + def get_run_refs(self): + return self.run_refs -@dataclass -class EngineHandshakeMetadata: - """Metadata sent to each engine process during startup handshake, - including addresses of the front-end ZMQ queues that they should - connect to. - """ - addresses: EngineZmqAddresses - parallel_config: dict[str, Union[int, str]] + def close(self): + import ray + for actor in self.local_engine_actors + self.remote_engine_actors: + ray.kill(actor) + for pg in self.created_placement_groups: + ray.util.remove_placement_group(pg) def wait_for_engine_startup( @@ -383,11 +549,19 @@ def wait_for_engine_startup( def wait_for_completion_or_failure( api_server_manager: APIServerProcessManager, - local_engine_manager: Optional[CoreEngineProcManager] = None, + engine_manager: Optional[Union[CoreEngineProcManager, + CoreEngineActorManager]] = None, coordinator: Optional["DPCoordinator"] = None) -> None: """Wait for all processes to complete or detect if any fail. Raises an exception if any process exits with a non-zero status. + + Args: + api_server_manager: The manager for API servers. + engine_manager: The manager for engine processes. + If CoreEngineProcManager, it manages local engines; + if CoreEngineActorManager, it manages all engines. + coordinator: The coordinator for data parallel. """ try: @@ -402,14 +576,18 @@ def wait_for_completion_or_failure( if coordinator: sentinel_to_proc[coordinator.proc.sentinel] = coordinator.proc - if local_engine_manager: - for proc in local_engine_manager.processes: + actor_run_refs = [] + if isinstance(engine_manager, CoreEngineProcManager): + for proc in engine_manager.processes: sentinel_to_proc[proc.sentinel] = proc + elif isinstance(engine_manager, CoreEngineActorManager): + actor_run_refs = engine_manager.get_run_refs() # Check if any process terminates - while sentinel_to_proc: + while sentinel_to_proc or actor_run_refs: # Wait for any process to terminate - ready_sentinels: list[Any] = connection.wait(sentinel_to_proc) + ready_sentinels: list[Any] = connection.wait(sentinel_to_proc, + timeout=5) # Process any terminated processes for sentinel in ready_sentinels: @@ -420,6 +598,11 @@ def wait_for_completion_or_failure( raise RuntimeError( f"Process {proc.name} (PID: {proc.pid}) " f"died with exit code {proc.exitcode}") + + if actor_run_refs: + import ray + _, actor_run_refs = ray.wait(actor_run_refs, timeout=5) + except KeyboardInterrupt: logger.info("Received KeyboardInterrupt, shutting down API servers...") except Exception as e: @@ -431,8 +614,8 @@ def wait_for_completion_or_failure( api_server_manager.close() if coordinator: coordinator.close() - if local_engine_manager: - local_engine_manager.close() + if engine_manager: + engine_manager.close() # Note(rob): shutdown function cannot be a bound method, -- GitLab From 1282bd812ea4e1511378bad5b918d609280d2b89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Tue, 3 Jun 2025 13:13:13 +0800 Subject: [PATCH 124/274] Add tarsier model support (#18985) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 汪志鹏 --- docs/models/supported_models.md | 1 + examples/offline_inference/vision_language.py | 20 + .../vision_language_multi_image.py | 21 + .../multimodal/processing/test_common.py | 1 + tests/models/registry.py | 2 + vllm/model_executor/models/registry.py | 1 + vllm/model_executor/models/tarsier.py | 643 ++++++++++++++++++ 7 files changed, 689 insertions(+) create mode 100644 vllm/model_executor/models/tarsier.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index b60fefdda..f2090fe39 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -550,6 +550,7 @@ Specified using `--task generate`. | `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni | T + IE+ + VE+ + A+ | `Qwen/Qwen2.5-Omni-7B` | | ✅︎ | ✅︎\* | | `SkyworkR1VChatModel` | Skywork-R1V-38B | T + I | `Skywork/Skywork-R1V-38B` | | ✅︎ | ✅︎ | | `SmolVLMForConditionalGeneration` | SmolVLM2 | T + I | `SmolVLM2-2.2B-Instruct` | ✅︎ | | ✅︎ | +| `TarsierForConditionalGeneration` | Tarsier | T + IE+ | `omni-search/Tarsier-7b`,`omni-search/Tarsier-34b` | | ✅︎ | ✅︎ | ^ You need to set the architecture name via `--hf-overrides` to match the one in vLLM.     • For example, to use DeepSeek-VL2 series models: diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index f05045016..2ef87f4f4 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -333,6 +333,25 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData: ) +# omni-research/Tarsier-7b +def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: + assert modality == "image" + model_name = "omni-research/Tarsier-7b" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + limit_mm_per_prompt={modality: 1}, + ) + prompts = [(f"USER: \n{question} ASSISTANT:") for question in questions] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + + # InternVL def run_internvl(questions: list[str], modality: str) -> ModelRequestData: model_name = "OpenGVLab/InternVL3-2B" @@ -1091,6 +1110,7 @@ model_example_map = { "qwen2_5_omni": run_qwen2_5_omni, "skywork_chat": run_skyworkr1v, "smolvlm": run_smolvlm, + "tarsier": run_tarsier, } diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index e776ff7fe..7ce28c5a4 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -691,6 +691,26 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: ) +def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData: + model_name = "omni-research/Tarsier-7b" + + engine_args = EngineArgs( + model=model_name, + trust_remote_code=True, + max_model_len=4096, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + prompt = f"USER: {'' * len(image_urls)}\n{question}\n ASSISTANT:" + image_data = [fetch_image(url) for url in image_urls] + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image_data=image_data, + ) + + model_example_map = { "aria": load_aria, "aya_vision": load_aya_vision, @@ -712,6 +732,7 @@ model_example_map = { "qwen2_vl": load_qwen2_vl, "qwen2_5_vl": load_qwen2_5_vl, "smolvlm": load_smolvlm, + "tarsier": load_tarsier, } diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index d7f950c23..2377fef82 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -282,6 +282,7 @@ def _test_processing_correctness_one( "Skywork/Skywork-R1V-38B", "fixie-ai/ultravox-v0_5-llama-3_2-1b", "openai/whisper-large-v3", + "omni-research/Tarsier-7b", ]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("num_batches", [32]) diff --git a/tests/models/registry.py b/tests/models/registry.py index fe49d2427..182a9668e 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -406,6 +406,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"), # noqa: E501 "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501 trust_remote_code=True), + "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b", # noqa: E501 + hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}), # noqa: E501 # [Encoder-decoder] # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer # Therefore, we borrow the BartTokenizer from the original Bart model diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 8efd4825b..fcef457a7 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -211,6 +211,7 @@ _MULTIMODAL_MODELS = { "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"), # noqa: E501 "UltravoxModel": ("ultravox", "UltravoxModel"), "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"), + "TarsierForConditionalGeneration": ("tarsier", "TarsierForConditionalGeneration"), # noqa: E501 # [Encoder-decoder] "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"), # noqa: E501 "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501 diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py new file mode 100644 index 000000000..5aa3ddabc --- /dev/null +++ b/vllm/model_executor/models/tarsier.py @@ -0,0 +1,643 @@ +# SPDX-License-Identifier: Apache-2.0 + +import math +from collections.abc import Iterable, Mapping, Sequence +from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, + Union, cast) + +import torch +import torch.nn as nn +from transformers import BatchFeature, CLIPVisionConfig +from transformers import LlavaConfig as HfLlavaConfig +from transformers import PretrainedConfig, SiglipVisionConfig +from transformers.image_utils import ImageInput, get_image_size, to_numpy_array +from transformers.models.llava import LlavaProcessor +from transformers.processing_utils import (ProcessingKwargs, Unpack, + _validate_images_text_input_order) +from transformers.tokenization_utils_base import PreTokenizedInput, TextInput + +from vllm.config import VllmConfig +from vllm.inputs import InputProcessingContext +from vllm.jsontree import json_map_leaves +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.models.llava import LlavaDummyInputsBuilder +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + ImageSize, MultiModalDataItems) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, ProcessingCache, + PromptReplacement, PromptUpdate) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.sequence import IntermediateTensors + +from .clip import CLIPVisionModel +from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP +from .siglip import SiglipVisionModel +from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, + maybe_prefix, merge_multimodal_embeddings) +from .vision import VisionEncoderInfo, get_vision_encoder_info + + +class TarsierImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + pixel_values: torch.Tensor + + +class TarsierImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: torch.Tensor + + +TarsierImageInputs = Union[TarsierImagePixelInputs, + TarsierImageEmbeddingInputs] + + +class TarsierHfConfig(Protocol): # Based on the Tarsier's LlavaConfig + vision_config: Final[PretrainedConfig] + text_config: Final[PretrainedConfig] # Added from Tarsier's LlavaConfig + image_token_index: Final[int] + vision_feature_select_strategy: Final[str] + vision_feature_layer: Final[Union[int, list[int]]] + projector_hidden_act: Final[str] + image_newline_idx: Final[int] + image_new_idx: Final[int] + multimodal_projector_bias: bool = True + + +class TarsierProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "padding": False, + }, + "images_kwargs": {}, + } + + +class TarsierProcessor(LlavaProcessor): + + def __call__( + self, + images: ImageInput = None, + text: Union[TextInput, PreTokenizedInput, list[TextInput], + list[PreTokenizedInput]] = None, + audio=None, + videos=None, + **kwargs: Unpack[TarsierProcessorKwargs], + ) -> BatchFeature: + if images is None and text is None: + raise ValueError( + "You have to specify at least one of `images` or `text`.") + + # check if images and text inputs are reversed for BC + images, text = _validate_images_text_input_order(images, text) + + output_kwargs = self._merge_kwargs( + TarsierProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + if images is not None: + image_inputs = self.image_processor( + images, **output_kwargs["images_kwargs"]) + else: + image_inputs = {} + + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError("Invalid input text. Please provide a string," + " or a list of strings") + + # try to expand inputs in processing if we have the necessary parts + prompt_strings = text + if image_inputs.get("pixel_values") is not None: + # Replace the image token with the expanded image token sequence + pixel_values = image_inputs["pixel_values"] + height, width = get_image_size(to_numpy_array(pixel_values[0])) + num_image_tokens = (height // self.patch_size) * ( + width // self.patch_size + + 1) + self.num_additional_image_tokens + 1 + if self.vision_feature_select_strategy == "default": + num_image_tokens -= 1 + + prompt_strings = [] + for sample in text: + sample = sample.replace(self.image_token, + self.image_token * num_image_tokens) + prompt_strings.append(sample) + + return_tensors = output_kwargs["text_kwargs"].pop( + "return_tensors", None) + text_inputs = self.tokenizer(prompt_strings, + **output_kwargs["text_kwargs"]) + return BatchFeature(data={ + **text_inputs, + **image_inputs + }, + tensor_type=return_tensors) + + +class TarsierMultiModalProjector(nn.Module): + + def __init__(self, + vision_hidden_size: int, + text_hidden_size: int, + projector_hidden_act: str, + multimodal_projector_bias: bool, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): + super().__init__() + + self.linear_1 = ColumnParallelLinear(vision_hidden_size, + text_hidden_size, + bias=multimodal_projector_bias, + quant_config=quant_config, + prefix=f"{prefix}.linear_1") + self.act = get_act_fn(projector_hidden_act) + self.linear_2 = RowParallelLinear(text_hidden_size, + text_hidden_size, + bias=multimodal_projector_bias, + quant_config=quant_config, + prefix=f"{prefix}.linear_2") + + def forward(self, image_features: torch.Tensor) -> torch.Tensor: + hidden_states, _ = self.linear_1(image_features) + hidden_states = self.act(hidden_states) + hidden_states, _ = self.linear_2(hidden_states) + return hidden_states + + +class TarsierProcessingInfo(BaseProcessingInfo): + + def get_hf_config(self) -> TarsierHfConfig: + return self.ctx.get_hf_config(HfLlavaConfig) + + def get_vision_encoder_info(self) -> VisionEncoderInfo: + return get_vision_encoder_info(self.get_hf_config()) + + def get_hf_processor(self, **kwargs: object) -> TarsierProcessor: + hf_processor = self.ctx.get_hf_processor(TarsierProcessor, **kwargs) + # Patch for patch_size if needed (copied from vLLM LLaVA) + if hasattr(hf_processor, + 'patch_size') and hf_processor.patch_size is None: + patch_size = self.get_vision_encoder_info().get_patch_size() + hf_processor.patch_size = patch_size + return hf_processor + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def _apply_feature_select_strategy( + self, + strategy: str, + encoder_num_image_tokens: int, + ) -> int: + if strategy == "default": + return encoder_num_image_tokens - 1 + if strategy == "full": + return encoder_num_image_tokens + msg = f"Unexpected feature select strategy: {strategy!r}" + raise NotImplementedError(msg) + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self.get_hf_config() + vision_encoder_info = self.get_vision_encoder_info() + num_projected_patches = self._apply_feature_select_strategy( + hf_config.vision_feature_select_strategy, + vision_encoder_info.get_num_image_tokens( + image_width=image_width, + image_height=image_height, + ), + ) + if num_projected_patches <= 0: + default_size = self.get_image_size_with_most_features() + num_projected_patches_default = self._apply_feature_select_strategy( + hf_config.vision_feature_select_strategy, + vision_encoder_info.get_num_image_tokens( + image_width=default_size.width, + image_height=default_size.height, + ), + ) + if num_projected_patches_default <= 0: + raise ValueError( + "Could not determine a valid number of image patches.") + num_projected_patches = num_projected_patches_default + num_height_patches = int(math.sqrt(num_projected_patches)) + total_image_tokens_for_llm = num_projected_patches \ + + num_height_patches + 1 + return total_image_tokens_for_llm + + def get_image_size_with_most_features(self) -> ImageSize: + vision_encoder_info = self.get_vision_encoder_info() + width = height = vision_encoder_info.get_image_size() + return ImageSize(width=width, height=height) + + def get_max_image_tokens(self) -> int: + target_width, target_height = self.get_image_size_with_most_features() + return self.get_num_image_tokens( + image_width=target_width, + image_height=target_height, + ) + + def get_image_newline_idx(self) -> int: + return self.get_hf_config().image_newline_idx + + def get_image_new_idx(self) -> int: + return self.get_hf_config().image_new_idx + + +_I_Tarsier = TypeVar("_I_Tarsier", bound=TarsierProcessingInfo) + + +class TarsierDummyInputsBuilder(LlavaDummyInputsBuilder[_I_Tarsier]): + + pass + + +class TarsierMultiModalProcessor(BaseMultiModalProcessor[_I_Tarsier]): + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> Sequence[PromptUpdate]: + hf_config = self.info.get_hf_config() + image_token_id = hf_config.image_token_index # The token ID + + def get_replacement(item_idx: int): + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems)) + + if isinstance(images, ImageEmbeddingItems): + num_projected_patches = images.get_feature_size(item_idx) + # This assumes num_projected_patches is a perfect square + num_height_patches = int(math.sqrt(num_projected_patches)) + num_final_image_tokens = num_projected_patches \ + + num_height_patches + 1 + else: + image_size = images.get_image_size(item_idx) + num_final_image_tokens = self.info.get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + ) + + return [image_token_id] * num_final_image_tokens + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], # Replace each single token + replacement=get_replacement, + ), + ] + + +def _build_tarsier_hf_info( + ctx: InputProcessingContext) -> TarsierProcessingInfo: + return TarsierProcessingInfo(ctx) + + +def _build_tarsier_hf_processor( + info: _I_Tarsier, + dummy_inputs: BaseDummyInputsBuilder[_I_Tarsier], + *, + cache: Optional[ProcessingCache] = None, +) -> BaseMultiModalProcessor: + if isinstance(info, TarsierProcessingInfo): + return TarsierMultiModalProcessor( + info, + dummy_inputs, + cache=cache, + ) + raise NotImplementedError(type(info)) + + +def init_vision_tower_for_tarsier( + hf_config: TarsierHfConfig, # Use the Tarsier specific config protocol + quant_config: Optional[QuantizationConfig], + *, + require_post_norm: Optional[bool] = None, + prefix: str = "", +) -> Union[CLIPVisionModel, SiglipVisionModel]: + vision_config = hf_config.vision_config + + feature_layers = hf_config.vision_feature_layer + base_num_hidden_layers = vision_config.num_hidden_layers + + def _get_layer_index(feature_layer_index: int, + num_hidden_layers_total: int) -> int: + if feature_layer_index < 0: + return num_hidden_layers_total + feature_layer_index + 1 + return feature_layer_index + + if isinstance(feature_layers, int): + num_hidden_layers_to_init = _get_layer_index(feature_layers, + base_num_hidden_layers) + elif isinstance(feature_layers, (list, tuple)): + num_hidden_layers_to_init = max( + _get_layer_index(idx, base_num_hidden_layers) + for idx in feature_layers) + else: + raise TypeError(f"vision_layer_feature type: {type(feature_layers)}" + " is not supported") + + if isinstance(vision_config, CLIPVisionConfig): + return CLIPVisionModel( + vision_config, + quant_config=quant_config, + num_hidden_layers_override=num_hidden_layers_to_init, + require_post_norm=require_post_norm, + prefix=prefix, + ) + elif isinstance(vision_config, SiglipVisionConfig): + return SiglipVisionModel( + vision_config, + quant_config=quant_config, + num_hidden_layers_override=num_hidden_layers_to_init, + require_post_norm=require_post_norm, + prefix=prefix, + ) + + msg = f"Unsupported vision config for Tarsier: {type(vision_config)}" + raise NotImplementedError(msg) + + +@MULTIMODAL_REGISTRY.register_processor(_build_tarsier_hf_processor, + info=_build_tarsier_hf_info, + dummy_inputs=TarsierDummyInputsBuilder) +class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, + SupportsPP): + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"] + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__() + config: TarsierHfConfig = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config # Storing the Tarsier-specific HF config + self.vision_tower = init_vision_tower_for_tarsier( + config, + quant_config, + require_post_norm=False, + prefix=maybe_prefix(prefix, "vision_tower")) + projector_bias = getattr(config, "multimodal_projector_bias", True) + + self.multi_modal_projector = TarsierMultiModalProjector( + vision_hidden_size=config.vision_config.hidden_size, + text_hidden_size=config.text_config.hidden_size, + projector_hidden_act=config.projector_hidden_act, + multimodal_projector_bias=projector_bias, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "multi_modal_projector")) + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + hf_config=config. + text_config, # Use text_config from Tarsier's main config + prefix=maybe_prefix(prefix, "language_model"), + ) + self.register_buffer('image_newline_idx_tensor', + torch.tensor([config.image_newline_idx], + dtype=torch.long), + persistent=False) + self.register_buffer('image_new_idx_tensor', + torch.tensor([config.image_new_idx], + dtype=torch.long), + persistent=False) + + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors) + + def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: + h = w = self.config.vision_config.image_size + expected_dims = (3, h, w) # Assuming 3 channels + actual_dims = tuple(data.shape[1:]) + + if actual_dims != expected_dims: + expected_expr = ("batch_size", *map(str, expected_dims)) + raise ValueError( + f"The expected shape of pixel values is {expected_expr}. " + f"You supplied {tuple(data.shape)}.") + return data + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[TarsierImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + + if pixel_values is None and image_embeds is None: + return None + + if pixel_values is not None: + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + return TarsierImagePixelInputs( + type="pixel_values", + pixel_values=self._validate_pixel_values( + flatten_bn(pixel_values, concat=True)), + ) + + if image_embeds is not None: + if not isinstance(image_embeds, (torch.Tensor, list)): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + return TarsierImageEmbeddingInputs( + type="image_embeds", + data=flatten_bn(image_embeds, concat=True), + ) + + raise AssertionError("This line should be unreachable.") + + def _select_image_features(self, image_features: torch.Tensor, *, + strategy: str) -> torch.Tensor: + if strategy == "default": + return image_features[:, 1:] + elif strategy == "full": + return image_features + raise ValueError(f"Unexpected select feature strategy: {strategy}") + + def _image_pixels_to_features( + self, + vision_tower: Union[CLIPVisionModel, SiglipVisionModel], + pixel_values: Union[torch.Tensor, list[torch.Tensor]], + ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]: + # From vLLM LLaVA, vision tower output handling + image_hidden_states = vision_tower(pixel_values) + if not isinstance(image_hidden_states, torch.Tensor): + raise TypeError( + f"image_hidden_states type: {type(image_hidden_states)}" + " is not supported") + + def select_features_fn(leaf: torch.Tensor): + return self._select_image_features( + leaf, + strategy=self.config.vision_feature_select_strategy, + ) + + selected_features = cast( + Union[torch.Tensor, tuple[torch.Tensor, ...]], + json_map_leaves(select_features_fn, image_hidden_states), + ) + return selected_features + + def _add_tarsier_split_tokens( + self, projected_image_features: torch.Tensor) -> torch.Tensor: + """ + Implements Tarsier's `add_split_tokens` logic. + """ + num_images, num_projected_patches, embed_dim = \ + projected_image_features.shape + num_height_patches = int(math.sqrt(num_projected_patches)) + num_width_patches = num_projected_patches // num_height_patches + device = projected_image_features.device + embedding_layer = self.language_model.model.embed_tokens + image_newline_emb = embedding_layer( + self.image_newline_idx_tensor.to(device)).squeeze(0) + image_new_emb = embedding_layer( + self.image_new_idx_tensor.to(device)).squeeze(0) + try: + current_image_features_grid = projected_image_features.view( + num_images, num_height_patches, num_width_patches, embed_dim) + except RuntimeError as e: + raise RuntimeError( + "Cannot reshape projected_image_features" + f" with shape {projected_image_features.shape} " + f"to ({num_images}, {num_height_patches}," + f" {num_width_patches}, {embed_dim}). " + "Ensure num_projected_patches is compatible" + " with a grid structure. " + f"num_projected_patches={num_projected_patches}, " + f"derived num_height_patches={num_height_patches}. ") from e + + image_newline_expanded = image_newline_emb.expand( + (num_images, num_height_patches, 1, embed_dim)) + features_with_newlines = torch.cat( + [current_image_features_grid, image_newline_expanded], + dim=2 # Concatenate along width dim + ) + new_num_patches_after_newline = num_projected_patches \ + + num_height_patches + features_with_newlines_flat = features_with_newlines.view( + num_images, new_num_patches_after_newline, embed_dim) + image_new_expanded = image_new_emb.expand((num_images, 1, embed_dim)) + final_image_features = torch.cat( + [features_with_newlines_flat, image_new_expanded], + dim=1 # Concatenate along patch sequence dim + ) + return final_image_features + + def _process_image_pixels( + self, + inputs: TarsierImagePixelInputs, + ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]: + assert self.vision_tower is not None + pixel_values = inputs["pixel_values"] + image_features_selected = self._image_pixels_to_features( + self.vision_tower, pixel_values) # type: ignore + if isinstance(image_features_selected, torch.Tensor): + projected_features = self.multi_modal_projector( + image_features_selected) + final_features = self._add_tarsier_split_tokens(projected_features) + return final_features + else: + raise TypeError( + f"_image_pixels_to_features type:" + f" {type(image_features_selected)} is not supported") + + def _process_image_input( + self, + image_input: TarsierImageInputs, + ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]: + if image_input["type"] == "image_embeds": + projected_features = image_input["data"] + if isinstance(projected_features, torch.Tensor): + return self._add_tarsier_split_tokens(projected_features) + else: + raise ValueError("Incorrect type of image_embeds. " + f"Got type: {type(projected_features)}. ") + assert self.vision_tower is not None + return self._process_image_pixels(image_input) + + def get_language_model(self) -> torch.nn.Module: + return self.language_model + + def get_multimodal_embeddings( + self, **kwargs: object) -> Optional[MultiModalEmbeddings]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + return self._process_image_input(image_input) + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[MultiModalEmbeddings] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + multimodal_embeddings, + self.config.image_token_index, + ) + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + if intermediate_tensors is not None: + inputs_embeds = None + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None + hidden_states = self.language_model.model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states, + sampling_metadata) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) -- GitLab From 17430e36531aeade52518b13961706d4227310f9 Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Tue, 3 Jun 2025 13:35:12 +0800 Subject: [PATCH 125/274] [bugfix] small fix logic issue (#18999) Signed-off-by: reidliu41 Co-authored-by: reidliu41 --- vllm/engine/arg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a5b155024..e3b8a18cc 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -455,7 +455,7 @@ class EngineArgs: title="ModelConfig", description=ModelConfig.__doc__, ) - if 'serve' not in sys.argv[1:] and '--help' not in sys.argv[1:]: + if not ('serve' in sys.argv[1:] and '--help' in sys.argv[1:]): model_group.add_argument("--model", **model_kwargs["model"]) model_group.add_argument("--task", **model_kwargs["task"]) model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"]) -- GitLab From cc977286e7a4350183aeef873858fe0dc6774740 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 3 Jun 2025 02:00:45 -0400 Subject: [PATCH 126/274] Reduce logs in CLI scripts and plugin loader (#18970) Signed-off-by: mgoin --- vllm/benchmarks/datasets.py | 6 +++--- vllm/benchmarks/latency.py | 2 -- vllm/benchmarks/throughput.py | 1 - vllm/compilation/backends.py | 6 +++--- vllm/plugins/__init__.py | 19 +++++++++++++------ 5 files changed, 19 insertions(+), 15 deletions(-) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 35cc303f6..21fe3eb62 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -328,9 +328,9 @@ class RandomDataset(BenchmarkDataset): output_high = int(output_len * (1 + range_ratio)) # Add logging for debugging - logger.info("Sampling input_len from [%s, %s]", input_low, input_high) - logger.info("Sampling output_len from [%s, %s]", output_low, - output_high) + logger.info( + "Sampling input_len from [%s, %s] and output_len from [%s, %s]", + input_low, input_high, output_low, output_high) input_lens = np.random.randint(input_low, input_high + 1, diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py index c9e03cc3b..dc1c42879 100644 --- a/vllm/benchmarks/latency.py +++ b/vllm/benchmarks/latency.py @@ -78,7 +78,6 @@ def add_cli_args(parser: argparse.ArgumentParser): def main(args: argparse.Namespace): - print(args) if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: raise OSError( "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " @@ -101,7 +100,6 @@ def main(args: argparse.Namespace): max_tokens=args.output_len, detokenize=not args.disable_detokenize, ) - print(sampling_params) dummy_prompt_token_ids = np.random.randint(10000, size=(args.batch_size, args.input_len)) diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index 13110a8b4..3ea6c194b 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -527,7 +527,6 @@ def main(args: argparse.Namespace): validate_args(args) if args.seed is None: args.seed = 0 - print(args) random.seed(args.seed) # Sample the requests. tokenizer = AutoTokenizer.from_pretrained( diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index b724479a9..c4bfffe92 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -31,13 +31,13 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface: if compilation_config.use_inductor: if envs.VLLM_USE_STANDALONE_COMPILE and is_torch_equal_or_newer( "2.8.0"): - logger.info("Using InductorStandaloneAdaptor") + logger.debug("Using InductorStandaloneAdaptor") return InductorStandaloneAdaptor() else: - logger.info("Using InductorAdaptor") + logger.debug("Using InductorAdaptor") return InductorAdaptor() else: - logger.info("Using EagerAdaptor") + logger.debug("Using EagerAdaptor") return EagerAdaptor() diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 2884cb46f..4cd3552f8 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -10,6 +10,8 @@ import vllm.envs as envs logger = logging.getLogger(__name__) +DEFAULT_PLUGINS_GROUP = 'vllm.general_plugins' + # make sure one process only loads plugins once plugins_loaded = False @@ -28,19 +30,24 @@ def load_plugins_by_group(group: str) -> dict[str, Callable[[], Any]]: logger.debug("No plugins for group %s found.", group) return {} - logger.info("Available plugins for group %s:", group) + # Check if the only discovered plugin is the default one + is_default_group = (group == DEFAULT_PLUGINS_GROUP) + # Use INFO for non-default groups and DEBUG for the default group + log_level = logger.debug if is_default_group else logger.info + + log_level("Available plugins for group %s:", group) for plugin in discovered_plugins: - logger.info("- %s -> %s", plugin.name, plugin.value) + log_level("- %s -> %s", plugin.name, plugin.value) if allowed_plugins is None: - logger.info("All plugins in this group will be loaded. " - "Set `VLLM_PLUGINS` to control which plugins to load.") + log_level("All plugins in this group will be loaded. " + "Set `VLLM_PLUGINS` to control which plugins to load.") plugins = dict[str, Callable[[], Any]]() for plugin in discovered_plugins: if allowed_plugins is None or plugin.name in allowed_plugins: if allowed_plugins is not None: - logger.info("Loading plugin %s", plugin.name) + log_level("Loading plugin %s", plugin.name) try: func = plugin.load() @@ -80,7 +87,7 @@ def load_general_plugins(): # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501 os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true' - plugins = load_plugins_by_group(group='vllm.general_plugins') + plugins = load_plugins_by_group(group=DEFAULT_PLUGINS_GROUP) # general plugins, we only need to execute the loaded functions for func in plugins.values(): func() -- GitLab From d32aa2e67002afe936b8d2cadffd8adc7aaf48e7 Mon Sep 17 00:00:00 2001 From: Lu Fang <30275821+houseroad@users.noreply.github.com> Date: Tue, 3 Jun 2025 15:16:17 +0800 Subject: [PATCH 127/274] [Bugfix] Use cmake 3.26.1 instead of 3.26 to avoid build failure (#19019) Signed-off-by: Lu Fang --- docker/Dockerfile.neuron | 2 +- docs/getting_started/installation/cpu/build.inc.md | 2 +- pyproject.toml | 2 +- requirements/build.txt | 2 +- requirements/rocm-build.txt | 2 +- requirements/tpu.txt | 2 +- requirements/xpu.txt | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docker/Dockerfile.neuron b/docker/Dockerfile.neuron index 259dc5a23..8bc235547 100644 --- a/docker/Dockerfile.neuron +++ b/docker/Dockerfile.neuron @@ -34,7 +34,7 @@ RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi RUN python3 -m pip install -U \ - 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ + 'cmake>=3.26.1' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ -r requirements/neuron.txt ENV VLLM_TARGET_DEVICE neuron diff --git a/docs/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md index 7d6472afa..7ddadccb1 100644 --- a/docs/getting_started/installation/cpu/build.inc.md +++ b/docs/getting_started/installation/cpu/build.inc.md @@ -17,7 +17,7 @@ Third, install Python packages for vLLM CPU backend building: ```console pip install --upgrade pip -pip install "cmake>=3.26" wheel packaging ninja "setuptools-scm>=8" numpy +pip install "cmake>=3.26.1" wheel packaging ninja "setuptools-scm>=8" numpy pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu ``` diff --git a/pyproject.toml b/pyproject.toml index 10f5dbeae..307878f7e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [build-system] # Should be mirrored in requirements/build.txt requires = [ - "cmake>=3.26", + "cmake>=3.26.1", "ninja", "packaging>=24.2", "setuptools>=77.0.3,<80.0.0", diff --git a/requirements/build.txt b/requirements/build.txt index 320e5b892..528cd3b53 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -1,5 +1,5 @@ # Should be mirrored in pyproject.toml -cmake>=3.26 +cmake>=3.26.1 ninja packaging>=24.2 setuptools>=77.0.3,<80.0.0 diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt index 981b90632..94201543c 100644 --- a/requirements/rocm-build.txt +++ b/requirements/rocm-build.txt @@ -7,7 +7,7 @@ torchvision==0.22.0 torchaudio==2.7.0 triton==3.2 -cmake>=3.26,<4 +cmake>=3.26.1,<4 packaging>=24.2 setuptools>=77.0.3,<80.0.0 setuptools-scm>=8 diff --git a/requirements/tpu.txt b/requirements/tpu.txt index edc8b2a45..47e638463 100644 --- a/requirements/tpu.txt +++ b/requirements/tpu.txt @@ -2,7 +2,7 @@ -r common.txt # Dependencies for TPU -cmake>=3.26 +cmake>=3.26.1 packaging>=24.2 setuptools-scm>=8 wheel diff --git a/requirements/xpu.txt b/requirements/xpu.txt index 04c4d4ff8..3cb6a4a8a 100644 --- a/requirements/xpu.txt +++ b/requirements/xpu.txt @@ -2,7 +2,7 @@ -r common.txt ray>=2.9 -cmake>=3.26 +cmake>=3.26.1 packaging>=24.2 setuptools-scm>=8 setuptools>=77.0.3,<80.0.0 -- GitLab From f32fcd944430603ebcbbf04454b4e15754168ef4 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Tue, 3 Jun 2025 16:01:48 +0800 Subject: [PATCH 128/274] [v1][KVCacheManager] Rename BlockHashType to BlockHash (#19015) Signed-off-by: Chen Zhang --- docs/design/v1/prefix_caching.md | 2 +- tests/v1/core/test_kv_cache_utils.py | 12 +++++------- tests/v1/core/test_prefix_caching.py | 4 ++-- tests/v1/core/test_specialized_manager.py | 4 ++-- vllm/v1/core/block_pool.py | 8 ++++---- vllm/v1/core/kv_cache_manager.py | 4 ++-- vllm/v1/core/kv_cache_utils.py | 14 +++++++------- vllm/v1/core/single_type_kv_cache_manager.py | 10 +++++----- 8 files changed, 28 insertions(+), 30 deletions(-) diff --git a/docs/design/v1/prefix_caching.md b/docs/design/v1/prefix_caching.md index ad041b005..bbdfb2552 100644 --- a/docs/design/v1/prefix_caching.md +++ b/docs/design/v1/prefix_caching.md @@ -104,7 +104,7 @@ class KVCacheBlock: block_id: int # The block hash (will be assigned when the block is full, # and will be reset when the block is evicted). - block_hash: BlockHashType + block_hash: BlockHash # The number of requests using this block now. ref_cnt: int diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index d3d62cf09..61aee8752 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -100,8 +100,8 @@ def test_kv_cache_block(): assert block.ref_cnt == 0 # Test block hash setting and resetting - block_hash = vllm.v1.core.kv_cache_utils.BlockHashType(hash_value=123, - token_ids=(1, 2, 3)) + block_hash = vllm.v1.core.kv_cache_utils.BlockHash(hash_value=123, + token_ids=(1, 2, 3)) block.block_hash = block_hash assert block.block_hash == block_hash @@ -282,7 +282,7 @@ def test_hash_block_tokens(hash_fn): block_hash = hash_block_tokens(hash_fn, parent_block_hash, curr_block_token_ids, extra_keys) - assert isinstance(block_hash, vllm.v1.core.kv_cache_utils.BlockHashType) + assert isinstance(block_hash, vllm.v1.core.kv_cache_utils.BlockHash) assert block_hash.hash_value == hash_fn( (parent_block_hash, curr_block_token_ids, extra_keys)) assert block_hash.token_ids == curr_block_token_ids @@ -306,10 +306,8 @@ def test_hash_request_tokens(hash_fn): block_hashes = hash_request_tokens(hash_fn, block_size, request) assert len(block_hashes) == 2 - assert isinstance(block_hashes[0], - vllm.v1.core.kv_cache_utils.BlockHashType) - assert isinstance(block_hashes[1], - vllm.v1.core.kv_cache_utils.BlockHashType) + assert isinstance(block_hashes[0], vllm.v1.core.kv_cache_utils.BlockHash) + assert isinstance(block_hashes[1], vllm.v1.core.kv_cache_utils.BlockHash) # Check the first block assert block_hashes[0].token_ids == (0, 1, 2) diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index ba3c0b3cf..1a7a31d98 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -12,7 +12,7 @@ from vllm.sampling_params import SamplingParams from vllm.utils import sha256 from vllm.v1.core.block_pool import BlockPool from vllm.v1.core.kv_cache_manager import KVCacheManager, Request -from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock, +from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock, hash_block_tokens) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, SlidingWindowSpec) @@ -547,7 +547,7 @@ def test_cache_blocks(hash_fn): # Test that blocks are cached correctly for 2 full blocks from the start. blocks = [KVCacheBlock(block_id=i) for i in range(2)] - block_hashes: list[BlockHashType] = [] + block_hashes: list[BlockHash] = [] block_pool.cache_full_blocks( request=req, diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py index 101a2379b..4217dc37e 100644 --- a/tests/v1/core/test_specialized_manager.py +++ b/tests/v1/core/test_specialized_manager.py @@ -3,7 +3,7 @@ import torch from vllm.v1.core.block_pool import BlockPool -from vllm.v1.core.kv_cache_utils import BlockHashType, KVCacheBlock +from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock from vllm.v1.core.single_type_kv_cache_manager import SlidingWindowManager from vllm.v1.kv_cache_interface import SlidingWindowSpec @@ -32,7 +32,7 @@ def test_sliding_window_possible_cached_prefix(): def run_one_case(block_is_cached, expect_length): block_hash_list = [ - BlockHashType(i, ()) for i in range(len(block_is_cached)) + BlockHash(i, ()) for i in range(len(block_is_cached)) ] block_pool.cached_block_hash_to_block.clear() diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index f2ed183b6..a0a065df9 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -6,7 +6,7 @@ from typing import Callable, Optional from vllm.distributed.kv_events import (AllBlocksCleared, BlockRemoved, BlockStored, KVCacheEvent) from vllm.logger import init_logger -from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue, +from vllm.v1.core.kv_cache_utils import (BlockHash, FreeKVCacheBlockQueue, KVCacheBlock, generate_block_hash_extra_keys, hash_block_tokens) @@ -55,7 +55,7 @@ class BlockPool: # if there is already an identical block in the cache. This is because # we want to make sure the allocated block IDs won't change so that # block tables are append-only. - self.cached_block_hash_to_block: dict[BlockHashType, dict[ + self.cached_block_hash_to_block: dict[BlockHash, dict[ int, KVCacheBlock]] = defaultdict(dict) # To represent a placeholder block with block_id=0. @@ -67,7 +67,7 @@ class BlockPool: self.kv_event_queue: list[KVCacheEvent] = [] def get_cached_block(self, - block_hash: BlockHashType) -> Optional[KVCacheBlock]: + block_hash: BlockHash) -> Optional[KVCacheBlock]: """Get a cached block by the block hash, or None if cache miss. If there are duplicated blocks, we return the first block in the cache. @@ -87,7 +87,7 @@ class BlockPool: self, request: Request, blocks: list[KVCacheBlock], - block_hashes: list[BlockHashType], + block_hashes: list[BlockHash], num_cached_blocks: int, num_full_blocks: int, block_size: int, diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 0f6098d2b..59e07382b 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -8,7 +8,7 @@ from vllm.distributed.kv_events import KVCacheEvent from vllm.logger import init_logger from vllm.utils import sha256 from vllm.v1.core.block_pool import BlockPool -from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock, +from vllm.v1.core.kv_cache_utils import (BlockHash, KVCacheBlock, hash_request_tokens) from vllm.v1.core.single_type_kv_cache_manager import ( get_manager_for_kv_cache_spec) @@ -92,7 +92,7 @@ class KVCacheManager: # This is to avoid recomputing the block hashes for each call of # `get_computed_blocks` or `allocate_slots`. self.req_to_block_hashes: defaultdict[ - str, list[BlockHashType]] = defaultdict(list) + str, list[BlockHash]] = defaultdict(list) @property def usage(self) -> float: diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index a41fe4881..3ccad97e9 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -18,7 +18,7 @@ from vllm.v1.request import Request logger = init_logger(__name__) -class BlockHashType(NamedTuple): +class BlockHash(NamedTuple): """Hash value of a block (int), the token IDs in the block, and extra keys. We keep a tuple of token IDs and extra keys to reduce the likelihood of hash collisions when the hash value is the same. By using SHA256 however, @@ -117,7 +117,7 @@ class KVCacheBlock: ref_cnt: int = 0 # The hash of the block composed of (block hash, tuple of token IDs). # It is only available when the block is full. - _block_hash: Optional[BlockHashType] = None + _block_hash: Optional[BlockHash] = None # Used to construct a doubly linked list for free blocks. # These two attributes should only be manipulated by FreeKVCacheBlockQueue. @@ -131,11 +131,11 @@ class KVCacheBlock: self.ref_cnt -= 1 @property - def block_hash(self) -> Optional[BlockHashType]: + def block_hash(self) -> Optional[BlockHash]: return self._block_hash @block_hash.setter - def block_hash(self, block_hash: BlockHashType): + def block_hash(self, block_hash: BlockHash): assert self.block_hash is None, ( "The block already has a hash. This should not happen.") self._block_hash = block_hash @@ -398,7 +398,7 @@ def hash_block_tokens( hash_function: Callable, parent_block_hash: Optional[int], curr_block_token_ids: Sequence[int], - extra_keys: Optional[tuple[Any, ...]] = None) -> BlockHashType: + extra_keys: Optional[tuple[Any, ...]] = None) -> BlockHash: """Computes a hash value corresponding to the contents of a block and the contents of the preceding block(s). The hash value is used for prefix caching. We use LRU cache for this function to avoid recomputing @@ -419,14 +419,14 @@ def hash_block_tokens( parent_block_hash = NONE_HASH curr_block_token_ids_tuple = tuple(curr_block_token_ids) - return BlockHashType( + return BlockHash( hash_function( (parent_block_hash, curr_block_token_ids_tuple, extra_keys)), curr_block_token_ids_tuple, extra_keys) def hash_request_tokens(hash_function: Any, block_size: int, - request: Request) -> list[BlockHashType]: + request: Request) -> list[BlockHash]: """Computes hash values of a chain of blocks given a sequence of token IDs. The hash value is used for prefix caching. diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 0223c9cee..e69e9ac9f 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -5,7 +5,7 @@ from typing import Callable from vllm.utils import cdiv from vllm.v1.core.block_pool import BlockPool -from vllm.v1.core.kv_cache_utils import BlockHashType, KVCacheBlock +from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheSpec, SlidingWindowSpec) from vllm.v1.request import Request @@ -133,7 +133,7 @@ class SingleTypeKVCacheManager(ABC): req_blocks.extend(new_blocks) return new_blocks - def cache_blocks(self, request: Request, block_hashes: list[BlockHashType], + def cache_blocks(self, request: Request, block_hashes: list[BlockHash], num_tokens: int) -> None: """ Cache the blocks for the request. @@ -187,7 +187,7 @@ class SingleTypeKVCacheManager(ABC): raise NotImplementedError @abstractmethod - def find_longest_cache_hit(self, block_hashes: list[BlockHashType], + def find_longest_cache_hit(self, block_hashes: list[BlockHash], max_length: int) -> list[KVCacheBlock]: """ Get the longest cache hit prefix of the blocks that is not longer than @@ -228,7 +228,7 @@ class SingleTypeKVCacheManager(ABC): class FullAttentionManager(SingleTypeKVCacheManager): - def find_longest_cache_hit(self, block_hashes: list[BlockHashType], + def find_longest_cache_hit(self, block_hashes: list[BlockHash], max_length: int) -> list[KVCacheBlock]: computed_blocks: list[KVCacheBlock] = [] max_num_blocks = max_length // self.block_size @@ -280,7 +280,7 @@ class SlidingWindowManager(SingleTypeKVCacheManager): self.sliding_window_contiguous_blocks += 1 self._null_block = block_pool.null_block - def find_longest_cache_hit(self, block_hashes: list[BlockHashType], + def find_longest_cache_hit(self, block_hashes: list[BlockHash], max_length: int) -> list[KVCacheBlock]: # TODO: reduce i by sliding_window_contiguous_blocks when cache miss, to # optimize the time complexity from O(max_num_blocks) to -- GitLab From 6d18ed2a2e858a8061dfe8c2e140c2c498d6a99a Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 3 Jun 2025 04:21:53 -0400 Subject: [PATCH 129/274] Update docker docs with ARM CUDA cross-compile (#19037) Signed-off-by: mgoin --- docs/deployment/docker.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md index 516640f6f..9e506d3d7 100644 --- a/docs/deployment/docker.md +++ b/docs/deployment/docker.md @@ -107,10 +107,21 @@ DOCKER_BUILDKIT=1 docker build . \ -t vllm/vllm-gh200-openai:latest \ --build-arg max_jobs=66 \ --build-arg nvcc_threads=2 \ - --build-arg torch_cuda_arch_list="9.0+PTX" \ + --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \ --build-arg vllm_fa_cmake_gpu_arches="90-real" ``` +!!! note + If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution. + + Run the following command on your host machine to register QEMU user static handlers: + + ```console + docker run --rm --privileged multiarch/qemu-user-static --reset -p yes + ``` + + After setting up QEMU, you can use the `--platform "linux/arm64"` flag in your `docker build` command. + ## Use the custom-built vLLM Docker image To run vLLM with the custom-built Docker image: -- GitLab From 42243fbda04d908aa16f17bf3d5f9cf35e4ef26f Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 3 Jun 2025 17:08:03 +0800 Subject: [PATCH 130/274] [Doc] Add InternVL LoRA support (#19055) Signed-off-by: Jee Jee Li --- docs/models/supported_models.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index f2090fe39..71414d2aa 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -524,7 +524,7 @@ Specified using `--task generate`. | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎\* | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3` etc. | ✅︎ | | ✅︎ | -| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | | ✅︎ | ✅︎ | +| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + IE+ + (VE+) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I+ | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ | | `Llama4ForConditionalGeneration` | Llama 4 | T + I+ | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ | | `LlavaForConditionalGeneration` | LLaVA-1.5 | T + IE+ | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | | ✅︎ | ✅︎ | -- GitLab From ec2dcd80bc173c06a4c48377d4a6b6ca2c78a2f5 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Tue, 3 Jun 2025 17:08:20 +0800 Subject: [PATCH 131/274] [Misc] Update `WeightsMapper` for qwen2-vl/qwen2.5-vl (#19054) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/qwen2_5_vl.py | 13 +++++++++---- vllm/model_executor/models/qwen2_vl.py | 13 +++++++++---- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index e3fa9f67c..f62c7e1d2 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -823,10 +823,15 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP): # To ensure correct weight loading and mapping. - hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ - "lm_head.": "language_model.lm_head.", - "model.": "language_model.model.", - }) + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + # mapping for new names in checkpoint saved after transformers v4.52 + "model.language_model.": "language_model.model.", + "model.visual.": "visual.", + # mapping for original checkpoint + "lm_head.": "language_model.lm_head.", + "model.": "language_model.model.", + }) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 873baa56f..5c30e36c7 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1071,10 +1071,15 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP): # To ensure correct weight loading and mapping. - hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ - "lm_head.": "language_model.lm_head.", - "model.": "language_model.model.", - }) + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + # mapping for new names in checkpoint saved after transformers v4.52 + "model.language_model.": "language_model.model.", + "model.visual.": "visual.", + # mapping for original checkpoint + "lm_head.": "language_model.lm_head.", + "model.": "language_model.model.", + }) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() -- GitLab From 118ff921118cc81061a2af865a1e13840ceb6792 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 3 Jun 2025 17:29:41 +0800 Subject: [PATCH 132/274] [Doc] Update V1 user guide for embedding and enc-dec models (#19060) Signed-off-by: DarkLight1337 --- docs/usage/v1_guide.md | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 3d5d7ce45..a2321bf98 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -1,5 +1,7 @@ # vLLM V1 +**We have started the process of deprecating V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.** + V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack). To disable V1, please set the environment variable as: `VLLM_USE_V1=0`, and send us a GitHub issue sharing the reason! @@ -51,9 +53,9 @@ This living user guide outlines a few known **important changes and limitations* | **Spec Decode** | 🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))| | **Prompt Logprobs with Prefix Caching** | 🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))| | **Structured Output Alternative Backends** | 🟡 Planned | -| **Embedding Models** | 🟡 Planned ([RFC #12249](https://github.com/vllm-project/vllm/issues/12249)) | +| **Embedding Models** | 🚧 WIP ([PR #18015](https://github.com/vllm-project/vllm/pull/18015)) | | **Mamba Models** | 🟡 Planned | -| **Encoder-Decoder Models** | 🟡 Planned | +| **Encoder-Decoder Models** | 🟠 Delayed | | **Request-level Structured Output Backend** | 🔴 Deprecated | | **best_of** | 🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))| | **Per-Request Logits Processors** | 🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360)) | @@ -63,10 +65,11 @@ This living user guide outlines a few known **important changes and limitations* - **🟢 Functional**: Fully operational, with ongoing optimizations. - **🚧 WIP**: Under active development. - **🟡 Planned**: Scheduled for future implementation (some may have open PRs/RFCs). -- **🔴 Deprecated**: Not planned for v1 unless there is strong demand. +- **🟠 Delayed**: Temporarily dropped in V1 but planned to be re-introduced later. +- **🔴 Deprecated**: Not planned for V1 unless there is strong demand. **Note**: vLLM V1’s unified scheduler treats both prompt and output tokens the same -way by using a simple dictionary (e.g., {request_id: num_tokens}) to dynamically +way by using a simple dictionary (e.g., `{request_id: num_tokens}`) to dynamically allocate a fixed token budget per request, enabling features like chunked prefills, prefix caching, and speculative decoding without a strict separation between prefill and decode phases. @@ -140,7 +143,9 @@ vLLM V1 currently excludes model architectures with the `SupportsV0Only` protoco and the majority fall into the following categories. V1 support for these models will be added eventually. **Embedding Models** -Instead of having a separate model runner, hidden states processor [RFC #12249](https://github.com/vllm-project/vllm/issues/12249), which is based on global logits processor [RFC #13360](https://github.com/vllm-project/vllm/pull/13360), has been proposed to enable simultaneous generation and embedding using the same engine instance in V1. It is still in the planning stage. +Initially, we will create a [separate model runner](https://github.com/vllm-project/vllm/pull/18015) to provide V1 support without conflicting with other ongoing work. + +Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249), which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360) to enable simultaneous generation and embedding using the same engine instance in V1. [PR #16188](https://github.com/vllm-project/vllm/pull/16188) is the first step towards enabling this. **Mamba Models** Models using selective state-space mechanisms (instead of standard transformer attention) -- GitLab From 4e88723f32f1115130566b31dba0d3c31ab1b13f Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 3 Jun 2025 21:42:17 +0800 Subject: [PATCH 133/274] [doc] clarify windows support (#19088) Signed-off-by: youkaichao --- docs/getting_started/installation/gpu.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md index 3c983f600..f8a3acef7 100644 --- a/docs/getting_started/installation/gpu.md +++ b/docs/getting_started/installation/gpu.md @@ -19,6 +19,9 @@ vLLM is a Python library that supports the following GPU variants. Select your G - OS: Linux - Python: 3.9 -- 3.12 +!!! note + vLLM does not support Windows natively. To run vLLM on Windows, you can use the Windows Subsystem for Linux (WSL) with a compatible Linux distribution, or use some community-maintained forks, e.g. [https://github.com/SystemPanic/vllm-windows](https://github.com/SystemPanic/vllm-windows). + === "NVIDIA CUDA" --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:requirements" -- GitLab From 4e68ae5e59b24fad3865eb34421b36bef4751888 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 3 Jun 2025 22:30:18 +0800 Subject: [PATCH 134/274] [CI/Build] Remove V0 LoRA test (#19066) Signed-off-by: Jee Jee Li --- tests/lora/test_add_lora.py | 21 ++----------------- tests/lora/test_chatglm3_tp.py | 10 --------- tests/lora/test_llama_tp.py | 8 -------- tests/lora/test_lora_functions.py | 34 ++++++++----------------------- tests/lora/test_mixtral.py | 8 -------- tests/lora/test_quant_model.py | 8 -------- tests/lora/test_qwen2vl.py | 8 -------- tests/lora/test_worker.py | 10 --------- 8 files changed, 10 insertions(+), 97 deletions(-) diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py index c8b7a5cbf..17347300b 100644 --- a/tests/lora/test_add_lora.py +++ b/tests/lora/test_add_lora.py @@ -6,6 +6,8 @@ import pytest import vllm.envs as env from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.entrypoints.openai.api_server import ( + build_async_engine_client_from_engine_args) from vllm.inputs import TextPrompt from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams @@ -16,14 +18,6 @@ LORA_RANK = 64 DEFAULT_MAX_LORAS = 4 * 3 -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def get_lora_requests(lora_path) -> list[LoRARequest]: lora_requests: list[LoRARequest] = [ LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path) @@ -88,17 +82,6 @@ async def test_add_lora(chatglm3_lora_files): trust_remote_code=True, enforce_eager=True) - # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1` - # environment variable. reload vllm.enging.async_llm_engine as - # vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the - # env var. - import importlib - - import vllm.engine.async_llm_engine - importlib.reload(vllm.engine.async_llm_engine) - from vllm.entrypoints.openai.api_server import ( - build_async_engine_client_from_engine_args) - # split lora_requests into 3 parts part_size = len(lora_requests) // 3 dummy_run_requests = lora_requests[:part_size] diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py index 2c18a115b..cd9526c8b 100644 --- a/tests/lora/test_chatglm3_tp.py +++ b/tests/lora/test_chatglm3_tp.py @@ -1,7 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -import pytest - import vllm from vllm.lora.request import LoRARequest @@ -18,14 +16,6 @@ EXPECTED_LORA_OUTPUT = [ ] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: prompts = [ PROMPT_TEMPLATE.format(query="How many singers do we have?"), diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 580992dea..54daea5b9 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -33,14 +33,6 @@ EXPECTED_LORA_OUTPUT = [ ] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py index 7ae33a848..fd80f61a5 100644 --- a/tests/lora/test_lora_functions.py +++ b/tests/lora/test_lora_functions.py @@ -2,26 +2,24 @@ """ Script to test add_lora, remove_lora, pin_lora, list_loras functions. """ - -import os - import pytest from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.llm_engine import LLMEngine +from vllm.entrypoints.openai.api_server import ( + build_async_engine_client_from_engine_args) from vllm.lora.request import LoRARequest MODEL_PATH = "meta-llama/Llama-2-7b-hf" LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test" LORA_RANK = 8 - -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass +# @pytest.fixture(autouse=True) +# def v1(run_with_both_engines_lora): +# # Simple autouse wrapper to run both engines for each test +# # This can be promoted up to conftest.py to run for every +# # test in a package +# pass def make_lora_request(lora_id: int): @@ -79,22 +77,6 @@ def test_lora_functions_sync(): @pytest.mark.asyncio async def test_lora_functions_async(): - if os.getenv("VLLM_USE_V1") == "0": - pytest.skip( - reason= - "V0 AsyncLLMEngine does not expose remove/list/pin LoRA functions") - - # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1` - # environment variable. reload vllm.enging.async_llm_engine as - # vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the - # env var. - import importlib - - import vllm.engine.async_llm_engine - importlib.reload(vllm.engine.async_llm_engine) - from vllm.entrypoints.openai.api_server import ( - build_async_engine_client_from_engine_args) - max_loras = 4 engine_args = AsyncEngineArgs(model=MODEL_PATH, enable_lora=True, diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index aea769193..4e77c5559 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -10,14 +10,6 @@ from vllm.platforms import current_platform MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1" -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, prompts: list[str]) -> list[str]: diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 7a76ffb74..43e2975cd 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -37,14 +37,6 @@ else: ] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index 162714df2..20a1ae67d 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -13,14 +13,6 @@ from vllm.platforms import current_platform from vllm.sampling_params import BeamSearchParams -@pytest.fixture(autouse=not current_platform.is_cpu()) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @dataclass class TestConfig: model_path: str diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index e5ae660af..1a5d52716 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -6,8 +6,6 @@ import tempfile from typing import Union from unittest.mock import patch -import pytest - import vllm.envs as envs from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, @@ -18,14 +16,6 @@ from vllm.v1.worker.gpu_worker import Worker as V1Worker from vllm.worker.worker import Worker -@pytest.fixture(autouse=True) -def v1(run_with_both_engines_lora): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @patch.dict(os.environ, {"RANK": "0"}) def test_worker_apply_lora(sql_lora_files): -- GitLab From 476844d44cbf315c6c1e8431946bdecfe9823834 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 3 Jun 2025 15:39:24 +0100 Subject: [PATCH 135/274] Fix underscores in dict keys passed via CLI (#19030) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/test_utils.py | 11 +++++++++++ vllm/utils.py | 13 ++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index dd8777f06..42e0df1ff 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -259,11 +259,18 @@ def test_dict_args(parser): "--model-name=something.something", "--hf-overrides.key1", "val1", + # Test nesting "--hf-overrides.key2.key3", "val2", "--hf-overrides.key2.key4", "val3", + # Test = sign "--hf-overrides.key5=val4", + # Test underscore to dash conversion + "--hf_overrides.key_6", + "val5", + "--hf_overrides.key-7.key_8", + "val6", ] parsed_args = parser.parse_args(args) assert parsed_args.model_name == "something.something" @@ -274,6 +281,10 @@ def test_dict_args(parser): "key4": "val3", }, "key5": "val4", + "key_6": "val5", + "key-7": { + "key_8": "val6", + }, } diff --git a/vllm/utils.py b/vllm/utils.py index c879b38d0..b4152e6b2 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1456,17 +1456,24 @@ class FlexibleArgumentParser(ArgumentParser): if '--config' in args: args = self._pull_args_from_config(args) + def repl(match: re.Match) -> str: + """Replaces underscores with dashes in the matched string.""" + return match.group(0).replace("_", "-") + + # Everything between the first -- and the first . + pattern = re.compile(r"(?<=--)[^\.]*") + # Convert underscores to dashes and vice versa in argument names processed_args = [] for arg in args: if arg.startswith('--'): if '=' in arg: key, value = arg.split('=', 1) - key = '--' + key[len('--'):].replace('_', '-') + key = pattern.sub(repl, key, count=1) processed_args.append(f'{key}={value}') else: - processed_args.append('--' + - arg[len('--'):].replace('_', '-')) + key = pattern.sub(repl, arg, count=1) + processed_args.append(key) elif arg.startswith('-O') and arg != '-O' and len(arg) == 2: # allow -O flag to be used without space, e.g. -O3 processed_args.append('-O') -- GitLab From d81edded69a5534a80785b68cde26c547cfcd4c6 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 3 Jun 2025 17:06:04 +0200 Subject: [PATCH 136/274] [Bugfix] disable processor cache (#19068) Signed-off-by: raushan --- vllm/v1/engine/mm_input_cache.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index fcb90bebd..45fb5cd23 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -34,8 +34,8 @@ class MirroredProcessingCache: def __init__(self, model_config): mm_config = model_config.multimodal_config - disable_mm_preprocessor_cache = mm_config is not None and \ - not mm_config.disable_mm_preprocessor_cache + disable_mm_preprocessor_cache = ( + mm_config is not None and mm_config.disable_mm_preprocessor_cache) self.use_cache = not disable_mm_preprocessor_cache self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB, MultiModalKwargs) -- GitLab From d00dd65cd4dbc1ebbdbe2cd070ff694e9e9321a2 Mon Sep 17 00:00:00 2001 From: Lu Fang <30275821+houseroad@users.noreply.github.com> Date: Tue, 3 Jun 2025 23:44:34 +0800 Subject: [PATCH 137/274] [Doc] Improve the Pull Request template with key components (#19086) Signed-off-by: Lu Fang --- .github/PULL_REQUEST_TEMPLATE.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 65be771b9..c1d1e07bf 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,6 +1,15 @@ -FILL IN THE PR DESCRIPTION HERE +## Essential Elements of an Effective PR Description Checklist +- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)". +- [ ] The test plan, such as providing test command. +- [ ] The test results, such as pasting the results comparison before and after, or e2e results -FIX #xxxx (*link existing issues this PR will resolve*) +PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED. + +## Purpose + +## Test Plan + +## Test Result **BEFORE SUBMITTING, PLEASE READ ** (anything written below this line will be removed by GitHub Actions) -- GitLab From 4b7817c119e27ad9b1e1930a34006eff9680a457 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Tue, 3 Jun 2025 18:15:16 +0200 Subject: [PATCH 138/274] [Misc] Add missing `_Backend` enums (#19081) Signed-off-by: nicklucche --- vllm/platforms/interface.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 5c4f7a2f7..c7a627262 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -47,6 +47,8 @@ class _Backend(enum.Enum): TORCH_SDPA = enum.auto() FLASHINFER = enum.auto() TRITON_MLA = enum.auto() # Supported by V1 + TRITON_MLA_VLLM_V1 = enum.auto() + FLASHMLA_VLLM_V1 = enum.auto() FLASHMLA = enum.auto() # Supported by V1 HPU_ATTN = enum.auto() PALLAS = enum.auto() -- GitLab From d054da1992175787f936d18aead51bef663a0399 Mon Sep 17 00:00:00 2001 From: CYJiang <86391540+googs1025@users.noreply.github.com> Date: Wed, 4 Jun 2025 02:02:07 +0800 Subject: [PATCH 139/274] [Misc] fix: add miss best_of param validation (#18555) Signed-off-by: googs1025 --- vllm/sampling_params.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index dc38daa38..4294465f6 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -389,6 +389,17 @@ class SamplingParams( f"type {type(self.n)}") if self.n < 1: raise ValueError(f"n must be at least 1, got {self.n}.") + if self.best_of is not None: + if not isinstance(self.best_of, int): + raise ValueError( + f"best_of must be an integer, got {type(self.best_of)}") + if self.best_of < 1: + raise ValueError( + f"best_of must be at least 1, got {self.best_of}") + if self.best_of < self.n: + raise ValueError( + f"best_of must be greater than or equal to n, " + f"got n={self.n} and best_of={self.best_of}.") if not -2.0 <= self.presence_penalty <= 2.0: raise ValueError("presence_penalty must be in [-2, 2], got " f"{self.presence_penalty}.") -- GitLab From 02f0c7b220422792f5e53de2a7d51d2d3ff2df28 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Tue, 3 Jun 2025 11:20:17 -0700 Subject: [PATCH 140/274] [Misc] Add SPDX-FileCopyrightText (#19100) Signed-off-by: simon-mo --- .buildkite/check-wheel-size.py | 1 + .buildkite/generate_index.py | 1 + .buildkite/lm-eval-harness/conftest.py | 1 + .buildkite/lm-eval-harness/test_lm_eval_correctness.py | 1 + .../scripts/convert-results-json-to-markdown.py | 1 + .buildkite/nightly-benchmarks/scripts/download-tokenizer.py | 1 + .../nightly-benchmarks/scripts/generate-nightly-markdown.py | 1 + .../nightly-benchmarks/scripts/get-lmdeploy-modelname.py | 1 + .../nightly-benchmarks/scripts/summary-nightly-results.py | 1 + benchmarks/backend_request_func.py | 1 + benchmarks/benchmark_dataset.py | 1 + benchmarks/benchmark_latency.py | 1 + benchmarks/benchmark_long_document_qa_throughput.py | 1 + benchmarks/benchmark_prefix_caching.py | 1 + benchmarks/benchmark_prioritization.py | 1 + benchmarks/benchmark_serving.py | 1 + benchmarks/benchmark_serving_structured_output.py | 1 + benchmarks/benchmark_throughput.py | 1 + benchmarks/benchmark_utils.py | 1 + benchmarks/cutlass_benchmarks/sparse_benchmarks.py | 1 + benchmarks/cutlass_benchmarks/utils.py | 1 + benchmarks/cutlass_benchmarks/w8a8_benchmarks.py | 1 + benchmarks/cutlass_benchmarks/weight_shapes.py | 1 + benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py | 1 + benchmarks/disagg_benchmarks/round_robin_proxy.py | 1 + benchmarks/disagg_benchmarks/visualize_benchmark_results.py | 1 + benchmarks/fused_kernels/layernorm_rms_benchmarks.py | 1 + benchmarks/kernels/bench_fp8_gemm.py | 1 + benchmarks/kernels/benchmark_aqlm.py | 1 + benchmarks/kernels/benchmark_bitblas.py | 1 + benchmarks/kernels/benchmark_cutlass_fp4_moe.py | 1 + benchmarks/kernels/benchmark_grouped_gemm_cutlass.py | 1 + benchmarks/kernels/benchmark_layernorm.py | 1 + benchmarks/kernels/benchmark_lora.py | 1 + benchmarks/kernels/benchmark_machete.py | 1 + benchmarks/kernels/benchmark_marlin.py | 1 + benchmarks/kernels/benchmark_moe.py | 1 + benchmarks/kernels/benchmark_moe_permute_unpermute.py | 1 + benchmarks/kernels/benchmark_paged_attention.py | 1 + benchmarks/kernels/benchmark_quant.py | 1 + benchmarks/kernels/benchmark_rmsnorm.py | 1 + benchmarks/kernels/benchmark_rope.py | 1 + benchmarks/kernels/benchmark_shapes.py | 1 + benchmarks/kernels/benchmark_w8a8_block_fp8.py | 1 + .../kernels/deepgemm/benchmark_fp8_block_dense_gemm.py | 1 + benchmarks/kernels/graph_machete_bench.py | 1 + benchmarks/kernels/utils.py | 1 + benchmarks/kernels/weight_shapes.py | 1 + benchmarks/overheads/benchmark_hashing.py | 1 + cmake/hipify.py | 1 + csrc/cutlass_extensions/vllm_cutlass_library_extension.py | 1 + csrc/moe/marlin_moe_wna16/generate_kernels.py | 1 + csrc/quantization/gptq_marlin/generate_kernels.py | 1 + csrc/quantization/machete/generate.py | 1 + docs/mkdocs/hooks/generate_examples.py | 1 + docs/mkdocs/hooks/remove_announcement.py | 1 + docs/mkdocs/hooks/url_schemes.py | 1 + examples/offline_inference/audio_language.py | 1 + examples/offline_inference/automatic_prefix_caching.py | 1 + examples/offline_inference/basic/basic.py | 1 + examples/offline_inference/basic/chat.py | 1 + examples/offline_inference/basic/classify.py | 1 + examples/offline_inference/basic/embed.py | 1 + examples/offline_inference/basic/generate.py | 1 + examples/offline_inference/basic/score.py | 1 + examples/offline_inference/batch_llm_inference.py | 1 + examples/offline_inference/chat_with_tools.py | 1 + examples/offline_inference/context_extension.py | 1 + examples/offline_inference/data_parallel.py | 1 + .../disaggregated-prefill-v1/decode_example.py | 1 + .../disaggregated-prefill-v1/prefill_example.py | 1 + examples/offline_inference/disaggregated_prefill.py | 1 + examples/offline_inference/eagle.py | 1 + examples/offline_inference/embed_jina_embeddings_v3.py | 1 + examples/offline_inference/embed_matryoshka_fy.py | 1 + examples/offline_inference/encoder_decoder.py | 1 + examples/offline_inference/encoder_decoder_multimodal.py | 1 + examples/offline_inference/llm_engine_example.py | 1 + examples/offline_inference/load_sharded_state.py | 1 + .../offline_inference/lora_with_quantization_inference.py | 1 + examples/offline_inference/metrics.py | 1 + examples/offline_inference/mistral-small.py | 1 + examples/offline_inference/mlpspeculator.py | 1 + examples/offline_inference/multilora_inference.py | 1 + examples/offline_inference/neuron.py | 1 + examples/offline_inference/neuron_eagle.py | 1 + examples/offline_inference/neuron_int8_quantization.py | 1 + examples/offline_inference/neuron_multimodal.py | 1 + examples/offline_inference/neuron_speculation.py | 1 + examples/offline_inference/prefix_caching.py | 1 + examples/offline_inference/prithvi_geospatial_mae.py | 1 + examples/offline_inference/profiling.py | 1 + examples/offline_inference/profiling_tpu/profiling.py | 1 + examples/offline_inference/prompt_embed_inference.py | 1 + examples/offline_inference/qwen2_5_omni/only_thinker.py | 1 + examples/offline_inference/qwen_1m.py | 1 + examples/offline_inference/reproducibility.py | 1 + examples/offline_inference/rlhf.py | 1 + examples/offline_inference/rlhf_colocate.py | 1 + examples/offline_inference/rlhf_utils.py | 1 + examples/offline_inference/save_sharded_state.py | 1 + examples/offline_inference/simple_profiling.py | 1 + examples/offline_inference/structured_outputs.py | 1 + examples/offline_inference/torchrun_example.py | 1 + examples/offline_inference/tpu.py | 1 + examples/offline_inference/vision_language.py | 1 + examples/offline_inference/vision_language_embedding.py | 1 + examples/offline_inference/vision_language_multi_image.py | 1 + examples/online_serving/api_client.py | 1 + examples/online_serving/cohere_rerank_client.py | 1 + .../disaggregated_serving/disagg_proxy_demo.py | 1 + examples/online_serving/gradio_openai_chatbot_webserver.py | 1 + examples/online_serving/gradio_webserver.py | 1 + examples/online_serving/jinaai_rerank_client.py | 1 + examples/online_serving/kv_events_subscriber.py | 1 + examples/online_serving/openai_chat_completion_client.py | 1 + .../openai_chat_completion_client_for_multimodal.py | 1 + .../openai_chat_completion_client_with_tools.py | 1 + .../openai_chat_completion_client_with_tools_required.py | 1 + .../openai_chat_completion_structured_outputs.py | 1 + ...nai_chat_completion_structured_outputs_structural_tag.py | 1 + ...nai_chat_completion_structured_outputs_with_reasoning.py | 1 + .../openai_chat_completion_tool_calls_with_reasoning.py | 1 + .../online_serving/openai_chat_completion_with_reasoning.py | 1 + .../openai_chat_completion_with_reasoning_streaming.py | 1 + .../openai_chat_embedding_client_for_multimodal.py | 1 + examples/online_serving/openai_classification_client.py | 1 + examples/online_serving/openai_completion_client.py | 1 + examples/online_serving/openai_cross_encoder_score.py | 1 + examples/online_serving/openai_embedding_client.py | 1 + examples/online_serving/openai_embedding_matryoshka_fy.py | 1 + examples/online_serving/openai_pooling_client.py | 1 + examples/online_serving/openai_transcription_client.py | 1 + examples/online_serving/opentelemetry/dummy_client.py | 1 + .../prompt_embed_inference_with_openai_client.py | 1 + examples/online_serving/ray_serve_deepseek.py | 1 + .../retrieval_augmented_generation_with_langchain.py | 1 + .../retrieval_augmented_generation_with_llamaindex.py | 1 + .../online_serving/streamlit_openai_chatbot_webserver.py | 1 + examples/online_serving/utils.py | 1 + examples/others/lmcache/cpu_offload_lmcache.py | 1 + examples/others/lmcache/disagg_prefill_lmcache_v0.py | 1 + .../disagg_prefill_lmcache_v1/disagg_proxy_server.py | 1 + examples/others/lmcache/kv_cache_sharing_lmcache_v1.py | 1 + examples/others/tensorize_vllm_model.py | 1 + find_cuda_init.py | 1 + setup.py | 1 + tests/async_engine/api_server_async_engine.py | 1 + tests/async_engine/conftest.py | 1 + tests/async_engine/test_api_server.py | 1 + tests/async_engine/test_async_llm_engine.py | 1 + tests/async_engine/test_request_tracker.py | 1 + tests/basic_correctness/test_basic_correctness.py | 1 + tests/basic_correctness/test_chunked_prefill.py | 1 + tests/basic_correctness/test_cpu_offload.py | 1 + tests/basic_correctness/test_cumem.py | 1 + tests/basic_correctness/test_preemption.py | 1 + tests/benchmarks/test_latency_cli.py | 1 + tests/benchmarks/test_serve_cli.py | 1 + tests/benchmarks/test_throughput_cli.py | 1 + tests/build_cython.py | 1 + tests/compile/backend.py | 1 + tests/compile/conftest.py | 1 + tests/compile/piecewise/test_full_cudagraph.py | 1 + tests/compile/piecewise/test_simple.py | 1 + tests/compile/piecewise/test_toy_llama.py | 1 + tests/compile/test_async_tp.py | 1 + tests/compile/test_basic_correctness.py | 1 + tests/compile/test_full_graph.py | 1 + tests/compile/test_functionalization.py | 1 + tests/compile/test_fusion.py | 1 + tests/compile/test_pass_manager.py | 1 + tests/compile/test_sequence_parallelism.py | 1 + tests/compile/test_silu_mul_quant_fusion.py | 1 + tests/compile/test_wrapper.py | 1 + tests/conftest.py | 1 + tests/core/block/conftest.py | 1 + tests/core/block/e2e/conftest.py | 1 + tests/core/block/e2e/test_correctness.py | 1 + tests/core/block/e2e/test_correctness_sliding_window.py | 1 + tests/core/block/test_block_manager.py | 1 + tests/core/block/test_block_table.py | 1 + tests/core/block/test_common.py | 1 + tests/core/block/test_cpu_gpu_block_allocator.py | 1 + tests/core/block/test_naive_block.py | 1 + tests/core/block/test_prefix_caching_block.py | 1 + tests/core/conftest.py | 1 + tests/core/test_chunked_prefill_scheduler.py | 1 + tests/core/test_num_computed_tokens_update.py | 1 + tests/core/test_scheduler.py | 1 + tests/core/test_scheduler_encoder_decoder.py | 1 + tests/core/test_serialization.py | 1 + tests/core/utils.py | 1 + tests/detokenizer/conftest.py | 1 + tests/detokenizer/test_disable_detokenization.py | 1 + tests/detokenizer/test_stop_checker.py | 1 + tests/detokenizer/test_stop_reason.py | 1 + tests/detokenizer/test_stop_strings.py | 1 + tests/distributed/conftest.py | 1 + tests/distributed/test_ca_buffer_sharing.py | 1 + tests/distributed/test_comm_ops.py | 1 + tests/distributed/test_custom_all_reduce.py | 1 + tests/distributed/test_distributed_oot.py | 1 + tests/distributed/test_events.py | 1 + tests/distributed/test_expert_parallel.py | 1 + tests/distributed/test_multi_node_assignment.py | 1 + tests/distributed/test_pipeline_parallel.py | 1 + tests/distributed/test_pipeline_partition.py | 1 + tests/distributed/test_pp_cudagraph.py | 1 + tests/distributed/test_pynccl.py | 1 + tests/distributed/test_same_node.py | 1 + tests/distributed/test_sequence_parallel.py | 1 + tests/distributed/test_shm_broadcast.py | 1 + tests/distributed/test_torchrun_example.py | 1 + tests/distributed/test_utils.py | 1 + tests/encoder_decoder/test_e2e_correctness.py | 1 + tests/engine/conftest.py | 1 + tests/engine/test_arg_utils.py | 1 + tests/engine/test_computed_prefix_blocks.py | 1 + tests/engine/test_executor.py | 1 + tests/engine/test_multi_step_output_processor.py | 1 + tests/engine/test_multiproc_workers.py | 1 + tests/engine/test_options.py | 1 + tests/engine/test_short_mm_context.py | 1 + tests/entrypoints/conftest.py | 1 + tests/entrypoints/llm/test_accuracy.py | 1 + tests/entrypoints/llm/test_chat.py | 1 + tests/entrypoints/llm/test_collective_rpc.py | 1 + tests/entrypoints/llm/test_encode.py | 1 + tests/entrypoints/llm/test_generate.py | 1 + tests/entrypoints/llm/test_generate_multiple_loras.py | 1 + tests/entrypoints/llm/test_gpu_utilization.py | 1 + tests/entrypoints/llm/test_guided_generate.py | 1 + tests/entrypoints/llm/test_lazy_outlines.py | 1 + tests/entrypoints/llm/test_prompt_validation.py | 1 + tests/entrypoints/offline_mode/test_offline_mode.py | 1 + tests/entrypoints/openai/correctness/test_lmeval.py | 1 + tests/entrypoints/openai/correctness/test_mteb.py | 1 + .../correctness/test_transcription_api_correctness.py | 1 + tests/entrypoints/openai/test_async_tokenization.py | 1 + tests/entrypoints/openai/test_audio.py | 1 + tests/entrypoints/openai/test_basic.py | 1 + tests/entrypoints/openai/test_chat.py | 1 + tests/entrypoints/openai/test_chat_echo.py | 1 + tests/entrypoints/openai/test_chat_logit_bias_validation.py | 1 + tests/entrypoints/openai/test_chat_template.py | 1 + tests/entrypoints/openai/test_chat_with_tool_reasoning.py | 1 + tests/entrypoints/openai/test_chunked_prompt.py | 1 + tests/entrypoints/openai/test_classification.py | 1 + tests/entrypoints/openai/test_cli_args.py | 1 + tests/entrypoints/openai/test_completion.py | 1 + .../openai/test_completion_with_function_calling.py | 1 + .../openai/test_completion_with_prompt_embeds.py | 1 + tests/entrypoints/openai/test_embedding.py | 1 + tests/entrypoints/openai/test_embedding_dimensions.py | 1 + tests/entrypoints/openai/test_encoder_decoder.py | 1 + tests/entrypoints/openai/test_lora_adapters.py | 1 + tests/entrypoints/openai/test_lora_resolvers.py | 1 + tests/entrypoints/openai/test_metrics.py | 1 + tests/entrypoints/openai/test_models.py | 1 + tests/entrypoints/openai/test_oot_registration.py | 1 + tests/entrypoints/openai/test_openai_schema.py | 1 + tests/entrypoints/openai/test_pooling.py | 1 + tests/entrypoints/openai/test_prompt_validation.py | 1 + tests/entrypoints/openai/test_rerank.py | 1 + tests/entrypoints/openai/test_return_tokens_as_ids.py | 1 + tests/entrypoints/openai/test_root_path.py | 1 + tests/entrypoints/openai/test_run_batch.py | 1 + tests/entrypoints/openai/test_score.py | 1 + tests/entrypoints/openai/test_serving_chat.py | 1 + tests/entrypoints/openai/test_serving_models.py | 1 + tests/entrypoints/openai/test_shutdown.py | 1 + tests/entrypoints/openai/test_sleep.py | 1 + tests/entrypoints/openai/test_tensorizer_entrypoint.py | 1 + tests/entrypoints/openai/test_tokenization.py | 1 + tests/entrypoints/openai/test_transcription_validation.py | 1 + tests/entrypoints/openai/test_truncation.py | 1 + tests/entrypoints/openai/test_video.py | 1 + tests/entrypoints/openai/test_vision.py | 1 + tests/entrypoints/openai/test_vision_embedding.py | 1 + .../openai/tool_parsers/test_llama4_pythonic_tool_parser.py | 1 + .../openai/tool_parsers/test_pythonic_tool_parser.py | 1 + tests/entrypoints/openai/tool_parsers/utils.py | 1 + tests/entrypoints/test_api_server_process_manager.py | 1 + tests/entrypoints/test_chat_utils.py | 1 + tests/entrypoints/test_ssl_cert_refresher.py | 1 + tests/fastsafetensors_loader/test_fastsafetensors_loader.py | 1 + tests/fastsafetensors_loader/test_weight_utils.py | 1 + tests/kernels/allclose_default.py | 1 + tests/kernels/attention/conftest.py | 1 + tests/kernels/attention/test_attention.py | 1 + tests/kernels/attention/test_attention_selector.py | 1 + tests/kernels/attention/test_blocksparse_attention.py | 1 + tests/kernels/attention/test_cache.py | 1 + tests/kernels/attention/test_cascade_flash_attn.py | 1 + tests/kernels/attention/test_encoder_decoder_attn.py | 1 + tests/kernels/attention/test_flash_attn.py | 1 + tests/kernels/attention/test_flashinfer.py | 1 + tests/kernels/attention/test_flashmla.py | 1 + tests/kernels/attention/test_lightning_attn.py | 1 + tests/kernels/attention/test_merge_attn_states.py | 1 + tests/kernels/attention/test_mha_attn.py | 1 + tests/kernels/attention/test_mla_decode_cpu.py | 1 + tests/kernels/attention/test_prefix_prefill.py | 1 + tests/kernels/attention/test_rocm_attention_selector.py | 1 + tests/kernels/attention/test_triton_decode_attention.py | 1 + tests/kernels/attention/test_triton_unified_attention.py | 1 + tests/kernels/core/test_activation.py | 1 + tests/kernels/core/test_fused_quant_layernorm.py | 1 + tests/kernels/core/test_layernorm.py | 1 + tests/kernels/core/test_opcheck.py | 1 + tests/kernels/core/test_permute_cols.py | 1 + tests/kernels/core/test_pos_encoding.py | 1 + tests/kernels/core/test_rotary_embedding.py | 1 + tests/kernels/core/test_uva.py | 1 + tests/kernels/mamba/test_causal_conv1d.py | 1 + tests/kernels/mamba/test_mamba_mixer2.py | 1 + tests/kernels/mamba/test_mamba_ssm.py | 1 + tests/kernels/mamba/test_mamba_ssm_ssd.py | 1 + tests/kernels/moe/test_batched_moe.py | 1 + tests/kernels/moe/test_cutlass_moe.py | 1 + tests/kernels/moe/test_moe.py | 1 + tests/kernels/moe/test_moe_permute_unpermute.py | 1 + tests/kernels/moe/test_nvfp4_moe.py | 1 + tests/kernels/moe/test_pplx_moe.py | 1 + tests/kernels/moe/test_rocm_aiter_topk.py | 1 + tests/kernels/moe/test_triton_moe_ptpc_fp8.py | 1 + tests/kernels/quant_utils.py | 1 + tests/kernels/quantization/nvfp4_utils.py | 1 + tests/kernels/quantization/test_allspark_gemm.py | 1 + tests/kernels/quantization/test_aqlm.py | 1 + tests/kernels/quantization/test_awq.py | 1 + tests/kernels/quantization/test_awq_triton.py | 1 + tests/kernels/quantization/test_block_fp8.py | 1 + tests/kernels/quantization/test_block_int8.py | 1 + tests/kernels/quantization/test_cutlass_2of4_sparse.py | 1 + tests/kernels/quantization/test_cutlass_scaled_mm.py | 1 + tests/kernels/quantization/test_fp8_quant.py | 1 + tests/kernels/quantization/test_ggml.py | 1 + tests/kernels/quantization/test_gguf.py | 1 + tests/kernels/quantization/test_gptq.py | 1 + tests/kernels/quantization/test_int8_kernel.py | 1 + tests/kernels/quantization/test_int8_quant.py | 1 + tests/kernels/quantization/test_machete_mm.py | 1 + tests/kernels/quantization/test_marlin_gemm.py | 1 + tests/kernels/quantization/test_nvfp4_quant.py | 1 + tests/kernels/quantization/test_nvfp4_scaled_mm.py | 1 + tests/kernels/quantization/test_rocm_skinny_gemms.py | 1 + tests/kernels/quantization/test_triton_scaled_mm.py | 1 + tests/kernels/test_cutlass_mla_decode.py | 1 + tests/kernels/test_fused_quant_activation.py | 1 + tests/kernels/test_triton_flash_attention.py | 1 + tests/kernels/utils.py | 1 + tests/kv_transfer/test_disagg.py | 1 + tests/kv_transfer/test_lookup_buffer.py | 1 + tests/kv_transfer/test_module.py | 1 + tests/kv_transfer/test_send_recv.py | 1 + tests/lora/conftest.py | 1 + tests/lora/test_add_lora.py | 1 + tests/lora/test_baichuan.py | 1 + tests/lora/test_chatglm3_tp.py | 1 + tests/lora/test_layers.py | 1 + tests/lora/test_llama_tp.py | 1 + tests/lora/test_lora_allowed_token_ids.py | 1 + tests/lora/test_lora_checkpoints.py | 1 + tests/lora/test_lora_functions.py | 1 + tests/lora/test_lora_huggingface.py | 1 + tests/lora/test_lora_manager.py | 1 + tests/lora/test_minicpmv_tp.py | 1 + tests/lora/test_mixtral.py | 1 + tests/lora/test_peft_helper.py | 1 + tests/lora/test_phi.py | 1 + tests/lora/test_punica_ops.py | 1 + tests/lora/test_quant_model.py | 1 + tests/lora/test_qwen2vl.py | 1 + tests/lora/test_resolver.py | 1 + tests/lora/test_tokenizer_group.py | 1 + tests/lora/test_transfomers_model.py | 1 + tests/lora/test_utils.py | 1 + tests/lora/test_worker.py | 1 + tests/lora/utils.py | 1 + tests/metrics/test_metrics.py | 1 + tests/mistral_tool_use/conftest.py | 1 + tests/mistral_tool_use/test_mistral_tool_calls.py | 1 + tests/mistral_tool_use/utils.py | 1 + tests/model_executor/conftest.py | 1 + tests/model_executor/test_enabled_custom_ops.py | 1 + tests/model_executor/test_guided_processors.py | 1 + tests/model_executor/test_logits_processor.py | 1 + tests/model_executor/test_model_load_with_params.py | 1 + tests/model_executor/test_weight_utils.py | 1 + tests/models/language/generation/test_bart.py | 1 + tests/models/language/generation/test_common.py | 1 + tests/models/language/generation/test_granite.py | 1 + tests/models/language/generation/test_granitemoehybrid.py | 1 + tests/models/language/generation/test_hybrid.py | 1 + tests/models/language/generation/test_mistral.py | 1 + tests/models/language/generation/test_phimoe.py | 1 + tests/models/language/pooling/embed_utils.py | 1 + tests/models/language/pooling/mteb_utils.py | 1 + tests/models/language/pooling/test_baai.py | 1 + tests/models/language/pooling/test_classification.py | 1 + tests/models/language/pooling/test_embedding.py | 1 + tests/models/language/pooling/test_gritlm.py | 1 + tests/models/language/pooling/test_gte.py | 1 + tests/models/language/pooling/test_jina.py | 1 + tests/models/language/pooling/test_nomic.py | 1 + tests/models/language/pooling/test_nomic_max_model_len.py | 1 + tests/models/language/pooling/test_scoring.py | 1 + .../models/language/pooling/test_snowflake_arctic_embed.py | 1 + tests/models/language/pooling/test_truncation_control.py | 1 + tests/models/multimodal/generation/test_common.py | 1 + tests/models/multimodal/generation/test_florence2.py | 1 + tests/models/multimodal/generation/test_granite_speech.py | 1 + tests/models/multimodal/generation/test_interleaved.py | 1 + tests/models/multimodal/generation/test_mllama.py | 1 + tests/models/multimodal/generation/test_phi4mm.py | 1 + tests/models/multimodal/generation/test_pixtral.py | 1 + tests/models/multimodal/generation/test_qwen2_vl.py | 1 + tests/models/multimodal/generation/test_ultravox.py | 1 + tests/models/multimodal/generation/test_whisper.py | 1 + tests/models/multimodal/generation/vlm_utils/builders.py | 1 + .../multimodal/generation/vlm_utils/case_filtering.py | 1 + tests/models/multimodal/generation/vlm_utils/core.py | 1 + .../models/multimodal/generation/vlm_utils/custom_inputs.py | 1 + tests/models/multimodal/generation/vlm_utils/model_utils.py | 1 + tests/models/multimodal/generation/vlm_utils/runners.py | 1 + tests/models/multimodal/generation/vlm_utils/types.py | 1 + tests/models/multimodal/pooling/test_dse_qwen2_vl.py | 1 + tests/models/multimodal/pooling/test_intern_vit.py | 1 + tests/models/multimodal/pooling/test_llava_next.py | 1 + tests/models/multimodal/pooling/test_phi3v.py | 1 + tests/models/multimodal/processing/test_common.py | 1 + tests/models/multimodal/processing/test_h2ovl.py | 1 + tests/models/multimodal/processing/test_idefics3.py | 1 + tests/models/multimodal/processing/test_internvl.py | 1 + tests/models/multimodal/processing/test_llama4.py | 1 + tests/models/multimodal/processing/test_llava_next.py | 1 + tests/models/multimodal/processing/test_llava_onevision.py | 1 + tests/models/multimodal/processing/test_minimax_vl_01.py | 1 + tests/models/multimodal/processing/test_mllama.py | 1 + tests/models/multimodal/processing/test_phi3v.py | 1 + tests/models/multimodal/processing/test_phi4mm.py | 1 + tests/models/multimodal/processing/test_qwen2_vl.py | 1 + tests/models/multimodal/processing/test_smolvlm.py | 1 + tests/models/quantization/test_aqlm.py | 1 + tests/models/quantization/test_awq.py | 1 + tests/models/quantization/test_bitblas.py | 1 + tests/models/quantization/test_fp8.py | 1 + tests/models/quantization/test_gguf.py | 1 + tests/models/quantization/test_gptq_bitblas.py | 1 + tests/models/quantization/test_gptq_marlin.py | 1 + tests/models/quantization/test_gptq_marlin_24.py | 1 + tests/models/quantization/test_modelopt.py | 1 + tests/models/quantization/test_mxfp4.py | 1 + tests/models/quantization/test_nvfp4.py | 1 + tests/models/registry.py | 1 + tests/models/test_initialization.py | 1 + tests/models/test_oot_registration.py | 1 + tests/models/test_registry.py | 1 + tests/models/test_transformers.py | 1 + tests/models/test_utils.py | 1 + tests/models/test_vision.py | 1 + tests/models/utils.py | 1 + tests/mq_llm_engine/conftest.py | 1 + tests/mq_llm_engine/test_abort.py | 1 + tests/mq_llm_engine/test_error_handling.py | 1 + tests/mq_llm_engine/test_load.py | 1 + tests/mq_llm_engine/utils.py | 1 + tests/multi_step/test_correctness_async_llm.py | 1 + tests/multi_step/test_correctness_llm.py | 1 + tests/multimodal/test_hasher.py | 1 + tests/multimodal/test_image.py | 1 + tests/multimodal/test_inputs.py | 1 + tests/multimodal/test_processing.py | 1 + tests/multimodal/test_utils.py | 1 + tests/multimodal/test_video.py | 1 + tests/multimodal/utils.py | 1 + tests/neuron/1_core/test_activation.py | 1 + tests/neuron/1_core/test_block_table.py | 1 + tests/neuron/1_core/test_cache.py | 1 + tests/neuron/1_core/test_layernorm.py | 1 + tests/neuron/1_core/test_logits_processor.py | 1 + tests/neuron/1_core/test_neuron_model_runner.py | 1 + tests/neuron/1_core/test_neuron_quant.py | 1 + tests/neuron/1_core/test_prefix_prefill.py | 1 + tests/neuron/1_core/test_rotary_embedding.py | 1 + tests/neuron/2_core/test_comm_ops.py | 1 + tests/neuron/2_core/test_eagle.py | 1 + tests/neuron/2_core/test_mistral.py | 1 + tests/neuron/2_core/test_multi_lora.py | 1 + tests/plugins/lora_resolvers/test_filesystem_resolver.py | 1 + tests/plugins/vllm_add_dummy_model/setup.py | 1 + .../vllm_add_dummy_model/vllm_add_dummy_model/__init__.py | 1 + .../vllm_add_dummy_model/my_gemma_embedding.py | 1 + .../vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py | 1 + .../vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py | 1 + tests/plugins/vllm_add_dummy_platform/setup.py | 1 + .../vllm_add_dummy_platform/__init__.py | 1 + .../vllm_add_dummy_platform/dummy_attention_backend.py | 1 + .../vllm_add_dummy_platform/dummy_platform.py | 1 + tests/plugins_tests/conftest.py | 1 + tests/plugins_tests/test_platform_plugins.py | 1 + tests/plugins_tests/test_scheduler_plugins.py | 1 + tests/prefix_caching/test_disable_sliding_window.py | 1 + tests/prefix_caching/test_prefix_caching.py | 1 + tests/prompt_adapter/test_bloom.py | 1 + tests/prompt_adapter/test_multi_adapter_inference.py | 1 + tests/prompt_adapter/test_pa_lora.py | 1 + tests/quantization/test_auto_round.py | 1 + tests/quantization/test_bitsandbytes.py | 1 + tests/quantization/test_compressed_tensors.py | 1 + tests/quantization/test_configs.py | 1 + tests/quantization/test_cpu_offload.py | 3 ++- tests/quantization/test_experts_int8.py | 1 + tests/quantization/test_fp8.py | 1 + tests/quantization/test_gptq_dynamic.py | 1 + tests/quantization/test_ipex_quant.py | 1 + tests/quantization/test_lm_head.py | 1 + tests/quantization/test_ptpc_fp8.py | 1 + tests/quantization/test_quark.py | 1 + tests/quantization/test_register_quantization_config.py | 1 + tests/quantization/test_torchao.py | 1 + tests/quantization/utils.py | 1 + tests/reasoning/test_deepseekr1_reasoning_parser.py | 1 + tests/reasoning/test_granite_reasoning_parser.py | 1 + tests/reasoning/test_qwen3_reasoning_parser.py | 1 + tests/reasoning/utils.py | 1 + .../test_runai_model_streamer_loader.py | 1 + tests/runai_model_streamer_test/test_weight_utils.py | 1 + tests/samplers/test_beam_search.py | 1 + tests/samplers/test_ignore_eos.py | 1 + tests/samplers/test_logits_processor.py | 1 + tests/samplers/test_logprobs.py | 1 + tests/samplers/test_no_bad_words.py | 1 + tests/samplers/test_ranks.py | 1 + tests/samplers/test_rejection_sampler.py | 1 + tests/samplers/test_sampler.py | 1 + tests/samplers/test_seeded_generate.py | 1 + tests/samplers/test_typical_acceptance_sampler.py | 1 + tests/spec_decode/conftest.py | 1 + tests/spec_decode/e2e/conftest.py | 1 + tests/spec_decode/e2e/test_compatibility.py | 1 + tests/spec_decode/e2e/test_eagle_correctness.py | 1 + tests/spec_decode/e2e/test_integration.py | 1 + tests/spec_decode/e2e/test_integration_dist_tp2.py | 1 + tests/spec_decode/e2e/test_integration_dist_tp4.py | 1 + tests/spec_decode/e2e/test_logprobs.py | 1 + tests/spec_decode/e2e/test_medusa_correctness.py | 1 + tests/spec_decode/e2e/test_mlp_correctness.py | 1 + tests/spec_decode/e2e/test_mtp_correctness.py | 1 + tests/spec_decode/e2e/test_multistep_correctness.py | 1 + tests/spec_decode/e2e/test_ngram_correctness.py | 1 + tests/spec_decode/e2e/test_seed.py | 1 + tests/spec_decode/test_batch_expansion.py | 1 + tests/spec_decode/test_dynamic_spec_decode.py | 1 + tests/spec_decode/test_memory_usage.py | 1 + tests/spec_decode/test_metrics.py | 1 + tests/spec_decode/test_multi_step_worker.py | 1 + tests/spec_decode/test_ngram_worker.py | 1 + tests/spec_decode/test_scorer.py | 1 + tests/spec_decode/test_spec_decode_worker.py | 1 + tests/spec_decode/test_utils.py | 1 + tests/spec_decode/utils.py | 1 + tests/standalone_tests/lazy_imports.py | 1 + tests/tensorizer_loader/conftest.py | 1 + tests/tensorizer_loader/test_tensorizer.py | 1 + tests/test_cache_block_hashing.py | 1 + tests/test_config.py | 1 + tests/test_embedded_commit.py | 1 + tests/test_inputs.py | 1 + tests/test_logger.py | 1 + tests/test_outputs.py | 1 + tests/test_regression.py | 1 + tests/test_sampling_params.py | 1 + tests/test_scalartype.py | 1 + tests/test_seed_behavior.py | 3 ++- tests/test_sequence.py | 1 + tests/test_sharded_state_loader.py | 1 + tests/test_triton_utils.py | 1 + tests/test_utils.py | 1 + tests/test_version.py | 1 + tests/test_vllm_port.py | 1 + tests/tokenization/test_cached_tokenizer.py | 1 + tests/tokenization/test_detokenize.py | 1 + tests/tokenization/test_get_eos.py | 1 + tests/tokenization/test_mistral_tokenizer.py | 1 + tests/tokenization/test_tokenizer.py | 1 + tests/tokenization/test_tokenizer_group.py | 1 + tests/tokenization/test_tokenizer_registry.py | 1 + tests/tool_use/conftest.py | 1 + tests/tool_use/test_chat_completion_request_validations.py | 1 + tests/tool_use/test_chat_completions.py | 1 + tests/tool_use/test_jamba_tool_parser.py | 1 + tests/tool_use/test_parallel_tool_calls.py | 1 + tests/tool_use/test_tool_calls.py | 1 + tests/tool_use/test_tool_choice_required.py | 1 + tests/tool_use/utils.py | 1 + tests/tpu/lora/test_lora.py | 1 + tests/tpu/test_compilation.py | 1 + tests/tpu/test_custom_dispatcher.py | 1 + tests/tpu/test_moe_pallas.py | 1 + tests/tpu/test_quantization_accuracy.py | 1 + tests/tracing/test_tracing.py | 1 + tests/utils.py | 1 + tests/v1/core/test_kv_cache_utils.py | 1 + tests/v1/core/test_prefix_caching.py | 1 + tests/v1/core/test_scheduler.py | 1 + tests/v1/core/test_scheduler_e2e.py | 1 + tests/v1/core/test_specialized_manager.py | 1 + tests/v1/e2e/test_cascade_attention.py | 1 + tests/v1/e2e/test_correctness_sliding_window.py | 1 + tests/v1/e2e/test_spec_decode.py | 1 + tests/v1/engine/conftest.py | 1 + tests/v1/engine/test_async_llm.py | 1 + tests/v1/engine/test_engine_args.py | 1 + tests/v1/engine/test_engine_core.py | 1 + tests/v1/engine/test_engine_core_client.py | 1 + tests/v1/engine/test_llm_engine.py | 1 + tests/v1/engine/test_output_processor.py | 1 + tests/v1/engine/utils.py | 1 + tests/v1/entrypoints/conftest.py | 1 + tests/v1/entrypoints/llm/test_struct_output_generate.py | 1 + tests/v1/entrypoints/openai/test_chat_completion.py | 1 + tests/v1/entrypoints/openai/test_completion.py | 1 + tests/v1/entrypoints/openai/test_multi_api_servers.py | 1 + tests/v1/kv_connector/nixl_integration/test_accuracy.py | 1 + tests/v1/kv_connector/nixl_integration/test_edge_cases.py | 1 + tests/v1/kv_connector/nixl_integration/toy_proxy_server.py | 1 + tests/v1/kv_connector/unit/test_multi_connector.py | 1 + tests/v1/kv_connector/unit/test_nixl_connector.py | 1 + tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py | 1 + tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py | 1 + tests/v1/kv_connector/unit/utils.py | 1 + tests/v1/metrics/test_ray_metrics.py | 1 + tests/v1/sample/test_logprobs.py | 1 + tests/v1/sample/test_logprobs_e2e.py | 1 + tests/v1/sample/test_rejection_sampler.py | 1 + tests/v1/sample/test_sampler.py | 1 + tests/v1/sample/test_sampling_params_e2e.py | 1 + tests/v1/sample/test_topk_topp_sampler.py | 1 + tests/v1/sample/utils.py | 1 + tests/v1/shutdown/test_delete.py | 1 + tests/v1/shutdown/test_forward_error.py | 1 + tests/v1/shutdown/test_processor_error.py | 1 + tests/v1/shutdown/test_startup_error.py | 1 + tests/v1/shutdown/utils.py | 1 + tests/v1/spec_decode/test_eagle.py | 1 + tests/v1/spec_decode/test_max_len.py | 1 + tests/v1/spec_decode/test_ngram.py | 1 + tests/v1/structured_output/test_utils.py | 1 + tests/v1/test_async_llm_dp.py | 1 + tests/v1/test_metrics_reader.py | 1 + tests/v1/test_oracle.py | 1 + tests/v1/test_serial_utils.py | 1 + tests/v1/test_utils.py | 1 + tests/v1/tpu/test_basic.py | 1 + tests/v1/tpu/test_mha_attn.py | 1 + tests/v1/tpu/test_multimodal.py | 1 + tests/v1/tpu/test_pallas.py | 1 + tests/v1/tpu/test_perf.py | 1 + tests/v1/tpu/test_sampler.py | 1 + tests/v1/tpu/test_topk_topp_sampler.py | 1 + tests/v1/tpu/worker/test_tpu_model_runner.py | 1 + tests/v1/worker/test_gpu_input_batch.py | 1 + tests/v1/worker/test_gpu_model_runner.py | 1 + tests/vllm_test_utils/setup.py | 1 + tests/vllm_test_utils/vllm_test_utils/__init__.py | 1 + tests/vllm_test_utils/vllm_test_utils/blame.py | 1 + tests/vllm_test_utils/vllm_test_utils/monitor.py | 1 + tests/weight_loading/test_weight_loading.py | 1 + tests/worker/conftest.py | 1 + tests/worker/test_encoder_decoder_model_runner.py | 1 + tests/worker/test_model_input.py | 1 + tests/worker/test_model_runner.py | 1 + tests/worker/test_profile.py | 1 + tests/worker/test_swap.py | 1 + tools/check_spdx_header.py | 5 ++++- tools/check_triton_import.py | 1 + tools/enforce_regex_import.py | 1 + tools/profiler/print_layerwise_table.py | 1 + tools/profiler/visualize_layerwise_profile.py | 1 + tools/report_build_time_ninja.py | 1 + use_existing_torch.py | 1 + vllm/__init__.py | 1 + vllm/_custom_ops.py | 1 + vllm/_ipex_ops.py | 1 + vllm/adapter_commons/layers.py | 1 + vllm/adapter_commons/models.py | 1 + vllm/adapter_commons/request.py | 1 + vllm/adapter_commons/utils.py | 1 + vllm/adapter_commons/worker_manager.py | 1 + vllm/assets/audio.py | 1 + vllm/assets/base.py | 1 + vllm/assets/image.py | 1 + vllm/assets/video.py | 1 + vllm/attention/__init__.py | 1 + vllm/attention/backends/abstract.py | 1 + vllm/attention/backends/blocksparse_attn.py | 1 + vllm/attention/backends/cpu_mla.py | 1 + vllm/attention/backends/dual_chunk_flash_attn.py | 1 + vllm/attention/backends/flash_attn.py | 1 + vllm/attention/backends/flashinfer.py | 1 + vllm/attention/backends/flashmla.py | 1 + vllm/attention/backends/hpu_attn.py | 1 + vllm/attention/backends/ipex_attn.py | 1 + vllm/attention/backends/mla/common.py | 1 + vllm/attention/backends/pallas.py | 1 + vllm/attention/backends/placeholder_attn.py | 1 + vllm/attention/backends/rocm_aiter_mla.py | 1 + vllm/attention/backends/rocm_flash_attn.py | 1 + vllm/attention/backends/torch_sdpa.py | 1 + vllm/attention/backends/triton_mla.py | 1 + vllm/attention/backends/utils.py | 1 + vllm/attention/backends/xformers.py | 1 + vllm/attention/layer.py | 1 + .../blocksparse_attention/blocksparse_attention_kernel.py | 1 + vllm/attention/ops/blocksparse_attention/interface.py | 1 + vllm/attention/ops/blocksparse_attention/utils.py | 1 + vllm/attention/ops/chunked_prefill_paged_decode.py | 1 + vllm/attention/ops/flashmla.py | 1 + vllm/attention/ops/hpu_paged_attn.py | 1 + vllm/attention/ops/ipex_attn.py | 1 + vllm/attention/ops/merge_attn_states.py | 1 + vllm/attention/ops/nki_flash_attn.py | 1 + vllm/attention/ops/paged_attn.py | 1 + vllm/attention/ops/prefix_prefill.py | 1 + vllm/attention/ops/rocm_aiter_mla.py | 1 + vllm/attention/ops/rocm_aiter_paged_attn.py | 1 + vllm/attention/ops/triton_decode_attention.py | 1 + vllm/attention/ops/triton_flash_attention.py | 1 + vllm/attention/ops/triton_merge_attn_states.py | 1 + vllm/attention/ops/triton_unified_attention.py | 1 + vllm/attention/selector.py | 1 + vllm/attention/utils/fa_utils.py | 1 + vllm/beam_search.py | 1 + vllm/benchmarks/datasets.py | 1 + vllm/benchmarks/endpoint_request_func.py | 1 + vllm/benchmarks/latency.py | 1 + vllm/benchmarks/serve.py | 1 + vllm/benchmarks/throughput.py | 1 + vllm/benchmarks/utils.py | 1 + vllm/collect_env.py | 6 ++++-- vllm/compilation/activation_quant_fusion.py | 1 + vllm/compilation/backends.py | 1 + vllm/compilation/base_piecewise_backend.py | 1 + vllm/compilation/collective_fusion.py | 1 + vllm/compilation/compiler_interface.py | 1 + vllm/compilation/counter.py | 1 + vllm/compilation/cuda_piecewise_backend.py | 1 + vllm/compilation/decorators.py | 1 + vllm/compilation/fix_functionalization.py | 1 + vllm/compilation/fusion.py | 1 + vllm/compilation/fx_utils.py | 1 + vllm/compilation/inductor_pass.py | 1 + vllm/compilation/monitor.py | 1 + vllm/compilation/multi_output_match.py | 1 + vllm/compilation/noop_elimination.py | 1 + vllm/compilation/pass_manager.py | 1 + vllm/compilation/sequence_parallelism.py | 1 + vllm/compilation/torch25_custom_graph_pass.py | 1 + vllm/compilation/vllm_inductor_pass.py | 1 + vllm/compilation/wrapper.py | 1 + vllm/config.py | 1 + vllm/connections.py | 1 + vllm/core/block/block_table.py | 1 + vllm/core/block/common.py | 1 + vllm/core/block/cpu_gpu_block_allocator.py | 1 + vllm/core/block/interfaces.py | 1 + vllm/core/block/naive_block.py | 1 + vllm/core/block/prefix_caching_block.py | 1 + vllm/core/block/utils.py | 1 + vllm/core/block_manager.py | 1 + vllm/core/evictor.py | 1 + vllm/core/interfaces.py | 1 + vllm/core/placeholder_block_space_manager.py | 1 + vllm/core/scheduler.py | 1 + vllm/device_allocator/cumem.py | 1 + vllm/distributed/__init__.py | 1 + vllm/distributed/communication_op.py | 1 + vllm/distributed/device_communicators/all2all.py | 1 + .../device_communicators/base_device_communicator.py | 1 + vllm/distributed/device_communicators/cpu_communicator.py | 1 + vllm/distributed/device_communicators/cuda_communicator.py | 1 + vllm/distributed/device_communicators/cuda_wrapper.py | 1 + vllm/distributed/device_communicators/custom_all_reduce.py | 1 + .../device_communicators/custom_all_reduce_utils.py | 1 + vllm/distributed/device_communicators/hpu_communicator.py | 1 + .../distributed/device_communicators/neuron_communicator.py | 1 + vllm/distributed/device_communicators/pynccl.py | 1 + vllm/distributed/device_communicators/pynccl_wrapper.py | 1 + vllm/distributed/device_communicators/shm_broadcast.py | 1 + vllm/distributed/device_communicators/tpu_communicator.py | 1 + vllm/distributed/device_communicators/xpu_communicator.py | 1 + vllm/distributed/kv_events.py | 1 + vllm/distributed/kv_transfer/__init__.py | 1 + vllm/distributed/kv_transfer/kv_connector/base.py | 1 + vllm/distributed/kv_transfer/kv_connector/factory.py | 1 + .../kv_transfer/kv_connector/lmcache_connector.py | 1 + .../kv_transfer/kv_connector/mooncake_store_connector.py | 1 + .../kv_transfer/kv_connector/simple_connector.py | 1 + vllm/distributed/kv_transfer/kv_connector/utils.py | 1 + vllm/distributed/kv_transfer/kv_connector/v1/__init__.py | 1 + vllm/distributed/kv_transfer/kv_connector/v1/base.py | 1 + .../kv_transfer/kv_connector/v1/lmcache_connector.py | 1 + .../kv_transfer/kv_connector/v1/multi_connector.py | 1 + .../kv_transfer/kv_connector/v1/nixl_connector.py | 1 + .../kv_transfer/kv_connector/v1/shared_storage_connector.py | 1 + vllm/distributed/kv_transfer/kv_connector_agent.py | 1 + vllm/distributed/kv_transfer/kv_lookup_buffer/base.py | 1 + .../kv_transfer/kv_lookup_buffer/mooncake_store.py | 1 + .../kv_transfer/kv_lookup_buffer/simple_buffer.py | 1 + vllm/distributed/kv_transfer/kv_pipe/base.py | 1 + vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py | 1 + vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py | 1 + vllm/distributed/kv_transfer/kv_transfer_state.py | 1 + vllm/distributed/parallel_state.py | 1 + vllm/distributed/utils.py | 1 + vllm/engine/arg_utils.py | 1 + vllm/engine/async_llm_engine.py | 1 + vllm/engine/async_timeout.py | 1 + vllm/engine/llm_engine.py | 1 + vllm/engine/metrics.py | 1 + vllm/engine/metrics_types.py | 1 + vllm/engine/multiprocessing/__init__.py | 1 + vllm/engine/multiprocessing/client.py | 1 + vllm/engine/multiprocessing/engine.py | 1 + vllm/engine/output_processor/interfaces.py | 1 + vllm/engine/output_processor/multi_step.py | 1 + vllm/engine/output_processor/single_step.py | 1 + vllm/engine/output_processor/stop_checker.py | 1 + vllm/engine/output_processor/util.py | 1 + vllm/engine/protocol.py | 1 + vllm/entrypoints/api_server.py | 1 + vllm/entrypoints/chat_utils.py | 1 + vllm/entrypoints/cli/benchmark/base.py | 1 + vllm/entrypoints/cli/benchmark/latency.py | 1 + vllm/entrypoints/cli/benchmark/main.py | 1 + vllm/entrypoints/cli/benchmark/serve.py | 1 + vllm/entrypoints/cli/benchmark/throughput.py | 1 + vllm/entrypoints/cli/collect_env.py | 1 + vllm/entrypoints/cli/main.py | 1 + vllm/entrypoints/cli/openai.py | 1 + vllm/entrypoints/cli/run_batch.py | 1 + vllm/entrypoints/cli/serve.py | 1 + vllm/entrypoints/cli/types.py | 1 + vllm/entrypoints/launcher.py | 1 + vllm/entrypoints/llm.py | 1 + vllm/entrypoints/logger.py | 1 + vllm/entrypoints/openai/api_server.py | 1 + vllm/entrypoints/openai/cli_args.py | 1 + vllm/entrypoints/openai/logits_processors.py | 1 + vllm/entrypoints/openai/protocol.py | 1 + vllm/entrypoints/openai/run_batch.py | 1 + vllm/entrypoints/openai/serving_chat.py | 1 + vllm/entrypoints/openai/serving_classification.py | 1 + vllm/entrypoints/openai/serving_completion.py | 1 + vllm/entrypoints/openai/serving_embedding.py | 1 + vllm/entrypoints/openai/serving_engine.py | 1 + vllm/entrypoints/openai/serving_models.py | 1 + vllm/entrypoints/openai/serving_pooling.py | 1 + vllm/entrypoints/openai/serving_score.py | 1 + vllm/entrypoints/openai/serving_tokenization.py | 1 + vllm/entrypoints/openai/serving_transcription.py | 1 + vllm/entrypoints/openai/tool_parsers/__init__.py | 1 + .../entrypoints/openai/tool_parsers/abstract_tool_parser.py | 1 + .../openai/tool_parsers/deepseekv3_tool_parser.py | 1 + .../openai/tool_parsers/granite_20b_fc_tool_parser.py | 1 + vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py | 1 + vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py | 1 + .../openai/tool_parsers/internlm2_tool_parser.py | 1 + vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py | 1 + .../openai/tool_parsers/llama4_pythonic_tool_parser.py | 1 + vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py | 1 + vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py | 1 + .../entrypoints/openai/tool_parsers/phi4mini_tool_parser.py | 1 + .../entrypoints/openai/tool_parsers/pythonic_tool_parser.py | 1 + vllm/entrypoints/openai/tool_parsers/utils.py | 1 + vllm/entrypoints/score_utils.py | 1 + vllm/entrypoints/ssl.py | 1 + vllm/entrypoints/utils.py | 1 + vllm/env_override.py | 1 + vllm/envs.py | 1 + vllm/executor/executor_base.py | 1 + vllm/executor/mp_distributed_executor.py | 1 + vllm/executor/msgspec_utils.py | 1 + vllm/executor/multiproc_worker_utils.py | 1 + vllm/executor/ray_distributed_executor.py | 1 + vllm/executor/ray_utils.py | 1 + vllm/executor/uniproc_executor.py | 1 + vllm/forward_context.py | 1 + vllm/inputs/__init__.py | 1 + vllm/inputs/data.py | 1 + vllm/inputs/parse.py | 1 + vllm/inputs/preprocess.py | 1 + vllm/inputs/registry.py | 1 + vllm/jsontree.py | 1 + vllm/logger.py | 1 + vllm/logging_utils/__init__.py | 1 + vllm/logging_utils/dump_input.py | 1 + vllm/logging_utils/formatter.py | 1 + vllm/logits_process.py | 1 + vllm/lora/fully_sharded_layers.py | 1 + vllm/lora/layers.py | 1 + vllm/lora/lora.py | 1 + vllm/lora/models.py | 1 + vllm/lora/ops/torch_ops/__init__.py | 1 + vllm/lora/ops/torch_ops/lora_ops.py | 1 + vllm/lora/ops/triton_ops/__init__.py | 1 + vllm/lora/ops/triton_ops/kernel_utils.py | 1 + vllm/lora/ops/triton_ops/lora_expand_op.py | 1 + vllm/lora/ops/triton_ops/lora_kernel_metadata.py | 1 + vllm/lora/ops/triton_ops/lora_shrink_op.py | 1 + vllm/lora/ops/triton_ops/utils.py | 1 + vllm/lora/ops/xla_ops/__init__.py | 1 + vllm/lora/ops/xla_ops/lora_ops.py | 1 + vllm/lora/peft_helper.py | 1 + vllm/lora/punica_wrapper/__init__.py | 1 + vllm/lora/punica_wrapper/punica_base.py | 1 + vllm/lora/punica_wrapper/punica_cpu.py | 1 + vllm/lora/punica_wrapper/punica_gpu.py | 1 + vllm/lora/punica_wrapper/punica_hpu.py | 1 + vllm/lora/punica_wrapper/punica_selector.py | 1 + vllm/lora/punica_wrapper/punica_tpu.py | 1 + vllm/lora/punica_wrapper/utils.py | 1 + vllm/lora/request.py | 1 + vllm/lora/resolver.py | 1 + vllm/lora/utils.py | 1 + vllm/lora/worker_manager.py | 1 + vllm/model_executor/__init__.py | 1 + vllm/model_executor/custom_op.py | 1 + vllm/model_executor/guided_decoding/__init__.py | 1 + vllm/model_executor/guided_decoding/guidance_decoding.py | 1 + .../guided_decoding/guidance_logits_processors.py | 1 + vllm/model_executor/guided_decoding/guided_fields.py | 1 + .../guided_decoding/lm_format_enforcer_decoding.py | 1 + vllm/model_executor/guided_decoding/outlines_decoding.py | 1 + .../guided_decoding/outlines_logits_processors.py | 1 + vllm/model_executor/guided_decoding/utils.py | 1 + vllm/model_executor/guided_decoding/xgrammar_decoding.py | 1 + vllm/model_executor/layers/activation.py | 1 + vllm/model_executor/layers/fused_moe/__init__.py | 1 + vllm/model_executor/layers/fused_moe/cutlass_moe.py | 1 + vllm/model_executor/layers/fused_moe/deep_gemm_moe.py | 1 + vllm/model_executor/layers/fused_moe/fused_batched_moe.py | 1 + vllm/model_executor/layers/fused_moe/fused_marlin_moe.py | 1 + vllm/model_executor/layers/fused_moe/fused_moe.py | 1 + vllm/model_executor/layers/fused_moe/layer.py | 1 + vllm/model_executor/layers/fused_moe/modular_kernel.py | 1 + .../model_executor/layers/fused_moe/moe_align_block_size.py | 1 + vllm/model_executor/layers/fused_moe/moe_pallas.py | 1 + .../layers/fused_moe/moe_permute_unpermute.py | 1 + vllm/model_executor/layers/fused_moe/moe_torch_iterative.py | 1 + .../layers/fused_moe/pplx_prepare_finalize.py | 1 + vllm/model_executor/layers/fused_moe/prepare_finalize.py | 1 + .../model_executor/layers/fused_moe/rocm_aiter_fused_moe.py | 1 + .../model_executor/layers/fused_moe/triton_deep_gemm_moe.py | 1 + vllm/model_executor/layers/fused_moe/utils.py | 1 + vllm/model_executor/layers/layernorm.py | 1 + vllm/model_executor/layers/lightning_attn.py | 1 + vllm/model_executor/layers/linear.py | 1 + vllm/model_executor/layers/logits_processor.py | 1 + vllm/model_executor/layers/mamba/mamba2_metadata.py | 1 + vllm/model_executor/layers/mamba/mamba_mixer.py | 1 + vllm/model_executor/layers/mamba/mamba_mixer2.py | 1 + vllm/model_executor/layers/mamba/ops/causal_conv1d.py | 1 + vllm/model_executor/layers/mamba/ops/mamba_ssm.py | 1 + vllm/model_executor/layers/mamba/ops/ssd_bmm.py | 1 + vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py | 1 + vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py | 1 + vllm/model_executor/layers/mamba/ops/ssd_combined.py | 1 + vllm/model_executor/layers/mamba/ops/ssd_state_passing.py | 1 + vllm/model_executor/layers/pooler.py | 1 + vllm/model_executor/layers/quantization/__init__.py | 1 + vllm/model_executor/layers/quantization/aqlm.py | 1 + vllm/model_executor/layers/quantization/auto_round.py | 1 + vllm/model_executor/layers/quantization/awq.py | 1 + vllm/model_executor/layers/quantization/awq_marlin.py | 1 + vllm/model_executor/layers/quantization/awq_triton.py | 1 + vllm/model_executor/layers/quantization/base_config.py | 1 + vllm/model_executor/layers/quantization/bitblas.py | 1 + vllm/model_executor/layers/quantization/bitsandbytes.py | 1 + .../quantization/compressed_tensors/compressed_tensors.py | 1 + .../compressed_tensors/compressed_tensors_moe.py | 1 + .../quantization/compressed_tensors/schemes/__init__.py | 1 + .../compressed_tensors/schemes/compressed_tensors_24.py | 1 + .../compressed_tensors/schemes/compressed_tensors_scheme.py | 1 + .../schemes/compressed_tensors_w4a16_24.py | 1 + .../schemes/compressed_tensors_w4a16_nvfp4.py | 1 + .../schemes/compressed_tensors_w8a16_fp8.py | 1 + .../schemes/compressed_tensors_w8a8_fp8.py | 1 + .../schemes/compressed_tensors_w8a8_int8.py | 1 + .../compressed_tensors/schemes/compressed_tensors_wNa16.py | 1 + .../quantization/compressed_tensors/triton_scaled_mm.py | 1 + .../layers/quantization/compressed_tensors/utils.py | 1 + vllm/model_executor/layers/quantization/deepspeedfp.py | 1 + vllm/model_executor/layers/quantization/experts_int8.py | 1 + vllm/model_executor/layers/quantization/fbgemm_fp8.py | 1 + vllm/model_executor/layers/quantization/fp8.py | 1 + vllm/model_executor/layers/quantization/gguf.py | 1 + vllm/model_executor/layers/quantization/gptq.py | 1 + vllm/model_executor/layers/quantization/gptq_bitblas.py | 1 + vllm/model_executor/layers/quantization/gptq_marlin.py | 1 + vllm/model_executor/layers/quantization/gptq_marlin_24.py | 1 + vllm/model_executor/layers/quantization/hqq_marlin.py | 1 + vllm/model_executor/layers/quantization/ipex_quant.py | 1 + .../quantization/kernels/mixed_precision/MPLinearKernel.py | 1 + .../layers/quantization/kernels/mixed_precision/__init__.py | 1 + .../layers/quantization/kernels/mixed_precision/allspark.py | 1 + .../layers/quantization/kernels/mixed_precision/bitblas.py | 1 + .../layers/quantization/kernels/mixed_precision/exllama.py | 1 + .../layers/quantization/kernels/mixed_precision/machete.py | 1 + .../layers/quantization/kernels/mixed_precision/marlin.py | 1 + .../quantization/kernels/scaled_mm/ScaledMMLinearKernel.py | 1 + .../layers/quantization/kernels/scaled_mm/__init__.py | 1 + .../layers/quantization/kernels/scaled_mm/aiter.py | 1 + .../layers/quantization/kernels/scaled_mm/cutlass.py | 1 + .../layers/quantization/kernels/scaled_mm/triton.py | 1 + .../layers/quantization/kernels/scaled_mm/xla.py | 1 + vllm/model_executor/layers/quantization/kv_cache.py | 1 + vllm/model_executor/layers/quantization/marlin.py | 1 + vllm/model_executor/layers/quantization/modelopt.py | 1 + vllm/model_executor/layers/quantization/moe_wna16.py | 1 + vllm/model_executor/layers/quantization/neuron_quant.py | 1 + vllm/model_executor/layers/quantization/ptpc_fp8.py | 1 + vllm/model_executor/layers/quantization/qqq.py | 1 + vllm/model_executor/layers/quantization/quark/quark.py | 1 + vllm/model_executor/layers/quantization/quark/quark_moe.py | 1 + .../layers/quantization/quark/schemes/__init__.py | 1 + .../layers/quantization/quark/schemes/quark_scheme.py | 1 + .../layers/quantization/quark/schemes/quark_w4a4_mxfp4.py | 1 + .../layers/quantization/quark/schemes/quark_w8a8_fp8.py | 1 + .../layers/quantization/quark/schemes/quark_w8a8_int8.py | 1 + vllm/model_executor/layers/quantization/quark/utils.py | 1 + vllm/model_executor/layers/quantization/schema.py | 1 + vllm/model_executor/layers/quantization/torchao.py | 1 + vllm/model_executor/layers/quantization/tpu_int8.py | 1 + vllm/model_executor/layers/quantization/utils/__init__.py | 1 + .../layers/quantization/utils/allspark_utils.py | 1 + .../layers/quantization/utils/bitblas_utils.py | 1 + vllm/model_executor/layers/quantization/utils/fp8_utils.py | 1 + vllm/model_executor/layers/quantization/utils/gptq_utils.py | 1 + vllm/model_executor/layers/quantization/utils/int8_utils.py | 1 + .../model_executor/layers/quantization/utils/layer_utils.py | 1 + .../layers/quantization/utils/machete_utils.py | 1 + .../layers/quantization/utils/marlin_utils.py | 1 + .../layers/quantization/utils/marlin_utils_fp4.py | 1 + .../layers/quantization/utils/marlin_utils_fp8.py | 1 + .../layers/quantization/utils/marlin_utils_test.py | 1 + .../layers/quantization/utils/marlin_utils_test_24.py | 1 + .../layers/quantization/utils/marlin_utils_test_qqq.py | 1 + .../model_executor/layers/quantization/utils/mxfp4_utils.py | 1 + .../layers/quantization/utils/nvfp4_emulation_utils.py | 1 + .../model_executor/layers/quantization/utils/quant_utils.py | 1 + vllm/model_executor/layers/quantization/utils/w8a8_utils.py | 1 + vllm/model_executor/layers/rejection_sampler.py | 1 + vllm/model_executor/layers/resampler.py | 1 + vllm/model_executor/layers/rotary_embedding.py | 1 + vllm/model_executor/layers/sampler.py | 1 + vllm/model_executor/layers/spec_decode_base_sampler.py | 1 + vllm/model_executor/layers/typical_acceptance_sampler.py | 1 + vllm/model_executor/layers/utils.py | 1 + vllm/model_executor/layers/vocab_parallel_embedding.py | 1 + vllm/model_executor/model_loader/__init__.py | 1 + vllm/model_executor/model_loader/base_loader.py | 1 + vllm/model_executor/model_loader/bitsandbytes_loader.py | 1 + vllm/model_executor/model_loader/default_loader.py | 1 + vllm/model_executor/model_loader/dummy_loader.py | 1 + vllm/model_executor/model_loader/gguf_loader.py | 1 + vllm/model_executor/model_loader/neuron.py | 1 + vllm/model_executor/model_loader/neuronx_distributed.py | 1 + vllm/model_executor/model_loader/runai_streamer_loader.py | 1 + vllm/model_executor/model_loader/sharded_state_loader.py | 1 + vllm/model_executor/model_loader/tensorizer.py | 1 + vllm/model_executor/model_loader/tensorizer_loader.py | 1 + vllm/model_executor/model_loader/utils.py | 1 + vllm/model_executor/model_loader/weight_utils.py | 1 + vllm/model_executor/models/__init__.py | 1 + vllm/model_executor/models/adapters.py | 1 + vllm/model_executor/models/aimv2.py | 1 + vllm/model_executor/models/arctic.py | 1 + vllm/model_executor/models/aria.py | 1 + vllm/model_executor/models/aya_vision.py | 3 ++- vllm/model_executor/models/baichuan.py | 1 + vllm/model_executor/models/bamba.py | 1 + vllm/model_executor/models/bart.py | 1 + vllm/model_executor/models/bert.py | 1 + vllm/model_executor/models/bert_with_rope.py | 1 + vllm/model_executor/models/blip.py | 1 + vllm/model_executor/models/blip2.py | 1 + vllm/model_executor/models/bloom.py | 1 + vllm/model_executor/models/chameleon.py | 1 + vllm/model_executor/models/chatglm.py | 1 + vllm/model_executor/models/clip.py | 1 + vllm/model_executor/models/commandr.py | 1 + vllm/model_executor/models/constant_size_cache.py | 1 + vllm/model_executor/models/dbrx.py | 1 + vllm/model_executor/models/deepseek.py | 1 + vllm/model_executor/models/deepseek_mtp.py | 1 + vllm/model_executor/models/deepseek_v2.py | 1 + vllm/model_executor/models/deepseek_vl2.py | 1 + vllm/model_executor/models/eagle.py | 1 + vllm/model_executor/models/exaone.py | 1 + vllm/model_executor/models/fairseq2_llama.py | 1 + vllm/model_executor/models/falcon.py | 1 + vllm/model_executor/models/falcon_h1.py | 1 + vllm/model_executor/models/florence2.py | 1 + vllm/model_executor/models/fuyu.py | 1 + vllm/model_executor/models/gemma.py | 1 + vllm/model_executor/models/gemma2.py | 1 + vllm/model_executor/models/gemma3.py | 1 + vllm/model_executor/models/gemma3_mm.py | 1 + vllm/model_executor/models/glm.py | 1 + vllm/model_executor/models/glm4.py | 1 + vllm/model_executor/models/glm4v.py | 1 + vllm/model_executor/models/gpt2.py | 1 + vllm/model_executor/models/gpt_bigcode.py | 1 + vllm/model_executor/models/gpt_j.py | 1 + vllm/model_executor/models/gpt_neox.py | 1 + vllm/model_executor/models/granite.py | 1 + vllm/model_executor/models/granite_speech.py | 1 + vllm/model_executor/models/granitemoe.py | 1 + vllm/model_executor/models/granitemoehybrid.py | 1 + vllm/model_executor/models/granitemoeshared.py | 1 + vllm/model_executor/models/gritlm.py | 1 + vllm/model_executor/models/grok1.py | 1 + vllm/model_executor/models/h2ovl.py | 1 + vllm/model_executor/models/idefics2_vision_model.py | 1 + vllm/model_executor/models/idefics3.py | 1 + vllm/model_executor/models/interfaces.py | 1 + vllm/model_executor/models/interfaces_base.py | 1 + vllm/model_executor/models/intern_vit.py | 1 + vllm/model_executor/models/internlm2.py | 1 + vllm/model_executor/models/internlm2_ve.py | 1 + vllm/model_executor/models/internvl.py | 1 + vllm/model_executor/models/jais.py | 1 + vllm/model_executor/models/jamba.py | 1 + vllm/model_executor/models/kimi_vl.py | 1 + vllm/model_executor/models/llama.py | 1 + vllm/model_executor/models/llama4.py | 1 + vllm/model_executor/models/llama_eagle.py | 1 + vllm/model_executor/models/llama_eagle3.py | 1 + vllm/model_executor/models/llava.py | 1 + vllm/model_executor/models/llava_next.py | 1 + vllm/model_executor/models/llava_next_video.py | 1 + vllm/model_executor/models/llava_onevision.py | 1 + vllm/model_executor/models/mamba.py | 1 + vllm/model_executor/models/mamba2.py | 1 + vllm/model_executor/models/mamba_cache.py | 1 + vllm/model_executor/models/medusa.py | 1 + vllm/model_executor/models/mimo.py | 1 + vllm/model_executor/models/mimo_mtp.py | 1 + vllm/model_executor/models/minicpm.py | 1 + vllm/model_executor/models/minicpm3.py | 1 + vllm/model_executor/models/minicpm_eagle.py | 1 + vllm/model_executor/models/minicpmo.py | 1 + vllm/model_executor/models/minicpmv.py | 1 + vllm/model_executor/models/minimax_cache.py | 1 + vllm/model_executor/models/minimax_text_01.py | 1 + vllm/model_executor/models/minimax_vl_01.py | 1 + vllm/model_executor/models/mistral3.py | 1 + vllm/model_executor/models/mixtral.py | 1 + vllm/model_executor/models/mixtral_quant.py | 1 + vllm/model_executor/models/mllama.py | 1 + vllm/model_executor/models/mllama4.py | 1 + vllm/model_executor/models/mlp_speculator.py | 1 + vllm/model_executor/models/modernbert.py | 1 + vllm/model_executor/models/module_mapping.py | 1 + vllm/model_executor/models/molmo.py | 1 + vllm/model_executor/models/moonvit.py | 1 + vllm/model_executor/models/mpt.py | 1 + vllm/model_executor/models/nemotron.py | 1 + vllm/model_executor/models/nemotron_nas.py | 1 + vllm/model_executor/models/nvlm_d.py | 1 + vllm/model_executor/models/olmo.py | 1 + vllm/model_executor/models/olmo2.py | 1 + vllm/model_executor/models/olmoe.py | 1 + vllm/model_executor/models/opt.py | 1 + vllm/model_executor/models/orion.py | 1 + vllm/model_executor/models/ovis.py | 1 + vllm/model_executor/models/paligemma.py | 1 + vllm/model_executor/models/persimmon.py | 1 + vllm/model_executor/models/phi.py | 1 + vllm/model_executor/models/phi3.py | 1 + vllm/model_executor/models/phi3_small.py | 1 + vllm/model_executor/models/phi3v.py | 1 + vllm/model_executor/models/phi4mm.py | 1 + vllm/model_executor/models/phi4mm_audio.py | 1 + vllm/model_executor/models/phi4mm_utils.py | 1 + vllm/model_executor/models/phimoe.py | 1 + vllm/model_executor/models/pixtral.py | 1 + vllm/model_executor/models/plamo2.py | 1 + vllm/model_executor/models/prithvi_geospatial_mae.py | 1 + vllm/model_executor/models/qwen.py | 1 + vllm/model_executor/models/qwen2.py | 1 + vllm/model_executor/models/qwen2_5_omni_thinker.py | 1 + vllm/model_executor/models/qwen2_5_vl.py | 1 + vllm/model_executor/models/qwen2_audio.py | 1 + vllm/model_executor/models/qwen2_moe.py | 1 + vllm/model_executor/models/qwen2_rm.py | 1 + vllm/model_executor/models/qwen2_vl.py | 1 + vllm/model_executor/models/qwen3.py | 1 + vllm/model_executor/models/qwen3_moe.py | 1 + vllm/model_executor/models/qwen_vl.py | 1 + vllm/model_executor/models/registry.py | 1 + vllm/model_executor/models/roberta.py | 1 + vllm/model_executor/models/siglip.py | 1 + vllm/model_executor/models/skyworkr1v.py | 1 + vllm/model_executor/models/smolvlm.py | 1 + vllm/model_executor/models/solar.py | 1 + vllm/model_executor/models/stablelm.py | 1 + vllm/model_executor/models/starcoder2.py | 1 + vllm/model_executor/models/telechat2.py | 1 + vllm/model_executor/models/teleflm.py | 1 + vllm/model_executor/models/transformers.py | 1 + vllm/model_executor/models/ultravox.py | 1 + vllm/model_executor/models/utils.py | 1 + vllm/model_executor/models/vision.py | 1 + vllm/model_executor/models/whisper.py | 1 + vllm/model_executor/models/zamba2.py | 1 + vllm/model_executor/parameter.py | 1 + vllm/model_executor/pooling_metadata.py | 1 + vllm/model_executor/sampling_metadata.py | 1 + vllm/model_executor/utils.py | 1 + vllm/multimodal/__init__.py | 1 + vllm/multimodal/audio.py | 1 + vllm/multimodal/base.py | 1 + vllm/multimodal/hasher.py | 1 + vllm/multimodal/image.py | 1 + vllm/multimodal/inputs.py | 1 + vllm/multimodal/parse.py | 1 + vllm/multimodal/processing.py | 1 + vllm/multimodal/profiling.py | 1 + vllm/multimodal/registry.py | 1 + vllm/multimodal/utils.py | 1 + vllm/multimodal/video.py | 1 + vllm/outputs.py | 1 + vllm/platforms/__init__.py | 1 + vllm/platforms/cpu.py | 1 + vllm/platforms/cuda.py | 1 + vllm/platforms/hpu.py | 1 + vllm/platforms/interface.py | 1 + vllm/platforms/neuron.py | 1 + vllm/platforms/rocm.py | 1 + vllm/platforms/tpu.py | 1 + vllm/platforms/xpu.py | 1 + vllm/plugins/__init__.py | 1 + vllm/plugins/lora_resolvers/filesystem_resolver.py | 1 + vllm/pooling_params.py | 1 + vllm/profiler/layerwise_profile.py | 1 + vllm/profiler/utils.py | 1 + vllm/prompt_adapter/layers.py | 1 + vllm/prompt_adapter/models.py | 1 + vllm/prompt_adapter/request.py | 1 + vllm/prompt_adapter/utils.py | 1 + vllm/prompt_adapter/worker_manager.py | 1 + vllm/reasoning/__init__.py | 1 + vllm/reasoning/abs_reasoning_parsers.py | 1 + vllm/reasoning/deepseek_r1_reasoning_parser.py | 1 + vllm/reasoning/granite_reasoning_parser.py | 1 + vllm/reasoning/qwen3_reasoning_parser.py | 1 + vllm/sampling_params.py | 1 + vllm/scalar_type.py | 1 + vllm/scripts.py | 1 + vllm/sequence.py | 1 + vllm/spec_decode/batch_expansion.py | 1 + vllm/spec_decode/draft_model_runner.py | 1 + vllm/spec_decode/interfaces.py | 1 + vllm/spec_decode/medusa_worker.py | 1 + vllm/spec_decode/metrics.py | 1 + vllm/spec_decode/mlp_speculator_worker.py | 1 + vllm/spec_decode/mqa_scorer.py | 1 + vllm/spec_decode/multi_step_worker.py | 1 + vllm/spec_decode/ngram_worker.py | 1 + vllm/spec_decode/proposer_worker_base.py | 1 + vllm/spec_decode/smaller_tp_proposer_worker.py | 1 + vllm/spec_decode/spec_decode_worker.py | 1 + vllm/spec_decode/target_model_runner.py | 1 + vllm/spec_decode/top1_proposer.py | 1 + vllm/spec_decode/util.py | 1 + vllm/test_utils.py | 1 + vllm/third_party/pynvml.py | 1 + vllm/tracing.py | 1 + vllm/transformers_utils/__init__.py | 1 + vllm/transformers_utils/chat_templates/__init__.py | 1 + vllm/transformers_utils/chat_templates/registry.py | 1 + vllm/transformers_utils/config.py | 1 + vllm/transformers_utils/configs/__init__.py | 1 + vllm/transformers_utils/configs/arctic.py | 1 + vllm/transformers_utils/configs/chatglm.py | 1 + vllm/transformers_utils/configs/cohere2.py | 1 + vllm/transformers_utils/configs/dbrx.py | 1 + vllm/transformers_utils/configs/deepseek_vl2.py | 1 + vllm/transformers_utils/configs/eagle.py | 1 + vllm/transformers_utils/configs/exaone.py | 1 + vllm/transformers_utils/configs/falcon.py | 1 + vllm/transformers_utils/configs/h2ovl.py | 1 + vllm/transformers_utils/configs/internvl.py | 1 + vllm/transformers_utils/configs/jais.py | 1 + vllm/transformers_utils/configs/kimi_vl.py | 1 + vllm/transformers_utils/configs/medusa.py | 1 + vllm/transformers_utils/configs/minimax_text_01.py | 1 + vllm/transformers_utils/configs/minimax_vl_01.py | 1 + vllm/transformers_utils/configs/mllama.py | 1 + vllm/transformers_utils/configs/mlp_speculator.py | 1 + vllm/transformers_utils/configs/moonvit.py | 1 + vllm/transformers_utils/configs/mpt.py | 1 + vllm/transformers_utils/configs/nemotron.py | 1 + vllm/transformers_utils/configs/nvlm_d.py | 1 + vllm/transformers_utils/configs/ovis.py | 1 + vllm/transformers_utils/configs/skyworkr1v.py | 1 + vllm/transformers_utils/configs/solar.py | 1 + vllm/transformers_utils/configs/telechat2.py | 1 + vllm/transformers_utils/configs/ultravox.py | 1 + vllm/transformers_utils/detokenizer.py | 1 + vllm/transformers_utils/detokenizer_utils.py | 1 + vllm/transformers_utils/processor.py | 1 + vllm/transformers_utils/processors/__init__.py | 1 + vllm/transformers_utils/processors/deepseek_vl2.py | 1 + vllm/transformers_utils/processors/ovis.py | 1 + vllm/transformers_utils/s3_utils.py | 1 + vllm/transformers_utils/tokenizer.py | 1 + vllm/transformers_utils/tokenizer_base.py | 1 + vllm/transformers_utils/tokenizer_group.py | 1 + vllm/transformers_utils/tokenizers/__init__.py | 1 + vllm/transformers_utils/tokenizers/mistral.py | 1 + vllm/transformers_utils/utils.py | 1 + vllm/triton_utils/__init__.py | 1 + vllm/triton_utils/importing.py | 1 + vllm/usage/usage_lib.py | 1 + vllm/utils.py | 1 + vllm/v1/attention/backends/flash_attn.py | 1 + vllm/v1/attention/backends/flashinfer.py | 1 + vllm/v1/attention/backends/mla/common.py | 1 + vllm/v1/attention/backends/mla/flashmla.py | 1 + vllm/v1/attention/backends/mla/rocm_aiter_mla.py | 1 + vllm/v1/attention/backends/mla/triton_mla.py | 1 + vllm/v1/attention/backends/pallas.py | 1 + vllm/v1/attention/backends/triton_attn.py | 1 + vllm/v1/attention/backends/utils.py | 1 + vllm/v1/core/block_pool.py | 1 + vllm/v1/core/encoder_cache_manager.py | 1 + vllm/v1/core/kv_cache_manager.py | 1 + vllm/v1/core/kv_cache_utils.py | 1 + vllm/v1/core/sched/interface.py | 1 + vllm/v1/core/sched/output.py | 1 + vllm/v1/core/sched/scheduler.py | 1 + vllm/v1/core/sched/utils.py | 1 + vllm/v1/core/single_type_kv_cache_manager.py | 1 + vllm/v1/engine/__init__.py | 1 + vllm/v1/engine/async_llm.py | 1 + vllm/v1/engine/coordinator.py | 1 + vllm/v1/engine/core.py | 1 + vllm/v1/engine/core_client.py | 1 + vllm/v1/engine/detokenizer.py | 1 + vllm/v1/engine/exceptions.py | 1 + vllm/v1/engine/llm_engine.py | 1 + vllm/v1/engine/logprobs.py | 1 + vllm/v1/engine/mm_input_cache.py | 1 + vllm/v1/engine/output_processor.py | 1 + vllm/v1/engine/parallel_sampling.py | 1 + vllm/v1/engine/processor.py | 1 + vllm/v1/executor/abstract.py | 1 + vllm/v1/executor/multiproc_executor.py | 1 + vllm/v1/executor/ray_distributed_executor.py | 1 + vllm/v1/kv_cache_interface.py | 1 + vllm/v1/metrics/loggers.py | 1 + vllm/v1/metrics/prometheus.py | 1 + vllm/v1/metrics/ray_wrappers.py | 1 + vllm/v1/metrics/reader.py | 1 + vllm/v1/metrics/stats.py | 1 + vllm/v1/outputs.py | 1 + vllm/v1/request.py | 1 + vllm/v1/sample/metadata.py | 1 + vllm/v1/sample/ops/bad_words.py | 1 + vllm/v1/sample/ops/penalties.py | 1 + vllm/v1/sample/ops/topk_topp_sampler.py | 1 + vllm/v1/sample/rejection_sampler.py | 1 + vllm/v1/sample/sampler.py | 1 + vllm/v1/sample/tpu/metadata.py | 1 + vllm/v1/sample/tpu/sampler.py | 1 + vllm/v1/serial_utils.py | 1 + vllm/v1/spec_decode/eagle.py | 1 + vllm/v1/spec_decode/medusa.py | 1 + vllm/v1/spec_decode/metadata.py | 1 + vllm/v1/spec_decode/metrics.py | 1 + vllm/v1/spec_decode/ngram_proposer.py | 1 + vllm/v1/spec_decode/utils.py | 1 + vllm/v1/structured_output/__init__.py | 1 + vllm/v1/structured_output/backend_guidance.py | 1 + vllm/v1/structured_output/backend_types.py | 1 + vllm/v1/structured_output/backend_xgrammar.py | 1 + vllm/v1/structured_output/request.py | 1 + vllm/v1/structured_output/utils.py | 1 + vllm/v1/utils.py | 1 + vllm/v1/worker/block_table.py | 1 + vllm/v1/worker/gpu_input_batch.py | 1 + vllm/v1/worker/gpu_model_runner.py | 1 + vllm/v1/worker/gpu_worker.py | 1 + vllm/v1/worker/lora_model_runner_mixin.py | 1 + vllm/v1/worker/tpu_model_runner.py | 1 + vllm/v1/worker/tpu_worker.py | 1 + vllm/v1/worker/utils.py | 1 + vllm/v1/worker/worker_base.py | 1 + vllm/version.py | 1 + vllm/worker/cache_engine.py | 1 + vllm/worker/cpu_enc_dec_model_runner.py | 1 + vllm/worker/cpu_model_runner.py | 1 + vllm/worker/cpu_pooling_model_runner.py | 1 + vllm/worker/cpu_worker.py | 1 + vllm/worker/enc_dec_model_runner.py | 1 + vllm/worker/hpu_model_runner.py | 1 + vllm/worker/hpu_worker.py | 1 + vllm/worker/model_runner.py | 1 + vllm/worker/model_runner_base.py | 1 + vllm/worker/multi_step_hpu_worker.py | 1 + vllm/worker/multi_step_model_runner.py | 1 + vllm/worker/multi_step_neuron_model_runner.py | 1 + vllm/worker/multi_step_neuronx_distributed_model_runner.py | 1 + vllm/worker/multi_step_tpu_worker.py | 1 + vllm/worker/multi_step_worker.py | 1 + vllm/worker/neuron_model_runner.py | 1 + vllm/worker/neuron_worker.py | 1 + vllm/worker/neuronx_distributed_model_runner.py | 1 + vllm/worker/pooling_model_runner.py | 1 + vllm/worker/tpu_model_runner.py | 1 + vllm/worker/tpu_worker.py | 1 + vllm/worker/utils.py | 1 + vllm/worker/worker.py | 1 + vllm/worker/worker_base.py | 1 + vllm/worker/xpu_model_runner.py | 1 + vllm/worker/xpu_worker.py | 1 + 1432 files changed, 1441 insertions(+), 6 deletions(-) diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index e29881fcb..68aff793a 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import sys diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py index 270663c41..7045d8810 100644 --- a/.buildkite/generate_index.py +++ b/.buildkite/generate_index.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import os diff --git a/.buildkite/lm-eval-harness/conftest.py b/.buildkite/lm-eval-harness/conftest.py index 769d2efda..c0d60dd53 100644 --- a/.buildkite/lm-eval-harness/conftest.py +++ b/.buildkite/lm-eval-harness/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from pathlib import Path import pytest diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index 409a6ca82..930adfaf3 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ LM eval harness on model to compare vs HF baseline computed offline. Configs are found in configs/$MODEL.yaml diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index 7f2a2d8dc..a4f1638c1 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import os diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py index 778a3a8d8..8532ff7ef 100644 --- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py +++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py index 10a7a2f5a..053fd52c3 100644 --- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py index e5f179a0f..ddea1d2b1 100644 --- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py +++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from lmdeploy.serve.openai.api_client import APIClient diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index 2a7b37991..fb3b9d5e3 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import datetime import json diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 85e6eda7f..ddb38e304 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import io import json diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index d86bf045e..80a9246aa 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This module defines a framework for sampling benchmark requests from various datasets. Each dataset subclass of BenchmarkDataset must implement sample diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index de62bf5c6..c06857247 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Benchmark the latency of processing a single batch of requests.""" import argparse diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py index 109624c87..00869fa94 100644 --- a/benchmarks/benchmark_long_document_qa_throughput.py +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Offline benchmark to test the long document QA throughput. diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index ffaa80357..3e4704f0b 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Benchmark the efficiency of prefix caching. diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index a05dd24de..5496703f2 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Benchmark offline prioritization.""" import argparse diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 6bd9f1b49..81428fb7d 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project r"""Benchmark online serving throughput. On the server side, run one of the following commands: diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 6a50f47d3..3848ebda9 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project r"""Benchmark online serving throughput with structured outputs. On the server side, run one of the following commands: diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 7a13babda..d19753d40 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Benchmark offline inference throughput.""" import argparse diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index b0c4fca92..272b7979c 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json diff --git a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py index da258f98e..9ec270bbd 100644 --- a/benchmarks/cutlass_benchmarks/sparse_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/sparse_benchmarks.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy diff --git a/benchmarks/cutlass_benchmarks/utils.py b/benchmarks/cutlass_benchmarks/utils.py index 7e9f5a7fc..b4f3c6bf9 100644 --- a/benchmarks/cutlass_benchmarks/utils.py +++ b/benchmarks/cutlass_benchmarks/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Cutlass bench utils from collections.abc import Iterable diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index 08e93837f..cec422e8d 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py index d31b623a1..25b96ef56 100644 --- a/benchmarks/cutlass_benchmarks/weight_shapes.py +++ b/benchmarks/cutlass_benchmarks/weight_shapes.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Weight Shapes are in the format # ([K, N], TP_SPLIT_DIM) diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py index fce156e1c..f62d8102e 100644 --- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py +++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py index fd19b40bf..b1df2f255 100644 --- a/benchmarks/disagg_benchmarks/round_robin_proxy.py +++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import itertools diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py index 484d0cb3c..74fa56d07 100644 --- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py +++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py index 37a9173a1..901524214 100644 --- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py +++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pickle as pkl import time diff --git a/benchmarks/kernels/bench_fp8_gemm.py b/benchmarks/kernels/bench_fp8_gemm.py index 36d03e40e..640a33419 100644 --- a/benchmarks/kernels/bench_fp8_gemm.py +++ b/benchmarks/kernels/bench_fp8_gemm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy import itertools diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py index e9934aa47..42de062b0 100644 --- a/benchmarks/kernels/benchmark_aqlm.py +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import sys diff --git a/benchmarks/kernels/benchmark_bitblas.py b/benchmarks/kernels/benchmark_bitblas.py index d40ab70ec..97ee06034 100644 --- a/benchmarks/kernels/benchmark_bitblas.py +++ b/benchmarks/kernels/benchmark_bitblas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py index d39d8a6e3..3383fb788 100644 --- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py +++ b/benchmarks/kernels/benchmark_cutlass_fp4_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index 2197bceab..1be83b84e 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch import torch.utils.benchmark as benchmark diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index f21ca97ee..69978ec6b 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index 6c1284930..3d38d4b35 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index f8f1db047..0f896f187 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index b17baff2e..9ea1fddae 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch import torch.utils.benchmark as benchmark diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index c2f766085..6cb55b359 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index 333986fdf..dba1f3943 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse from typing import Any, TypedDict diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 54f05e723..7e0376c18 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random import time diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py index 2463dfebe..6ab26f5f1 100644 --- a/benchmarks/kernels/benchmark_quant.py +++ b/benchmarks/kernels/benchmark_quant.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time diff --git a/benchmarks/kernels/benchmark_rmsnorm.py b/benchmarks/kernels/benchmark_rmsnorm.py index d720083b6..4cf633a81 100644 --- a/benchmarks/kernels/benchmark_rmsnorm.py +++ b/benchmarks/kernels/benchmark_rmsnorm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools from typing import Optional, Union diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 944024ca3..b81baf17a 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from itertools import accumulate from typing import Optional diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py index 70190ba24..18c459c31 100644 --- a/benchmarks/kernels/benchmark_shapes.py +++ b/benchmarks/kernels/benchmark_shapes.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project WEIGHT_SHAPES = { "ideal": [[4 * 256 * 32, 256 * 32]], diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index 6315c1ee6..4fcdbadd6 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from sglang quantization/tuning_block_wise_kernel.py import argparse diff --git a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py index e37764825..e67ce0545 100644 --- a/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py +++ b/benchmarks/kernels/deepgemm/benchmark_fp8_block_dense_gemm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # fmt: off # ruff: noqa: E501 import time diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py index 0c86e4072..9a4da0ef5 100644 --- a/benchmarks/kernels/graph_machete_bench.py +++ b/benchmarks/kernels/graph_machete_bench.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math import pickle diff --git a/benchmarks/kernels/utils.py b/benchmarks/kernels/utils.py index 877a29fee..4bbb36bb4 100644 --- a/benchmarks/kernels/utils.py +++ b/benchmarks/kernels/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from collections.abc import Iterable diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py index afe159ddd..a27f02394 100644 --- a/benchmarks/kernels/weight_shapes.py +++ b/benchmarks/kernels/weight_shapes.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Weight Shapes are in the format # ([K, N], TP_SPLIT_DIM) diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py index d5701a8fb..0957a9c65 100644 --- a/benchmarks/overheads/benchmark_hashing.py +++ b/benchmarks/overheads/benchmark_hashing.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import cProfile import pstats diff --git a/cmake/hipify.py b/cmake/hipify.py index a15577125..55d378f5b 100755 --- a/cmake/hipify.py +++ b/cmake/hipify.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # # A command line tool for running pytorch's hipify preprocessor on CUDA diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py index d64f0d0a5..1dd7101ac 100644 --- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py +++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum from typing import Union diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py index 15f008d4f..49f33718a 100644 --- a/csrc/moe/marlin_moe_wna16/generate_kernels.py +++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import glob import itertools import os diff --git a/csrc/quantization/gptq_marlin/generate_kernels.py b/csrc/quantization/gptq_marlin/generate_kernels.py index 4ac7121ab..18fb6c1a8 100644 --- a/csrc/quantization/gptq_marlin/generate_kernels.py +++ b/csrc/quantization/gptq_marlin/generate_kernels.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import glob import itertools import os diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index 3114e14ba..9af7833d0 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools import math diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py index 6f290efe4..7cfc89605 100644 --- a/docs/mkdocs/hooks/generate_examples.py +++ b/docs/mkdocs/hooks/generate_examples.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools from dataclasses import dataclass, field from pathlib import Path diff --git a/docs/mkdocs/hooks/remove_announcement.py b/docs/mkdocs/hooks/remove_announcement.py index e5f8549d8..f67941d2a 100644 --- a/docs/mkdocs/hooks/remove_announcement.py +++ b/docs/mkdocs/hooks/remove_announcement.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import Literal diff --git a/docs/mkdocs/hooks/url_schemes.py b/docs/mkdocs/hooks/url_schemes.py index c73882808..6484581ed 100644 --- a/docs/mkdocs/hooks/url_schemes.py +++ b/docs/mkdocs/hooks/url_schemes.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import regex as re from mkdocs.config.defaults import MkDocsConfig from mkdocs.structure.files import Files diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index 56cdd6861..8e5cac78a 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use vLLM for running offline inference with the correct prompt format on audio language models. diff --git a/examples/offline_inference/automatic_prefix_caching.py b/examples/offline_inference/automatic_prefix_caching.py index 0d8c73304..a01a9565a 100644 --- a/examples/offline_inference/automatic_prefix_caching.py +++ b/examples/offline_inference/automatic_prefix_caching.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Demonstration script for Automatic Prefix Caching (APC) in vLLM. diff --git a/examples/offline_inference/basic/basic.py b/examples/offline_inference/basic/basic.py index ae5ae7cb4..78bfda9bc 100644 --- a/examples/offline_inference/basic/basic.py +++ b/examples/offline_inference/basic/basic.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, SamplingParams diff --git a/examples/offline_inference/basic/chat.py b/examples/offline_inference/basic/chat.py index b0bb5aa71..d078c517d 100644 --- a/examples/offline_inference/basic/chat.py +++ b/examples/offline_inference/basic/chat.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, EngineArgs from vllm.utils import FlexibleArgumentParser diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py index 40ccb1294..219064e97 100644 --- a/examples/offline_inference/basic/classify.py +++ b/examples/offline_inference/basic/classify.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from argparse import Namespace diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py index 38a73ccca..fc5ca2378 100644 --- a/examples/offline_inference/basic/embed.py +++ b/examples/offline_inference/basic/embed.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from argparse import Namespace diff --git a/examples/offline_inference/basic/generate.py b/examples/offline_inference/basic/generate.py index 72f4a8208..6a41ef4d8 100644 --- a/examples/offline_inference/basic/generate.py +++ b/examples/offline_inference/basic/generate.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, EngineArgs from vllm.utils import FlexibleArgumentParser diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py index 3da73c6c4..6a08de2d2 100644 --- a/examples/offline_inference/basic/score.py +++ b/examples/offline_inference/basic/score.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from argparse import Namespace diff --git a/examples/offline_inference/batch_llm_inference.py b/examples/offline_inference/batch_llm_inference.py index c1edfb52f..b1c1ef620 100644 --- a/examples/offline_inference/batch_llm_inference.py +++ b/examples/offline_inference/batch_llm_inference.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use Ray Data for data parallel batch inference. diff --git a/examples/offline_inference/chat_with_tools.py b/examples/offline_inference/chat_with_tools.py index 61230d895..6e56e24f2 100644 --- a/examples/offline_inference/chat_with_tools.py +++ b/examples/offline_inference/chat_with_tools.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa import json diff --git a/examples/offline_inference/context_extension.py b/examples/offline_inference/context_extension.py index 1a70446c3..8d7666418 100644 --- a/examples/offline_inference/context_extension.py +++ b/examples/offline_inference/context_extension.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, SamplingParams diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index 15906e1a2..3eccb4e11 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Usage: Single node: diff --git a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py index 4ae5d3310..8f3d1a5c0 100644 --- a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py +++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, SamplingParams from vllm.config import KVTransferConfig diff --git a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py index 5757a8a84..0bfe7ec0e 100644 --- a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py +++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, SamplingParams from vllm.config import KVTransferConfig diff --git a/examples/offline_inference/disaggregated_prefill.py b/examples/offline_inference/disaggregated_prefill.py index 3ccab0dcd..05a361fee 100644 --- a/examples/offline_inference/disaggregated_prefill.py +++ b/examples/offline_inference/disaggregated_prefill.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file demonstrates the example usage of disaggregated prefilling We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode), diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py index 606ce7799..ce977ee99 100644 --- a/examples/offline_inference/eagle.py +++ b/examples/offline_inference/eagle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json import os diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/embed_jina_embeddings_v3.py index 23f60c431..e68128399 100644 --- a/examples/offline_inference/embed_jina_embeddings_v3.py +++ b/examples/offline_inference/embed_jina_embeddings_v3.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from argparse import Namespace diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/embed_matryoshka_fy.py index 59c0592ae..7f5d74d9a 100644 --- a/examples/offline_inference/embed_matryoshka_fy.py +++ b/examples/offline_inference/embed_matryoshka_fy.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from argparse import Namespace diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py index 83dd1f667..0da6fa5c4 100644 --- a/examples/offline_inference/encoder_decoder.py +++ b/examples/offline_inference/encoder_decoder.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Demonstrate prompting of text-to-text encoder/decoder models, specifically BART diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py index ae3737e37..d27a902ed 100644 --- a/examples/offline_inference/encoder_decoder_multimodal.py +++ b/examples/offline_inference/encoder_decoder_multimodal.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use vLLM for running offline inference with the explicit/implicit prompt format on enc-dec LMMs for text generation. diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py index 5d5e55a83..d7f2a1633 100644 --- a/examples/offline_inference/llm_engine_example.py +++ b/examples/offline_inference/llm_engine_example.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file demonstrates using the `LLMEngine` for processing prompts with various sampling parameters. diff --git a/examples/offline_inference/load_sharded_state.py b/examples/offline_inference/load_sharded_state.py index 5bb2327a3..cc78c0cbb 100644 --- a/examples/offline_inference/load_sharded_state.py +++ b/examples/offline_inference/load_sharded_state.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Validates the loading of a model saved with the sharded_state format. This script demonstrates how to load a model that was previously saved diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py index 33c660015..00d4cb9eb 100644 --- a/examples/offline_inference/lora_with_quantization_inference.py +++ b/examples/offline_inference/lora_with_quantization_inference.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use LoRA with different quantization techniques for offline inference. diff --git a/examples/offline_inference/metrics.py b/examples/offline_inference/metrics.py index 7927f758c..00fb3f5bc 100644 --- a/examples/offline_inference/metrics.py +++ b/examples/offline_inference/metrics.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, SamplingParams from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Vector diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py index 98fef2648..330103d58 100644 --- a/examples/offline_inference/mistral-small.py +++ b/examples/offline_inference/mistral-small.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa import argparse diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py index b750397f4..d5b1b4ad2 100644 --- a/examples/offline_inference/mlpspeculator.py +++ b/examples/offline_inference/mlpspeculator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file demonstrates the usage of text generation with an LLM model, comparing the performance with and without speculative decoding. diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py index 1fa2f16f8..f0c00bcaa 100644 --- a/examples/offline_inference/multilora_inference.py +++ b/examples/offline_inference/multilora_inference.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use the multi-LoRA functionality for offline inference. diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py index f2d7698f2..7826629a3 100644 --- a/examples/offline_inference/neuron.py +++ b/examples/offline_inference/neuron.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, SamplingParams diff --git a/examples/offline_inference/neuron_eagle.py b/examples/offline_inference/neuron_eagle.py index 5d7fb819d..0b2070c8e 100644 --- a/examples/offline_inference/neuron_eagle.py +++ b/examples/offline_inference/neuron_eagle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to run offline inference with an EAGLE speculative decoding model on neuron. To use EAGLE speculative decoding, you must use diff --git a/examples/offline_inference/neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py index ec38525b9..c0ecfac50 100644 --- a/examples/offline_inference/neuron_int8_quantization.py +++ b/examples/offline_inference/neuron_int8_quantization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os diff --git a/examples/offline_inference/neuron_multimodal.py b/examples/offline_inference/neuron_multimodal.py index a9478650b..6ff8faabd 100644 --- a/examples/offline_inference/neuron_multimodal.py +++ b/examples/offline_inference/neuron_multimodal.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import requests import torch from neuronx_distributed_inference.models.mllama.utils import add_instruct diff --git a/examples/offline_inference/neuron_speculation.py b/examples/offline_inference/neuron_speculation.py index ecacbab77..2ef69f298 100644 --- a/examples/offline_inference/neuron_speculation.py +++ b/examples/offline_inference/neuron_speculation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to run offline inference with a speculative decoding model on neuron. diff --git a/examples/offline_inference/prefix_caching.py b/examples/offline_inference/prefix_caching.py index d3dad2495..699891382 100644 --- a/examples/offline_inference/prefix_caching.py +++ b/examples/offline_inference/prefix_caching.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, SamplingParams from vllm.distributed import cleanup_dist_env_and_memory diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py index 21f7668ad..567c448a8 100644 --- a/examples/offline_inference/prithvi_geospatial_mae.py +++ b/examples/offline_inference/prithvi_geospatial_mae.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This is a demo script showing how to use the PrithviGeospatialMAE model with vLLM diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py index 244a64b89..392fba8fc 100644 --- a/examples/offline_inference/profiling.py +++ b/examples/offline_inference/profiling.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import inspect import json diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py index 82737d538..5200be826 100644 --- a/examples/offline_inference/profiling_tpu/profiling.py +++ b/examples/offline_inference/profiling_tpu/profiling.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import dataclasses diff --git a/examples/offline_inference/prompt_embed_inference.py b/examples/offline_inference/prompt_embed_inference.py index 9f6a60223..5d79222a1 100644 --- a/examples/offline_inference/prompt_embed_inference.py +++ b/examples/offline_inference/prompt_embed_inference.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Demonstrates how to generate prompt embeddings using Hugging Face Transformers and use them as input to vLLM diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py index 6482490d1..62effd5c8 100644 --- a/examples/offline_inference/qwen2_5_omni/only_thinker.py +++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use vLLM for running offline inference with the correct prompt format on Qwen2.5-Omni (thinker only). diff --git a/examples/offline_inference/qwen_1m.py b/examples/offline_inference/qwen_1m.py index 856a35b0e..d8d61667f 100644 --- a/examples/offline_inference/qwen_1m.py +++ b/examples/offline_inference/qwen_1m.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from urllib.request import urlopen diff --git a/examples/offline_inference/reproducibility.py b/examples/offline_inference/reproducibility.py index 6d048986e..d909438b4 100644 --- a/examples/offline_inference/reproducibility.py +++ b/examples/offline_inference/reproducibility.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Demonstrates how to achieve reproducibility in vLLM. diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py index a8f6977e2..c6e63531a 100644 --- a/examples/offline_inference/rlhf.py +++ b/examples/offline_inference/rlhf.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ a simple demonstration of RLHF with vLLM, inspired by the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF . diff --git a/examples/offline_inference/rlhf_colocate.py b/examples/offline_inference/rlhf_colocate.py index 76eafdca1..096363e68 100644 --- a/examples/offline_inference/rlhf_colocate.py +++ b/examples/offline_inference/rlhf_colocate.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ a simple demonstration to show how to co-locate vLLM worker with training actors on the same GPUs, diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py index 3461af707..c445224d7 100644 --- a/examples/offline_inference/rlhf_utils.py +++ b/examples/offline_inference/rlhf_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py index 860fe2b5f..9b154e370 100644 --- a/examples/offline_inference/save_sharded_state.py +++ b/examples/offline_inference/save_sharded_state.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Saves each worker's model state dict directly to a checkpoint, which enables a fast load path for large tensor-parallel models where each worker only needs to diff --git a/examples/offline_inference/simple_profiling.py b/examples/offline_inference/simple_profiling.py index d583110c8..46858fffa 100644 --- a/examples/offline_inference/simple_profiling.py +++ b/examples/offline_inference/simple_profiling.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import time diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py index 9ed729960..8ef121ebe 100644 --- a/examples/offline_inference/structured_outputs.py +++ b/examples/offline_inference/structured_outputs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file demonstrates the example usage of guided decoding to generate structured outputs using vLLM. It shows how to apply diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py index 2fa49c083..3d3d7946c 100644 --- a/examples/offline_inference/torchrun_example.py +++ b/examples/offline_inference/torchrun_example.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ experimental support for tensor-parallel inference with torchrun, see https://github.com/vllm-project/vllm/issues/11400 for diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py index f3c2859d4..9776f4fe3 100644 --- a/examples/offline_inference/tpu.py +++ b/examples/offline_inference/tpu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import os diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 2ef87f4f4..15dbd9f44 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use vLLM for running offline inference with the correct prompt format on vision language models for text generation. diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py index cee02d06c..1f5bd4ad7 100644 --- a/examples/offline_inference/vision_language_embedding.py +++ b/examples/offline_inference/vision_language_embedding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use vLLM for running offline inference with the correct prompt format on vision language models for multimodal embedding. diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 7ce28c5a4..de6365c0d 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use vLLM for running offline inference with multi-image input on vision language models for text generation, diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py index cc190e91c..84854911b 100644 --- a/examples/online_serving/api_client.py +++ b/examples/online_serving/api_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Example Python client for `vllm.entrypoints.api_server` Start the demo server: python -m vllm.entrypoints.api_server --model diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/cohere_rerank_client.py index e57b94e88..63c9ff9e9 100644 --- a/examples/online_serving/cohere_rerank_client.py +++ b/examples/online_serving/cohere_rerank_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Example of using the OpenAI entrypoint's rerank API which is compatible with the Cohere SDK: https://github.com/cohere-ai/cohere-python diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py index 2ffba4a7e..16c32dcaa 100644 --- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py +++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file provides a disaggregated prefilling proxy demo to demonstrate an example usage of XpYd disaggregated prefilling. diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py index 3f2a3d01b..d5d0a07a2 100644 --- a/examples/online_serving/gradio_openai_chatbot_webserver.py +++ b/examples/online_serving/gradio_openai_chatbot_webserver.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Example for starting a Gradio OpenAI Chatbot Webserver Start vLLM API server: vllm serve meta-llama/Llama-2-7b-chat-hf diff --git a/examples/online_serving/gradio_webserver.py b/examples/online_serving/gradio_webserver.py index fd341ff49..86d9ceb48 100644 --- a/examples/online_serving/gradio_webserver.py +++ b/examples/online_serving/gradio_webserver.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Example for starting a Gradio Webserver Start vLLM API server: python -m vllm.entrypoints.api_server \ diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/jinaai_rerank_client.py index 7eb3d2193..908d6a924 100644 --- a/examples/online_serving/jinaai_rerank_client.py +++ b/examples/online_serving/jinaai_rerank_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Example of using the OpenAI entrypoint's rerank API which is compatible with Jina and Cohere https://jina.ai/reranker diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/online_serving/kv_events_subscriber.py index 65d74dcca..584db53db 100644 --- a/examples/online_serving/kv_events_subscriber.py +++ b/examples/online_serving/kv_events_subscriber.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional, Union import msgspec diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py index 2856e3be3..def95deb0 100644 --- a/examples/online_serving/openai_chat_completion_client.py +++ b/examples/online_serving/openai_chat_completion_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Example Python client for OpenAI Chat Completion using vLLM API server NOTE: start a supported chat completion model server with `vllm serve`, e.g. vllm serve meta-llama/Llama-2-7b-chat-hf diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py index 8c3c6ecdd..c99b5148d 100644 --- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """An example showing how to use vLLM to serve multimodal models and run online serving with OpenAI client. diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py index a0d7841f6..41dbb3236 100644 --- a/examples/online_serving/openai_chat_completion_client_with_tools.py +++ b/examples/online_serving/openai_chat_completion_client_with_tools.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Set up this example by starting a vLLM OpenAI-compatible server with tool call options enabled. For example: diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py index 45c4232fe..7eb866821 100644 --- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py +++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ To run this example, you can start the vLLM server without any specific flags: diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py index a4134ea43..64379083d 100644 --- a/examples/online_serving/openai_chat_completion_structured_outputs.py +++ b/examples/online_serving/openai_chat_completion_structured_outputs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ To run this example, you need to start the vLLM server: diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py index c73208abe..ec7d8b954 100644 --- a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py +++ b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from openai import OpenAI # This example demonstrates the `structural_tag` response format. diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py index 1ca61a8d5..bfbee7513 100644 --- a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ An example shows how to generate structured outputs from reasoning models like DeepSeekR1. The thinking process will not be guided by the JSON diff --git a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py index a5febad45..4006d07f7 100644 --- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ An example demonstrates how to use tool calling with reasoning models like QwQ-32B. The reasoning_content will not be parsed by the tool diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py index f6b808211..932dbeb2e 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ An example shows how to generate chat completions from reasoning models like DeepSeekR1. diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py index f984fbabf..5a9192977 100644 --- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py +++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ An example shows how to generate chat completions from reasoning models like DeepSeekR1. diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py index ee519e555..70f3c2f19 100644 --- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import base64 diff --git a/examples/online_serving/openai_classification_client.py b/examples/online_serving/openai_classification_client.py index 649cfa5d6..b10e7acbd 100644 --- a/examples/online_serving/openai_classification_client.py +++ b/examples/online_serving/openai_classification_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import pprint diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py index b1d21b5e4..df6e4e942 100644 --- a/examples/online_serving/openai_completion_client.py +++ b/examples/online_serving/openai_completion_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py index 7891e14cb..2e0d168d6 100644 --- a/examples/online_serving/openai_cross_encoder_score.py +++ b/examples/online_serving/openai_cross_encoder_score.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Example online usage of Score API. diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py index a055654e9..6bc390861 100644 --- a/examples/online_serving/openai_embedding_client.py +++ b/examples/online_serving/openai_embedding_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from openai import OpenAI diff --git a/examples/online_serving/openai_embedding_matryoshka_fy.py b/examples/online_serving/openai_embedding_matryoshka_fy.py index 4544dcfb5..653da8d18 100644 --- a/examples/online_serving/openai_embedding_matryoshka_fy.py +++ b/examples/online_serving/openai_embedding_matryoshka_fy.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Example Python client for embedding API dimensions using vLLM API server NOTE: start a supported Matryoshka Embeddings model server with `vllm serve`, e.g. diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py index 2620a1232..8252b3670 100644 --- a/examples/online_serving/openai_pooling_client.py +++ b/examples/online_serving/openai_pooling_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Example online usage of Pooling API. diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py index eb501ae72..12d45de3c 100644 --- a/examples/online_serving/openai_transcription_client.py +++ b/examples/online_serving/openai_transcription_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import json diff --git a/examples/online_serving/opentelemetry/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py index 33d365f0c..018d986ad 100644 --- a/examples/online_serving/opentelemetry/dummy_client.py +++ b/examples/online_serving/opentelemetry/dummy_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import requests from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter diff --git a/examples/online_serving/prompt_embed_inference_with_openai_client.py b/examples/online_serving/prompt_embed_inference_with_openai_client.py index 85ea23407..3a9042138 100644 --- a/examples/online_serving/prompt_embed_inference_with_openai_client.py +++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ vLLM OpenAI-Compatible Client with Prompt Embeddings diff --git a/examples/online_serving/ray_serve_deepseek.py b/examples/online_serving/ray_serve_deepseek.py index a76020130..9471563dd 100644 --- a/examples/online_serving/ray_serve_deepseek.py +++ b/examples/online_serving/ray_serve_deepseek.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Example to deploy DeepSeek R1 or V3 with Ray Serve LLM. See more details at: diff --git a/examples/online_serving/retrieval_augmented_generation_with_langchain.py b/examples/online_serving/retrieval_augmented_generation_with_langchain.py index 37af3b388..d9a4cadb0 100644 --- a/examples/online_serving/retrieval_augmented_generation_with_langchain.py +++ b/examples/online_serving/retrieval_augmented_generation_with_langchain.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Retrieval Augmented Generation (RAG) Implementation with Langchain ================================================================== diff --git a/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py b/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py index 08796b1b3..be4796acd 100644 --- a/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py +++ b/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ RAG (Retrieval Augmented Generation) Implementation with LlamaIndex ================================================================ diff --git a/examples/online_serving/streamlit_openai_chatbot_webserver.py b/examples/online_serving/streamlit_openai_chatbot_webserver.py index 0722aa671..dab56172e 100644 --- a/examples/online_serving/streamlit_openai_chatbot_webserver.py +++ b/examples/online_serving/streamlit_openai_chatbot_webserver.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ vLLM Chat Assistant - A Streamlit Web Interface diff --git a/examples/online_serving/utils.py b/examples/online_serving/utils.py index 0781a27f1..a512d8a31 100644 --- a/examples/online_serving/utils.py +++ b/examples/online_serving/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from openai import APIConnectionError, OpenAI from openai.pagination import SyncPage from openai.types.model import Model diff --git a/examples/others/lmcache/cpu_offload_lmcache.py b/examples/others/lmcache/cpu_offload_lmcache.py index 98eafb31e..354e4cc8c 100644 --- a/examples/others/lmcache/cpu_offload_lmcache.py +++ b/examples/others/lmcache/cpu_offload_lmcache.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file demonstrates the example usage of cpu offloading with LMCache in vLLM v1 or v0. diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v0.py b/examples/others/lmcache/disagg_prefill_lmcache_v0.py index b2b7b3b2c..6669eb3fb 100644 --- a/examples/others/lmcache/disagg_prefill_lmcache_v0.py +++ b/examples/others/lmcache/disagg_prefill_lmcache_v0.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file demonstrates the example usage of disaggregated prefilling with LMCache. diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py index 20155c203..5d8e38c73 100644 --- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py +++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import os diff --git a/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py index 89945d67a..508cf4a5a 100644 --- a/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py +++ b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file demonstrates the example usage of remote KV cache sharing with LMCache. diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py index 175777630..9e1003a5c 100644 --- a/examples/others/tensorize_vllm_model.py +++ b/examples/others/tensorize_vllm_model.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import dataclasses diff --git a/find_cuda_init.py b/find_cuda_init.py index 0d13b2f86..308fc6fc2 100644 --- a/find_cuda_init.py +++ b/find_cuda_init.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib import traceback diff --git a/setup.py b/setup.py index c190864dd..b07cdea30 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import ctypes import importlib.util diff --git a/tests/async_engine/api_server_async_engine.py b/tests/async_engine/api_server_async_engine.py index 1e3c2d1a4..163185b90 100644 --- a/tests/async_engine/api_server_async_engine.py +++ b/tests/async_engine/api_server_async_engine.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """vllm.entrypoints.api_server with some extra logging for testing.""" from collections.abc import Iterable from typing import Any diff --git a/tests/async_engine/conftest.py b/tests/async_engine/conftest.py index 1a20e2c13..375b248eb 100644 --- a/tests/async_engine/conftest.py +++ b/tests/async_engine/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 410cece79..38ecaf223 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import subprocess diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index b6f448714..1a31bdbfc 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import os diff --git a/tests/async_engine/test_request_tracker.py b/tests/async_engine/test_request_tracker.py index fd6d89d4e..1851eeeda 100644 --- a/tests/async_engine/test_request_tracker.py +++ b/tests/async_engine/test_request_tracker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 11c8e7a4b..46be4a3c3 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compare the short outputs of HF and vLLM when using greedy sampling. Run `pytest tests/basic_correctness/test_basic_correctness.py`. diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index 06c9e25ed..eb5b09ff7 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compare the outputs of HF and vLLM when using greedy sampling. It tests chunked prefill. Chunked prefill can be enabled by diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py index be3ad1239..28bfe9e7c 100644 --- a/tests/basic_correctness/test_cpu_offload.py +++ b/tests/basic_correctness/test_cpu_offload.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from ..utils import compare_two_settings diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index 76b266aad..34f9389c8 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index 63dc0f8c8..341a39a42 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compare the short outputs of HF and vLLM when using greedy sampling. VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 has to be set before running this test. diff --git a/tests/benchmarks/test_latency_cli.py b/tests/benchmarks/test_latency_cli.py index 8537459b9..2279c846e 100644 --- a/tests/benchmarks/test_latency_cli.py +++ b/tests/benchmarks/test_latency_cli.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import subprocess import pytest diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py index b746d6b78..a31819526 100644 --- a/tests/benchmarks/test_serve_cli.py +++ b/tests/benchmarks/test_serve_cli.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import subprocess import pytest diff --git a/tests/benchmarks/test_throughput_cli.py b/tests/benchmarks/test_throughput_cli.py index 2045b3629..b61e51db4 100644 --- a/tests/benchmarks/test_throughput_cli.py +++ b/tests/benchmarks/test_throughput_cli.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import subprocess import pytest diff --git a/tests/build_cython.py b/tests/build_cython.py index 9dea6bcd6..f4a334aa3 100644 --- a/tests/build_cython.py +++ b/tests/build_cython.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import Cython.Compiler.Options from Cython.Build import cythonize from setuptools import setup diff --git a/tests/compile/backend.py b/tests/compile/backend.py index 5a02c4e2b..60334f5e4 100644 --- a/tests/compile/backend.py +++ b/tests/compile/backend.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from copy import deepcopy from typing import Callable, Union diff --git a/tests/compile/conftest.py b/tests/compile/conftest.py index 7118810a5..d86ca3710 100644 --- a/tests/compile/conftest.py +++ b/tests/compile/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py index a71a40cda..3188ea40f 100644 --- a/tests/compile/piecewise/test_full_cudagraph.py +++ b/tests/compile/piecewise/test_full_cudagraph.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib import os diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index 5ce520a44..852aa44d4 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Test the piecewise compilation with a simple model so that we can exactly calculate the expected output and side effects. diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 22560befc..2464d7889 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Test the piecewise compilation with a simple model, comparing the output with and without the piecewise compilation. diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py index 8e4e0ba83..1e4ee571f 100644 --- a/tests/compile/test_async_tp.py +++ b/tests/compile/test_async_tp.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index b6b45d1cb..dc6cfe9da 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations import dataclasses diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 397517b86..1d000fe00 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py index 5d38ff914..aade29b99 100644 --- a/tests/compile/test_functionalization.py +++ b/tests/compile/test_functionalization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index 509593e73..0c25aae52 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py index b630d0e85..251cc46e9 100644 --- a/tests/compile/test_pass_manager.py +++ b/tests/compile/test_pass_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy import pytest diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py index 2cd7ebaac..c689befdf 100644 --- a/tests/compile/test_sequence_parallelism.py +++ b/tests/compile/test_sequence_parallelism.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py index 9eae48d60..df36b86ab 100644 --- a/tests/compile/test_silu_mul_quant_fusion.py +++ b/tests/compile/test_silu_mul_quant_fusion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py index 0934c6113..5e39f6821 100644 --- a/tests/compile/test_wrapper.py +++ b/tests/compile/test_wrapper.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/conftest.py b/tests/conftest.py index 6336c6c2c..5ec3926bd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import os import tempfile diff --git a/tests/core/block/conftest.py b/tests/core/block/conftest.py index b7a9863f4..6afe98d78 100644 --- a/tests/core/block/conftest.py +++ b/tests/core/block/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py index 83259b690..e2c6c66b2 100644 --- a/tests/core/block/e2e/conftest.py +++ b/tests/core/block/e2e/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Callable, Optional diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index 9e8e315d8..f296c81e1 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from itertools import cycle diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py index 039b5e739..3429a858d 100644 --- a/tests/core/block/e2e/test_correctness_sliding_window.py +++ b/tests/core/block/e2e/test_correctness_sliding_window.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random diff --git a/tests/core/block/test_block_manager.py b/tests/core/block/test_block_manager.py index 68d9618ae..9eed264fd 100644 --- a/tests/core/block/test_block_manager.py +++ b/tests/core/block/test_block_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py index 250c9a749..ba0850011 100644 --- a/tests/core/block/test_block_table.py +++ b/tests/core/block/test_block_table.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/core/block/test_common.py b/tests/core/block/test_common.py index 202608730..65400899b 100644 --- a/tests/core/block/test_common.py +++ b/tests/core/block/test_common.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py index a1414edd9..795eef674 100644 --- a/tests/core/block/test_cpu_gpu_block_allocator.py +++ b/tests/core/block/test_cpu_gpu_block_allocator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/core/block/test_naive_block.py b/tests/core/block/test_naive_block.py index 4b9454c84..a31d1c46b 100644 --- a/tests/core/block/test_naive_block.py +++ b/tests/core/block/test_naive_block.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index 50233624f..46e224c6f 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math import random diff --git a/tests/core/conftest.py b/tests/core/conftest.py index 1a20e2c13..375b248eb 100644 --- a/tests/core/conftest.py +++ b/tests/core/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index 161b32f01..d4dacc4f1 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest.mock import MagicMock diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py index a4a901444..1b958e34d 100644 --- a/tests/core/test_num_computed_tokens_update.py +++ b/tests/core/test_num_computed_tokens_update.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index a5ba16898..db78a9d55 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time from collections import deque diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py index c6049b26a..20cc083ec 100644 --- a/tests/core/test_scheduler_encoder_decoder.py +++ b/tests/core/test_scheduler_encoder_decoder.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest # noqa diff --git a/tests/core/test_serialization.py b/tests/core/test_serialization.py index 64b3e148e..8281298d6 100644 --- a/tests/core/test_serialization.py +++ b/tests/core/test_serialization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import msgspec diff --git a/tests/core/utils.py b/tests/core/utils.py index 84b0426b4..b746c1786 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time from collections import defaultdict diff --git a/tests/detokenizer/conftest.py b/tests/detokenizer/conftest.py index 59394b035..f2c125355 100644 --- a/tests/detokenizer/conftest.py +++ b/tests/detokenizer/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/detokenizer/test_disable_detokenization.py b/tests/detokenizer/test_disable_detokenization.py index 14f9babb8..ae06a985c 100644 --- a/tests/detokenizer/test_disable_detokenization.py +++ b/tests/detokenizer/test_disable_detokenization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/detokenizer/test_stop_checker.py b/tests/detokenizer/test_stop_checker.py index e9ad8d161..bd2219772 100644 --- a/tests/detokenizer/test_stop_checker.py +++ b/tests/detokenizer/test_stop_checker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest.mock import MagicMock diff --git a/tests/detokenizer/test_stop_reason.py b/tests/detokenizer/test_stop_reason.py index 4b1e4f5cf..9716f7d72 100644 --- a/tests/detokenizer/test_stop_reason.py +++ b/tests/detokenizer/test_stop_reason.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test the different finish_reason="stop" situations during generation: 1. One of the provided stop strings 2. One of the provided stop tokens diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py index 0607dd01a..efe938a20 100644 --- a/tests/detokenizer/test_stop_strings.py +++ b/tests/detokenizer/test_stop_strings.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/tests/distributed/conftest.py b/tests/distributed/conftest.py index ee8f20979..95f085788 100644 --- a/tests/distributed/conftest.py +++ b/tests/distributed/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from typing import Optional, Union diff --git a/tests/distributed/test_ca_buffer_sharing.py b/tests/distributed/test_ca_buffer_sharing.py index 72e7ebdb7..e2de46261 100644 --- a/tests/distributed/test_ca_buffer_sharing.py +++ b/tests/distributed/test_ca_buffer_sharing.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # can only run on machines with p2p access across GPUs # can only run with torchrun: diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index 8f4c3537e..e2cb579e2 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test the communication operators. Run `pytest tests/distributed/test_comm_ops.py`. diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index a7ba45c9e..fae49c41d 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random diff --git a/tests/distributed/test_distributed_oot.py b/tests/distributed/test_distributed_oot.py index 4b0c65d1d..b93696e4b 100644 --- a/tests/distributed/test_distributed_oot.py +++ b/tests/distributed/test_distributed_oot.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from ..entrypoints.openai.test_oot_registration import ( run_and_test_dummy_opt_api_server) diff --git a/tests/distributed/test_events.py b/tests/distributed/test_events.py index 8de1aa20e..ec1e5a2d6 100644 --- a/tests/distributed/test_events.py +++ b/tests/distributed/test_events.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import threading import time diff --git a/tests/distributed/test_expert_parallel.py b/tests/distributed/test_expert_parallel.py index db8281617..f641bf160 100644 --- a/tests/distributed/test_expert_parallel.py +++ b/tests/distributed/test_expert_parallel.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Literal, NamedTuple, Optional diff --git a/tests/distributed/test_multi_node_assignment.py b/tests/distributed/test_multi_node_assignment.py index c86d2d8a0..ef17a51ff 100644 --- a/tests/distributed/test_multi_node_assignment.py +++ b/tests/distributed/test_multi_node_assignment.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Make sure ray assigns GPU workers to the correct node. Run: diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index e6410ab06..7d569fd83 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ WARNING: This test runs in both single-node (4 GPUs) and multi-node (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is diff --git a/tests/distributed/test_pipeline_partition.py b/tests/distributed/test_pipeline_partition.py index 7bf93f270..69ceedd34 100644 --- a/tests/distributed/test_pipeline_partition.py +++ b/tests/distributed/test_pipeline_partition.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py index 3ca6e7b33..a027a9e37 100644 --- a/tests/distributed/test_pp_cudagraph.py +++ b/tests/distributed/test_pp_cudagraph.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations from typing import TYPE_CHECKING diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 2c323edfa..5b32b90f3 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import multiprocessing import os diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py index 9b1bbd6e5..94ad8f4f1 100644 --- a/tests/distributed/test_same_node.py +++ b/tests/distributed/test_same_node.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py index c9eba2b43..91a594eac 100644 --- a/tests/distributed/test_sequence_parallel.py +++ b/tests/distributed/test_sequence_parallel.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ WARNING: This test runs in both single-node (4 GPUs) and multi-node (2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py index f9eacc11d..e1357b4a3 100644 --- a/tests/distributed/test_shm_broadcast.py +++ b/tests/distributed/test_shm_broadcast.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import multiprocessing import random diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py index bb38e908b..9f2c3eaec 100644 --- a/tests/distributed/test_torchrun_example.py +++ b/tests/distributed/test_torchrun_example.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # unit test for `examples/offline_inference/torchrun_example.py` import os diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py index 4432950f2..0287ad94e 100644 --- a/tests/distributed/test_utils.py +++ b/tests/distributed/test_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import socket diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py index 0f46fba3a..8b99d9d6e 100644 --- a/tests/encoder_decoder/test_e2e_correctness.py +++ b/tests/encoder_decoder/test_e2e_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """E2E tests to verify the correctness of the encoder-decoder framework Run `pytest tests/encoder_decoder/test_e2e_correctness.py`. diff --git a/tests/engine/conftest.py b/tests/engine/conftest.py index 1a20e2c13..375b248eb 100644 --- a/tests/engine/conftest.py +++ b/tests/engine/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 05d9cfc7a..ab78aa7da 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from argparse import ArgumentError, ArgumentTypeError diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py index 049fa2c8b..ac5a1f957 100644 --- a/tests/engine/test_computed_prefix_blocks.py +++ b/tests/engine/test_computed_prefix_blocks.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py index 91c9ba4a7..15c7a97b5 100644 --- a/tests/engine/test_executor.py +++ b/tests/engine/test_executor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import os diff --git a/tests/engine/test_multi_step_output_processor.py b/tests/engine/test_multi_step_output_processor.py index b67dd86bf..458f4deb7 100644 --- a/tests/engine/test_multi_step_output_processor.py +++ b/tests/engine/test_multi_step_output_processor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from unittest.mock import MagicMock diff --git a/tests/engine/test_multiproc_workers.py b/tests/engine/test_multiproc_workers.py index 9b2f45def..b5381b61a 100644 --- a/tests/engine/test_multiproc_workers.py +++ b/tests/engine/test_multiproc_workers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio from concurrent.futures import ThreadPoolExecutor diff --git a/tests/engine/test_options.py b/tests/engine/test_options.py index 0cf4f69d5..fc6a78a51 100644 --- a/tests/engine/test_options.py +++ b/tests/engine/test_options.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import nullcontext import pytest diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py index b29d6362f..9c62761d7 100644 --- a/tests/engine/test_short_mm_context.py +++ b/tests/engine/test_short_mm_context.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py index 3b596ea3e..a7c533ec2 100644 --- a/tests/entrypoints/conftest.py +++ b/tests/entrypoints/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py index 95657455b..a2d35486a 100644 --- a/tests/entrypoints/llm/test_accuracy.py +++ b/tests/entrypoints/llm/test_accuracy.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file test accuracy of the vLLM server via LMEval. It uses local-completions, which interacts with vLLM diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py index 742a66683..97cf3b5ce 100644 --- a/tests/entrypoints/llm/test_chat.py +++ b/tests/entrypoints/llm/test_chat.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import weakref import pytest diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py index 6470249dd..3a13f8c97 100644 --- a/tests/entrypoints/llm/test_collective_rpc.py +++ b/tests/entrypoints/llm/test_collective_rpc.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py index d10257761..f0fa54aa3 100644 --- a/tests/entrypoints/llm/test_encode.py +++ b/tests/entrypoints/llm/test_encode.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import weakref diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py index 9a895c922..4676dc992 100644 --- a/tests/entrypoints/llm/test_generate.py +++ b/tests/entrypoints/llm/test_generate.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import weakref diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py index 099af0f36..b7d53e31f 100644 --- a/tests/entrypoints/llm/test_generate_multiple_loras.py +++ b/tests/entrypoints/llm/test_generate_multiple_loras.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import weakref diff --git a/tests/entrypoints/llm/test_gpu_utilization.py b/tests/entrypoints/llm/test_gpu_utilization.py index c2b4a9358..533da9e6d 100644 --- a/tests/entrypoints/llm/test_gpu_utilization.py +++ b/tests/entrypoints/llm/test_gpu_utilization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, SamplingParams diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py index dd5d17885..d41b0a436 100644 --- a/tests/entrypoints/llm/test_guided_generate.py +++ b/tests/entrypoints/llm/test_guided_generate.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import weakref diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py index f065f6564..61b6b4fbf 100644 --- a/tests/entrypoints/llm/test_lazy_outlines.py +++ b/tests/entrypoints/llm/test_lazy_outlines.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import sys from contextlib import nullcontext diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py index 665c6ea1e..1b7be15d5 100644 --- a/tests/entrypoints/llm/test_prompt_validation.py +++ b/tests/entrypoints/llm/test_prompt_validation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py index 23fd72f4e..a606eeab5 100644 --- a/tests/entrypoints/offline_mode/test_offline_mode.py +++ b/tests/entrypoints/offline_mode/test_offline_mode.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for HF_HUB_OFFLINE mode""" import importlib import sys diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py index d3948e2ed..41b70f80e 100644 --- a/tests/entrypoints/openai/correctness/test_lmeval.py +++ b/tests/entrypoints/openai/correctness/test_lmeval.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file test accuracy of the vLLM server via LMEval. It uses local-completions, which interacts with vLLM diff --git a/tests/entrypoints/openai/correctness/test_mteb.py b/tests/entrypoints/openai/correctness/test_mteb.py index 44d7ac193..437c48511 100644 --- a/tests/entrypoints/openai/correctness/test_mteb.py +++ b/tests/entrypoints/openai/correctness/test_mteb.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import pytest diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py index 642c204b9..58195f98b 100644 --- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py +++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Evaluate Transcription API correctness by computing Word Error Rate (WER) on a given ASR dataset. When provided, it will also compare the WER against diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py index 1f7ba0da4..ab3c80905 100644 --- a/tests/entrypoints/openai/test_async_tokenization.py +++ b/tests/entrypoints/openai/test_async_tokenization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import contextlib diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index 7f959f312..d67c05ab3 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index a4ac80070..a55941976 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio from http import HTTPStatus diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 2509ef0d2..dab947b21 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # imports for guided decoding tests import json diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py index 86ee17c6f..de63f4ed2 100644 --- a/tests/entrypoints/openai/test_chat_echo.py +++ b/tests/entrypoints/openai/test_chat_echo.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import NamedTuple diff --git a/tests/entrypoints/openai/test_chat_logit_bias_validation.py b/tests/entrypoints/openai/test_chat_logit_bias_validation.py index 9dab524ea..e9d1a8552 100644 --- a/tests/entrypoints/openai/test_chat_logit_bias_validation.py +++ b/tests/entrypoints/openai/test_chat_logit_bias_validation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai import pytest diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py index f18fbb0a9..daa4a78c9 100644 --- a/tests/entrypoints/openai/test_chat_template.py +++ b/tests/entrypoints/openai/test_chat_template.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py index e00f001ef..03730b672 100644 --- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py +++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai # use the official client for correctness check import pytest diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py index 0419395f1..3c8ed955a 100644 --- a/tests/entrypoints/openai/test_chunked_prompt.py +++ b/tests/entrypoints/openai/test_chunked_prompt.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai # use the official client for correctness check import pytest diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/openai/test_classification.py index 97124c85e..6d5f92515 100644 --- a/tests/entrypoints/openai/test_classification.py +++ b/tests/entrypoints/openai/test_classification.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import requests diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py index 8d1abe28a..504fd72aa 100644 --- a/tests/entrypoints/openai/test_cli_args.py +++ b/tests/entrypoints/openai/test_cli_args.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index 9d12f27a2..7e54143f6 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # imports for guided decoding tests import json import shutil diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index dad76b54c..dbea2dc0b 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai # use the official client for correctness check import pytest diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py index b7ee3e33c..00d3ffb61 100644 --- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py +++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 import io diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 81ca65b65..80640a2e1 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 diff --git a/tests/entrypoints/openai/test_embedding_dimensions.py b/tests/entrypoints/openai/test_embedding_dimensions.py index 341defae0..08b797dc5 100644 --- a/tests/entrypoints/openai/test_embedding_dimensions.py +++ b/tests/entrypoints/openai/test_embedding_dimensions.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Run `pytest tests/entrypoints/openai/test_embedding_dimensions.py`. """ diff --git a/tests/entrypoints/openai/test_encoder_decoder.py b/tests/entrypoints/openai/test_encoder_decoder.py index 52b4df9ce..9c2aef23e 100644 --- a/tests/entrypoints/openai/test_encoder_decoder.py +++ b/tests/entrypoints/openai/test_encoder_decoder.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai import pytest diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py index cd07ca46c..bcdeaaace 100644 --- a/tests/entrypoints/openai/test_lora_adapters.py +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import json diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index c96151349..d4afdf775 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import suppress from dataclasses import dataclass, field diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index b21c0173c..2d7b84573 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import subprocess import sys diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py index 3d4f1cde2..1980daa80 100644 --- a/tests/entrypoints/openai/test_models.py +++ b/tests/entrypoints/openai/test_models.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai # use the official client for correctness check import pytest diff --git a/tests/entrypoints/openai/test_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py index a1b7a205a..f0ce50deb 100644 --- a/tests/entrypoints/openai/test_oot_registration.py +++ b/tests/entrypoints/openai/test_oot_registration.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from ...utils import VLLM_PATH, RemoteOpenAIServer diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py index cae2a3b59..4ded37595 100644 --- a/tests/entrypoints/openai/test_openai_schema.py +++ b/tests/entrypoints/openai/test_openai_schema.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Final import pytest diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/openai/test_pooling.py index 72ab12c56..cf16ace65 100644 --- a/tests/entrypoints/openai/test_pooling.py +++ b/tests/entrypoints/openai/test_pooling.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py index e38491589..ff0730c77 100644 --- a/tests/entrypoints/openai/test_prompt_validation.py +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # imports for guided decoding tests import openai diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py index ba11cd3a2..19eba320c 100644 --- a/tests/entrypoints/openai/test_rerank.py +++ b/tests/entrypoints/openai/test_rerank.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import requests diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py index 647485864..099062e55 100644 --- a/tests/entrypoints/openai/test_return_tokens_as_ids.py +++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Separate these tests out from test_completion and test_chat, because they # require launching a second server with a different flag. Running both servers diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py index 106d6b2c1..7b4966848 100644 --- a/tests/entrypoints/openai/test_root_path.py +++ b/tests/entrypoints/openai/test_root_path.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib import os diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index 99639ce51..e23f41e98 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import subprocess diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py index b373f2912..af51a0a3e 100644 --- a/tests/entrypoints/openai/test_score.py +++ b/tests/entrypoints/openai/test_score.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any import pytest diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 5e11af8cf..94740fefc 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio from contextlib import suppress diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py index e8f3c2f8b..28af6489a 100644 --- a/tests/entrypoints/openai/test_serving_models.py +++ b/tests/entrypoints/openai/test_serving_models.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from http import HTTPStatus from unittest.mock import MagicMock diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py index 0f12ac9b2..29a94c852 100644 --- a/tests/entrypoints/openai/test_shutdown.py +++ b/tests/entrypoints/openai/test_shutdown.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai import pytest diff --git a/tests/entrypoints/openai/test_sleep.py b/tests/entrypoints/openai/test_sleep.py index 3ca8a9a41..0dd6af17e 100644 --- a/tests/entrypoints/openai/test_sleep.py +++ b/tests/entrypoints/openai/test_sleep.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import requests diff --git a/tests/entrypoints/openai/test_tensorizer_entrypoint.py b/tests/entrypoints/openai/test_tensorizer_entrypoint.py index f1ab72230..e14315035 100644 --- a/tests/entrypoints/openai/test_tensorizer_entrypoint.py +++ b/tests/entrypoints/openai/test_tensorizer_entrypoint.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import gc import json import tempfile diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py index 7d823542e..57dd25fe1 100644 --- a/tests/entrypoints/openai/test_tokenization.py +++ b/tests/entrypoints/openai/test_tokenization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import pytest_asyncio diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index 5c48df3ce..1cb0a39df 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # imports for guided decoding tests import io diff --git a/tests/entrypoints/openai/test_truncation.py b/tests/entrypoints/openai/test_truncation.py index 137ed9db8..b33a26af6 100644 --- a/tests/entrypoints/openai/test_truncation.py +++ b/tests/entrypoints/openai/test_truncation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any import openai diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py index 53f057a29..990ea3579 100644 --- a/tests/entrypoints/openai/test_video.py +++ b/tests/entrypoints/openai/test_video.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 1ab50b41c..4513d8b34 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py index 26c68e06c..fe982e286 100644 --- a/tests/entrypoints/openai/test_vision_embedding.py +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py index f5f327ea0..8c86b4889 100644 --- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest.mock import MagicMock, patch diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py index 71f41ea7d..d83137472 100644 --- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py +++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest.mock import MagicMock, patch diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py index ab8f4bd67..e1b41f45f 100644 --- a/tests/entrypoints/openai/tool_parsers/utils.py +++ b/tests/entrypoints/openai/tool_parsers/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Union diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/test_api_server_process_manager.py index 0dd1fdd99..e4af60a78 100644 --- a/tests/entrypoints/test_api_server_process_manager.py +++ b/tests/entrypoints/test_api_server_process_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import multiprocessing import socket diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 9f1f2321d..492946642 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import warnings from typing import Optional diff --git a/tests/entrypoints/test_ssl_cert_refresher.py b/tests/entrypoints/test_ssl_cert_refresher.py index 23ce7a679..33ad2cfd3 100644 --- a/tests/entrypoints/test_ssl_cert_refresher.py +++ b/tests/entrypoints/test_ssl_cert_refresher.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import tempfile from pathlib import Path diff --git a/tests/fastsafetensors_loader/test_fastsafetensors_loader.py b/tests/fastsafetensors_loader/test_fastsafetensors_loader.py index 184bee2a7..1b95bf59f 100644 --- a/tests/fastsafetensors_loader/test_fastsafetensors_loader.py +++ b/tests/fastsafetensors_loader/test_fastsafetensors_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import SamplingParams from vllm.config import LoadFormat diff --git a/tests/fastsafetensors_loader/test_weight_utils.py b/tests/fastsafetensors_loader/test_weight_utils.py index 8772035af..78d23acfe 100644 --- a/tests/fastsafetensors_loader/test_weight_utils.py +++ b/tests/fastsafetensors_loader/test_weight_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import glob import tempfile diff --git a/tests/kernels/allclose_default.py b/tests/kernels/allclose_default.py index 97ceffab4..9d65159bf 100644 --- a/tests/kernels/allclose_default.py +++ b/tests/kernels/allclose_default.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/tests/kernels/attention/conftest.py b/tests/kernels/attention/conftest.py index 4f04ec947..88a2fb62b 100644 --- a/tests/kernels/attention/conftest.py +++ b/tests/kernels/attention/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index d9f956fbc..2d381a99b 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from typing import Optional diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index 58da01f0e..435fe6225 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest.mock import patch diff --git a/tests/kernels/attention/test_blocksparse_attention.py b/tests/kernels/attention/test_blocksparse_attention.py index 82d038257..9aee818c9 100644 --- a/tests/kernels/attention/test_blocksparse_attention.py +++ b/tests/kernels/attention/test_blocksparse_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from typing import Optional diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py index 2f2212dd2..e508505c2 100644 --- a/tests/kernels/attention/test_cache.py +++ b/tests/kernels/attention/test_cache.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random diff --git a/tests/kernels/attention/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py index d6570e633..1e7e7e0a7 100755 --- a/tests/kernels/attention/test_cascade_flash_attn.py +++ b/tests/kernels/attention/test_cascade_flash_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/kernels/attention/test_encoder_decoder_attn.py b/tests/kernels/attention/test_encoder_decoder_attn.py index c8ee46bc6..c6ce7b0cc 100644 --- a/tests/kernels/attention/test_encoder_decoder_attn.py +++ b/tests/kernels/attention/test_encoder_decoder_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Tests: diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py index 88516b75c..bd3190d09 100644 --- a/tests/kernels/attention/test_flash_attn.py +++ b/tests/kernels/attention/test_flash_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py index 5ad1137aa..3ad6e1d32 100644 --- a/tests/kernels/attention/test_flashinfer.py +++ b/tests/kernels/attention/test_flashinfer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/kernels/attention/test_flashmla.py b/tests/kernels/attention/test_flashmla.py index 0d51a8e7f..21b08e45f 100644 --- a/tests/kernels/attention/test_flashmla.py +++ b/tests/kernels/attention/test_flashmla.py @@ -1,5 +1,6 @@ # Adapted from: https://github.com/deepseek-ai/FlashMLA/blob/main/tests/test_flash_mla.py # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math import random diff --git a/tests/kernels/attention/test_lightning_attn.py b/tests/kernels/attention/test_lightning_attn.py index fbad52987..de45ee1ed 100644 --- a/tests/kernels/attention/test_lightning_attn.py +++ b/tests/kernels/attention/test_lightning_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/attention/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py index 7038fbea5..9d1a301eb 100644 --- a/tests/kernels/attention/test_merge_attn_states.py +++ b/tests/kernels/attention/test_merge_attn_states.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import pytest diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py index 5a18b7916..53c37554b 100644 --- a/tests/kernels/attention/test_mha_attn.py +++ b/tests/kernels/attention/test_mha_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Test: diff --git a/tests/kernels/attention/test_mla_decode_cpu.py b/tests/kernels/attention/test_mla_decode_cpu.py index 8cebe32c4..5a7480a6b 100644 --- a/tests/kernels/attention/test_mla_decode_cpu.py +++ b/tests/kernels/attention/test_mla_decode_cpu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch import torch.nn.functional as F diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py index 9333777d3..b09e1bbc4 100644 --- a/tests/kernels/attention/test_prefix_prefill.py +++ b/tests/kernels/attention/test_prefix_prefill.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math import random diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py index 6ffe27abf..ed58880cc 100644 --- a/tests/kernels/attention/test_rocm_attention_selector.py +++ b/tests/kernels/attention/test_rocm_attention_selector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/attention/test_triton_decode_attention.py b/tests/kernels/attention/test_triton_decode_attention.py index fd3c9fa41..358b374ea 100644 --- a/tests/kernels/attention/test_triton_decode_attention.py +++ b/tests/kernels/attention/test_triton_decode_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py index be3d1879d..0cb7f5963 100644 --- a/tests/kernels/attention/test_triton_unified_attention.py +++ b/tests/kernels/attention/test_triton_unified_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py index 79f838a95..29c5e70a8 100644 --- a/tests/kernels/core/test_activation.py +++ b/tests/kernels/core/test_activation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random diff --git a/tests/kernels/core/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py index 7a591f536..19703b8a2 100644 --- a/tests/kernels/core/test_fused_quant_layernorm.py +++ b/tests/kernels/core/test_fused_quant_layernorm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional, Union diff --git a/tests/kernels/core/test_layernorm.py b/tests/kernels/core/test_layernorm.py index fa4bbe458..3eac06273 100644 --- a/tests/kernels/core/test_layernorm.py +++ b/tests/kernels/core/test_layernorm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/core/test_opcheck.py b/tests/kernels/core/test_opcheck.py index c9a9679c5..40ced08b9 100644 --- a/tests/kernels/core/test_opcheck.py +++ b/tests/kernels/core/test_opcheck.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Tests for miscellaneous utilities """ diff --git a/tests/kernels/core/test_permute_cols.py b/tests/kernels/core/test_permute_cols.py index 35d62079f..e18f6230d 100644 --- a/tests/kernels/core/test_permute_cols.py +++ b/tests/kernels/core/test_permute_cols.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index 8cb56314c..ab6f1ccf8 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from itertools import accumulate, product from typing import Callable, Optional diff --git a/tests/kernels/core/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py index 8383f943b..db0fdcbf5 100644 --- a/tests/kernels/core/test_rotary_embedding.py +++ b/tests/kernels/core/test_rotary_embedding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Tests for miscellaneous utilities """ diff --git a/tests/kernels/core/test_uva.py b/tests/kernels/core/test_uva.py index f641ae7b6..c71215e4c 100644 --- a/tests/kernels/core/test_uva.py +++ b/tests/kernels/core/test_uva.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/mamba/test_causal_conv1d.py b/tests/kernels/mamba/test_causal_conv1d.py index 93064e23d..addb8bfcd 100644 --- a/tests/kernels/mamba/test_causal_conv1d.py +++ b/tests/kernels/mamba/test_causal_conv1d.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/kernels/mamba/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py index abcf3888f..f5c6a1861 100644 --- a/tests/kernels/mamba/test_mamba_mixer2.py +++ b/tests/kernels/mamba/test_mamba_mixer2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import unittest diff --git a/tests/kernels/mamba/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py index 84d4c347e..8dece26dd 100644 --- a/tests/kernels/mamba/test_mamba_ssm.py +++ b/tests/kernels/mamba/test_mamba_ssm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py index f5e751bea..abed1252a 100644 --- a/tests/kernels/mamba/test_mamba_ssm_ssd.py +++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py index 7d369edfc..b0e0feab4 100644 --- a/tests/kernels/moe/test_batched_moe.py +++ b/tests/kernels/moe/test_batched_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index 7db4fe0f4..558288ba4 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from typing import Optional diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 299279390..7238813a2 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the MOE layers. Run `pytest tests/kernels/test_moe.py`. diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py index 10e6ac64d..7cc83b512 100644 --- a/tests/kernels/moe/test_moe_permute_unpermute.py +++ b/tests/kernels/moe/test_moe_permute_unpermute.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the MOE permute/unpermute kernel Run `pytest tests/kernels/test_moe_permute_unpermute.py`. diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py index ae63b379f..be33200cc 100644 --- a/tests/kernels/moe/test_nvfp4_moe.py +++ b/tests/kernels/moe/test_nvfp4_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py index 8c4a2c3fa..95c10037b 100644 --- a/tests/kernels/moe/test_pplx_moe.py +++ b/tests/kernels/moe/test_pplx_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the MOE layers. Run `pytest tests/kernels/test_pplx_moe.py`. diff --git a/tests/kernels/moe/test_rocm_aiter_topk.py b/tests/kernels/moe/test_rocm_aiter_topk.py index 922fd66db..1c51c530c 100644 --- a/tests/kernels/moe/test_rocm_aiter_topk.py +++ b/tests/kernels/moe/test_rocm_aiter_topk.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # This is a test for the AITER ops. # It tests if the AITER ops are # 1. correctly registered as custom ops diff --git a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py index 3b5838a99..dfd0f35c8 100644 --- a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py +++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_triton_moe_channel_fp8_kernel.py import itertools diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index 892309a01..0840cc7b5 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional, Union diff --git a/tests/kernels/quantization/nvfp4_utils.py b/tests/kernels/quantization/nvfp4_utils.py index 58eaeee1c..1095975ab 100644 --- a/tests/kernels/quantization/nvfp4_utils.py +++ b/tests/kernels/quantization/nvfp4_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch from vllm.scalar_type import scalar_types diff --git a/tests/kernels/quantization/test_allspark_gemm.py b/tests/kernels/quantization/test_allspark_gemm.py index 896e02657..3de9cb364 100644 --- a/tests/kernels/quantization/test_allspark_gemm.py +++ b/tests/kernels/quantization/test_allspark_gemm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/quantization/test_aqlm.py b/tests/kernels/quantization/test_aqlm.py index 7d3617281..427db3e60 100644 --- a/tests/kernels/quantization/test_aqlm.py +++ b/tests/kernels/quantization/test_aqlm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/tests/kernels/quantization/test_awq.py b/tests/kernels/quantization/test_awq.py index 248b294e5..bc0868123 100644 --- a/tests/kernels/quantization/test_awq.py +++ b/tests/kernels/quantization/test_awq.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/quantization/test_awq_triton.py b/tests/kernels/quantization/test_awq_triton.py index 3fc3feaf4..96797e85b 100644 --- a/tests/kernels/quantization/test_awq_triton.py +++ b/tests/kernels/quantization/test_awq_triton.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the AWQ Triton kernel. Run `pytest tests/kernels/test_awq_triton.py`. diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py index ae05d6117..8c5ee9874 100644 --- a/tests/kernels/quantization/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://github.com/sgl-project/sglang/pull/2575 import itertools diff --git a/tests/kernels/quantization/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py index a4e9f83f0..fa2c9f890 100644 --- a/tests/kernels/quantization/test_block_int8.py +++ b/tests/kernels/quantization/test_block_int8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_block_int8.py import itertools diff --git a/tests/kernels/quantization/test_cutlass_2of4_sparse.py b/tests/kernels/quantization/test_cutlass_2of4_sparse.py index d67d2dbb8..878f66647 100644 --- a/tests/kernels/quantization/test_cutlass_2of4_sparse.py +++ b/tests/kernels/quantization/test_cutlass_2of4_sparse.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for sparse cutlass kernels Run `pytest tests/kernels/test_semi_structured.py`. diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py index 633addd42..51bb29df0 100644 --- a/tests/kernels/quantization/test_cutlass_scaled_mm.py +++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for cutlass kernels Run `pytest tests/kernels/test_cutlass.py`. diff --git a/tests/kernels/quantization/test_fp8_quant.py b/tests/kernels/quantization/test_fp8_quant.py index 876cf03fd..0a3edd4dd 100644 --- a/tests/kernels/quantization/test_fp8_quant.py +++ b/tests/kernels/quantization/test_fp8_quant.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/quantization/test_ggml.py b/tests/kernels/quantization/test_ggml.py index 73697a6d1..07651fef3 100644 --- a/tests/kernels/quantization/test_ggml.py +++ b/tests/kernels/quantization/test_ggml.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import gguf import pytest diff --git a/tests/kernels/quantization/test_gguf.py b/tests/kernels/quantization/test_gguf.py index ad755fe7f..436d5cb64 100644 --- a/tests/kernels/quantization/test_gguf.py +++ b/tests/kernels/quantization/test_gguf.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from pathlib import Path diff --git a/tests/kernels/quantization/test_gptq.py b/tests/kernels/quantization/test_gptq.py index fea013d9e..7fb57a157 100644 --- a/tests/kernels/quantization/test_gptq.py +++ b/tests/kernels/quantization/test_gptq.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/tests/kernels/quantization/test_int8_kernel.py b/tests/kernels/quantization/test_int8_kernel.py index 4c7543527..dc5fecbf4 100644 --- a/tests/kernels/quantization/test_int8_kernel.py +++ b/tests/kernels/quantization/test_int8_kernel.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_int8_kernel.py import itertools diff --git a/tests/kernels/quantization/test_int8_quant.py b/tests/kernels/quantization/test_int8_quant.py index 25dcb587e..63ccf4a91 100644 --- a/tests/kernels/quantization/test_int8_quant.py +++ b/tests/kernels/quantization/test_int8_quant.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py index 5aeaaa654..998171baa 100644 --- a/tests/kernels/quantization/test_machete_mm.py +++ b/tests/kernels/quantization/test_machete_mm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the machete kernel. Run `pytest tests/kernels/test_machete_mm.py`. diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py index 52507b375..92914bd5c 100644 --- a/tests/kernels/quantization/test_marlin_gemm.py +++ b/tests/kernels/quantization/test_marlin_gemm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the marlin kernel. Run `pytest tests/kernels/marlin/test_marlin_gemm.py`. diff --git a/tests/kernels/quantization/test_nvfp4_quant.py b/tests/kernels/quantization/test_nvfp4_quant.py index b8aa16721..3a8f4c175 100644 --- a/tests/kernels/quantization/test_nvfp4_quant.py +++ b/tests/kernels/quantization/test_nvfp4_quant.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/quantization/test_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_nvfp4_scaled_mm.py index 1f49900b2..0b45c2298 100644 --- a/tests/kernels/quantization/test_nvfp4_scaled_mm.py +++ b/tests/kernels/quantization/test_nvfp4_scaled_mm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch from nvfp4_utils import (FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, diff --git a/tests/kernels/quantization/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py index c7eee8998..533a4fe59 100644 --- a/tests/kernels/quantization/test_rocm_skinny_gemms.py +++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py index 30e6eeb8d..8a2cc3bac 100644 --- a/tests/kernels/quantization/test_triton_scaled_mm.py +++ b/tests/kernels/quantization/test_triton_scaled_mm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the triton_scaled_mm kernel Run `pytest tests/kernels/test_triton_scaled_mm.py`. diff --git a/tests/kernels/test_cutlass_mla_decode.py b/tests/kernels/test_cutlass_mla_decode.py index 87e4bd4b0..c56024b75 100644 --- a/tests/kernels/test_cutlass_mla_decode.py +++ b/tests/kernels/test_cutlass_mla_decode.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch import torch.nn.functional as F diff --git a/tests/kernels/test_fused_quant_activation.py b/tests/kernels/test_fused_quant_activation.py index faa8d49ce..803453a20 100644 --- a/tests/kernels/test_fused_quant_activation.py +++ b/tests/kernels/test_fused_quant_activation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/kernels/test_triton_flash_attention.py b/tests/kernels/test_triton_flash_attention.py index cf2bdc908..1c31cfb25 100644 --- a/tests/kernels/test_triton_flash_attention.py +++ b/tests/kernels/test_triton_flash_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the triton_flash_attention kernel Run `pytest tests/kernels/test_triton_flash_attention.py`. diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 22b3d7c2b..d1db6a8eb 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Kernel test utils""" import itertools diff --git a/tests/kv_transfer/test_disagg.py b/tests/kv_transfer/test_disagg.py index dc948a48b..9f2229cc4 100644 --- a/tests/kv_transfer/test_disagg.py +++ b/tests/kv_transfer/test_disagg.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import subprocess diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py index c5b34660d..352ab6355 100644 --- a/tests/kv_transfer/test_lookup_buffer.py +++ b/tests/kv_transfer/test_lookup_buffer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import random diff --git a/tests/kv_transfer/test_module.py b/tests/kv_transfer/test_module.py index 8a6490b5c..7a0417487 100644 --- a/tests/kv_transfer/test_module.py +++ b/tests/kv_transfer/test_module.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import subprocess import sys diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py index 3dd923d24..32116608a 100644 --- a/tests/kv_transfer/test_send_recv.py +++ b/tests/kv_transfer/test_send_recv.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import time diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 399311ce6..0737bb886 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import tempfile from collections import OrderedDict diff --git a/tests/lora/test_add_lora.py b/tests/lora/test_add_lora.py index 17347300b..cc8160b28 100644 --- a/tests/lora/test_add_lora.py +++ b/tests/lora/test_add_lora.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import time diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index 007be7aa5..774ebb9db 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py index cd9526c8b..5481b413b 100644 --- a/tests/lora/test_chatglm3_tp.py +++ b/tests/lora/test_chatglm3_tp.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import vllm from vllm.lora.request import LoRARequest diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 0a8b38fa7..92db023ba 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from copy import deepcopy diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index 54daea5b9..23819f03d 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import subprocess import sys from typing import Union diff --git a/tests/lora/test_lora_allowed_token_ids.py b/tests/lora/test_lora_allowed_token_ids.py index 094541aef..01bc102bd 100644 --- a/tests/lora/test_lora_allowed_token_ids.py +++ b/tests/lora/test_lora_allowed_token_ids.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py index 02f2339be..ebc0f2637 100644 --- a/tests/lora/test_lora_checkpoints.py +++ b/tests/lora/test_lora_checkpoints.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py index fd80f61a5..e9a52e1b6 100644 --- a/tests/lora/test_lora_functions.py +++ b/tests/lora/test_lora_functions.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Script to test add_lora, remove_lora, pin_lora, list_loras functions. """ diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py index 90498c47f..b46d81f16 100644 --- a/tests/lora/test_lora_huggingface.py +++ b/tests/lora/test_lora_huggingface.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 52b0834ca..8f8a27006 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py index 24242b8a1..99fe951bb 100644 --- a/tests/lora/test_minicpmv_tp.py +++ b/tests/lora/test_minicpmv_tp.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index 4e77c5559..0ea077933 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py index 9935472ad..f16589e06 100644 --- a/tests/lora/test_peft_helper.py +++ b/tests/lora/test_peft_helper.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import math diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py index 7375cabbc..a21de0705 100644 --- a/tests/lora/test_phi.py +++ b/tests/lora/test_phi.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py index add313c94..14fa79ae5 100644 --- a/tests/lora/test_punica_ops.py +++ b/tests/lora/test_punica_ops.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from threading import Lock import pytest diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 43e2975cd..caa31fdb0 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index 20a1ae67d..604bb307b 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Optional diff --git a/tests/lora/test_resolver.py b/tests/lora/test_resolver.py index 8ebc2ae98..6c93e5776 100644 --- a/tests/lora/test_resolver.py +++ b/tests/lora/test_resolver.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py index 8845eb33d..6cfdaf50d 100644 --- a/tests/lora/test_tokenizer_group.py +++ b/tests/lora/test_tokenizer_group.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from transformers import AutoTokenizer, PreTrainedTokenizerBase diff --git a/tests/lora/test_transfomers_model.py b/tests/lora/test_transfomers_model.py index 63907f2c1..5065a2fb7 100644 --- a/tests/lora/test_transfomers_model.py +++ b/tests/lora/test_transfomers_model.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py index 0d4e0bf68..b343bef0a 100644 --- a/tests/lora/test_utils.py +++ b/tests/lora/test_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import OrderedDict from typing import NamedTuple, Optional diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 1a5d52716..6f13e663a 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import random diff --git a/tests/lora/utils.py b/tests/lora/utils.py index 59a0e7420..cc1b0d819 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Optional, Union diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index e71c87ff3..7bb5d8980 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time diff --git a/tests/mistral_tool_use/conftest.py b/tests/mistral_tool_use/conftest.py index 39ab01c9b..e89e60c5a 100644 --- a/tests/mistral_tool_use/conftest.py +++ b/tests/mistral_tool_use/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import pytest_asyncio diff --git a/tests/mistral_tool_use/test_mistral_tool_calls.py b/tests/mistral_tool_use/test_mistral_tool_calls.py index bbb3a0789..9bf6863f3 100644 --- a/tests/mistral_tool_use/test_mistral_tool_calls.py +++ b/tests/mistral_tool_use/test_mistral_tool_calls.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai import pytest diff --git a/tests/mistral_tool_use/utils.py b/tests/mistral_tool_use/utils.py index 1d809a05e..7a026cd9b 100644 --- a/tests/mistral_tool_use/utils.py +++ b/tests/mistral_tool_use/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/model_executor/conftest.py b/tests/model_executor/conftest.py index b588a1a96..c6d89d849 100644 --- a/tests/model_executor/conftest.py +++ b/tests/model_executor/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index e957db5b3..a94215ee3 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py index 6cd966f84..ac31064d9 100644 --- a/tests/model_executor/test_guided_processors.py +++ b/tests/model_executor/test_guided_processors.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import pickle diff --git a/tests/model_executor/test_logits_processor.py b/tests/model_executor/test_logits_processor.py index 8301c645b..532ebba03 100644 --- a/tests/model_executor/test_logits_processor.py +++ b/tests/model_executor/test_logits_processor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from unittest.mock import patch diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 7fda1f0e8..94a14bd24 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os diff --git a/tests/model_executor/test_weight_utils.py b/tests/model_executor/test_weight_utils.py index bdaba22c3..df625b8d6 100644 --- a/tests/model_executor/test_weight_utils.py +++ b/tests/model_executor/test_weight_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import tempfile diff --git a/tests/models/language/generation/test_bart.py b/tests/models/language/generation/test_bart.py index 8ab0167dc..7d8acab5e 100644 --- a/tests/models/language/generation/test_bart.py +++ b/tests/models/language/generation/test_bart.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import pytest diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index 05dd18fbd..ed9e54722 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import Optional diff --git a/tests/models/language/generation/test_granite.py b/tests/models/language/generation/test_granite.py index f381c34f4..2a39f78a7 100644 --- a/tests/models/language/generation/test_granite.py +++ b/tests/models/language/generation/test_granite.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from ...utils import check_logprobs_close diff --git a/tests/models/language/generation/test_granitemoehybrid.py b/tests/models/language/generation/test_granitemoehybrid.py index da3f5e110..952449f28 100644 --- a/tests/models/language/generation/test_granitemoehybrid.py +++ b/tests/models/language/generation/test_granitemoehybrid.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index 604cb854b..3eaadcb45 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py index c1b612ae2..bdd857ff5 100644 --- a/tests/models/language/generation/test_mistral.py +++ b/tests/models/language/generation/test_mistral.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy import json diff --git a/tests/models/language/generation/test_phimoe.py b/tests/models/language/generation/test_phimoe.py index 603ca1cb1..6c9cc2821 100644 --- a/tests/models/language/generation/test_phimoe.py +++ b/tests/models/language/generation/test_phimoe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py index 0c8ac2ab1..07bc9f447 100644 --- a/tests/models/language/pooling/embed_utils.py +++ b/tests/models/language/pooling/embed_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from typing import Optional diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index f45168bc0..2705be25e 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence import mteb diff --git a/tests/models/language/pooling/test_baai.py b/tests/models/language/pooling/test_baai.py index fc0e82079..1af3c05d3 100644 --- a/tests/models/language/pooling/test_baai.py +++ b/tests/models/language/pooling/test_baai.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from .embed_utils import EmbedModelInfo, correctness_test_embed_models diff --git a/tests/models/language/pooling/test_classification.py b/tests/models/language/pooling/test_classification.py index 57b3cb58d..4a6d781ce 100644 --- a/tests/models/language/pooling/test_classification.py +++ b/tests/models/language/pooling/test_classification.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch from transformers import AutoModelForSequenceClassification diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index 8f82c8091..9516a0142 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from vllm.config import PoolerConfig diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py index f450edd82..c2f70bb64 100644 --- a/tests/models/language/pooling/test_gritlm.py +++ b/tests/models/language/pooling/test_gritlm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations import importlib.util diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index 725e3d168..2178a815b 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any import pytest diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py index 0403a20a4..2adf34b29 100644 --- a/tests/models/language/pooling/test_jina.py +++ b/tests/models/language/pooling/test_jina.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from functools import partial import pytest diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling/test_nomic.py index 92cd7cc56..59dbd74fb 100644 --- a/tests/models/language/pooling/test_nomic.py +++ b/tests/models/language/pooling/test_nomic.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py index 68603e628..250b3a528 100644 --- a/tests/models/language/pooling/test_nomic_max_model_len.py +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: SIM117 import pytest diff --git a/tests/models/language/pooling/test_scoring.py b/tests/models/language/pooling/test_scoring.py index 6b10aeffc..c75ff1445 100644 --- a/tests/models/language/pooling/test_scoring.py +++ b/tests/models/language/pooling/test_scoring.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch import torch.nn.functional as F diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py index c6c2d1e7a..d6b5dbd08 100644 --- a/tests/models/language/pooling/test_snowflake_arctic_embed.py +++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/models/language/pooling/test_truncation_control.py b/tests/models/language/pooling/test_truncation_control.py index 1b8ac395e..33aff1c87 100644 --- a/tests/models/language/pooling/test_truncation_control.py +++ b/tests/models/language/pooling/test_truncation_control.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2" diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index e4e48f995..a5bbcfc22 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Common tests for testing .generate() functionality for single / multiple image, embedding, and video support for different VLMs in vLLM. """ diff --git a/tests/models/multimodal/generation/test_florence2.py b/tests/models/multimodal/generation/test_florence2.py index b8225f5f1..b048cec5e 100644 --- a/tests/models/multimodal/generation/test_florence2.py +++ b/tests/models/multimodal/generation/test_florence2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py index 96c444441..14552010d 100644 --- a/tests/models/multimodal/generation/test_granite_speech.py +++ b/tests/models/multimodal/generation/test_granite_speech.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from typing import Optional diff --git a/tests/models/multimodal/generation/test_interleaved.py b/tests/models/multimodal/generation/test_interleaved.py index 972db40e8..949c0a80d 100644 --- a/tests/models/multimodal/generation/test_interleaved.py +++ b/tests/models/multimodal/generation/test_interleaved.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/models/multimodal/generation/test_mllama.py b/tests/models/multimodal/generation/test_mllama.py index 99aa3c2d3..2bb01e494 100644 --- a/tests/models/multimodal/generation/test_mllama.py +++ b/tests/models/multimodal/generation/test_mllama.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional, overload diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py index e51dbee47..e4cd476a9 100644 --- a/tests/models/multimodal/generation/test_phi4mm.py +++ b/tests/models/multimodal/generation/test_phi4mm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from collections.abc import Sequence diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py index 506b71472..1def825ab 100644 --- a/tests/models/multimodal/generation/test_pixtral.py +++ b/tests/models/multimodal/generation/test_pixtral.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from dataclasses import asdict from typing import TYPE_CHECKING, Any, Optional diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py index 6be401b77..a2793b8c8 100644 --- a/tests/models/multimodal/generation/test_qwen2_vl.py +++ b/tests/models/multimodal/generation/test_qwen2_vl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional, TypedDict, Union diff --git a/tests/models/multimodal/generation/test_ultravox.py b/tests/models/multimodal/generation/test_ultravox.py index 2c8a06688..e7e7bd315 100644 --- a/tests/models/multimodal/generation/test_ultravox.py +++ b/tests/models/multimodal/generation/test_ultravox.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from typing import Any diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py index d0b85842a..363d55153 100644 --- a/tests/models/multimodal/generation/test_whisper.py +++ b/tests/models/multimodal/generation/test_whisper.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import pytest diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py index 32117c8d8..7d20dd660 100644 --- a/tests/models/multimodal/generation/vlm_utils/builders.py +++ b/tests/models/multimodal/generation/vlm_utils/builders.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Helpers for building inputs that can be leveraged for different test types. """ from collections.abc import Iterable diff --git a/tests/models/multimodal/generation/vlm_utils/case_filtering.py b/tests/models/multimodal/generation/vlm_utils/case_filtering.py index a5077a090..336e2dd2b 100644 --- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py +++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utils for determining which subset of model tests belong to a specific modality, getting all combinations (similar to pytest's parametrization), handling multimodal placeholder substitution, and so on. diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py index ccd2799ab..8c83d8f8a 100644 --- a/tests/models/multimodal/generation/vlm_utils/core.py +++ b/tests/models/multimodal/generation/vlm_utils/core.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Core test implementation to be shared across modalities.""" from typing import Any, Callable, Optional diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py index cc1045561..aa5835243 100644 --- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py +++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Custom input builders for edge-cases in different models.""" from io import BytesIO from typing import Callable diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index dc1ea5208..1b087191f 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Common utility functions relating to different models that are useful for manipulating the input / output of HF & vLLM test runners, which are typically specific to a small subset of models. diff --git a/tests/models/multimodal/generation/vlm_utils/runners.py b/tests/models/multimodal/generation/vlm_utils/runners.py index 9e8a1262e..562f89df1 100644 --- a/tests/models/multimodal/generation/vlm_utils/runners.py +++ b/tests/models/multimodal/generation/vlm_utils/runners.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Entrypoints for wrapping the core run_test implementation for specific test types / modalities. """ diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py index 1c2bb4d62..0ec7909e7 100644 --- a/tests/models/multimodal/generation/vlm_utils/types.py +++ b/tests/models/multimodal/generation/vlm_utils/types.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Types for writing multimodal model tests.""" from collections.abc import Iterable from enum import Enum diff --git a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py index ea1caec0e..3734d87b7 100644 --- a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py +++ b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable diff --git a/tests/models/multimodal/pooling/test_intern_vit.py b/tests/models/multimodal/pooling/test_intern_vit.py index 76f9fbe02..3e2be34a5 100644 --- a/tests/models/multimodal/pooling/test_intern_vit.py +++ b/tests/models/multimodal/pooling/test_intern_vit.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch import torch.nn as nn diff --git a/tests/models/multimodal/pooling/test_llava_next.py b/tests/models/multimodal/pooling/test_llava_next.py index 77508738c..b6d90d2b0 100644 --- a/tests/models/multimodal/pooling/test_llava_next.py +++ b/tests/models/multimodal/pooling/test_llava_next.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch.nn.functional as F diff --git a/tests/models/multimodal/pooling/test_phi3v.py b/tests/models/multimodal/pooling/test_phi3v.py index cd58a5cb4..b42ac6fb2 100644 --- a/tests/models/multimodal/pooling/test_phi3v.py +++ b/tests/models/multimodal/pooling/test_phi3v.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch.nn.functional as F diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 2377fef82..be574435e 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from functools import partial from typing import Optional, Union diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py index 37142b6dd..76e4acc67 100644 --- a/tests/models/multimodal/processing/test_h2ovl.py +++ b/tests/models/multimodal/processing/test_h2ovl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for H2OVL's multimodal preprocessing kwargs.""" from collections.abc import Mapping from typing import Optional diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py index c35ce2f6a..d3a55993e 100644 --- a/tests/models/multimodal/processing/test_idefics3.py +++ b/tests/models/multimodal/processing/test_idefics3.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for Idefics3's multimodal preprocessing kwargs.""" import pytest from transformers import Idefics3Config diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py index 7ec81197a..c3e2841a8 100644 --- a/tests/models/multimodal/processing/test_internvl.py +++ b/tests/models/multimodal/processing/test_internvl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for InternVL's multimodal preprocessing kwargs.""" from collections.abc import Mapping from typing import Optional diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py index 614f17dbb..9ef7af556 100644 --- a/tests/models/multimodal/processing/test_llama4.py +++ b/tests/models/multimodal/processing/test_llama4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for Llama4's multimodal preprocessing kwargs.""" import pytest diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py index b82bfe483..ca34d1d75 100644 --- a/tests/models/multimodal/processing/test_llava_next.py +++ b/tests/models/multimodal/processing/test_llava_next.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools from functools import partial diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py index dcc8dc8da..e6344c4e7 100644 --- a/tests/models/multimodal/processing/test_llava_onevision.py +++ b/tests/models/multimodal/processing/test_llava_onevision.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools from functools import partial diff --git a/tests/models/multimodal/processing/test_minimax_vl_01.py b/tests/models/multimodal/processing/test_minimax_vl_01.py index 9bd2b9887..9387212e3 100644 --- a/tests/models/multimodal/processing/test_minimax_vl_01.py +++ b/tests/models/multimodal/processing/test_minimax_vl_01.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from PIL import Image diff --git a/tests/models/multimodal/processing/test_mllama.py b/tests/models/multimodal/processing/test_mllama.py index d4794396f..a6b20a1e3 100644 --- a/tests/models/multimodal/processing/test_mllama.py +++ b/tests/models/multimodal/processing/test_mllama.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for mllama's multimodal preprocessing and profiling.""" import pytest from transformers import MllamaConfig diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py index b53351544..1f3646f79 100644 --- a/tests/models/multimodal/processing/test_phi3v.py +++ b/tests/models/multimodal/processing/test_phi3v.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for phi3v's multimodal preprocessing kwargs.""" import pytest diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py index c6e272650..f16d261c2 100644 --- a/tests/models/multimodal/processing/test_phi4mm.py +++ b/tests/models/multimodal/processing/test_phi4mm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for phi4mm's multimodal preprocessing kwargs.""" import pytest diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py index 02abe1ca8..9d1cd1833 100644 --- a/tests/models/multimodal/processing/test_qwen2_vl.py +++ b/tests/models/multimodal/processing/test_qwen2_vl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/models/multimodal/processing/test_smolvlm.py b/tests/models/multimodal/processing/test_smolvlm.py index 224d1bced..af8f98338 100644 --- a/tests/models/multimodal/processing/test_smolvlm.py +++ b/tests/models/multimodal/processing/test_smolvlm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for smolvlm's multimodal preprocessing kwargs.""" import pytest from transformers import SmolVLMConfig diff --git a/tests/models/quantization/test_aqlm.py b/tests/models/quantization/test_aqlm.py index 1272a6297..de6851e2f 100644 --- a/tests/models/quantization/test_aqlm.py +++ b/tests/models/quantization/test_aqlm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from tests.quantization.utils import is_quant_method_supported diff --git a/tests/models/quantization/test_awq.py b/tests/models/quantization/test_awq.py index 597c8e48f..bd6961989 100644 --- a/tests/models/quantization/test_awq.py +++ b/tests/models/quantization/test_awq.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/models/quantization/test_bitblas.py b/tests/models/quantization/test_bitblas.py index f0781394d..754ac9a29 100644 --- a/tests/models/quantization/test_bitblas.py +++ b/tests/models/quantization/test_bitblas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compare the outputs of a GPTQ model to a bitblas model. Note: GPTQ and bitblas do not have bitwise correctness. diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py index e01ee2026..10914abf9 100644 --- a/tests/models/quantization/test_fp8.py +++ b/tests/models/quantization/test_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # flake8: noqa """Tests fp8 models against ground truth generation diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py index 5f17d1228..eafdfd1b0 100644 --- a/tests/models/quantization/test_gguf.py +++ b/tests/models/quantization/test_gguf.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Tests gguf models against unquantized models generations Note: To pass the test, quantization higher than Q4 should be used diff --git a/tests/models/quantization/test_gptq_bitblas.py b/tests/models/quantization/test_gptq_bitblas.py index c8e96455f..c3aed7752 100644 --- a/tests/models/quantization/test_gptq_bitblas.py +++ b/tests/models/quantization/test_gptq_bitblas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compare the outputs of a GPTQ model to a bitblas model. Note: GPTQ and bitblas do not have bitwise correctness. diff --git a/tests/models/quantization/test_gptq_marlin.py b/tests/models/quantization/test_gptq_marlin.py index 397bdb981..db70a3bd2 100644 --- a/tests/models/quantization/test_gptq_marlin.py +++ b/tests/models/quantization/test_gptq_marlin.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compares the outputs of gptq vs gptq_marlin. Note: GPTQ and Marlin do not have bitwise correctness. diff --git a/tests/models/quantization/test_gptq_marlin_24.py b/tests/models/quantization/test_gptq_marlin_24.py index 6fb24b1f4..9b86ae95b 100644 --- a/tests/models/quantization/test_gptq_marlin_24.py +++ b/tests/models/quantization/test_gptq_marlin_24.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compare the outputs of a GPTQ model to a Marlin_24 model. Note: GPTQ and Marlin_24 do not have bitwise correctness. diff --git a/tests/models/quantization/test_modelopt.py b/tests/models/quantization/test_modelopt.py index 1d9aa4fa8..6ad526cc8 100644 --- a/tests/models/quantization/test_modelopt.py +++ b/tests/models/quantization/test_modelopt.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # flake8: noqa """Tests Model Optimizer fp8 models against ground truth generation diff --git a/tests/models/quantization/test_mxfp4.py b/tests/models/quantization/test_mxfp4.py index 9a0608295..7b8a334bb 100644 --- a/tests/models/quantization/test_mxfp4.py +++ b/tests/models/quantization/test_mxfp4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # flake8: noqa """Tests Quark mxfp4 models against ground truth generation """ diff --git a/tests/models/quantization/test_nvfp4.py b/tests/models/quantization/test_nvfp4.py index 510858c2d..b95dad9a4 100644 --- a/tests/models/quantization/test_nvfp4.py +++ b/tests/models/quantization/test_nvfp4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # flake8: noqa """Tests Model Optimizer nvfp4 models against ground truth generation Note: these tests will only pass on B200 diff --git a/tests/models/registry.py b/tests/models/registry.py index 182a9668e..ed49676a9 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Mapping, Set from dataclasses import dataclass, field diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index d403cb392..af023d903 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest.mock import patch diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index b62720caa..ef0ad613d 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 3282284b6..b7527ca27 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import warnings diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index 1a51b4aea..b7b99ce41 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test the functionality of the Transformers backend.""" from typing import Any, Optional, Union diff --git a/tests/models/test_utils.py b/tests/models/test_utils.py index a16384efe..b52327a18 100644 --- a/tests/models/test_utils.py +++ b/tests/models/test_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/tests/models/test_vision.py b/tests/models/test_vision.py index d64c0e6d4..310d3a371 100644 --- a/tests/models/test_vision.py +++ b/tests/models/test_vision.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/models/utils.py b/tests/models/utils.py index ffc904bd1..943b4f570 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import warnings from collections.abc import Sequence diff --git a/tests/mq_llm_engine/conftest.py b/tests/mq_llm_engine/conftest.py index 1a20e2c13..375b248eb 100644 --- a/tests/mq_llm_engine/conftest.py +++ b/tests/mq_llm_engine/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/mq_llm_engine/test_abort.py b/tests/mq_llm_engine/test_abort.py index 808346b5e..5ff08cbb3 100644 --- a/tests/mq_llm_engine/test_abort.py +++ b/tests/mq_llm_engine/test_abort.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test that aborting is handled properly.""" import asyncio diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py index e617bd057..49b02279d 100644 --- a/tests/mq_llm_engine/test_error_handling.py +++ b/tests/mq_llm_engine/test_error_handling.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test that various errors are handled properly.""" import asyncio diff --git a/tests/mq_llm_engine/test_load.py b/tests/mq_llm_engine/test_load.py index 2069ff987..e9fd5b814 100644 --- a/tests/mq_llm_engine/test_load.py +++ b/tests/mq_llm_engine/test_load.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test that the MQLLMEngine is able to handle 10k concurrent requests.""" import asyncio diff --git a/tests/mq_llm_engine/utils.py b/tests/mq_llm_engine/utils.py index 64559609a..7976d5031 100644 --- a/tests/mq_llm_engine/utils.py +++ b/tests/mq_llm_engine/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import multiprocessing diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py index ce716e647..56e339d48 100644 --- a/tests/multi_step/test_correctness_async_llm.py +++ b/tests/multi_step/test_correctness_async_llm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Test the AsyncLLMEngine with multi-step-decoding from typing import Optional diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py index a823e484b..9f1b3bbe8 100644 --- a/tests/multi_step/test_correctness_llm.py +++ b/tests/multi_step/test_correctness_llm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Test the LLMEngine with multi-step-decoding diff --git a/tests/multimodal/test_hasher.py b/tests/multimodal/test_hasher.py index 17b36b368..b5048c8cc 100644 --- a/tests/multimodal/test_hasher.py +++ b/tests/multimodal/test_hasher.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from pathlib import Path import numpy as np diff --git a/tests/multimodal/test_image.py b/tests/multimodal/test_image.py index 56b5475c9..cfd44351a 100644 --- a/tests/multimodal/test_image.py +++ b/tests/multimodal/test_image.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from pathlib import Path import numpy as np diff --git a/tests/multimodal/test_inputs.py b/tests/multimodal/test_inputs.py index f5d3e282f..ffb3a6fe8 100644 --- a/tests/multimodal/test_inputs.py +++ b/tests/multimodal/test_inputs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 59f7bf8fa..8b52911c6 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import nullcontext from types import MethodType diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index f1e45da30..e4debb47c 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 import mimetypes diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py index e67624ece..9a700808d 100644 --- a/tests/multimodal/test_video.py +++ b/tests/multimodal/test_video.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import numpy as np import numpy.typing as npt import pytest diff --git a/tests/multimodal/utils.py b/tests/multimodal/utils.py index 40fcfeeea..23346509a 100644 --- a/tests/multimodal/utils.py +++ b/tests/multimodal/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import numpy as np from PIL import Image diff --git a/tests/neuron/1_core/test_activation.py b/tests/neuron/1_core/test_activation.py index ec2b1238e..2d6e5f523 100644 --- a/tests/neuron/1_core/test_activation.py +++ b/tests/neuron/1_core/test_activation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/neuron/1_core/test_block_table.py b/tests/neuron/1_core/test_block_table.py index 033a36b41..efec56360 100644 --- a/tests/neuron/1_core/test_block_table.py +++ b/tests/neuron/1_core/test_block_table.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import neuronxcc.nki.language as nl import pytest diff --git a/tests/neuron/1_core/test_cache.py b/tests/neuron/1_core/test_cache.py index 3d869cd2f..670889ad6 100644 --- a/tests/neuron/1_core/test_cache.py +++ b/tests/neuron/1_core/test_cache.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/neuron/1_core/test_layernorm.py b/tests/neuron/1_core/test_layernorm.py index e96df8db6..c6fce1d1a 100644 --- a/tests/neuron/1_core/test_layernorm.py +++ b/tests/neuron/1_core/test_layernorm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/neuron/1_core/test_logits_processor.py b/tests/neuron/1_core/test_logits_processor.py index 6d1514088..ce9eadf5a 100644 --- a/tests/neuron/1_core/test_logits_processor.py +++ b/tests/neuron/1_core/test_logits_processor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from unittest.mock import patch diff --git a/tests/neuron/1_core/test_neuron_model_runner.py b/tests/neuron/1_core/test_neuron_model_runner.py index 92417fb64..5f3268810 100644 --- a/tests/neuron/1_core/test_neuron_model_runner.py +++ b/tests/neuron/1_core/test_neuron_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from unittest.mock import MagicMock diff --git a/tests/neuron/1_core/test_neuron_quant.py b/tests/neuron/1_core/test_neuron_quant.py index 68f0cb805..086300269 100644 --- a/tests/neuron/1_core/test_neuron_quant.py +++ b/tests/neuron/1_core/test_neuron_quant.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.model_executor.layers.quantization.neuron_quant import ( NeuronQuantConfig) diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py index 8f7e711b5..8b9a5f6e4 100644 --- a/tests/neuron/1_core/test_prefix_prefill.py +++ b/tests/neuron/1_core/test_prefix_prefill.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/neuron/1_core/test_rotary_embedding.py b/tests/neuron/1_core/test_rotary_embedding.py index da57631fc..a7ac79729 100644 --- a/tests/neuron/1_core/test_rotary_embedding.py +++ b/tests/neuron/1_core/test_rotary_embedding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Tests for miscellaneous utilities """ diff --git a/tests/neuron/2_core/test_comm_ops.py b/tests/neuron/2_core/test_comm_ops.py index 3cad160b2..85a48dae5 100644 --- a/tests/neuron/2_core/test_comm_ops.py +++ b/tests/neuron/2_core/test_comm_ops.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools from typing import Callable from unittest.mock import patch diff --git a/tests/neuron/2_core/test_eagle.py b/tests/neuron/2_core/test_eagle.py index d71c88689..cac642af0 100644 --- a/tests/neuron/2_core/test_eagle.py +++ b/tests/neuron/2_core/test_eagle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import os diff --git a/tests/neuron/2_core/test_mistral.py b/tests/neuron/2_core/test_mistral.py index 3e651502d..d02fff943 100644 --- a/tests/neuron/2_core/test_mistral.py +++ b/tests/neuron/2_core/test_mistral.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import LLM, SamplingParams diff --git a/tests/neuron/2_core/test_multi_lora.py b/tests/neuron/2_core/test_multi_lora.py index 6fa8f9128..6b97f47d4 100644 --- a/tests/neuron/2_core/test_multi_lora.py +++ b/tests/neuron/2_core/test_multi_lora.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from huggingface_hub import snapshot_download diff --git a/tests/plugins/lora_resolvers/test_filesystem_resolver.py b/tests/plugins/lora_resolvers/test_filesystem_resolver.py index cb0f0c3c5..3e2c2577d 100644 --- a/tests/plugins/lora_resolvers/test_filesystem_resolver.py +++ b/tests/plugins/lora_resolvers/test_filesystem_resolver.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import shutil diff --git a/tests/plugins/vllm_add_dummy_model/setup.py b/tests/plugins/vllm_add_dummy_model/setup.py index e3fb6efb2..6307bb638 100644 --- a/tests/plugins/vllm_add_dummy_model/setup.py +++ b/tests/plugins/vllm_add_dummy_model/setup.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from setuptools import setup diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py index 0c431cb39..b2085b01c 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import ModelRegistry diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py index bc4a41cdf..aff349856 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Optional, Union diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py index c23ab6430..da97cf7e2 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py index bbd11ed4a..8c34407e3 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/plugins/vllm_add_dummy_platform/setup.py b/tests/plugins/vllm_add_dummy_platform/setup.py index 10df0b5e0..e40f62f77 100644 --- a/tests/plugins/vllm_add_dummy_platform/setup.py +++ b/tests/plugins/vllm_add_dummy_platform/setup.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from setuptools import setup diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py index 0d1b062ac..1b28342eb 100644 --- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py index 33425bbc1..f30a36f35 100644 --- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.attention.backends.flash_attn import FlashAttentionBackend diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py index 5cefafc7e..67cd5ed3b 100644 --- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.platforms.cuda import CudaPlatform diff --git a/tests/plugins_tests/conftest.py b/tests/plugins_tests/conftest.py index 8561f2ddf..c8c1b81ca 100644 --- a/tests/plugins_tests/conftest.py +++ b/tests/plugins_tests/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py index 207de53ab..685a8cd2c 100644 --- a/tests/plugins_tests/test_platform_plugins.py +++ b/tests/plugins_tests/test_platform_plugins.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py index 4c95a52a9..8c2121610 100644 --- a/tests/plugins_tests/test_scheduler_plugins.py +++ b/tests/plugins_tests/test_scheduler_plugins.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py index 4cc399175..f00a8f699 100644 --- a/tests/prefix_caching/test_disable_sliding_window.py +++ b/tests/prefix_caching/test_disable_sliding_window.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compare the with and without prefix caching. Run `pytest tests/prefix_caching/test_prefix_caching.py`. diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 607b6c43e..a65fc934b 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compare the with and without prefix caching. Run `pytest tests/prefix_caching/test_prefix_caching.py`. diff --git a/tests/prompt_adapter/test_bloom.py b/tests/prompt_adapter/test_bloom.py index a31d8e873..2b603fe8f 100644 --- a/tests/prompt_adapter/test_bloom.py +++ b/tests/prompt_adapter/test_bloom.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/prompt_adapter/test_multi_adapter_inference.py b/tests/prompt_adapter/test_multi_adapter_inference.py index e249a6e64..4f273afb4 100644 --- a/tests/prompt_adapter/test_multi_adapter_inference.py +++ b/tests/prompt_adapter/test_multi_adapter_inference.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import EngineArgs, LLMEngine, SamplingParams from vllm.prompt_adapter.request import PromptAdapterRequest diff --git a/tests/prompt_adapter/test_pa_lora.py b/tests/prompt_adapter/test_pa_lora.py index fb4c3e149..ba2e15b81 100644 --- a/tests/prompt_adapter/test_pa_lora.py +++ b/tests/prompt_adapter/test_pa_lora.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from huggingface_hub import snapshot_download diff --git a/tests/quantization/test_auto_round.py b/tests/quantization/test_auto_round.py index 81ceecdb4..1c41d904b 100644 --- a/tests/quantization/test_auto_round.py +++ b/tests/quantization/test_auto_round.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test model set-up and inference for quantized HF models supported on the AutoRound. diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py index e8ddfd7fc..325a902b3 100644 --- a/tests/quantization/test_bitsandbytes.py +++ b/tests/quantization/test_bitsandbytes.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project '''Tests whether bitsandbytes computation is enabled correctly. Run `pytest tests/quantization/test_bitsandbytes.py`. diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index c968a68f1..807b24d4e 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test model set-up and weight loading for llmcompressor-quantized models. Run `pytest tests/quantization/test_compressed_tensors.py`. diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py index e30166842..8b0ffc0fe 100644 --- a/tests/quantization/test_configs.py +++ b/tests/quantization/test_configs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests whether Marlin models can be loaded from the autogptq config. Run `pytest tests/quantization/test_configs.py --forked`. diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py index a05eb494c..08d9573ec 100644 --- a/tests/quantization/test_cpu_offload.py +++ b/tests/quantization/test_cpu_offload.py @@ -1,4 +1,5 @@ -# SPDX-License-Identifier: Apache-2.0 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Expanded quantized model tests for CPU offloading # Base tests: tests/basic_correctness/test_cpu_offload.py diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py index b6db6d5f2..50179b9a9 100644 --- a/tests/quantization/test_experts_int8.py +++ b/tests/quantization/test_experts_int8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # flake8: noqa """Tests experts_int8 quantization startup and generation, diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index e74e14a0d..e5ab7b3dd 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests whether FP8 computation is enabled correctly. Run `pytest tests/quantization/test_fp8.py --forked`. diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py index 22055c49a..23b999e7c 100644 --- a/tests/quantization/test_gptq_dynamic.py +++ b/tests/quantization/test_gptq_dynamic.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests whether gptq models with dynamic quantized can be loaded. Run `pytest tests/quantization/test_gptq_dynamic.py --forked`. diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py index 0e3913676..34b1b6c2e 100644 --- a/tests/quantization/test_ipex_quant.py +++ b/tests/quantization/test_ipex_quant.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test model set-up and inference for quantized HF models supported on the CPU/GPU backend using IPEX (including AWQ/GPTQ). diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index 1c6bd1852..11f78a23b 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests whether gptq models with quantized lm_head can be loaded. Run `pytest tests/quantization/test_quant_lm_head_true.py --forked`. diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py index 9bbb5e327..5f78bc305 100644 --- a/tests/quantization/test_ptpc_fp8.py +++ b/tests/quantization/test_ptpc_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests whether PTPC w8a8 FP8 computation is enabled correctly. Run `pytest tests/quantization/test_ptpc_fp8.py --forked`. diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py index ae09ac58e..3571f773f 100644 --- a/tests/quantization/test_quark.py +++ b/tests/quantization/test_quark.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test model set-up and weight loading for quark-quantized models. Run `pytest tests/quantization/test_quark.py`. diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py index 0ea71aaf8..42081a8c6 100644 --- a/tests/quantization/test_register_quantization_config.py +++ b/tests/quantization/test_register_quantization_config.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests register custom quantization config. See https://github.com/vllm-project/vllm/issues/11926 for more details. diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py index 6571fc9e4..c966dc9b8 100644 --- a/tests/quantization/test_torchao.py +++ b/tests/quantization/test_torchao.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib.metadata import importlib.util diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py index 7a339c162..20a425b72 100644 --- a/tests/quantization/utils.py +++ b/tests/quantization/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.model_executor.layers.quantization import get_quantization_config from vllm.platforms import current_platform diff --git a/tests/reasoning/test_deepseekr1_reasoning_parser.py b/tests/reasoning/test_deepseekr1_reasoning_parser.py index 1b669c8fd..987f3c48d 100644 --- a/tests/reasoning/test_deepseekr1_reasoning_parser.py +++ b/tests/reasoning/test_deepseekr1_reasoning_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from transformers import AutoTokenizer diff --git a/tests/reasoning/test_granite_reasoning_parser.py b/tests/reasoning/test_granite_reasoning_parser.py index 48fb8c2f8..38cab73a4 100644 --- a/tests/reasoning/test_granite_reasoning_parser.py +++ b/tests/reasoning/test_granite_reasoning_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from transformers import AutoTokenizer diff --git a/tests/reasoning/test_qwen3_reasoning_parser.py b/tests/reasoning/test_qwen3_reasoning_parser.py index 95b7460d3..2d5557d5c 100644 --- a/tests/reasoning/test_qwen3_reasoning_parser.py +++ b/tests/reasoning/test_qwen3_reasoning_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from transformers import AutoTokenizer diff --git a/tests/reasoning/utils.py b/tests/reasoning/utils.py index 0f894ed80..ddcf89796 100644 --- a/tests/reasoning/utils.py +++ b/tests/reasoning/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional, Union diff --git a/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py b/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py index 8b96184f5..e27d9958f 100644 --- a/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py +++ b/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import SamplingParams from vllm.config import LoadConfig, LoadFormat diff --git a/tests/runai_model_streamer_test/test_weight_utils.py b/tests/runai_model_streamer_test/test_weight_utils.py index 06e506c35..ee448c2cc 100644 --- a/tests/runai_model_streamer_test/test_weight_utils.py +++ b/tests/runai_model_streamer_test/test_weight_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import glob import tempfile diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index 5de1137ea..bdf48c768 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compare the outputs of HF and vLLM when using beam search. Run `pytest tests/samplers/test_beam_search.py`. diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py index 2a124aa0c..7eb9c0b5f 100644 --- a/tests/samplers/test_ignore_eos.py +++ b/tests/samplers/test_ignore_eos.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Make sure ignore_eos works. Run `pytest tests/samplers/test_ignore_eos.py`. diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py index 74f1eb4a9..901c87591 100644 --- a/tests/samplers/test_logits_processor.py +++ b/tests/samplers/test_logits_processor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 5cc646e76..86c8a03ee 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py index f9688b4b9..42b529ae1 100644 --- a/tests/samplers/test_no_bad_words.py +++ b/tests/samplers/test_no_bad_words.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Make sure bad_words works. Run `pytest tests/samplers/test_no_bad_words.py`. diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py index ebe9b3021..86fc14dc8 100644 --- a/tests/samplers/test_ranks.py +++ b/tests/samplers/test_ranks.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index 6ef61f2ff..3b93c6411 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for rejection sampling.""" import pytest diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 7b19d5750..520b88d03 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools import random diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py index efa2642db..b339b4b2d 100644 --- a/tests/samplers/test_seeded_generate.py +++ b/tests/samplers/test_seeded_generate.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Verify that seeded random sampling is deterministic. Run `pytest tests/samplers/test_seeded_generate.py`. diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py index 279e5ed10..418471b8e 100644 --- a/tests/samplers/test_typical_acceptance_sampler.py +++ b/tests/samplers/test_typical_acceptance_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for rejection sampling.""" import pytest diff --git a/tests/spec_decode/conftest.py b/tests/spec_decode/conftest.py index 1a20e2c13..375b248eb 100644 --- a/tests/spec_decode/conftest.py +++ b/tests/spec_decode/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index 921081f3c..f3fe9db3f 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from itertools import cycle diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py index 4fd52cf7e..6c453879a 100644 --- a/tests/spec_decode/e2e/test_compatibility.py +++ b/tests/spec_decode/e2e/test_compatibility.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py index eee535a14..989394614 100644 --- a/tests/spec_decode/e2e/test_eagle_correctness.py +++ b/tests/spec_decode/e2e/test_eagle_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """This docstring details important information on the testing methodology. Most of the tests rely on "greedy equality", where we expect the output of diff --git a/tests/spec_decode/e2e/test_integration.py b/tests/spec_decode/e2e/test_integration.py index 9dfc1b2fd..760861850 100644 --- a/tests/spec_decode/e2e/test_integration.py +++ b/tests/spec_decode/e2e/test_integration.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests which cover integration of the speculative decoding framework with other features, e.g. cuda graphs. """ diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py index b11297475..a18be80c5 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp2.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests which cover integration of the speculative decoding framework with tensor parallelism. """ diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py index a1b7c8b40..039eec8fd 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp4.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests which cover integration of the speculative decoding framework with tensor parallelism. """ diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py index cb2dae541..1629c69f8 100644 --- a/tests/spec_decode/e2e/test_logprobs.py +++ b/tests/spec_decode/e2e/test_logprobs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from itertools import cycle diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py index 5c60100e6..064a6e10a 100644 --- a/tests/spec_decode/e2e/test_medusa_correctness.py +++ b/tests/spec_decode/e2e/test_medusa_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """This docstring details important information on the testing methodology. Most of the tests rely on "greedy equality", where we expect the output of diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py index 7bf29349d..9f778ca8d 100644 --- a/tests/spec_decode/e2e/test_mlp_correctness.py +++ b/tests/spec_decode/e2e/test_mlp_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """This docstring details important information on the testing methodology. Most of the tests rely on "greedy equality", where we expect the output of diff --git a/tests/spec_decode/e2e/test_mtp_correctness.py b/tests/spec_decode/e2e/test_mtp_correctness.py index 371e6834b..d4d4d519b 100644 --- a/tests/spec_decode/e2e/test_mtp_correctness.py +++ b/tests/spec_decode/e2e/test_mtp_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """This docstring details important information on the testing methodology. Most of the tests rely on "greedy equality", where we expect the output of diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index e187b6bc1..6d385184d 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """The tests in this file verify end-to-end speculative decoding correctness. This docstring details important information on the testing methodology. diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py index eca433ffa..c10329a9b 100644 --- a/tests/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/spec_decode/e2e/test_ngram_correctness.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """This docstring details important information on the testing methodology. Most of the tests rely on "greedy equality", where we expect the output of diff --git a/tests/spec_decode/e2e/test_seed.py b/tests/spec_decode/e2e/test_seed.py index 3dc371722..4cf373809 100644 --- a/tests/spec_decode/e2e/test_seed.py +++ b/tests/spec_decode/e2e/test_seed.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py index 9edd8bd4c..d20c549b0 100644 --- a/tests/spec_decode/test_batch_expansion.py +++ b/tests/spec_decode/test_batch_expansion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py index 0bff0ea1d..407786ad3 100644 --- a/tests/spec_decode/test_dynamic_spec_decode.py +++ b/tests/spec_decode/test_dynamic_spec_decode.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest.mock import MagicMock, patch diff --git a/tests/spec_decode/test_memory_usage.py b/tests/spec_decode/test_memory_usage.py index 16dffe6d7..5d9dd3f72 100644 --- a/tests/spec_decode/test_memory_usage.py +++ b/tests/spec_decode/test_memory_usage.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """This docstring details important information on the testing methodology. This test verifies that memory usage remains constant (or never grows) when diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py index 1a6693e16..e8de410f8 100644 --- a/tests/spec_decode/test_metrics.py +++ b/tests/spec_decode/test_metrics.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from unittest.mock import MagicMock diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index ca37c9a68..f2d93203b 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from unittest.mock import MagicMock diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py index 7de54b3ed..8a7c11485 100644 --- a/tests/spec_decode/test_ngram_worker.py +++ b/tests/spec_decode/test_ngram_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py index f73cf4b34..55fcf0055 100644 --- a/tests/spec_decode/test_scorer.py +++ b/tests/spec_decode/test_scorer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index f7ef9786a..8aceaadff 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from collections import defaultdict diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py index 24573e224..9cfc618b9 100644 --- a/tests/spec_decode/test_utils.py +++ b/tests/spec_decode/test_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest.mock import MagicMock diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index d303b7f12..1733f66fe 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence as GenericSequence from itertools import count diff --git a/tests/standalone_tests/lazy_imports.py b/tests/standalone_tests/lazy_imports.py index 61e3b3879..21bcb6b82 100644 --- a/tests/standalone_tests/lazy_imports.py +++ b/tests/standalone_tests/lazy_imports.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Description: Test the lazy import module # The utility function cannot be placed in `vllm.utils` diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py index ce8689f5b..cd59d579e 100644 --- a/tests/tensorizer_loader/conftest.py +++ b/tests/tensorizer_loader/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from vllm.distributed import cleanup_dist_env_and_memory diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 747ec56ad..c97f5968d 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import gc import os diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index 05d2c624d..edc0849df 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test hashing of cache blocks. Run `pytest tests/test_cache_block_hashing.py`. diff --git a/tests/test_config.py b/tests/test_config.py index 7db95e3f6..dffea9138 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import MISSING, Field, asdict, dataclass, field from typing import Literal, Union diff --git a/tests/test_embedded_commit.py b/tests/test_embedded_commit.py index a9b4f5cbf..b9593e2a3 100644 --- a/tests/test_embedded_commit.py +++ b/tests/test_embedded_commit.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import vllm diff --git a/tests/test_inputs.py b/tests/test_inputs.py index d361808ed..e549834fa 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/test_logger.py b/tests/test_logger.py index 046f70504..8f235f147 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum import json import logging diff --git a/tests/test_outputs.py b/tests/test_outputs.py index c41bd6723..4bb1c20f7 100644 --- a/tests/test_outputs.py +++ b/tests/test_outputs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.outputs import RequestOutput diff --git a/tests/test_regression.py b/tests/test_regression.py index e09294542..f5f1ed8e8 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Containing tests that check for regressions in vLLM's behavior. It should include tests that are reported by users and making sure they diff --git a/tests/test_sampling_params.py b/tests/test_sampling_params.py index 9af810c4c..39e3808d8 100644 --- a/tests/test_sampling_params.py +++ b/tests/test_sampling_params.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the SamplingParams class. """ diff --git a/tests/test_scalartype.py b/tests/test_scalartype.py index eecfa1db3..ef4aef3af 100644 --- a/tests/test_scalartype.py +++ b/tests/test_scalartype.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/test_seed_behavior.py b/tests/test_seed_behavior.py index c45ed6926..e9138b9e8 100644 --- a/tests/test_seed_behavior.py +++ b/tests/test_seed_behavior.py @@ -1,4 +1,5 @@ -# SPDX-License-Identifier: Apache-2.0 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random import numpy as np diff --git a/tests/test_sequence.py b/tests/test_sequence.py index 902de1099..a782a3bf7 100644 --- a/tests/test_sequence.py +++ b/tests/test_sequence.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py index 77fec0968..64706defb 100644 --- a/tests/test_sharded_state_loader.py +++ b/tests/test_sharded_state_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import multiprocessing as mp import os diff --git a/tests/test_triton_utils.py b/tests/test_triton_utils.py index eb8ad48fd..64f72668f 100644 --- a/tests/test_triton_utils.py +++ b/tests/test_triton_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import sys import types diff --git a/tests/test_utils.py b/tests/test_utils.py index 42e0df1ff..a2fd845ea 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa import asyncio diff --git a/tests/test_version.py b/tests/test_version.py index 56842b6d4..fd07abb59 100644 --- a/tests/test_version.py +++ b/tests/test_version.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest.mock import patch diff --git a/tests/test_vllm_port.py b/tests/test_vllm_port.py index ccbb36bf4..88e1efd8f 100644 --- a/tests/test_vllm_port.py +++ b/tests/test_vllm_port.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from unittest.mock import patch diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py index c740fde42..e218678c4 100644 --- a/tests/tokenization/test_cached_tokenizer.py +++ b/tests/tokenization/test_cached_tokenizer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pickle from copy import deepcopy diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 079100e78..b289dc972 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Generator from typing import Any, Optional diff --git a/tests/tokenization/test_get_eos.py b/tests/tokenization/test_get_eos.py index 8942f8891..d82884293 100644 --- a/tests/tokenization/test_get_eos.py +++ b/tests/tokenization/test_get_eos.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This test file includes some cases where it is inappropriate to only get the `eos_token_id` from the tokenizer as defined by diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py index b16d9af35..69b3c6294 100644 --- a/tests/tokenization/test_mistral_tokenizer.py +++ b/tests/tokenization/test_mistral_tokenizer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from mistral_common.protocol.instruct.messages import (AssistantMessage, diff --git a/tests/tokenization/test_tokenizer.py b/tests/tokenization/test_tokenizer.py index eddc63098..09a3638fd 100644 --- a/tests/tokenization/test_tokenizer.py +++ b/tests/tokenization/test_tokenizer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from transformers import PreTrainedTokenizerBase diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py index bcfa78ed4..0570c1525 100644 --- a/tests/tokenization/test_tokenizer_group.py +++ b/tests/tokenization/test_tokenizer_group.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest from transformers import AutoTokenizer, PreTrainedTokenizerBase diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py index 772eeb345..5abb10164 100644 --- a/tests/tokenization/test_tokenizer_registry.py +++ b/tests/tokenization/test_tokenizer_registry.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TYPE_CHECKING, Any, Optional, Union diff --git a/tests/tool_use/conftest.py b/tests/tool_use/conftest.py index 4bf9b45fe..510b54790 100644 --- a/tests/tool_use/conftest.py +++ b/tests/tool_use/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import pytest_asyncio diff --git a/tests/tool_use/test_chat_completion_request_validations.py b/tests/tool_use/test_chat_completion_request_validations.py index ba0ad78f6..a30c58b09 100644 --- a/tests/tool_use/test_chat_completion_request_validations.py +++ b/tests/tool_use/test_chat_completion_request_validations.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py index 448347be6..8c01c86e2 100644 --- a/tests/tool_use/test_chat_completions.py +++ b/tests/tool_use/test_chat_completions.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai import pytest diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py index a40675744..351531393 100644 --- a/tests/tool_use/test_jamba_tool_parser.py +++ b/tests/tool_use/test_jamba_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from collections.abc import Generator diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py index 910e0b2d5..fff20c68d 100644 --- a/tests/tool_use/test_parallel_tool_calls.py +++ b/tests/tool_use/test_parallel_tool_calls.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from typing import Optional diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py index b320b335e..53ba03a0a 100644 --- a/tests/tool_use/test_tool_calls.py +++ b/tests/tool_use/test_tool_calls.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from typing import Optional diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py index 291769848..3b43b723d 100644 --- a/tests/tool_use/test_tool_choice_required.py +++ b/tests/tool_use/test_tool_choice_required.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from copy import deepcopy from unittest.mock import MagicMock diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py index efa6455c4..a17fab9ae 100644 --- a/tests/tool_use/utils.py +++ b/tests/tool_use/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from copy import deepcopy from typing import Any, Optional diff --git a/tests/tpu/lora/test_lora.py b/tests/tpu/lora/test_lora.py index 21d7fce69..b26bdd34d 100644 --- a/tests/tpu/lora/test_lora.py +++ b/tests/tpu/lora/test_lora.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import vllm diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py index 06e00187c..3a180c679 100644 --- a/tests/tpu/test_compilation.py +++ b/tests/tpu/test_compilation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import glob import os diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py index acb6b90f5..9c90df1b7 100644 --- a/tests/tpu/test_custom_dispatcher.py +++ b/tests/tpu/test_custom_dispatcher.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/tpu/test_moe_pallas.py b/tests/tpu/test_moe_pallas.py index 19df22f78..ab6cd3069 100644 --- a/tests/tpu/test_moe_pallas.py +++ b/tests/tpu/test_moe_pallas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Tests for the Pallas MOE implementation. Run `pytest tests/kernels/moe/test_moe_pallas.py`. diff --git a/tests/tpu/test_quantization_accuracy.py b/tests/tpu/test_quantization_accuracy.py index 20f9dd77d..a13cf7064 100644 --- a/tests/tpu/test_quantization_accuracy.py +++ b/tests/tpu/test_quantization_accuracy.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py index caa233ec3..4dbae7c15 100644 --- a/tests/tracing/test_tracing.py +++ b/tests/tracing/test_tracing.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa # type: ignore from __future__ import annotations diff --git a/tests/utils.py b/tests/utils.py index d21b18470..ade28a481 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import copy diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 61aee8752..ad34becb1 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib import pytest diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 1a7a31d98..897d181ec 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Compare the with and without prefix caching.""" from typing import Optional diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index f38454b1b..aa074f1bb 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional from unittest.mock import Mock diff --git a/tests/v1/core/test_scheduler_e2e.py b/tests/v1/core/test_scheduler_e2e.py index 511d57d40..85415f6ad 100644 --- a/tests/v1/core/test_scheduler_e2e.py +++ b/tests/v1/core/test_scheduler_e2e.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import pytest diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py index 4217dc37e..c6f7481dd 100644 --- a/tests/v1/core/test_specialized_manager.py +++ b/tests/v1/core/test_specialized_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py index 48c265560..161bcd4d3 100644 --- a/tests/v1/e2e/test_cascade_attention.py +++ b/tests/v1/e2e/test_cascade_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/v1/e2e/test_correctness_sliding_window.py b/tests/v1/e2e/test_correctness_sliding_window.py index a125d3fb7..3eedc535d 100644 --- a/tests/v1/e2e/test_correctness_sliding_window.py +++ b/tests/v1/e2e/test_correctness_sliding_window.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass import pytest diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 2fad37d68..93e7c12f3 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations import random diff --git a/tests/v1/engine/conftest.py b/tests/v1/engine/conftest.py index d04679c12..d7722142b 100644 --- a/tests/v1/engine/conftest.py +++ b/tests/v1/engine/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 5d52ad5f5..957d50d0d 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio from contextlib import ExitStack diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py index 9b2f1a919..f70a3ce14 100644 --- a/tests/v1/engine/test_engine_args.py +++ b/tests/v1/engine/test_engine_args.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from argparse import ArgumentError diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index e78c7480a..3d7632a60 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy import time diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 8bea032f6..a01b205df 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import os diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py index e77916f95..6284dcfb9 100644 --- a/tests/v1/engine/test_llm_engine.py +++ b/tests/v1/engine/test_llm_engine.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from typing import Optional diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index fac701c4c..a83454ee6 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math import time diff --git a/tests/v1/engine/utils.py b/tests/v1/engine/utils.py index 4a23e0c1b..b58bc75fc 100644 --- a/tests/v1/engine/utils.py +++ b/tests/v1/engine/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random from dataclasses import dataclass diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py index 8c03f0433..ffe061212 100644 --- a/tests/v1/entrypoints/conftest.py +++ b/tests/v1/entrypoints/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index 5f1fff200..a39ab47b8 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -1,5 +1,6 @@ # ruff: noqa: E501 # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/tests/v1/entrypoints/openai/test_chat_completion.py b/tests/v1/entrypoints/openai/test_chat_completion.py index c650ccd0c..dffb32846 100644 --- a/tests/v1/entrypoints/openai/test_chat_completion.py +++ b/tests/v1/entrypoints/openai/test_chat_completion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai # use the official client for correctness check import pytest diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py index 333ad2379..a7c31c064 100644 --- a/tests/v1/entrypoints/openai/test_completion.py +++ b/tests/v1/entrypoints/openai/test_completion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/v1/entrypoints/openai/test_multi_api_servers.py index 7b4583bc3..ed4ecbe84 100644 --- a/tests/v1/entrypoints/openai/test_multi_api_servers.py +++ b/tests/v1/entrypoints/openai/test_multi_api_servers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import os diff --git a/tests/v1/kv_connector/nixl_integration/test_accuracy.py b/tests/v1/kv_connector/nixl_integration/test_accuracy.py index be2d84f3b..2b2b147ce 100644 --- a/tests/v1/kv_connector/nixl_integration/test_accuracy.py +++ b/tests/v1/kv_connector/nixl_integration/test_accuracy.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import lm_eval diff --git a/tests/v1/kv_connector/nixl_integration/test_edge_cases.py b/tests/v1/kv_connector/nixl_integration/test_edge_cases.py index 5363fbde0..95465a25f 100644 --- a/tests/v1/kv_connector/nixl_integration/test_edge_cases.py +++ b/tests/v1/kv_connector/nixl_integration/test_edge_cases.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import openai diff --git a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py index 13071f581..3d720fe0c 100644 --- a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py +++ b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import itertools diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py index a21d92c52..ddf2836d0 100644 --- a/tests/v1/kv_connector/unit/test_multi_connector.py +++ b/tests/v1/kv_connector/unit/test_multi_connector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import filecmp import shutil import tempfile diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 9b2a720c1..9b257143d 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import ( NixlConnectorMetadata) diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py index dc963251c..52dc21a2c 100644 --- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py +++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT diff --git a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py index 86eacb693..2312e2135 100644 --- a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py +++ b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 3c3190b32..e190e9561 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional import torch diff --git a/tests/v1/metrics/test_ray_metrics.py b/tests/v1/metrics/test_ray_metrics.py index 02475f7c1..ea54038a2 100644 --- a/tests/v1/metrics/test_ray_metrics.py +++ b/tests/v1/metrics/test_ray_metrics.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import ray diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index 3800cb392..612eca116 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools from collections.abc import Generator diff --git a/tests/v1/sample/test_logprobs_e2e.py b/tests/v1/sample/test_logprobs_e2e.py index f62770060..085b2ee09 100644 --- a/tests/v1/sample/test_logprobs_e2e.py +++ b/tests/v1/sample/test_logprobs_e2e.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import lm_eval diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py index cbdb0b910..f35c3e194 100644 --- a/tests/v1/sample/test_rejection_sampler.py +++ b/tests/v1/sample/test_rejection_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional import pytest diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py index 24b759bc1..a2beb5ad7 100644 --- a/tests/v1/sample/test_sampler.py +++ b/tests/v1/sample/test_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py index 0512a1e02..ac0f3eb58 100644 --- a/tests/v1/sample/test_sampling_params_e2e.py +++ b/tests/v1/sample/test_sampling_params_e2e.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import pytest diff --git a/tests/v1/sample/test_topk_topp_sampler.py b/tests/v1/sample/test_topk_topp_sampler.py index 220f05c7f..63fdeb5a6 100644 --- a/tests/v1/sample/test_topk_topp_sampler.py +++ b/tests/v1/sample/test_topk_topp_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch from flashinfer.sampling import top_k_renorm_probs, top_p_renorm_probs diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py index 932b652ae..8c111f846 100644 --- a/tests/v1/sample/utils.py +++ b/tests/v1/sample/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from enum import Enum from typing import Optional diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py index ed368fe82..682d84dc2 100644 --- a/tests/v1/shutdown/test_delete.py +++ b/tests/v1/shutdown/test_delete.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test that we handle a startup Error and shutdown.""" import pytest diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index 9fedbe4f9..523b7ee23 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test that we handle an Error in model forward and shutdown.""" import asyncio diff --git a/tests/v1/shutdown/test_processor_error.py b/tests/v1/shutdown/test_processor_error.py index 0fe48da47..a077d48fe 100644 --- a/tests/v1/shutdown/test_processor_error.py +++ b/tests/v1/shutdown/test_processor_error.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test error handling in Processor. Should not impact other reqs.""" import asyncio diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py index 1bba19102..88fc5297a 100644 --- a/tests/v1/shutdown/test_startup_error.py +++ b/tests/v1/shutdown/test_startup_error.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test that we handle a startup Error and shutdown.""" import pytest diff --git a/tests/v1/shutdown/utils.py b/tests/v1/shutdown/utils.py index 8f7c0380d..124254a41 100644 --- a/tests/v1/shutdown/utils.py +++ b/tests/v1/shutdown/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Shutdown test utils""" SHUTDOWN_TEST_TIMEOUT_SEC = 120 diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index b49ac45f3..eff8eff43 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest import mock diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py index f577fb4ab..9070d2b10 100644 --- a/tests/v1/spec_decode/test_max_len.py +++ b/tests/v1/spec_decode/test_max_len.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Test whether spec decoding handles the max model length properly.""" import pytest diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py index 50548219f..ffea86d0d 100644 --- a/tests/v1/spec_decode/test_ngram.py +++ b/tests/v1/spec_decode/test_ngram.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import numpy as np diff --git a/tests/v1/structured_output/test_utils.py b/tests/v1/structured_output/test_utils.py index ffc0bceee..4e7c4b33e 100644 --- a/tests/v1/structured_output/test_utils.py +++ b/tests/v1/structured_output/test_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py index 366fa3b25..53242180b 100644 --- a/tests/v1/test_async_llm_dp.py +++ b/tests/v1/test_async_llm_dp.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import os diff --git a/tests/v1/test_metrics_reader.py b/tests/v1/test_metrics_reader.py index 68539c80b..c05de5e4c 100644 --- a/tests/v1/test_metrics_reader.py +++ b/tests/v1/test_metrics_reader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import prometheus_client import pytest diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index 1b77417a1..e5eadfd4e 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import pytest diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py index ee490071f..0ab4e0bf5 100644 --- a/tests/v1/test_serial_utils.py +++ b/tests/v1/test_serial_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import UserDict from dataclasses import dataclass from typing import Optional diff --git a/tests/v1/test_utils.py b/tests/v1/test_utils.py index b68f08385..a3df882a9 100644 --- a/tests/v1/test_utils.py +++ b/tests/v1/test_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py index 1c0210b6a..7117a66c2 100644 --- a/tests/v1/tpu/test_basic.py +++ b/tests/v1/tpu/test_basic.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A basic correctness check for TPUs Run `pytest tests/v1/tpu/test_basic.py`. diff --git a/tests/v1/tpu/test_mha_attn.py b/tests/v1/tpu/test_mha_attn.py index 01664598c..55fee4ee1 100644 --- a/tests/v1/tpu/test_mha_attn.py +++ b/tests/v1/tpu/test_mha_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Test: diff --git a/tests/v1/tpu/test_multimodal.py b/tests/v1/tpu/test_multimodal.py index 8c87fc836..a61773a4f 100644 --- a/tests/v1/tpu/test_multimodal.py +++ b/tests/v1/tpu/test_multimodal.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import openai import pytest diff --git a/tests/v1/tpu/test_pallas.py b/tests/v1/tpu/test_pallas.py index 8faa5270b..3a9d80847 100644 --- a/tests/v1/tpu/test_pallas.py +++ b/tests/v1/tpu/test_pallas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from unittest.mock import ANY, patch import torch diff --git a/tests/v1/tpu/test_perf.py b/tests/v1/tpu/test_perf.py index 811833f73..f4a2d5ac8 100644 --- a/tests/v1/tpu/test_perf.py +++ b/tests/v1/tpu/test_perf.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A basic performance regression test for TPUs Run `pytest tests/v1/tpu/test_perf.py`. diff --git a/tests/v1/tpu/test_sampler.py b/tests/v1/tpu/test_sampler.py index 2bbeb3dda..198bb1e16 100644 --- a/tests/v1/tpu/test_sampler.py +++ b/tests/v1/tpu/test_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random import pytest diff --git a/tests/v1/tpu/test_topk_topp_sampler.py b/tests/v1/tpu/test_topk_topp_sampler.py index ff9217f8f..ca5c067b3 100644 --- a/tests/v1/tpu/test_topk_topp_sampler.py +++ b/tests/v1/tpu/test_topk_topp_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math import pytest diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index 348f12887..230c97e78 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import unittest.mock as mock import pytest diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 27741bd15..e932e4b32 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import inspect from typing import Optional diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 6ba6d1f6f..ceb9d4df2 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import random diff --git a/tests/vllm_test_utils/setup.py b/tests/vllm_test_utils/setup.py index c03943149..83be8bdce 100644 --- a/tests/vllm_test_utils/setup.py +++ b/tests/vllm_test_utils/setup.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from setuptools import setup diff --git a/tests/vllm_test_utils/vllm_test_utils/__init__.py b/tests/vllm_test_utils/vllm_test_utils/__init__.py index 1d1219fbe..2818428de 100644 --- a/tests/vllm_test_utils/vllm_test_utils/__init__.py +++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ vllm_utils is a package for vLLM testing utilities. It does not import any vLLM modules. diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py index 3b25980cb..49fd083ef 100644 --- a/tests/vllm_test_utils/vllm_test_utils/blame.py +++ b/tests/vllm_test_utils/vllm_test_utils/blame.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib import dataclasses diff --git a/tests/vllm_test_utils/vllm_test_utils/monitor.py b/tests/vllm_test_utils/vllm_test_utils/monitor.py index 27077f13d..9454221b2 100644 --- a/tests/vllm_test_utils/vllm_test_utils/monitor.py +++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib import dataclasses diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py index 9f99b3725..3aabae099 100644 --- a/tests/weight_loading/test_weight_loading.py +++ b/tests/weight_loading/test_weight_loading.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os diff --git a/tests/worker/conftest.py b/tests/worker/conftest.py index 372d71a78..3f202d4db 100644 --- a/tests/worker/conftest.py +++ b/tests/worker/conftest.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index 3e237aacc..35ac90b38 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py index a41fc5217..a5e61128d 100644 --- a/tests/worker/test_model_input.py +++ b/tests/worker/test_model_input.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index ae4b53652..0be25aa2f 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest import torch diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py index 22466105b..d8767f700 100644 --- a/tests/worker/test_profile.py +++ b/tests/worker/test_profile.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index 3ab807099..6d9f404ac 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/tools/check_spdx_header.py b/tools/check_spdx_header.py index 709befc53..92914186b 100644 --- a/tools/check_spdx_header.py +++ b/tools/check_spdx_header.py @@ -1,8 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import sys -SPDX_HEADER = "# SPDX-License-Identifier: Apache-2.0" +SPDX_HEADER = ( + "# SPDX-License-Identifier: Apache-2.0\n" + "# SPDX-FileCopyrightText: Copyright contributors to the vLLM project") SPDX_HEADER_PREFIX = "# SPDX-License-Identifier:" diff --git a/tools/check_triton_import.py b/tools/check_triton_import.py index 18c9726a1..77b2dfc39 100644 --- a/tools/check_triton_import.py +++ b/tools/check_triton_import.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import subprocess import sys diff --git a/tools/enforce_regex_import.py b/tools/enforce_regex_import.py index 6c201dd25..63ceee582 100644 --- a/tools/enforce_regex_import.py +++ b/tools/enforce_regex_import.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations import subprocess diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py index 9601b578e..209c3a576 100644 --- a/tools/profiler/print_layerwise_table.py +++ b/tools/profiler/print_layerwise_table.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py index 8ec3dfc97..038d3c44f 100644 --- a/tools/profiler/visualize_layerwise_profile.py +++ b/tools/profiler/visualize_layerwise_profile.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import copy diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py index 011af2522..7368ae953 100644 --- a/tools/report_build_time_ninja.py +++ b/tools/report_build_time_ninja.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) 2018 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be diff --git a/use_existing_torch.py b/use_existing_torch.py index 7d352c6ca..a9f79e169 100644 --- a/use_existing_torch.py +++ b/use_existing_torch.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import glob diff --git a/vllm/__init__.py b/vllm/__init__.py index 52022fb8f..6232b657e 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """vLLM: a high-throughput and memory-efficient inference engine for LLMs""" # The version.py should be independent library, and we always import the # version library first. Such assumption is critical for some customization. diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 3c8e6b95c..008a7aa94 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib import importlib diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index a9a624b85..ae63e0603 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/adapter_commons/layers.py b/vllm/adapter_commons/layers.py index 9cc2b181f..9753a0880 100644 --- a/vllm/adapter_commons/layers.py +++ b/vllm/adapter_commons/layers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass diff --git a/vllm/adapter_commons/models.py b/vllm/adapter_commons/models.py index a84fbea2e..7b685880a 100644 --- a/vllm/adapter_commons/models.py +++ b/vllm/adapter_commons/models.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Any, Callable, Optional, TypeVar diff --git a/vllm/adapter_commons/request.py b/vllm/adapter_commons/request.py index 2b604b91b..8135b54ba 100644 --- a/vllm/adapter_commons/request.py +++ b/vllm/adapter_commons/request.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod diff --git a/vllm/adapter_commons/utils.py b/vllm/adapter_commons/utils.py index 46e9629e1..a1a56b6bb 100644 --- a/vllm/adapter_commons/utils.py +++ b/vllm/adapter_commons/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Optional diff --git a/vllm/adapter_commons/worker_manager.py b/vllm/adapter_commons/worker_manager.py index 3c1d26404..07e85d138 100644 --- a/vllm/adapter_commons/worker_manager.py +++ b/vllm/adapter_commons/worker_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Any, Optional diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py index a21eb7f59..1c1623084 100644 --- a/vllm/assets/audio.py +++ b/vllm/assets/audio.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from pathlib import Path diff --git a/vllm/assets/base.py b/vllm/assets/base.py index 03f3b9dab..31cde431b 100644 --- a/vllm/assets/base.py +++ b/vllm/assets/base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from functools import lru_cache from pathlib import Path diff --git a/vllm/assets/image.py b/vllm/assets/image.py index d8cca9b74..c977242a3 100644 --- a/vllm/assets/image.py +++ b/vllm/assets/image.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Literal diff --git a/vllm/assets/video.py b/vllm/assets/video.py index bf06746a9..01834aeeb 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from functools import lru_cache diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py index 85c5715fa..344040586 100644 --- a/vllm/attention/__init__.py +++ b/vllm/attention/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.attention.backends.abstract import (AttentionBackend, AttentionMetadata, diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index f3d6ffaeb..deb3951d6 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from contextlib import contextmanager diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index ea4f84072..a2fd557f8 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Tuple, Type diff --git a/vllm/attention/backends/cpu_mla.py b/vllm/attention/backends/cpu_mla.py index 4567893a9..39e667bca 100644 --- a/vllm/attention/backends/cpu_mla.py +++ b/vllm/attention/backends/cpu_mla.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Type diff --git a/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py index eceab1f1a..3548df88d 100644 --- a/vllm/attention/backends/dual_chunk_flash_attn.py +++ b/vllm/attention/backends/dual_chunk_flash_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with Dual chunk flash attention and sparse attention. """ import math diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 7f8f720ee..26be2c04f 100755 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with FlashAttention.""" from collections import defaultdict from dataclasses import dataclass diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 37b20d073..7ae7ea37f 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses import os diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py index 0e62748dd..9a6b8a40e 100644 --- a/vllm/attention/backends/flashmla.py +++ b/vllm/attention/backends/flashmla.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import contextmanager from dataclasses import dataclass diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index d701c59a2..5128e4975 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index f322c7b3d..30441b3ad 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Attention layer with torch scaled_dot_product_attention and PagedAttention.""" from dataclasses import dataclass diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index 1007140ef..50842abd3 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ # MLA Common Components diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index 19642a939..a6823ac05 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Type diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index f1def25c8..820ddcab7 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import defaultdict from dataclasses import dataclass diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py index c974f2a15..855036071 100644 --- a/vllm/attention/backends/rocm_aiter_mla.py +++ b/vllm/attention/backends/rocm_aiter_mla.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import contextmanager from dataclasses import dataclass diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 7134472da..755e0da06 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer ROCm GPUs.""" import itertools from dataclasses import dataclass diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index c1bd638f2..760634004 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Attention layer with torch scaled_dot_product_attention and PagedAttention.""" from dataclasses import dataclass diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py index 6945c2c6e..d9fff8fac 100644 --- a/vllm/attention/backends/triton_mla.py +++ b/vllm/attention/backends/triton_mla.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Dict, List, Optional, Type diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index a281c9771..e3f02a193 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention backend utils""" from collections import defaultdict from contextlib import contextmanager diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index a9d4a70b5..8355e0397 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with xFormers and PagedAttention.""" from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Type diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 9e4fbe0b4..6c5b05a5c 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer.""" from typing import Any, Dict, List, Optional diff --git a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py index bc87ce33a..05fa9d11f 100644 --- a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +++ b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py index 6ab69ea5b..c6f6cc297 100644 --- a/vllm/attention/ops/blocksparse_attention/interface.py +++ b/vllm/attention/ops/blocksparse_attention/interface.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math diff --git a/vllm/attention/ops/blocksparse_attention/utils.py b/vllm/attention/ops/blocksparse_attention/utils.py index e64fc1139..445720c70 100644 --- a/vllm/attention/ops/blocksparse_attention/utils.py +++ b/vllm/attention/ops/blocksparse_attention/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Helper functions for 3D sparse pattern # These function are not optimized and very inefficient. diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py index 6ca2a6414..4f839348e 100644 --- a/vllm/attention/ops/chunked_prefill_paged_decode.py +++ b/vllm/attention/ops/chunked_prefill_paged_decode.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Authors: # - Burkhard Ringlein diff --git a/vllm/attention/ops/flashmla.py b/vllm/attention/ops/flashmla.py index 18b69a6b3..b85f27ac4 100644 --- a/vllm/attention/ops/flashmla.py +++ b/vllm/attention/ops/flashmla.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from: https://github.com/deepseek-ai/FlashMLA/blob/main/flash_mla/flash_mla_interface.py from typing import Optional, Tuple diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py index a97c36338..412dd20ec 100644 --- a/vllm/attention/ops/hpu_paged_attn.py +++ b/vllm/attention/ops/hpu_paged_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py index 1702203b1..b7e4ba4d7 100644 --- a/vllm/attention/ops/ipex_attn.py +++ b/vllm/attention/ops/ipex_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Dict, List, Optional, Tuple diff --git a/vllm/attention/ops/merge_attn_states.py b/vllm/attention/ops/merge_attn_states.py index f9fcfe6a6..5cb1a4739 100644 --- a/vllm/attention/ops/merge_attn_states.py +++ b/vllm/attention/ops/merge_attn_states.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py index 8c9145bb9..e28ff7e8b 100644 --- a/vllm/attention/ops/nki_flash_attn.py +++ b/vllm/attention/ops/nki_flash_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import neuronxcc.nki.isa as nisa import neuronxcc.nki.language as nl diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index 827c3041a..c6d1501e2 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import List, Optional, Tuple diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index 729b61b02..13bef9672 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # The kernels in this file are adapted from LightLLM's context_attention_fwd: # https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py diff --git a/vllm/attention/ops/rocm_aiter_mla.py b/vllm/attention/ops/rocm_aiter_mla.py index 421891ab6..cce6b4639 100644 --- a/vllm/attention/ops/rocm_aiter_mla.py +++ b/vllm/attention/ops/rocm_aiter_mla.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py index 0f3cf1842..ad97152e2 100644 --- a/vllm/attention/ops/rocm_aiter_paged_attn.py +++ b/vllm/attention/ops/rocm_aiter_paged_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import aiter as rocm_aiter diff --git a/vllm/attention/ops/triton_decode_attention.py b/vllm/attention/ops/triton_decode_attention.py index fb983907e..c27b377ae 100644 --- a/vllm/attention/ops/triton_decode_attention.py +++ b/vllm/attention/ops/triton_decode_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/sgl-project/sglang/blob/9f635ea50de920aa507f486daafba26a5b837574/python/sglang/srt/layers/attention/triton_ops/decode_attention.py diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py index 62cfb813d..a26e713b1 100644 --- a/vllm/attention/ops/triton_flash_attention.py +++ b/vllm/attention/ops/triton_flash_attention.py @@ -1,5 +1,6 @@ #!/usr/bin/env python # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Fused Attention =============== diff --git a/vllm/attention/ops/triton_merge_attn_states.py b/vllm/attention/ops/triton_merge_attn_states.py index 30e61b6d8..56d78ed5e 100644 --- a/vllm/attention/ops/triton_merge_attn_states.py +++ b/vllm/attention/ops/triton_merge_attn_states.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index 87cf333f7..92c09e6dd 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Authors: # - Burkhard Ringlein diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index ebbdea27f..cb577fa67 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from contextlib import contextmanager diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py index ca88549f3..69cde06fd 100644 --- a/vllm/attention/utils/fa_utils.py +++ b/vllm/attention/utils/fa_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional from vllm import envs diff --git a/vllm/beam_search.py b/vllm/beam_search.py index ddacc6695..f3bc42183 100644 --- a/vllm/beam_search.py +++ b/vllm/beam_search.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Optional, Union diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 21fe3eb62..0ef3e0254 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This module defines a framework for sampling benchmark requests from various datasets. Each dataset subclass of BenchmarkDataset must implement sample diff --git a/vllm/benchmarks/endpoint_request_func.py b/vllm/benchmarks/endpoint_request_func.py index a28630d50..aba60edc5 100644 --- a/vllm/benchmarks/endpoint_request_func.py +++ b/vllm/benchmarks/endpoint_request_func.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """The request function for API endpoints.""" import io diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py index dc1c42879..5c6124db8 100644 --- a/vllm/benchmarks/latency.py +++ b/vllm/benchmarks/latency.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Benchmark the latency of processing a single batch of requests.""" import argparse diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 858a0c6a0..019ebcf8d 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project r"""Benchmark online serving throughput. On the server side, run one of the following commands diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index 3ea6c194b..be9ea39f0 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Benchmark offline inference throughput.""" import argparse import dataclasses diff --git a/vllm/benchmarks/utils.py b/vllm/benchmarks/utils.py index 45a0ddbd5..f0bb99326 100644 --- a/vllm/benchmarks/utils.py +++ b/vllm/benchmarks/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import json diff --git a/vllm/collect_env.py b/vllm/collect_env.py index 86eb465b8..64172a9bf 100644 --- a/vllm/collect_env.py +++ b/vllm/collect_env.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa # code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py @@ -6,7 +7,6 @@ import datetime import locale import os -import re import subprocess import sys # Unlike the rest of the PyTorch this file must be python2 compliant. @@ -14,6 +14,8 @@ import sys # Run it with `python collect_env.py` or `python -m torch.utils.collect_env` from collections import namedtuple +import regex as re + from vllm.envs import environment_variables try: @@ -815,4 +817,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/vllm/compilation/activation_quant_fusion.py b/vllm/compilation/activation_quant_fusion.py index dc3e1482e..ce4e50a2b 100644 --- a/vllm/compilation/activation_quant_fusion.py +++ b/vllm/compilation/activation_quant_fusion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch from torch._higher_order_ops.auto_functionalize import auto_functionalized diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index c4bfffe92..5af3b7efe 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import ast import dataclasses diff --git a/vllm/compilation/base_piecewise_backend.py b/vllm/compilation/base_piecewise_backend.py index 84d1e1f77..4d7aeeb4d 100644 --- a/vllm/compilation/base_piecewise_backend.py +++ b/vllm/compilation/base_piecewise_backend.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Protocol diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py index f651ee691..f754fc238 100644 --- a/vllm/compilation/collective_fusion.py +++ b/vllm/compilation/collective_fusion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index 9293610cc..36c810ec2 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib import copy import hashlib diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py index 2200671b8..c584c103f 100644 --- a/vllm/compilation/counter.py +++ b/vllm/compilation/counter.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy import dataclasses diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py index 0ad480e28..8bf957368 100644 --- a/vllm/compilation/cuda_piecewise_backend.py +++ b/vllm/compilation/cuda_piecewise_backend.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from contextlib import ExitStack diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index f02994c55..05e4ca9f0 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import inspect from typing import Callable, Optional, TypeVar, Union, overload diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py index 70f3b8b6d..286221d32 100644 --- a/vllm/compilation/fix_functionalization.py +++ b/vllm/compilation/fix_functionalization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import operator from collections.abc import Iterable diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py index 618b2fe94..7e2c5b4fe 100644 --- a/vllm/compilation/fusion.py +++ b/vllm/compilation/fusion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, NamedTuple, Optional diff --git a/vllm/compilation/fx_utils.py b/vllm/compilation/fx_utils.py index b9eeb0c8d..9ef388932 100644 --- a/vllm/compilation/fx_utils.py +++ b/vllm/compilation/fx_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import operator from collections.abc import Iterable diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py index a9359fe1e..810d0801e 100644 --- a/vllm/compilation/inductor_pass.py +++ b/vllm/compilation/inductor_pass.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import hashlib import inspect diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py index 786c7c1e1..1e059b59f 100644 --- a/vllm/compilation/monitor.py +++ b/vllm/compilation/monitor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import time diff --git a/vllm/compilation/multi_output_match.py b/vllm/compilation/multi_output_match.py index cef19f925..6d1893777 100644 --- a/vllm/compilation/multi_output_match.py +++ b/vllm/compilation/multi_output_match.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import abc import operator diff --git a/vllm/compilation/noop_elimination.py b/vllm/compilation/noop_elimination.py index 13e4cd73f..46f70dcdc 100644 --- a/vllm/compilation/noop_elimination.py +++ b/vllm/compilation/noop_elimination.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Union diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index 07ebd3e1b..621c89a14 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from torch import fx as fx diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py index 17dded87f..d41093903 100644 --- a/vllm/compilation/sequence_parallelism.py +++ b/vllm/compilation/sequence_parallelism.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/compilation/torch25_custom_graph_pass.py b/vllm/compilation/torch25_custom_graph_pass.py index 4b881d0b6..cd3970657 100644 --- a/vllm/compilation/torch25_custom_graph_pass.py +++ b/vllm/compilation/torch25_custom_graph_pass.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Any, Optional diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py index 0fe73b72b..3ccbf52d9 100644 --- a/vllm/compilation/vllm_inductor_pass.py +++ b/vllm/compilation/vllm_inductor_pass.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index 1a8211f0a..8c8d0b5cb 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import sys diff --git a/vllm/config.py b/vllm/config.py index 8aa1b5610..d99e501ca 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import ast import copy diff --git a/vllm/connections.py b/vllm/connections.py index 84e32a4d5..103505eb3 100644 --- a/vllm/connections.py +++ b/vllm/connections.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Mapping, MutableMapping from pathlib import Path diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index d4d31c58d..444bb25f2 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from typing import List, Optional diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py index 1966eac1c..a337007a9 100644 --- a/vllm/core/block/common.py +++ b/vllm/core/block/common.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import deque from dataclasses import dataclass diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index d64142e77..ea490c327 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Dict, FrozenSet, List, Optional, Tuple diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index 301656996..1a05881f7 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index c388366b8..dae6ead04 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import deque from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 1ca9e49da..2913a01bf 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Token blocks.""" import sys from bisect import bisect_left diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py index 910afdd9f..e933c6ee7 100644 --- a/vllm/core/block/utils.py +++ b/vllm/core/block/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Block manager utils.""" from vllm.sequence import SequenceGroup from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index c6bf6d163..a33399204 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A block manager that manages token blocks.""" from typing import Dict, List, Optional from typing import Sequence as GenericSequence diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index 0e363eddc..7ec4768e9 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum import heapq diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py index 4c1182deb..ba290eeda 100644 --- a/vllm/core/interfaces.py +++ b/vllm/core/interfaces.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum from abc import ABC, abstractmethod diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py index 0f5d8ca6d..71b22942a 100644 --- a/vllm/core/placeholder_block_space_manager.py +++ b/vllm/core/placeholder_block_space_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import List, Optional, Tuple diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 06d4ed470..44be855b1 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum import os diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py index 6fcbca628..942e866ed 100644 --- a/vllm/device_allocator/cumem.py +++ b/vllm/device_allocator/cumem.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # cumem-based pytorch pluggable allocator to implement sleep mode. # other approaches tried but failed: diff --git a/vllm/distributed/__init__.py b/vllm/distributed/__init__.py index 39955ddac..e911b2a1a 100644 --- a/vllm/distributed/__init__.py +++ b/vllm/distributed/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .communication_op import * from .parallel_state import * diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py index d85a41dda..0a5a95176 100644 --- a/vllm/distributed/communication_op.py +++ b/vllm/distributed/communication_op.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional, Union diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index 7177754a3..ae7590299 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib.util from typing import TYPE_CHECKING diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py index 52b970949..38370d4dc 100644 --- a/vllm/distributed/device_communicators/base_device_communicator.py +++ b/vllm/distributed/device_communicators/base_device_communicator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import threading from typing import Optional from weakref import WeakValueDictionary diff --git a/vllm/distributed/device_communicators/cpu_communicator.py b/vllm/distributed/device_communicators/cpu_communicator.py index c04218cb9..94effa0b2 100644 --- a/vllm/distributed/device_communicators/cpu_communicator.py +++ b/vllm/distributed/device_communicators/cpu_communicator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import Optional diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py index a05a13f51..0eebdf873 100644 --- a/vllm/distributed/device_communicators/cuda_communicator.py +++ b/vllm/distributed/device_communicators/cuda_communicator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py index 6c15ef644..2c38e8ed2 100644 --- a/vllm/distributed/device_communicators/cuda_wrapper.py +++ b/vllm/distributed/device_communicators/cuda_wrapper.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """This file is a pure Python wrapper for the cudart library. It avoids the need to compile a separate shared library, and is convenient for use when we just need to call a few functions. diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index 5c2dbcc27..7dd104a4f 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import contextmanager from typing import Optional, Union diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index 11b8b57fe..7c6001e87 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import ctypes import json diff --git a/vllm/distributed/device_communicators/hpu_communicator.py b/vllm/distributed/device_communicators/hpu_communicator.py index 9536a7f88..f00f6b62b 100644 --- a/vllm/distributed/device_communicators/hpu_communicator.py +++ b/vllm/distributed/device_communicators/hpu_communicator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch import torch.distributed as dist diff --git a/vllm/distributed/device_communicators/neuron_communicator.py b/vllm/distributed/device_communicators/neuron_communicator.py index dfa4b5194..5b61a1687 100644 --- a/vllm/distributed/device_communicators/neuron_communicator.py +++ b/vllm/distributed/device_communicators/neuron_communicator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch from vllm.distributed.device_communicators.base_device_communicator import ( diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 0ccd42312..294862929 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional, Union diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 6f69089b6..04a4d0147 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # This file is a pure Python wrapper for the NCCL library. # The main purpose is to use NCCL combined with CUDA graph. diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 40e57e662..0f66f0aeb 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pickle import time diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py index a17752796..c60a7a7eb 100644 --- a/vllm/distributed/device_communicators/tpu_communicator.py +++ b/vllm/distributed/device_communicators/tpu_communicator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import Optional diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py index 256e7965e..216ff85c8 100644 --- a/vllm/distributed/device_communicators/xpu_communicator.py +++ b/vllm/distributed/device_communicators/xpu_communicator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py index 29c6a70c4..9bf1c058a 100644 --- a/vllm/distributed/kv_events.py +++ b/vllm/distributed/kv_events.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import queue import threading diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py index 8b6abf5a8..fa9b7e4f1 100644 --- a/vllm/distributed/kv_transfer/__init__.py +++ b/vllm/distributed/kv_transfer/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.distributed.kv_transfer.kv_transfer_state import ( KVConnectorBaseType, ensure_kv_transfer_initialized, get_kv_transfer_group, diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py index e9b70610e..181c33925 100644 --- a/vllm/distributed/kv_transfer/kv_connector/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ KVConnectorBase Class for Distributed KV Cache & Hidden State communication diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index dce0b545c..58dfa251c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib from typing import TYPE_CHECKING, Callable diff --git a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py index d121cb701..78bf30956 100644 --- a/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/lmcache_connector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ LMCache KV Cache Connector for Distributed Machine Learning Inference diff --git a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py index 58eabd0a3..94a7ce91a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ MooncakeStore Connector for Distributed Machine Learning Inference The MooncakeStoreConnector transfers KV caches between prefill vLLM workers diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py index ed8fe3816..e7c079e1f 100644 --- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Simple KV Cache Connector for Distributed Machine Learning Inference diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index b1c9c9af6..c62444e75 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ KV cache helper for store. """ diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py index e66aaa7f8..f00f31dde 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorRole) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index bc9258e9d..8f9d70eec 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ KVConnectorBase_V1 Class for Distributed KV Cache & Hidden State communication in vLLM v1 diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py index 2cb68dc1f..cc1f4ba35 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TYPE_CHECKING import torch diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index 0aabb260f..5aab10b2b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Optional diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 4d228dbc9..3f0b0e295 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib import math import threading diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 0421a65a2..f86b92692 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import hashlib import os from dataclasses import dataclass diff --git a/vllm/distributed/kv_transfer/kv_connector_agent.py b/vllm/distributed/kv_transfer/kv_connector_agent.py index 819c06805..8633fdaf5 100644 --- a/vllm/distributed/kv_transfer/kv_connector_agent.py +++ b/vllm/distributed/kv_transfer/kv_connector_agent.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A centralized entrypoint to perform distributed KV cache transfer. This implementation is a shim wrapper on two APIs exposed by `kv_connector`: diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py index d1ffb8092..eef14269f 100644 --- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py +++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file contains a new class `KVLookupBufferBase` that allows developers to think of KV cache operations as inserting new KV cache entries (`insert`) diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py index 5bb711021..4381aad1e 100644 --- a/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py +++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/mooncake_store.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file contains a new class `MooncakeStore` that allows developers to think of KV cache transfer operations as putting new KV cache entries diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py index e3b2274bd..a0ff7c320 100644 --- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py +++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Implements a distributed key-value (KV) cache transfer mechanism. diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py index 40589fb3e..1423fd032 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/base.py +++ b/vllm/distributed/kv_transfer/kv_pipe/base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file defines an interface `KVPipeBase` that provides an abstraction for sending and receiving tensors, or None, via diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py index aa4b1ba71..9f3494b81 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import os diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py index 761c56f7e..09de0b682 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This module implements a PyNccl pipe for sending and receiving Optional[torch.Tensor] between distributed ranks with advanced diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py index 25d2f2cf5..60f1d5d8b 100644 --- a/vllm/distributed/kv_transfer/kv_transfer_state.py +++ b/vllm/distributed/kv_transfer/kv_transfer_state.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TYPE_CHECKING, Optional from vllm import envs diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 32c9301bf..10f87c49b 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2023 The vLLM team. # Adapted from diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index 96d08dc1a..67f71643d 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2023 The vLLM team. # Adapted from diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e3b8a18cc..587a23134 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # yapf: disable import argparse diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 19b219b67..6d8d97cf5 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import copy diff --git a/vllm/engine/async_timeout.py b/vllm/engine/async_timeout.py index 94674262b..28a023a71 100644 --- a/vllm/engine/async_timeout.py +++ b/vllm/engine/async_timeout.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Workaround for https://github.com/python/cpython/issues/86296 # diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index a9600a2c8..dbcf78f02 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy import time diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 916afe0c8..8d51f0472 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time from typing import TYPE_CHECKING diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index acc83011d..9375dc4c4 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ These types are defined in this file to avoid importing vllm.engine.metrics and therefore importing prometheus_client. diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py index af72c8e6b..bf9f66903 100644 --- a/vllm/engine/multiprocessing/__init__.py +++ b/vllm/engine/multiprocessing/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import uuid from dataclasses import dataclass, field diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 18b7c187b..f2f442485 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import copy diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 434cb4985..ef088bd39 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pickle import signal diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py index 4c8e295c1..19c5963d3 100644 --- a/vllm/engine/output_processor/interfaces.py +++ b/vllm/engine/output_processor/interfaces.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Callable, List diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 110f84a65..e0fa6a00e 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools from typing import Callable, List, cast diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index e88f119c8..dbf6a371d 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import List diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index 6cad9ec8f..7925d91f6 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, List, Optional, Tuple diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py index 0d2b58c10..1e127eb98 100644 --- a/vllm/engine/output_processor/util.py +++ b/vllm/engine/output_processor/util.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import List from typing import Sequence as GenericSequence diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 28341c2c6..727d59283 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio from abc import ABC, abstractmethod diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 1c0271811..56f8754c2 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ NOTE: This API server is used only for demonstrating usage of AsyncEngine and simple performance benchmarks. It is not intended for production use. diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index b051cd333..95c806c22 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import json diff --git a/vllm/entrypoints/cli/benchmark/base.py b/vllm/entrypoints/cli/benchmark/base.py index 94fb415f5..30a884410 100644 --- a/vllm/entrypoints/cli/benchmark/base.py +++ b/vllm/entrypoints/cli/benchmark/base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse from vllm.entrypoints.cli.types import CLISubcommand diff --git a/vllm/entrypoints/cli/benchmark/latency.py b/vllm/entrypoints/cli/benchmark/latency.py index 5aca16e0b..e0358a262 100644 --- a/vllm/entrypoints/cli/benchmark/latency.py +++ b/vllm/entrypoints/cli/benchmark/latency.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse from vllm.benchmarks.latency import add_cli_args, main diff --git a/vllm/entrypoints/cli/benchmark/main.py b/vllm/entrypoints/cli/benchmark/main.py index 9e857af7d..717da630a 100644 --- a/vllm/entrypoints/cli/benchmark/main.py +++ b/vllm/entrypoints/cli/benchmark/main.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import vllm.entrypoints.cli.benchmark.latency diff --git a/vllm/entrypoints/cli/benchmark/serve.py b/vllm/entrypoints/cli/benchmark/serve.py index d5a858920..304370157 100644 --- a/vllm/entrypoints/cli/benchmark/serve.py +++ b/vllm/entrypoints/cli/benchmark/serve.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse from vllm.benchmarks.serve import add_cli_args, main diff --git a/vllm/entrypoints/cli/benchmark/throughput.py b/vllm/entrypoints/cli/benchmark/throughput.py index 88ee6aa03..20431cd3d 100644 --- a/vllm/entrypoints/cli/benchmark/throughput.py +++ b/vllm/entrypoints/cli/benchmark/throughput.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse from vllm.benchmarks.throughput import add_cli_args, main diff --git a/vllm/entrypoints/cli/collect_env.py b/vllm/entrypoints/cli/collect_env.py index 810ecfdf7..141aafdb1 100644 --- a/vllm/entrypoints/cli/collect_env.py +++ b/vllm/entrypoints/cli/collect_env.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py index 5eba72fec..3e834b3b2 100644 --- a/vllm/entrypoints/cli/main.py +++ b/vllm/entrypoints/cli/main.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # The CLI entrypoint to vLLM. import signal diff --git a/vllm/entrypoints/cli/openai.py b/vllm/entrypoints/cli/openai.py index 215fcf3c3..58dcdfe21 100644 --- a/vllm/entrypoints/cli/openai.py +++ b/vllm/entrypoints/cli/openai.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Commands that act as an interactive OpenAI API client import argparse diff --git a/vllm/entrypoints/cli/run_batch.py b/vllm/entrypoints/cli/run_batch.py index f74c8da9b..353034f88 100644 --- a/vllm/entrypoints/cli/run_batch.py +++ b/vllm/entrypoints/cli/run_batch.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import asyncio diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 040ae166a..f9c56e655 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import os diff --git a/vllm/entrypoints/cli/types.py b/vllm/entrypoints/cli/types.py index f739a68c5..0a7244312 100644 --- a/vllm/entrypoints/cli/types.py +++ b/vllm/entrypoints/cli/types.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py index a4f70a51e..9f4dc19fb 100644 --- a/vllm/entrypoints/launcher.py +++ b/vllm/entrypoints/launcher.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import signal diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index e05189ef4..fd28bf39e 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools import warnings diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py index d4655dd5e..f3aee188d 100644 --- a/vllm/entrypoints/logger.py +++ b/vllm/entrypoints/logger.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional, Union diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 5a4295ff7..2f8819bca 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import atexit diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index f196ff6ed..ca70e78df 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file contains the command line arguments for the vLLM's OpenAI-compatible server. It is kept in a separate file for documentation diff --git a/vllm/entrypoints/openai/logits_processors.py b/vllm/entrypoints/openai/logits_processors.py index 04d5091a9..29d72256c 100644 --- a/vllm/entrypoints/openai/logits_processors.py +++ b/vllm/entrypoints/openai/logits_processors.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from functools import lru_cache, partial diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index e72c23993..ecfcc0068 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index ac250b3cb..9994b3cae 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import tempfile diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index ea8e187dc..7e514d660 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import json diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py index 90cdd389d..3ac4f01ea 100644 --- a/vllm/entrypoints/openai/serving_classification.py +++ b/vllm/entrypoints/openai/serving_classification.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from http import HTTPStatus from typing import Optional, Union, cast diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 1c06070cb..ce5eca855 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import time diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 3785d2642..e87decfe6 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 from typing import Final, Literal, Optional, Union, cast diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index f96a4ac8b..ac3883bde 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 import io import json diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 74433a1a3..764b0e736 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import pathlib diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 7c401d4f5..b896cc46b 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import base64 diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 9bdacb551..f58611c49 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import time from collections.abc import AsyncGenerator, Mapping diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 0d739bbf9..3db0a71fa 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Final, Optional, Union diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py index 9fc5b562e..f667c7e9b 100644 --- a/vllm/entrypoints/openai/serving_transcription.py +++ b/vllm/entrypoints/openai/serving_transcription.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import io import time diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index 054c0b006..3e4f4e149 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .abstract_tool_parser import ToolParser, ToolParserManager from .deepseekv3_tool_parser import DeepSeekV3ToolParser diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py index 931d5aab9..02aeab613 100644 --- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from collections.abc import Sequence diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py index 14e743e13..60025af2a 100644 --- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from typing import Union diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py index 383e0d44d..5508ba6a3 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from collections.abc import Sequence diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py index b8bf14253..fcc5b7edd 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from collections.abc import Sequence diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index 2b9f9852b..c7030d34d 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from collections.abc import Sequence diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py index 3f2799f80..e5dcdf9a0 100644 --- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from collections.abc import Sequence diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py index 2714a545f..66b483d8b 100644 --- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from collections.abc import Sequence diff --git a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py index 323fb1441..6bf44a434 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import ast import json from collections.abc import Sequence diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py index 4eda7044c..5698bc70a 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from collections.abc import Sequence diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index fecad7e65..ef5b14f3c 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from collections.abc import Sequence diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py index 00690ad79..5501028cf 100644 --- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from collections.abc import Sequence diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py index bc5d15dcb..73329cdf7 100644 --- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import ast import json diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py index acbff3258..aa41cd6dc 100644 --- a/vllm/entrypoints/openai/tool_parsers/utils.py +++ b/vllm/entrypoints/openai/tool_parsers/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from json import JSONDecodeError, JSONDecoder diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index 80b6c07c6..c4e044f3a 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Union from torch.nn import CosineSimilarity diff --git a/vllm/entrypoints/ssl.py b/vllm/entrypoints/ssl.py index dba916b8b..e3646a60a 100644 --- a/vllm/entrypoints/ssl.py +++ b/vllm/entrypoints/ssl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio from ssl import SSLContext diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index 1b0ea6909..6fb32ff18 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import functools diff --git a/vllm/env_override.py b/vllm/env_override.py index 71f031d1e..b0a061d2c 100644 --- a/vllm/env_override.py +++ b/vllm/env_override.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import torch diff --git a/vllm/envs.py b/vllm/envs.py index 3dd0d9045..2e3d6eeb5 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import hashlib import os diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 40ca1d299..99e12201c 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import time diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py index d1f8c36fb..4e8c6d790 100644 --- a/vllm/executor/mp_distributed_executor.py +++ b/vllm/executor/mp_distributed_executor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import os diff --git a/vllm/executor/msgspec_utils.py b/vllm/executor/msgspec_utils.py index e680d53cb..852c8f5cf 100644 --- a/vllm/executor/msgspec_utils.py +++ b/vllm/executor/msgspec_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from array import array from typing import Any, Type diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index 380b672c3..a6c172bef 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import os diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index 8e67c7a41..bdc2b1f4c 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import json diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 7bc98a16f..c222f1609 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import time diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index 1d3a6e443..7ebeb4a22 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import Any, Callable, Dict, List, Optional, Tuple, Union diff --git a/vllm/forward_context.py b/vllm/forward_context.py index f192be1c4..f3b0518a4 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time from collections import defaultdict diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index df4f844cd..37bf2b7a4 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .data import (DecoderOnlyInputs, EmbedsInputs, EncoderDecoderInputs, ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType, diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 843c45bd6..23cb5e502 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import TYPE_CHECKING, Any, Generic, Literal, Optional, Union, cast diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py index 4c64a41ac..8c3700799 100644 --- a/vllm/inputs/parse.py +++ b/vllm/inputs/parse.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from typing import Literal, Optional, TypedDict, Union, cast, overload diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index b9acabeab..a13e563f3 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio from collections.abc import Mapping diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index f424a8f61..73d19aecd 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Mapping from dataclasses import dataclass from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union diff --git a/vllm/jsontree.py b/vllm/jsontree.py index 91cd7cb21..4cbe0f76e 100644 --- a/vllm/jsontree.py +++ b/vllm/jsontree.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Helper functions to work with nested JSON structures.""" from collections.abc import Iterable from functools import reduce diff --git a/vllm/logger.py b/vllm/logger.py index fd16dd95b..0ddb83cb8 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Logging configuration for vLLM.""" import datetime import json diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py index 7ab463258..cf690a89a 100644 --- a/vllm/logging_utils/__init__.py +++ b/vllm/logging_utils/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.logging_utils.formatter import NewLineFormatter diff --git a/vllm/logging_utils/dump_input.py b/vllm/logging_utils/dump_input.py index 47ce0ab18..d14515f56 100644 --- a/vllm/logging_utils/dump_input.py +++ b/vllm/logging_utils/dump_input.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib import enum diff --git a/vllm/logging_utils/formatter.py b/vllm/logging_utils/formatter.py index 010b0a124..0affef100 100644 --- a/vllm/logging_utils/formatter.py +++ b/vllm/logging_utils/formatter.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import logging diff --git a/vllm/logits_process.py b/vllm/logits_process.py index 29a73656b..5967d0836 100644 --- a/vllm/logits_process.py +++ b/vllm/logits_process.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Union diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index b6b138a44..7fc4cfe02 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # pylint: disable=unused-argument from typing import TYPE_CHECKING, Optional, Union, cast diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 023c8e9c9..66e037a97 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # pylint: disable=unused-argument import math diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py index 294b49e0a..958364fca 100644 --- a/vllm/lora/lora.py +++ b/vllm/lora/lora.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence as GenericSequence from typing import Optional diff --git a/vllm/lora/models.py b/vllm/lora/models.py index dfdc908d7..262e67995 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math import os diff --git a/vllm/lora/ops/torch_ops/__init__.py b/vllm/lora/ops/torch_ops/__init__.py index 85601d58c..22aa3c63d 100644 --- a/vllm/lora/ops/torch_ops/__init__.py +++ b/vllm/lora/ops/torch_ops/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.lora.ops.torch_ops.lora_ops import bgmv_expand # noqa: F401 from vllm.lora.ops.torch_ops.lora_ops import (bgmv_expand_slice, bgmv_shrink, diff --git a/vllm/lora/ops/torch_ops/lora_ops.py b/vllm/lora/ops/torch_ops/lora_ops.py index ab65faceb..cba5baad8 100644 --- a/vllm/lora/ops/torch_ops/lora_ops.py +++ b/vllm/lora/ops/torch_ops/lora_ops.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/vllm/lora/ops/triton_ops/__init__.py b/vllm/lora/ops/triton_ops/__init__.py index 5a39705e8..805de4b6f 100644 --- a/vllm/lora/ops/triton_ops/__init__.py +++ b/vllm/lora/ops/triton_ops/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.lora.ops.triton_ops.lora_expand_op import lora_expand from vllm.lora.ops.triton_ops.lora_kernel_metadata import LoRAKernelMeta diff --git a/vllm/lora/ops/triton_ops/kernel_utils.py b/vllm/lora/ops/triton_ops/kernel_utils.py index 0f971c035..e93064d0c 100644 --- a/vllm/lora/ops/triton_ops/kernel_utils.py +++ b/vllm/lora/ops/triton_ops/kernel_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Utilities for Punica kernel construction. """ diff --git a/vllm/lora/ops/triton_ops/lora_expand_op.py b/vllm/lora/ops/triton_ops/lora_expand_op.py index 9feb9e462..9e1f90e75 100644 --- a/vllm/lora/ops/triton_ops/lora_expand_op.py +++ b/vllm/lora/ops/triton_ops/lora_expand_op.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py index ac459a832..39e647b9b 100644 --- a/vllm/lora/ops/triton_ops/lora_kernel_metadata.py +++ b/vllm/lora/ops/triton_ops/lora_kernel_metadata.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ LoRA kernels metadata preparation utilities. """ diff --git a/vllm/lora/ops/triton_ops/lora_shrink_op.py b/vllm/lora/ops/triton_ops/lora_shrink_op.py index c3871bd58..3f9edfc6d 100644 --- a/vllm/lora/ops/triton_ops/lora_shrink_op.py +++ b/vllm/lora/ops/triton_ops/lora_shrink_op.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/ops/triton_ops/utils.py b/vllm/lora/ops/triton_ops/utils.py index 6225635c2..5857f7fec 100644 --- a/vllm/lora/ops/triton_ops/utils.py +++ b/vllm/lora/ops/triton_ops/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/vllm/lora/ops/xla_ops/__init__.py b/vllm/lora/ops/xla_ops/__init__.py index 94062b05d..7e7c3c892 100644 --- a/vllm/lora/ops/xla_ops/__init__.py +++ b/vllm/lora/ops/xla_ops/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.lora.ops.xla_ops.lora_ops import (bgmv_expand, bgmv_expand_slice, bgmv_shrink) diff --git a/vllm/lora/ops/xla_ops/lora_ops.py b/vllm/lora/ops/xla_ops/lora_ops.py index dff4d5181..9118f3351 100644 --- a/vllm/lora/ops/xla_ops/lora_ops.py +++ b/vllm/lora/ops/xla_ops/lora_ops.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import jax import jax.numpy as jnp diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index 7d335e5f7..a20d73f0f 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from: https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/config.py diff --git a/vllm/lora/punica_wrapper/__init__.py b/vllm/lora/punica_wrapper/__init__.py index 915fc6623..e664ffa1d 100644 --- a/vllm/lora/punica_wrapper/__init__.py +++ b/vllm/lora/punica_wrapper/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase from vllm.lora.punica_wrapper.punica_selector import get_punica_wrapper diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py index e03f73290..5b4902dcb 100644 --- a/vllm/lora/punica_wrapper/punica_base.py +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/punica_wrapper/punica_cpu.py b/vllm/lora/punica_wrapper/punica_cpu.py index 8118a72d6..59049cccc 100644 --- a/vllm/lora/punica_wrapper/punica_cpu.py +++ b/vllm/lora/punica_wrapper/punica_cpu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional, Union diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index 224640ec7..6b038309d 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Based on: Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py index 416c23e73..b20c9785a 100644 --- a/vllm/lora/punica_wrapper/punica_hpu.py +++ b/vllm/lora/punica_wrapper/punica_hpu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TYPE_CHECKING, Optional, Union, final diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py index 922d6c060..c684ac77c 100644 --- a/vllm/lora/punica_wrapper/punica_selector.py +++ b/vllm/lora/punica_wrapper/punica_selector.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.logger import init_logger from vllm.platforms import current_platform diff --git a/vllm/lora/punica_wrapper/punica_tpu.py b/vllm/lora/punica_wrapper/punica_tpu.py index 0556e583f..6b48268c5 100644 --- a/vllm/lora/punica_wrapper/punica_tpu.py +++ b/vllm/lora/punica_wrapper/punica_tpu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from typing import TYPE_CHECKING, Optional, Union diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py index 1adb40b4c..0b0a7989f 100644 --- a/vllm/lora/punica_wrapper/utils.py +++ b/vllm/lora/punica_wrapper/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TYPE_CHECKING, Optional, Union diff --git a/vllm/lora/request.py b/vllm/lora/request.py index 616e94f8d..5bbba7830 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import warnings from typing import Optional diff --git a/vllm/lora/resolver.py b/vllm/lora/resolver.py index 33f35322f..5808ae105 100644 --- a/vllm/lora/resolver.py +++ b/vllm/lora/resolver.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from collections.abc import Set diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 619dd3bdc..ee196e3f6 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import Optional, Union diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index f1ae03097..7da44569f 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import contextmanager from typing import Any, Literal, Optional, Union diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py index 763615217..55dfe8088 100644 --- a/vllm/model_executor/__init__.py +++ b/vllm/model_executor/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.model_executor.parameter import (BasevLLMParameter, PackedvLLMParameter) diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index acf722467..7e6cdd987 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch.nn as nn diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py index a2b61a1b1..3c2998bec 100644 --- a/vllm/model_executor/guided_decoding/__init__.py +++ b/vllm/model_executor/guided_decoding/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/vllm/model_executor/guided_decoding/guidance_decoding.py b/vllm/model_executor/guided_decoding/guidance_decoding.py index 58adcc3ca..05b6a1c32 100644 --- a/vllm/model_executor/guided_decoding/guidance_decoding.py +++ b/vllm/model_executor/guided_decoding/guidance_decoding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import llguidance diff --git a/vllm/model_executor/guided_decoding/guidance_logits_processors.py b/vllm/model_executor/guided_decoding/guidance_logits_processors.py index e17df68b4..379b5eaa3 100644 --- a/vllm/model_executor/guided_decoding/guidance_logits_processors.py +++ b/vllm/model_executor/guided_decoding/guidance_logits_processors.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy import os from typing import Any diff --git a/vllm/model_executor/guided_decoding/guided_fields.py b/vllm/model_executor/guided_decoding/guided_fields.py index 316860718..fa97b6dbf 100644 --- a/vllm/model_executor/guided_decoding/guided_fields.py +++ b/vllm/model_executor/guided_decoding/guided_fields.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Optional, TypedDict, Union diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py index 7eaf9e38e..f9b51f4c1 100644 --- a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +++ b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from functools import lru_cache from json import loads as json_loads diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py index e41af4b36..26c2d958e 100644 --- a/vllm/model_executor/guided_decoding/outlines_decoding.py +++ b/vllm/model_executor/guided_decoding/outlines_decoding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import concurrent.futures diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index 6986b6554..4ef4db7c4 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024- the Outlines developers # This file is adapted from diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py index 3f77cf394..8fdfa983e 100644 --- a/vllm/model_executor/guided_decoding/utils.py +++ b/vllm/model_executor/guided_decoding/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import regex as re diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index d2e568609..bdd3a1a9c 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # noqa: UP007 from __future__ import annotations diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index a32c26317..cc9c8d445 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Custom activation functions.""" import math from typing import Optional diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 5c262287f..2bdc96e29 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import contextmanager from typing import Any, Optional diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 26a433da2..d827869d0 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ CUTLASS based Fused MoE kernels.""" from typing import Optional diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index 46a814e6e..331544d64 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools import importlib.util from typing import Optional diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index c2db79365..205a95e7f 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Fused batched MoE kernel.""" from typing import Optional diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 4c84dd538..40b76994f 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Fused MoE utilities for GPTQ.""" import functools from typing import Optional diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 78f8eb926..883a48c98 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Fused MoE kernel.""" import functools import json diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 1e193c909..3ce4cbc28 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib from abc import abstractmethod diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 7d3ddf8f1..5e321c9b4 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Optional diff --git a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py index d025f1257..98e175b12 100644 --- a/vllm/model_executor/layers/fused_moe/moe_align_block_size.py +++ b/vllm/model_executor/layers/fused_moe/moe_align_block_size.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py index 9d8bd62c6..d35bd0098 100644 --- a/vllm/model_executor/layers/fused_moe/moe_pallas.py +++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch import torch.nn.functional as F diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py index cb396f26c..da7871434 100644 --- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py index da27633f2..6160da732 100644 --- a/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py +++ b/vllm/model_executor/layers/fused_moe/moe_torch_iterative.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch import torch.nn.functional as F diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index 783ebebbf..8405603cf 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import pplx_kernels as pplx diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py index 98f98b3bd..77a9686c9 100644 --- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index 824062491..d44989cce 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from enum import IntEnum from functools import cache from typing import Optional diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py index 2cfe37314..373e8ab39 100644 --- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py index d9d2520e1..c3a584782 100644 --- a/vllm/model_executor/layers/fused_moe/utils.py +++ b/vllm/model_executor/layers/fused_moe/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from math import prod from typing import Optional diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index e8abd32ff..b3c65e341 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Custom normalization layers.""" from typing import Optional, Union diff --git a/vllm/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py index 96659af40..978086d19 100644 --- a/vllm/model_executor/layers/lightning_attn.py +++ b/vllm/model_executor/layers/lightning_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch from einops import rearrange diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 269ac043d..588aa8deb 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools from abc import abstractmethod diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 6b69a2608..3d0125344 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A layer that compute logits from hidden_stats.""" import inspect from concurrent.futures import ThreadPoolExecutor diff --git a/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py index 019f634a9..88053faf9 100644 --- a/vllm/model_executor/layers/mamba/mamba2_metadata.py +++ b/vllm/model_executor/layers/mamba/mamba2_metadata.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from dataclasses import dataclass diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index 156e8752e..118bd8d55 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch from torch import nn diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index f94ab75f9..6d9ea5387 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional, Union diff --git a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py index 21e27160f..a10c5ab69 100644 --- a/vllm/model_executor/layers/mamba/ops/causal_conv1d.py +++ b/vllm/model_executor/layers/mamba/ops/causal_conv1d.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) 2024, Tri Dao. # Adapted from https://github.com/Dao-AILab/causal-conv1d/blob/main/causal_conv1d/causal_conv1d_interface.py diff --git a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py index 689c940d1..ccfb278cd 100644 --- a/vllm/model_executor/layers/mamba/ops/mamba_ssm.py +++ b/vllm/model_executor/layers/mamba/ops/mamba_ssm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) 2024, Tri Dao, Albert Gu. # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/selective_state_update.py diff --git a/vllm/model_executor/layers/mamba/ops/ssd_bmm.py b/vllm/model_executor/layers/mamba/ops/ssd_bmm.py index 0fdb055aa..11ca1255e 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_bmm.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_bmm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) 2024, Tri Dao, Albert Gu. # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_bmm.py diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py index 1652c5181..365e1c54b 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) 2024, Tri Dao, Albert Gu. # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_chunk_scan.py diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py index ee6335690..58bfb661d 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) 2024, Tri Dao, Albert Gu. # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_chunk_state.py diff --git a/vllm/model_executor/layers/mamba/ops/ssd_combined.py b/vllm/model_executor/layers/mamba/ops/ssd_combined.py index 79a1663b8..b121275e9 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_combined.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_combined.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) 2024, Tri Dao, Albert Gu. # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_combined.py diff --git a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py index 6f69ca743..a28fc9ffa 100644 --- a/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py +++ b/vllm/model_executor/layers/mamba/ops/ssd_state_passing.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) 2024, Tri Dao, Albert Gu. # Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/ops/triton/ssd_state_passing.py diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index d2c42191b..258038bed 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from enum import IntEnum from typing import Optional, Union diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index 407b9c72f..1cb23e7a1 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Literal, get_args diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index 8bf0ca5c0..2ea8c5dc5 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Supports AQLM compression, see https://github.com/Vahe1994/AQLM # and https://arxiv.org/pdf/2401.06118.pdf diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py index eb8ffa378..ea17cd56c 100644 --- a/vllm/model_executor/layers/quantization/auto_round.py +++ b/vllm/model_executor/layers/quantization/auto_round.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from fractions import Fraction from typing import Any, Optional, Union diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 87afdb623..f8bc3ab5e 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 0c8d082bb..56d803c6b 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Optional diff --git a/vllm/model_executor/layers/quantization/awq_triton.py b/vllm/model_executor/layers/quantization/awq_triton.py index 5e5491578..ebc526d6d 100644 --- a/vllm/model_executor/layers/quantization/awq_triton.py +++ b/vllm/model_executor/layers/quantization/awq_triton.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index c9533da9d..78c5c75c0 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import inspect from abc import ABC, abstractmethod diff --git a/vllm/model_executor/layers/quantization/bitblas.py b/vllm/model_executor/layers/quantization/bitblas.py index 1cd12bb76..9e5ce39ec 100644 --- a/vllm/model_executor/layers/quantization/bitblas.py +++ b/vllm/model_executor/layers/quantization/bitblas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional import torch diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 049ce7a71..38935bc96 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 27547f315..dff62af86 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import suppress from typing import Any, Literal, Optional, cast diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 9241ceeb4..ebb029572 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum from enum import Enum diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py index 79bf5c108..25924c733 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .compressed_tensors_scheme import CompressedTensorsScheme from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py index f010bc034..30ed55aee 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py index daa25d23a..a5d48f235 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py index 6ea31e50c..3f3e7668f 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py index cf60b34ba..8202ce951 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional import torch diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py index 61e4918ca..01a87a088 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 99bb73b71..1e61e058c 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index 7792ce865..6189f0609 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py index a33c58acb..74787603e 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py index 2380d3570..9bcf1aa2b 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index 75e81c4dd..402646498 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable, Mapping from types import MappingProxyType diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py index 0c1eaff93..8030be525 100644 --- a/vllm/model_executor/layers/quantization/deepspeedfp.py +++ b/vllm/model_executor/layers/quantization/deepspeedfp.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 3601d219d..01b0064f0 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Optional diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py index 223682ee9..3e465ee2c 100644 --- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py +++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index ac9b74945..cea4d26a4 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools import importlib.util diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 1fcb6d7af..2171f729a 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Optional diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 436f1e3cc..d3ab1be3b 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum from enum import Enum diff --git a/vllm/model_executor/layers/quantization/gptq_bitblas.py b/vllm/model_executor/layers/quantization/gptq_bitblas.py index be9510abd..78e0f59fa 100644 --- a/vllm/model_executor/layers/quantization/gptq_bitblas.py +++ b/vllm/model_executor/layers/quantization/gptq_bitblas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional import torch diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index cf012e145..f92ebdea9 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Optional, Union diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py index e90416f37..eba917d85 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py index a8faf9772..ee8a0e34b 100644 --- a/vllm/model_executor/layers/quantization/hqq_marlin.py +++ b/vllm/model_executor/layers/quantization/hqq_marlin.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index 8108c7976..31ad96ecc 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py index 55ad00b1c..07ecc0962 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from dataclasses import dataclass diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py index bb1dc40ad..0bf0d530d 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py index e07177dd6..785e559df 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py index 29e206991..649d07b4d 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py index 50d293cf4..fef333e86 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py index 855867fa4..c7c458618 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from functools import partial from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py index 899011f00..1597492a5 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py index 2d92af74b..9ebf5f303 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from dataclasses import dataclass diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py index 5d58c0489..18f5ce04f 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py index 6c2c464e6..165548a06 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py index 98a0b30be..6ddd4a9ec 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py index c09ca83d0..817565cf2 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py index a97b53b9d..3de28af40 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/xla.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import warnings from typing import Optional diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py index 67723c7c9..e5604670f 100644 --- a/vllm/model_executor/layers/quantization/kv_cache.py +++ b/vllm/model_executor/layers/quantization/kv_cache.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py index 2437030c8..62667db26 100644 --- a/vllm/model_executor/layers/quantization/marlin.py +++ b/vllm/model_executor/layers/quantization/marlin.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 2abe16a08..3f79b203a 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Optional, Union diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index 74bd6dc13..3aa23f068 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Optional diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py index b2d6bf5db..804023666 100644 --- a/vllm/model_executor/layers/quantization/neuron_quant.py +++ b/vllm/model_executor/layers/quantization/neuron_quant.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from importlib.util import find_spec diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py index 9e4fb3363..32ba1055f 100644 --- a/vllm/model_executor/layers/quantization/ptpc_fp8.py +++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py index 6028b8a2a..25978cb13 100644 --- a/vllm/model_executor/layers/quantization/qqq.py +++ b/vllm/model_executor/layers/quantization/qqq.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index df4bfbbbc..6ae5f5c9a 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import fnmatch from typing import Any, Optional, cast diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index aa7d72543..4c2da4c8b 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Optional diff --git a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py index d7dac1757..ec09d9b2a 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/__init__.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .quark_scheme import QuarkScheme from .quark_w4a4_mxfp4 import QuarkW4A4MXFP4 diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py index 40c8ea86d..c167e949a 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_scheme.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Optional diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py index 34c077b29..3c56251b7 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Callable, Optional diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py index 149c90937..47e0a492b 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py index 94f9fcd56..ae68d5bbc 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_w8a8_int8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py index 5e56bcb75..99f5ec159 100644 --- a/vllm/model_executor/layers/quantization/quark/utils.py +++ b/vllm/model_executor/layers/quantization/quark/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable, Mapping from types import MappingProxyType diff --git a/vllm/model_executor/layers/quantization/schema.py b/vllm/model_executor/layers/quantization/schema.py index c0be40c16..a10815292 100644 --- a/vllm/model_executor/layers/quantization/schema.py +++ b/vllm/model_executor/layers/quantization/schema.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This file contains the Pydantic schemas for various quantization-related parameters. When a relevant quantization technique is specified, these diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py index 7f9f3e643..af362f7a7 100644 --- a/vllm/model_executor/layers/quantization/torchao.py +++ b/vllm/model_executor/layers/quantization/torchao.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional import torch diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py index 7941ec973..83c8a98ea 100644 --- a/vllm/model_executor/layers/quantization/tpu_int8.py +++ b/vllm/model_executor/layers/quantization/tpu_int8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/model_executor/layers/quantization/utils/__init__.py b/vllm/model_executor/layers/quantization/utils/__init__.py index f7ee47288..6ad56bae3 100644 --- a/vllm/model_executor/layers/quantization/utils/__init__.py +++ b/vllm/model_executor/layers/quantization/utils/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .layer_utils import replace_parameter, update_tensor_inplace diff --git a/vllm/model_executor/layers/quantization/utils/allspark_utils.py b/vllm/model_executor/layers/quantization/utils/allspark_utils.py index 97860765a..1992b4d20 100644 --- a/vllm/model_executor/layers/quantization/utils/allspark_utils.py +++ b/vllm/model_executor/layers/quantization/utils/allspark_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py index 70d24cc89..82ee3edfd 100644 --- a/vllm/model_executor/layers/quantization/utils/bitblas_utils.py +++ b/vllm/model_executor/layers/quantization/utils/bitblas_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 4c213f2c8..1ebd2a898 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://github.com/sgl-project/sglang/pull/2575 import functools diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py index 36161d13b..db82b0def 100644 --- a/vllm/model_executor/layers/quantization/utils/gptq_utils.py +++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from copy import deepcopy from typing import Optional, Union diff --git a/vllm/model_executor/layers/quantization/utils/int8_utils.py b/vllm/model_executor/layers/quantization/utils/int8_utils.py index 72fff3fa1..a694a1917 100644 --- a/vllm/model_executor/layers/quantization/utils/int8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://github.com/sgl-project/sglang/blob/4cb53ecd0cffceb6dee5c011a58f65997a86f151/python/sglang/srt/layers/quantization/int8_kernel.py import functools diff --git a/vllm/model_executor/layers/quantization/utils/layer_utils.py b/vllm/model_executor/layers/quantization/utils/layer_utils.py index 5acae7ca3..fbc0f23ac 100644 --- a/vllm/model_executor/layers/quantization/utils/layer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/layer_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Union diff --git a/vllm/model_executor/layers/quantization/utils/machete_utils.py b/vllm/model_executor/layers/quantization/utils/machete_utils.py index 6d840b568..580c36a0e 100644 --- a/vllm/model_executor/layers/quantization/utils/machete_utils.py +++ b/vllm/model_executor/layers/quantization/utils/machete_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index e059a7ac3..7540a1516 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py index 13dcdc00a..ca10db69d 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py index 1f6e74244..5372c49d9 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py index 81112b27f..b2c228c24 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utility functions used for tests and benchmarks""" from typing import Optional diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py index 73feb4264..1c93c3646 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utility functions used for tests and benchmarks""" import random diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py index 0123540fc..8a64bebae 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_qqq.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import numpy import torch diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index e7c95e38e..9d4a188f5 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py index f29220831..6e8e98d54 100644 --- a/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py +++ b/vllm/model_executor/layers/quantization/utils/nvfp4_emulation_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch __all__ = [ diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py index 6ba327f3d..d6b96774b 100644 --- a/vllm/model_executor/layers/quantization/utils/quant_utils.py +++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """This file is used for /tests and /benchmarks""" from collections.abc import Mapping from types import MappingProxyType diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index eed8998fe..adc67aa64 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Callable, Optional, Union diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 3db734958..a6e58a77d 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from functools import cached_property from importlib.util import find_spec diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py index 839688e31..3f2d57177 100644 --- a/vllm/model_executor/layers/resampler.py +++ b/vllm/model_executor/layers/resampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index afc059719..9de233896 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 32375db0c..08840fc40 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A layer that samples the next tokens from the model's outputs.""" import itertools from collections.abc import Iterator diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py index 969cd59b5..0a36fe9be 100644 --- a/vllm/model_executor/layers/spec_decode_base_sampler.py +++ b/vllm/model_executor/layers/spec_decode_base_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import abstractmethod from typing import Optional, Union diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py index a14c86148..5dabaa537 100644 --- a/vllm/model_executor/layers/typical_acceptance_sampler.py +++ b/vllm/model_executor/layers/typical_acceptance_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch import torch.jit diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index 001e6aaf0..d97d84238 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utility methods for model layers.""" from typing import Callable, Optional diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 46d2075af..0f636d83a 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from dataclasses import dataclass diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py index a443a652d..f36437103 100644 --- a/vllm/model_executor/model_loader/__init__.py +++ b/vllm/model_executor/model_loader/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/model_loader/base_loader.py b/vllm/model_executor/model_loader/base_loader.py index d619d9f25..5018c7d9a 100644 --- a/vllm/model_executor/model_loader/base_loader.py +++ b/vllm/model_executor/model_loader/base_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod import torch diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index 3df835a93..ebbb021ca 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: SIM117 import fnmatch import glob diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index 6946627a5..4624ff01d 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses import glob import os diff --git a/vllm/model_executor/model_loader/dummy_loader.py b/vllm/model_executor/model_loader/dummy_loader.py index 64fa2be76..f4a7da574 100644 --- a/vllm/model_executor/model_loader/dummy_loader.py +++ b/vllm/model_executor/model_loader/dummy_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch.nn as nn from vllm.config import LoadConfig, ModelConfig diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py index 1eac50422..203c80760 100644 --- a/vllm/model_executor/model_loader/gguf_loader.py +++ b/vllm/model_executor/model_loader/gguf_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from collections.abc import Generator diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py index e65d16cae..fad97aba8 100644 --- a/vllm/model_executor/model_loader/neuron.py +++ b/vllm/model_executor/model_loader/neuron.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utilities for selecting and loading Neuron models in transformers-neuronx framework.""" import ast diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py index 72ad4da29..f450961c6 100644 --- a/vllm/model_executor/model_loader/neuronx_distributed.py +++ b/vllm/model_executor/model_loader/neuronx_distributed.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utilities for selecting and loading Neuron models in neuronx-distributed-inference framework.""" # Disabling yapf because yapf and isort have conflicts for the below imports diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py index a39e26c6d..83e0f386c 100644 --- a/vllm/model_executor/model_loader/runai_streamer_loader.py +++ b/vllm/model_executor/model_loader/runai_streamer_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: SIM117 import glob import os diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py index b5a5031bb..2fd9cfba3 100644 --- a/vllm/model_executor/model_loader/sharded_state_loader.py +++ b/vllm/model_executor/model_loader/sharded_state_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import collections import glob diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 90c0bdf08..24d1e1365 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import contextlib diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py index 1923e040a..b9982f312 100644 --- a/vllm/model_executor/model_loader/tensorizer_loader.py +++ b/vllm/model_executor/model_loader/tensorizer_loader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: SIM117 import copy from collections.abc import Generator diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 9c8d647a2..e6eaade09 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utilities for selecting and loading models.""" import contextlib import inspect diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 7a9a68be8..857f4bca6 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utilities for downloading and initializing model weights.""" import fnmatch import glob diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 3580c4fa5..27c169d2d 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal, SupportsPP, SupportsV0Only, has_inner_state, diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 6ab03c40a..1651e3e42 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import TYPE_CHECKING, Any, Optional, TypeVar diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py index 2e2a18abd..b13d863eb 100644 --- a/vllm/model_executor/models/aimv2.py +++ b/vllm/model_executor/models/aimv2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # A modified implementation of the AIMv2 Transformer # inserted here also the image tokenizer used by Ovis2 diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index 94a432856..4693c9487 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only Snowflake Arctic model.""" from collections.abc import Iterable from typing import Optional, Union diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index f74e13888..bb4177dfc 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable, Mapping, Sequence from typing import Optional, TypedDict, Union diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index 08d49d71e..22efb707a 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -1,4 +1,5 @@ -# SPDX-License-Identifier: Apache-2.0 Adapted from +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project Adapted from # https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision from collections.abc import Iterable, Mapping, Sequence from typing import Literal, Optional, TypedDict, Union, cast diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index bcff6eb3f..0de5de5e8 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index d6a705fb1..29e0e2a2e 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only Bamba model.""" # Added by the IBM Team, 2024 from collections.abc import Iterable diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index 92bbe1bb6..a0ec12674 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Derived from BART implementation posted on HuggingFace; license below: # diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 0b1d0f103..389393987 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Optional diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 8a387d71f..0f22393c7 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from copy import deepcopy from typing import Optional diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index acbc5d04d..2b457fd8a 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Minimal implementation of BlipVisionModel intended to be only used within a vision language model.""" from collections.abc import Iterable diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index db0dd2051..279541bed 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable, Mapping, Sequence from typing import Literal, Optional, TypedDict, Union diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 10424e218..6e4a399f3 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index a4528ca26..aea44261d 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable, Mapping, Sequence from functools import cached_property diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 4e95afe1a..129f0942f 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/THUDM/ChatGLM2-6B """Inference-only ChatGLM model compatible with THUDM weights.""" diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 9fd528fd7..dcab00822 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Minimal implementation of CLIPVisionModel intended to be only used within a vision language model.""" from collections.abc import Iterable diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 546b5f932..ee67cc640 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/model_executor/models/constant_size_cache.py b/vllm/model_executor/models/constant_size_cache.py index f1cc7e0f9..f03c58a12 100644 --- a/vllm/model_executor/models/constant_size_cache.py +++ b/vllm/model_executor/models/constant_size_cache.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Any diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index f21887f71..7a4dd6944 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Optional, Union diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 88d1ca9f7..2f0202f1e 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index 03ef7bed0..6e6e74b0d 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Optional diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index b78c193c1..0f996d04e 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 5c8793f59..765718e57 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py """Inference-only Deepseek-VL2 model compatible with HuggingFace weights.""" diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index fb1675d29..221932145 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Optional diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 838560692..aaf105ec2 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/modeling_exaone.py diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py index 00dbbebb1..d78ee100b 100644 --- a/vllm/model_executor/models/fairseq2_llama.py +++ b/vllm/model_executor/models/fairseq2_llama.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 The vLLM team. # Copyright 2024 Meta Platforms, Inc. and affiliates. All rights reserved. diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 376793594..62a93dabd 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 1c0e3911f..28f257eab 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only FalconH1 model.""" from collections.abc import Iterable from typing import Optional diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index f8acc5670..47760aabb 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections import OrderedDict diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index fbad7f56d..cb141dbc5 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 0f6d94e75..99ed51f8e 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2023 The vLLM team. # Copyright (c) Google Inc. diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index b46716213..ce405041b 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 The vLLM team. # Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 3a88adcce..e19e0026b 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2025 The vLLM team. # Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 182cc86d3..23e251707 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable, Mapping, Sequence from typing import Any, Literal, Optional, TypedDict diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py index 6269ebcee..defa77b84 100644 --- a/vllm/model_executor/models/glm.py +++ b/vllm/model_executor/models/glm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only HF format GLM-4 model compatible with THUDM weights.""" from vllm.config import VllmConfig from vllm.model_executor.models.llama import LlamaForCausalLM diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index f351ce5a0..5e2908a82 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2025 The Zhipu AI team. # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 4e1371671..034c7654f 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/THUDM/CogAgent diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index c2c310fca..fd3decbae 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index c4ae4fc3c..661a67bdc 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 69fdd90cf..bd162a5e5 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 401fa9f5c..d418d8bb8 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 3524d036d..bd4d5d0b6 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index fd8fb48c5..831164ba8 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index f342dfff8..5a70f3a61 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 443b102c9..f434b7a74 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only GraniteMoeHybrid model.""" # Added by the IBM Team, 2025 from collections.abc import Iterable diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py index 817e6091d..bb160dbce 100644 --- a/vllm/model_executor/models/granitemoeshared.py +++ b/vllm/model_executor/models/granitemoeshared.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only GraniteMoeShared model. The architecture is the same as granitemoe but with the addition of shared diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index 6a444e8d1..4273afbf4 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from array import array from typing import Optional diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index bc9e9a3c0..2d930527b 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/ROCm/vllm/blob/cea7419f151cc50293a05b7fac8547f8f887c9f6/vllm/model_executor/models/grok1.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 904f5330c..8f7f359b7 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index b8bdc7aa3..9e27200fb 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://github.com/huggingface/transformers/blob/v4.43.2/src/transformers/models/idefics2/modeling_idefics2.py # Copyright 2024 The vLLM team. diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index fdb128ef5..4bc5e2a0c 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 the HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 8be8841c1..cb2a4062b 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol, Union, overload, runtime_checkable) diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index d325a6b67..4a1ea74a2 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import (TYPE_CHECKING, Optional, Protocol, Union, overload, runtime_checkable) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 538e9de4f..58e8163e0 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py # -------------------------------------------------------- diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 3f3e3966e..e8549b4e0 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from functools import partial diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index 6893d0239..4bbb49da0 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional, Union diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index c37d3afb4..0c61369c5 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py # -------------------------------------------------------- diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index d6a1e0bb4..bed4a5dff 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/inceptionai/jais-30b-chat-v3/blob/main/modeling_jais.py diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 6f9fa60c9..8294f846b 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only Jamba model.""" from collections.abc import Iterable from typing import Optional diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index b575f4476..351d1fbdc 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: E501 # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/modeling_kimi_vl.py # Copyright 2025 The Moonshot AI Team, DeepSeek-AI, and HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index d36b6466c..5d5080479 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 40fdd84d8..a852be66b 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # # Copyright 2025 the LLAMA4, Meta Inc., vLLM, and HuggingFace Inc. team. # All rights reserved. diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py index 172dc8b5e..f73b863fe 100644 --- a/vllm/model_executor/models/llama_eagle.py +++ b/vllm/model_executor/models/llama_eagle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 1e40017fc..d31a321b8 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Optional diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index ced71b6dc..725e1b2c1 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 2fb79f57a..6f5f23187 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import abstractmethod from collections.abc import Iterable, Mapping diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 9303ea121..a3406d090 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable, Mapping, Sequence diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 7ea759fd5..d90d3d4a0 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable, Mapping, Sequence diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index ce76a76b6..8162ac3f7 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """PyTorch MAMBA model.""" from collections.abc import Iterable from typing import Optional diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index 65c6467bc..cf9e1bd03 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """PyTorch MAMBA2 model.""" from collections.abc import Iterable from typing import Optional diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py index 47d0ef9cc..49ba974c6 100644 --- a/vllm/model_executor/models/mamba_cache.py +++ b/vllm/model_executor/models/mamba_cache.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index 95ef1134b..709a5a993 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Optional diff --git a/vllm/model_executor/models/mimo.py b/vllm/model_executor/models/mimo.py index 49ea64e02..9b83f848e 100644 --- a/vllm/model_executor/models/mimo.py +++ b/vllm/model_executor/models/mimo.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py index cbca6a4c8..6066ec76c 100644 --- a/vllm/model_executor/models/mimo_mtp.py +++ b/vllm/model_executor/models/mimo_mtp.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/model_executor/models/deepseek_mtp.py diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index f471a86ff..d398a5d12 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index 2a6867d12..92c13e81b 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py index 039c3d22d..06c2eb4e8 100644 --- a/vllm/model_executor/models/minicpm_eagle.py +++ b/vllm/model_executor/models/minicpm_eagle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index ae5df0f92..ff5959ed1 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 04cc7e35e..4100fee0e 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/minimax_cache.py b/vllm/model_executor/models/minimax_cache.py index c95cbb419..9164ac06a 100644 --- a/vllm/model_executor/models/minimax_cache.py +++ b/vllm/model_executor/models/minimax_cache.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass import torch diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index ac0fe7b10..02800449b 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only MiniMaxText01 model.""" import copy import math diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py index 14c1250ca..b2ededcaf 100644 --- a/vllm/model_executor/models/minimax_vl_01.py +++ b/vllm/model_executor/models/minimax_vl_01.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable, Mapping from typing import Literal, Optional, TypedDict, Union, cast diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 051a73120..9147240b2 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 9bc7a1615..dec365119 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 8220200d2..3183c762d 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 713c9e8d2..e9f91feb3 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 the HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 58549b10e..54fae279d 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # # Copyright 2025 the LLAMA4, Meta Inc., vLLM, and HuggingFace Inc. team. # All rights reserved. diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index a7d7aa7d4..c6a97388d 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 18eab6051..35f416a6e 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable from typing import Optional diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py index 25e6f5940..11a2a384c 100644 --- a/vllm/model_executor/models/module_mapping.py +++ b/vllm/model_executor/models/module_mapping.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 640a2049a..1fa76b9ac 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable, Mapping, Sequence diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py index 9f11d4a42..d0fdab13e 100644 --- a/vllm/model_executor/models/moonvit.py +++ b/vllm/model_executor/models/moonvit.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: E501 # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/modeling_kimi_vl.py # This file is meant to be used in kimi_vl.py only diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 6c396d778..0878ada34 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main import math diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index d0999e30e..eabf47b1a 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index 9808fe055..a766ed947 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 172434e66..2f7f8e437 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://huggingface.co/nvidia/NVLM-D-72B/blob/main/modeling_nvlm_d.py # -------------------------------------------------------- diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index fcb7c619a..1dc4df85c 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 33adacdae..499e6d30e 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index af2894555..ebfdb690f 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 8376d6241..9eaac1e28 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index da2a194e6..d121188ba 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index 232a63c50..5c11d54c6 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/ovis/modeling_ovis.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 427005e9b..a0e291257 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable, Mapping, Sequence from typing import Literal, Optional, TypedDict, Union diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index d46b95fea..f8db99eb9 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 330ad5c59..21d517b3a 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_phi.py diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py index 8f84e0726..f4e870c53 100644 --- a/vllm/model_executor/models/phi3.py +++ b/vllm/model_executor/models/phi3.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from llama.py """Inference-only Phi3 model code inherit from Llama.py""" diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index d00d7d886..533655fd5 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index b757e661d..376c53d2c 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 The vLLM team. # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 418ff900f..924e64368 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable, Mapping, Sequence from typing import Any, Literal, Optional, TypedDict, Union diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py index 98cef7506..ae7a8a732 100644 --- a/vllm/model_executor/models/phi4mm_audio.py +++ b/vllm/model_executor/models/phi4mm_audio.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. # Code copied from Microsoft/MoE by Jacob Platin (jacobplatin@microsoft.com) diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py index f468fdbd5..c4890d842 100644 --- a/vllm/model_executor/models/phi4mm_utils.py +++ b/vllm/model_executor/models/phi4mm_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. # Code copied from Microsoft/MoE by Jacob Platin (jacobplatin@microsoft.com) diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index d9917c26d..dddd19c74 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 9f28d4cef..705586b6a 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable, Mapping, Sequence diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 55a65f807..670576c68 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Inference-only PLaMo2 model.""" import math from collections.abc import Iterable diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py index 40ac5e30a..4fdcae5de 100644 --- a/vllm/model_executor/models/prithvi_geospatial_mae.py +++ b/vllm/model_executor/models/prithvi_geospatial_mae.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2025 The vLLM team. # Copyright 2025 IBM. diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 2fda87a4f..e804f03e0 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index a664864ff..23f65b99c 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index d89b822dd..7172394e4 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index f62c7e1d2..7770ec711 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 3182a7532..6951630c6 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 143b9f98b..a2c65f4b5 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 81dc38988..76d7ecdd1 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 5c30e36c7..a4f8a361e 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index dbe2be8a7..393ce41a9 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 8a4c2850d..823197fc9 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index f5d242fdf..e828ce9c9 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/Qwen/Qwen-VL/blob/main/modeling_qwen.py diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index fcef457a7..57d1b7c53 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Whenever you add an architecture to this page, please also update `tests/models/registry.py` with example HuggingFace models for it. diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 76008b729..8fa8b8979 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools from collections.abc import Iterable diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 4803da295..3630f59f5 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Implementation of SiglipVisionModel intended to be only used within a vision language model.""" diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index eefadda91..08c47faca 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py # -------------------------------------------------------- diff --git a/vllm/model_executor/models/smolvlm.py b/vllm/model_executor/models/smolvlm.py index 31dec5502..0f22ba5b4 100644 --- a/vllm/model_executor/models/smolvlm.py +++ b/vllm/model_executor/models/smolvlm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index fcd17cc1c..8dd52f1d2 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 86ce813dd..d6ec743ce 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. # All rights reserved. diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index f4ba5a803..9d9a2bff0 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py index 7d713d23c..f0b31b133 100644 --- a/vllm/model_executor/models/telechat2.py +++ b/vllm/model_executor/models/telechat2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2023 The vLLM team. # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/teleflm.py b/vllm/model_executor/models/teleflm.py index e05f23f99..3666f7011 100644 --- a/vllm/model_executor/models/teleflm.py +++ b/vllm/model_executor/models/teleflm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index b87a2ebf2..2f78d9d4c 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 The vLLM team. # diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index c1a4dc1b3..43836f295 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py """PyTorch Ultravox model.""" diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 3d821d3dc..aa88f4210 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools from collections.abc import Iterable, Mapping diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 901d83ec5..ac6a659bb 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Final, Generic, Optional, Protocol, TypeVar, Union diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index c6e303d60..3ee5f7dba 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import math from collections.abc import Iterable, Mapping, Sequence diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index 48e254bdd..a4f97c774 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """PyTorch Zamba2 model implementation for vLLM. This module implements the Zamba2 architecture from diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py index 34a0b527b..750ee7850 100644 --- a/vllm/model_executor/parameter.py +++ b/vllm/model_executor/parameter.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from fractions import Fraction from typing import Callable, Optional, Union diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py index 4c5db7396..4dd443bc2 100644 --- a/vllm/model_executor/pooling_metadata.py +++ b/vllm/model_executor/pooling_metadata.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Any diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 6b83a59b5..56f0f0984 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from array import array from dataclasses import dataclass diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index 27cea6521..cbaa34bfc 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Utils for model executor.""" import copy from typing import Any, Optional diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 815e34d5a..2ef9f1ccc 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .base import MultiModalPlaceholderMap from .hasher import MultiModalHashDict, MultiModalHasher from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins, diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index 1fd2ab7f8..fbb29276f 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 from io import BytesIO from pathlib import Path diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 184c801e6..7188ed14c 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from collections.abc import Sequence diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py index b4cd6a908..b79883597 100644 --- a/vllm/multimodal/hasher.py +++ b/vllm/multimodal/hasher.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pickle from collections.abc import Iterable, Mapping diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index a63ec0bd8..e673632d4 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 from io import BytesIO diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 600a34d39..35d2a6e8c 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from collections import UserDict, defaultdict diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 63af84274..cae62b223 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from collections import UserDict diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index aa7914e40..5cfca57bf 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import sys from abc import ABC, abstractmethod diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 53f5b243d..1faecb7bd 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from collections.abc import Mapping from dataclasses import dataclass, field diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index b9f5cee92..27aaa661c 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Mapping from dataclasses import dataclass from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 1d838f66f..2b34cdf40 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from itertools import groupby from pathlib import Path diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 261d56aba..bedb9536e 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import base64 from abc import abstractmethod diff --git a/vllm/outputs.py b/vllm/outputs.py index 3960388bf..891305eb7 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time from collections.abc import MutableSequence diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 00d00d05f..13453d2c4 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import logging import traceback diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index eaffaac78..2739f5c8c 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import sys diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 9f833cbb5..e2d9424de 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Code inside this file can safely assume cuda platform, e.g. importing pynvml. However, it should not initialize cuda context. """ diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index a8dd7df9f..3cf289501 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import TYPE_CHECKING, Optional diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index c7a627262..1ec9c78a3 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum import os import platform diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index 56f204e71..04e918d7a 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum import os from functools import lru_cache diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index ef1c632a5..a929366db 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from datetime import timedelta diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 0173b1569..07e52017f 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TYPE_CHECKING, Optional, Union, cast diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index b2a6ad5d7..73f6f3d41 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TYPE_CHECKING, Optional diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 4cd3552f8..2cb177b9b 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import logging import os diff --git a/vllm/plugins/lora_resolvers/filesystem_resolver.py b/vllm/plugins/lora_resolvers/filesystem_resolver.py index 219231f77..b999d07a6 100644 --- a/vllm/plugins/lora_resolvers/filesystem_resolver.py +++ b/vllm/plugins/lora_resolvers/filesystem_resolver.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import os from typing import Optional diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 9a3b254f9..322f9ed3e 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TYPE_CHECKING, Any, Optional diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py index 6934d328a..2f9ebe531 100644 --- a/vllm/profiler/layerwise_profile.py +++ b/vllm/profiler/layerwise_profile.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy from collections import defaultdict diff --git a/vllm/profiler/utils.py b/vllm/profiler/utils.py index b26fd4dd8..9f0f56a15 100644 --- a/vllm/profiler/utils.py +++ b/vllm/profiler/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from typing import Callable, Union diff --git a/vllm/prompt_adapter/layers.py b/vllm/prompt_adapter/layers.py index c2f9f1691..b5b925d04 100644 --- a/vllm/prompt_adapter/layers.py +++ b/vllm/prompt_adapter/layers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Optional diff --git a/vllm/prompt_adapter/models.py b/vllm/prompt_adapter/models.py index 795591606..864b50c86 100644 --- a/vllm/prompt_adapter/models.py +++ b/vllm/prompt_adapter/models.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import logging import math diff --git a/vllm/prompt_adapter/request.py b/vllm/prompt_adapter/request.py index dfb8e61d7..3ce50d0a2 100644 --- a/vllm/prompt_adapter/request.py +++ b/vllm/prompt_adapter/request.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import msgspec diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py index dd179ab93..ddd007868 100644 --- a/vllm/prompt_adapter/utils.py +++ b/vllm/prompt_adapter/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420 diff --git a/vllm/prompt_adapter/worker_manager.py b/vllm/prompt_adapter/worker_manager.py index 28dcc1687..56265de80 100644 --- a/vllm/prompt_adapter/worker_manager.py +++ b/vllm/prompt_adapter/worker_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import logging from typing import Any, Optional, Set, Type diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py index 65606ce55..e8cd56551 100644 --- a/vllm/reasoning/__init__.py +++ b/vllm/reasoning/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index 9dd5191da..e827d381c 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/vllm/reasoning/deepseek_r1_reasoning_parser.py b/vllm/reasoning/deepseek_r1_reasoning_parser.py index 1c283c092..1a5ca46a6 100644 --- a/vllm/reasoning/deepseek_r1_reasoning_parser.py +++ b/vllm/reasoning/deepseek_r1_reasoning_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from typing import Optional, Union diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py index 07a63e294..5820001b9 100644 --- a/vllm/reasoning/granite_reasoning_parser.py +++ b/vllm/reasoning/granite_reasoning_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from typing import Optional, Union diff --git a/vllm/reasoning/qwen3_reasoning_parser.py b/vllm/reasoning/qwen3_reasoning_parser.py index 7095034b1..61bafc724 100644 --- a/vllm/reasoning/qwen3_reasoning_parser.py +++ b/vllm/reasoning/qwen3_reasoning_parser.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from typing import Optional, Union diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 4294465f6..7abdcecca 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Sampling parameters for text generation.""" import copy from dataclasses import dataclass diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py index fc1761c84..9060b55c7 100644 --- a/vllm/scalar_type.py +++ b/vllm/scalar_type.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools import struct diff --git a/vllm/scripts.py b/vllm/scripts.py index 7e569d2d2..7a7fdccf0 100644 --- a/vllm/scripts.py +++ b/vllm/scripts.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.entrypoints.cli.main import main as vllm_main from vllm.logger import init_logger diff --git a/vllm/sequence.py b/vllm/sequence.py index d359f897d..ffe890eb2 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Sequence and its related classes.""" import copy import enum diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index e08ed742a..f9b882469 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from array import array from itertools import chain, count diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index 991d2040a..8ccfefea1 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import List, Optional diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py index dd085ad77..70ec1590e 100644 --- a/vllm/spec_decode/interfaces.py +++ b/vllm/spec_decode/interfaces.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from dataclasses import dataclass diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py index 0b62a988e..82b5a79fa 100644 --- a/vllm/spec_decode/medusa_worker.py +++ b/vllm/spec_decode/medusa_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import weakref from typing import List, Optional, Set, Tuple diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index 4430da26c..a4784cad9 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time from typing import Callable, Optional, Union diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py index bdaf31895..8e8c05d26 100644 --- a/vllm/spec_decode/mlp_speculator_worker.py +++ b/vllm/spec_decode/mlp_speculator_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import List, Optional, Set, Tuple diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py index 6275c460e..18e7b055a 100644 --- a/vllm/spec_decode/mqa_scorer.py +++ b/vllm/spec_decode/mqa_scorer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.sequence import (ExecuteModelRequest, SequenceData, SequenceGroupMetadata, get_all_seq_ids) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index de57403d1..4a9bbe44d 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy import weakref diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index 57ae173af..7a1a0e56d 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import weakref from typing import List, Optional, Set, Tuple diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py index 2829d631b..fb44275aa 100644 --- a/vllm/spec_decode/proposer_worker_base.py +++ b/vllm/spec_decode/proposer_worker_base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import List, Optional, Set, Tuple diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index ea3d91d78..91256cab6 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import List, Optional, Set, Tuple diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 252c80957..7dda1cbfe 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy from collections import defaultdict diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py index 08e773c56..ca89eb60a 100644 --- a/vllm/spec_decode/target_model_runner.py +++ b/vllm/spec_decode/target_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import List, Optional diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py index b538923c0..afd91b42b 100644 --- a/vllm/spec_decode/top1_proposer.py +++ b/vllm/spec_decode/top1_proposer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import List, Optional, Set, Tuple diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index 466269b21..22d2a4833 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time from contextlib import contextmanager diff --git a/vllm/test_utils.py b/vllm/test_utils.py index f8cec380f..c6b126d00 100644 --- a/vllm/test_utils.py +++ b/vllm/test_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project MODELS_ON_S3 = [ "adept/fuyu-8b", "ai21labs/AI21-Jamba-1.5-Mini", diff --git a/vllm/third_party/pynvml.py b/vllm/third_party/pynvml.py index 7ed9ced0e..d215e5d8b 100644 --- a/vllm/third_party/pynvml.py +++ b/vllm/third_party/pynvml.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # copied from https://pypi.org/project/nvidia-ml-py # version 12.570.86 diff --git a/vllm/tracing.py b/vllm/tracing.py index 557ae40b8..6a287d82b 100644 --- a/vllm/tracing.py +++ b/vllm/tracing.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from collections.abc import Mapping diff --git a/vllm/transformers_utils/__init__.py b/vllm/transformers_utils/__init__.py index 84bd7a747..6d4231bac 100644 --- a/vllm/transformers_utils/__init__.py +++ b/vllm/transformers_utils/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm import envs diff --git a/vllm/transformers_utils/chat_templates/__init__.py b/vllm/transformers_utils/chat_templates/__init__.py index fe2bd3ca4..2783d12a2 100644 --- a/vllm/transformers_utils/chat_templates/__init__.py +++ b/vllm/transformers_utils/chat_templates/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .registry import get_chat_template_fallback_path __all__ = ["get_chat_template_fallback_path"] diff --git a/vllm/transformers_utils/chat_templates/registry.py b/vllm/transformers_utils/chat_templates/registry.py index 853fed5d4..e0ef7f099 100644 --- a/vllm/transformers_utils/chat_templates/registry.py +++ b/vllm/transformers_utils/chat_templates/registry.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from pathlib import Path from typing import Callable, Optional, Union diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 8774f95a2..9bc3b8e09 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum import json diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index ed10c22c8..7edff455f 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.transformers_utils.configs.chatglm import ChatGLMConfig from vllm.transformers_utils.configs.cohere2 import Cohere2Config diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py index 2261f0a9e..a789b93b5 100644 --- a/vllm/transformers_utils/configs/arctic.py +++ b/vllm/transformers_utils/configs/arctic.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # yapf: disable # ruff: noqa: E501 diff --git a/vllm/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py index 43e9503ff..7c5de3e94 100644 --- a/vllm/transformers_utils/configs/chatglm.py +++ b/vllm/transformers_utils/configs/chatglm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://github.com/THUDM/ChatGLM2-6B diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py index 21328d767..e547a9c28 100644 --- a/vllm/transformers_utils/configs/cohere2.py +++ b/vllm/transformers_utils/configs/cohere2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa diff --git a/vllm/transformers_utils/configs/dbrx.py b/vllm/transformers_utils/configs/dbrx.py index bffa127fe..7dbda99f8 100644 --- a/vllm/transformers_utils/configs/dbrx.py +++ b/vllm/transformers_utils/configs/dbrx.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # yapf: disable # ruff: noqa: E501 diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py index a54486fa4..957d63831 100644 --- a/vllm/transformers_utils/configs/deepseek_vl2.py +++ b/vllm/transformers_utils/configs/deepseek_vl2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268 diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py index a43e4746c..fb2e8a1df 100644 --- a/vllm/transformers_utils/configs/eagle.py +++ b/vllm/transformers_utils/configs/eagle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import Optional, Union diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py index 25bafbb85..7450904a1 100644 --- a/vllm/transformers_utils/configs/exaone.py +++ b/vllm/transformers_utils/configs/exaone.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copied from # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py diff --git a/vllm/transformers_utils/configs/falcon.py b/vllm/transformers_utils/configs/falcon.py index f161a06f3..2f5400463 100644 --- a/vllm/transformers_utils/configs/falcon.py +++ b/vllm/transformers_utils/configs/falcon.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py diff --git a/vllm/transformers_utils/configs/h2ovl.py b/vllm/transformers_utils/configs/h2ovl.py index 48b5d79ff..b36a6dd59 100644 --- a/vllm/transformers_utils/configs/h2ovl.py +++ b/vllm/transformers_utils/configs/h2ovl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py diff --git a/vllm/transformers_utils/configs/internvl.py b/vllm/transformers_utils/configs/internvl.py index 8ea62546e..4494ebfef 100644 --- a/vllm/transformers_utils/configs/internvl.py +++ b/vllm/transformers_utils/configs/internvl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/configuration_internvl_chat.py diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py index b947c6a9e..767c4ddae 100644 --- a/vllm/transformers_utils/configs/jais.py +++ b/vllm/transformers_utils/configs/jais.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. diff --git a/vllm/transformers_utils/configs/kimi_vl.py b/vllm/transformers_utils/configs/kimi_vl.py index 97ff44bb9..ae8dac0f3 100644 --- a/vllm/transformers_utils/configs/kimi_vl.py +++ b/vllm/transformers_utils/configs/kimi_vl.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py from typing import Optional, Union diff --git a/vllm/transformers_utils/configs/medusa.py b/vllm/transformers_utils/configs/medusa.py index 885713c5d..9ba52956a 100644 --- a/vllm/transformers_utils/configs/medusa.py +++ b/vllm/transformers_utils/configs/medusa.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import Optional, Union diff --git a/vllm/transformers_utils/configs/minimax_text_01.py b/vllm/transformers_utils/configs/minimax_text_01.py index 660e870ac..e3b63dfa0 100644 --- a/vllm/transformers_utils/configs/minimax_text_01.py +++ b/vllm/transformers_utils/configs/minimax_text_01.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ MiniMaxText01 model configuration""" from transformers.configuration_utils import PretrainedConfig diff --git a/vllm/transformers_utils/configs/minimax_vl_01.py b/vllm/transformers_utils/configs/minimax_vl_01.py index 99e0d249d..c62497192 100644 --- a/vllm/transformers_utils/configs/minimax_vl_01.py +++ b/vllm/transformers_utils/configs/minimax_vl_01.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """MiniMaxVL01 model configuration""" from transformers.configuration_utils import PretrainedConfig diff --git a/vllm/transformers_utils/configs/mllama.py b/vllm/transformers_utils/configs/mllama.py index eb77e09ad..f0cd2d52a 100644 --- a/vllm/transformers_utils/configs/mllama.py +++ b/vllm/transformers_utils/configs/mllama.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from transformers.models.mllama import configuration_mllama as mllama_hf_config diff --git a/vllm/transformers_utils/configs/mlp_speculator.py b/vllm/transformers_utils/configs/mlp_speculator.py index 70f607529..2fa284e5c 100644 --- a/vllm/transformers_utils/configs/mlp_speculator.py +++ b/vllm/transformers_utils/configs/mlp_speculator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/transformers_utils/configs/moonvit.py b/vllm/transformers_utils/configs/moonvit.py index a2b4059a6..a6f712f3d 100644 --- a/vllm/transformers_utils/configs/moonvit.py +++ b/vllm/transformers_utils/configs/moonvit.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py from transformers.configuration_utils import PretrainedConfig diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py index 2d52658d3..91316408d 100644 --- a/vllm/transformers_utils/configs/mpt.py +++ b/vllm/transformers_utils/configs/mpt.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copied from # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index fdf4fa2a5..d65b572dc 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2024 HuggingFace Inc. team. All rights reserved. # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. diff --git a/vllm/transformers_utils/configs/nvlm_d.py b/vllm/transformers_utils/configs/nvlm_d.py index 300f6e211..a533720af 100644 --- a/vllm/transformers_utils/configs/nvlm_d.py +++ b/vllm/transformers_utils/configs/nvlm_d.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py diff --git a/vllm/transformers_utils/configs/ovis.py b/vllm/transformers_utils/configs/ovis.py index 0ec224214..c2728f0ed 100644 --- a/vllm/transformers_utils/configs/ovis.py +++ b/vllm/transformers_utils/configs/ovis.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # yapf: disable # ruff: noqa: E501 diff --git a/vllm/transformers_utils/configs/skyworkr1v.py b/vllm/transformers_utils/configs/skyworkr1v.py index ef5f9ba85..33a45220e 100644 --- a/vllm/transformers_utils/configs/skyworkr1v.py +++ b/vllm/transformers_utils/configs/skyworkr1v.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/configuration_skywork_chat.py diff --git a/vllm/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py index 6eaf699d1..a83dfa40b 100644 --- a/vllm/transformers_utils/configs/solar.py +++ b/vllm/transformers_utils/configs/solar.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/transformers_utils/configs/telechat2.py b/vllm/transformers_utils/configs/telechat2.py index 5da6c5b44..050a7851d 100644 --- a/vllm/transformers_utils/configs/telechat2.py +++ b/vllm/transformers_utils/configs/telechat2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://www.modelscope.cn/models/TeleAI/TeleChat2-3B/resolve/master/configuration_telechat2.py """ Telechat configuration compatible with LlamaConfig. """ diff --git a/vllm/transformers_utils/configs/ultravox.py b/vllm/transformers_utils/configs/ultravox.py index 4c5072427..62f63b02d 100644 --- a/vllm/transformers_utils/configs/ultravox.py +++ b/vllm/transformers_utils/configs/ultravox.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_config.py from typing import Any, Optional diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py index 3adf2e32c..380c62a14 100644 --- a/vllm/transformers_utils/detokenizer.py +++ b/vllm/transformers_utils/detokenizer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py index 7373fa0ed..342632989 100644 --- a/vllm/transformers_utils/detokenizer_utils.py +++ b/vllm/transformers_utils/detokenizer_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index ce6427de4..70cd08263 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from functools import lru_cache from typing import TYPE_CHECKING, Any, Optional, Union, cast diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py index 2bd9ab1f0..14d15f2bc 100644 --- a/vllm/transformers_utils/processors/__init__.py +++ b/vllm/transformers_utils/processors/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.transformers_utils.processors.deepseek_vl2 import ( DeepseekVLV2Processor) diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py index df960e9c7..b4669d12f 100644 --- a/vllm/transformers_utils/processors/deepseek_vl2.py +++ b/vllm/transformers_utils/processors/deepseek_vl2.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # yapf: disable # ruff: noqa: E501 diff --git a/vllm/transformers_utils/processors/ovis.py b/vllm/transformers_utils/processors/ovis.py index f1c6407e1..4fe76d0df 100644 --- a/vllm/transformers_utils/processors/ovis.py +++ b/vllm/transformers_utils/processors/ovis.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # yapf: disable # ruff: noqa: E501 diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py index 1c3520bcf..f95aae781 100644 --- a/vllm/transformers_utils/s3_utils.py +++ b/vllm/transformers_utils/s3_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import fnmatch import os diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index fa7a208c4..ae96ebe4e 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import contextlib import copy diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py index d69e5a6b4..20e5fea71 100644 --- a/vllm/transformers_utils/tokenizer_base.py +++ b/vllm/transformers_utils/tokenizer_base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib from abc import ABC, abstractmethod diff --git a/vllm/transformers_utils/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group.py index 8b9e4881e..eb53cceaa 100644 --- a/vllm/transformers_utils/tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py index 7aac29a6b..941156c4b 100644 --- a/vllm/transformers_utils/tokenizers/__init__.py +++ b/vllm/transformers_utils/tokenizers/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from .mistral import (MistralTokenizer, maybe_serialize_tool_calls, truncate_tool_call_ids, validate_request_params) diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 23b6f67f0..fcc0f538f 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from dataclasses import dataclass diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py index 8dff1b612..66c8fb797 100644 --- a/vllm/transformers_utils/utils.py +++ b/vllm/transformers_utils/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json from functools import cache diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py index 9f14a907a..0fcf5d15a 100644 --- a/vllm/triton_utils/__init__.py +++ b/vllm/triton_utils/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.triton_utils.importing import (HAS_TRITON, TritonLanguagePlaceholder, TritonPlaceholder) diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py index 8cf2e01a3..068fa3031 100644 --- a/vllm/triton_utils/importing.py +++ b/vllm/triton_utils/importing.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import types from importlib.util import find_spec diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 90af0c63c..c14963763 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import datetime import json diff --git a/vllm/utils.py b/vllm/utils.py index b4152e6b2..41336b80e 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 9ed3dec7f..9e989df1c 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with FlashAttention.""" from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Optional diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 1c4f7f62f..8bd998eba 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with FlashInfer.""" from __future__ import annotations diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 1edfab26b..96befca5a 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ # MLA Common Components diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index e6594c6b6..060a7c9d8 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Any, Optional diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index d1e823bbe..8925b5a5c 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Any, Optional diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py index 2e6b619db..0857fc133 100644 --- a/vllm/v1/attention/backends/mla/triton_mla.py +++ b/vllm/v1/attention/backends/mla/triton_mla.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any, Optional diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py index 8187e457d..896f1394c 100644 --- a/vllm/v1/attention/backends/pallas.py +++ b/vllm/v1/attention/backends/pallas.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Any, Optional diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index a97bb8500..6a3314dd8 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with PagedAttention and Triton prefix prefill.""" from typing import TYPE_CHECKING, Any, Optional diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 10a771e83..2e65619ed 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass import torch diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index a0a065df9..27eaca497 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import defaultdict from collections.abc import Iterable from typing import Callable, Optional diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index 05d70bb9b..16dc67b9b 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TYPE_CHECKING diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 59e07382b..91999d300 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections import defaultdict from dataclasses import dataclass diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 3ccad97e9..61476362e 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """KV-Cache Utilities.""" import os from collections import deque diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py index 055ce4460..dd5052a34 100644 --- a/vllm/v1/core/sched/interface.py +++ b/vllm/v1/core/sched/interface.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from collections.abc import Iterable from typing import TYPE_CHECKING, Optional, Union diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index 257234430..b404c70eb 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index ce16a1ed5..e510a0626 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py index 3a0028a59..1397c5f4c 100644 --- a/vllm/v1/core/sched/utils.py +++ b/vllm/v1/core/sched/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.v1.request import Request, RequestStatus diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index e69e9ac9f..233c73e88 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from collections import defaultdict from typing import Callable diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 0c9f61a76..d1bec2523 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum import time diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 4b235c596..0e3696321 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio from collections.abc import AsyncGenerator, Mapping from copy import copy diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py index b84d4b144..4f6ba099c 100644 --- a/vllm/v1/engine/coordinator.py +++ b/vllm/v1/engine/coordinator.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import multiprocessing import time import weakref diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 7253d1dc6..f36a491a1 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import queue import signal diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index fa01998aa..adb0709c8 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio import contextlib import queue diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index dca327cc5..c6fe2d339 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod from typing import Optional diff --git a/vllm/v1/engine/exceptions.py b/vllm/v1/engine/exceptions.py index 97dd31d5e..692ba9dc8 100644 --- a/vllm/v1/engine/exceptions.py +++ b/vllm/v1/engine/exceptions.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project class EngineGenerateError(Exception): """Raised when a AsyncLLM.generate() fails. Recoverable.""" pass diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index c856e2645..736ffd8b4 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Mapping from copy import copy diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py index 03d82b6bb..edc3be5b0 100644 --- a/vllm/v1/engine/logprobs.py +++ b/vllm/v1/engine/logprobs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools from collections.abc import Iterable diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index 45fb5cd23..abe98a13d 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence from typing import Optional diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 293c291b4..1dcfbab30 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio from collections.abc import Iterable diff --git a/vllm/v1/engine/parallel_sampling.py b/vllm/v1/engine/parallel_sampling.py index 4df7ca597..1e9911152 100644 --- a/vllm/v1/engine/parallel_sampling.py +++ b/vllm/v1/engine/parallel_sampling.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from copy import copy from typing import Optional diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 64a756148..5c0d01d9b 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time from collections.abc import Mapping, Sequence diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index 3b9feb0d3..50b9634a4 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from concurrent.futures import Future from typing import Callable, Union diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index eb5f9d4bf..0bd7383b5 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import multiprocessing import os import pickle diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py index 320ebfd37..257564793 100644 --- a/vllm/v1/executor/ray_distributed_executor.py +++ b/vllm/v1/executor/ray_distributed_executor.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from concurrent.futures import Future from typing import Union diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 2747fc7fa..cf2eb3b95 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy from dataclasses import dataclass diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 665e5873d..2d621ec31 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import logging import time diff --git a/vllm/v1/metrics/prometheus.py b/vllm/v1/metrics/prometheus.py index a364b286d..61ba5d66c 100644 --- a/vllm/v1/metrics/prometheus.py +++ b/vllm/v1/metrics/prometheus.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os import tempfile diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py index a51c3ed7f..18c8dcf0a 100644 --- a/vllm/v1/metrics/ray_wrappers.py +++ b/vllm/v1/metrics/ray_wrappers.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time from typing import Optional, Union diff --git a/vllm/v1/metrics/reader.py b/vllm/v1/metrics/reader.py index 5ab78129a..4d6e59984 100644 --- a/vllm/v1/metrics/reader.py +++ b/vllm/v1/metrics/reader.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Optional diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 8fe163061..50c8b07fe 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time from dataclasses import dataclass, field diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index e8ce0df5e..17a299d57 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import NamedTuple, Optional diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 42c75ef96..53fd70fab 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum from typing import TYPE_CHECKING, Any, Optional, Union diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py index e97e1235f..ab13b288a 100644 --- a/vllm/v1/sample/metadata.py +++ b/vllm/v1/sample/metadata.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass from typing import Optional diff --git a/vllm/v1/sample/ops/bad_words.py b/vllm/v1/sample/ops/bad_words.py index 2984d4e48..1b699565f 100644 --- a/vllm/v1/sample/ops/bad_words.py +++ b/vllm/v1/sample/ops/bad_words.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/vllm/v1/sample/ops/penalties.py b/vllm/v1/sample/ops/penalties.py index ed05e3f48..48423b9b4 100644 --- a/vllm/v1/sample/ops/penalties.py +++ b/vllm/v1/sample/ops/penalties.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index 4a5fbb10d..30396f159 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py index 17b870fed..b2354c533 100644 --- a/vllm/v1/sample/rejection_sampler.py +++ b/vllm/v1/sample/rejection_sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 16561d30a..8ba3c2087 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A layer that samples the next tokens from the model's outputs.""" import torch diff --git a/vllm/v1/sample/tpu/metadata.py b/vllm/v1/sample/tpu/metadata.py index a1c7dcdb1..4c1ac4895 100644 --- a/vllm/v1/sample/tpu/metadata.py +++ b/vllm/v1/sample/tpu/metadata.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass, field from typing import Optional diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py index 7c31a2984..1056eb1d7 100644 --- a/vllm/v1/sample/tpu/sampler.py +++ b/vllm/v1/sample/tpu/sampler.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Sampler layer implementing TPU supported operations.""" import torch diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index 78f37c1e8..ab6653a78 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses import pickle diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 1ca856423..416bc8af1 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch import torch.nn as nn diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py index fdac2ef64..f516bf486 100644 --- a/vllm/v1/spec_decode/medusa.py +++ b/vllm/v1/spec_decode/medusa.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch import torch.nn as nn diff --git a/vllm/v1/spec_decode/metadata.py b/vllm/v1/spec_decode/metadata.py index 1cf650d5f..b1efb4061 100644 --- a/vllm/v1/spec_decode/metadata.py +++ b/vllm/v1/spec_decode/metadata.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass import numpy as np diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py index 36091bef2..b4bc3058c 100644 --- a/vllm/v1/spec_decode/metrics.py +++ b/vllm/v1/spec_decode/metrics.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass, field from typing import Optional diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py index 704153d43..6b90d0970 100644 --- a/vllm/v1/spec_decode/ngram_proposer.py +++ b/vllm/v1/spec_decode/ngram_proposer.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import numpy as np diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py index 334258e7f..5c37333ce 100644 --- a/vllm/v1/spec_decode/utils.py +++ b/vllm/v1/spec_decode/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.triton_utils import tl, triton from vllm.v1.worker.gpu_input_batch import InputBatch diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 07b422814..b2b0ee796 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations import multiprocessing diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py index 55c5f6090..02e7fc33f 100644 --- a/vllm/v1/structured_output/backend_guidance.py +++ b/vllm/v1/structured_output/backend_guidance.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py index 09f6cdf73..d500783aa 100644 --- a/vllm/v1/structured_output/backend_types.py +++ b/vllm/v1/structured_output/backend_types.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index f2570221d..88544565e 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py index 9a7e30d41..fc365f125 100644 --- a/vllm/v1/structured_output/request.py +++ b/vllm/v1/structured_output/request.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations import dataclasses diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index 111e92dc0..7adee7237 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from __future__ import annotations diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index d347efc42..5b497e66c 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import multiprocessing diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 576086ebe..958262c49 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import numpy as np import torch diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index b3e65917d..bb986b604 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Datastructures defining an input batch from dataclasses import dataclass diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9f7c474c7..c96ad0c01 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy import gc diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index dd06e7296..f36cf5d5c 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A GPU worker class.""" import gc import os diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index eb8ed6221..afa41a37e 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Define LoRA functionality mixin for model runners. """ diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index c5171b973..48ea3cb7b 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import bisect import gc import time diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index bf0a5777c..8d2f8112d 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A TPU worker class.""" import os from typing import Optional diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 91548a52c..b23b28c1d 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional import torch diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py index 487a49b62..9c93754f9 100644 --- a/vllm/v1/worker/worker_base.py +++ b/vllm/v1/worker/worker_base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional diff --git a/vllm/version.py b/vllm/version.py index 8329d7bec..6c88b1b5a 100644 --- a/vllm/version.py +++ b/vllm/version.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project try: from ._version import __version__, __version_tuple__ diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index d48a6957c..530907012 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """CacheEngine class for managing the KV cache.""" from typing import List diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py index 82eeeb570..677d66357 100644 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ b/vllm/worker/cpu_enc_dec_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, cast diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index fb436a079..6213cf760 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses import weakref diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py index 2a60e5126..174f86f48 100644 --- a/vllm/worker/cpu_pooling_model_runner.py +++ b/vllm/worker/cpu_pooling_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from typing import Any, Dict, List, Optional, Tuple, Type, Union diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 1436a4043..b04a9a1eb 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A CPU worker class.""" import os from typing import Dict, List, Optional, Set, Tuple, Type diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 3957e5608..a3e7b0147 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses import itertools diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index e2261cbb2..17123d2b4 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 533fead0e..6d76ea499 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 8c968faa7..75501e0f7 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses import gc diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index 935325cb2..d567ce4a6 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from abc import ABC, abstractmethod diff --git a/vllm/worker/multi_step_hpu_worker.py b/vllm/worker/multi_step_hpu_worker.py index 2c5e2eac7..f0210c13c 100644 --- a/vllm/worker/multi_step_hpu_worker.py +++ b/vllm/worker/multi_step_hpu_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project ############################################################################### # Copyright (C) 2025 Habana Labs, Ltd. an Intel Company diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index f8d5acf58..cc0cc855e 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses import functools diff --git a/vllm/worker/multi_step_neuron_model_runner.py b/vllm/worker/multi_step_neuron_model_runner.py index aafb7ab7c..336e41649 100644 --- a/vllm/worker/multi_step_neuron_model_runner.py +++ b/vllm/worker/multi_step_neuron_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from importlib.util import find_spec from typing import List, Optional diff --git a/vllm/worker/multi_step_neuronx_distributed_model_runner.py b/vllm/worker/multi_step_neuronx_distributed_model_runner.py index 3a9c0993e..de9827723 100644 --- a/vllm/worker/multi_step_neuronx_distributed_model_runner.py +++ b/vllm/worker/multi_step_neuronx_distributed_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import List, Optional import torch diff --git a/vllm/worker/multi_step_tpu_worker.py b/vllm/worker/multi_step_tpu_worker.py index 387119998..ed9f00166 100644 --- a/vllm/worker/multi_step_tpu_worker.py +++ b/vllm/worker/multi_step_tpu_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from typing import Dict, Optional, Tuple diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py index 3518ab2f6..ea16e14f9 100644 --- a/vllm/worker/multi_step_worker.py +++ b/vllm/worker/multi_step_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from dataclasses import dataclass diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index 3aff3e01a..28855bb46 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from dataclasses import dataclass diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index 64daee31b..662bde6bc 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A Neuron worker class.""" import os from typing import List, Optional, Set, Tuple diff --git a/vllm/worker/neuronx_distributed_model_runner.py b/vllm/worker/neuronx_distributed_model_runner.py index 9cd4f88d3..2a0f4e77c 100644 --- a/vllm/worker/neuronx_distributed_model_runner.py +++ b/vllm/worker/neuronx_distributed_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import List, Optional, Set diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py index 912e04c43..be6b3d137 100644 --- a/vllm/worker/pooling_model_runner.py +++ b/vllm/worker/pooling_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses from typing import Any, Dict, List, Optional, Tuple, Type, Union diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index e0cca9072..5f1535271 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum import time diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 4bb9bea02..ad5ed19e2 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from typing import List, Optional, Tuple, Union diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py index e2854bcb3..1a5f62cb3 100644 --- a/vllm/worker/utils.py +++ b/vllm/worker/utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project ''' Worker-related helper functions. ''' diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 2a4317271..9a9286326 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A GPU worker class.""" import gc import os diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index e5662e693..db1ca2d8f 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses import os diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 79fa7d2c7..ecbb63d91 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import dataclasses import time diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index a5109a982..fe321c059 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """A XPU worker class.""" import gc import os -- GitLab From 19bdaf32b139656627c8b311361a0fa38ae98f4b Mon Sep 17 00:00:00 2001 From: SorenDreano <71752785+SorenDreano@users.noreply.github.com> Date: Tue, 3 Jun 2025 20:50:55 +0200 Subject: [PATCH 141/274] [Doc] Readme standardization (#18695) Co-authored-by: Soren Dreano --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 67f6b957e..ec16d7583 100644 --- a/README.md +++ b/README.md @@ -58,8 +58,8 @@ vLLM is fast with: - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html) - Continuous batching of incoming requests - Fast model execution with CUDA/HIP graph -- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8. -- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer. +- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8 +- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer - Speculative decoding - Chunked prefill @@ -72,14 +72,14 @@ vLLM is flexible and easy to use with: - Tensor parallelism and pipeline parallelism support for distributed inference - Streaming outputs - OpenAI-compatible API server -- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron. +- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron - Prefix caching support - Multi-LoRA support vLLM seamlessly supports most popular open-source models on HuggingFace, including: - Transformer-like LLMs (e.g., Llama) - Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3) -- Embedding Models (e.g. E5-Mistral) +- Embedding Models (e.g., E5-Mistral) - Multi-modal LLMs (e.g., LLaVA) Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html). @@ -162,4 +162,4 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs ## Media Kit -- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit). +- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit) -- GitLab From 01eee4053606458b2596818acd1fffee699ed75d Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Wed, 4 Jun 2025 03:08:21 +0800 Subject: [PATCH 142/274] [doc] update docker version (#19074) Signed-off-by: reidliu41 Co-authored-by: reidliu41 --- docs/deployment/docker.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md index 9e506d3d7..93d9e80f5 100644 --- a/docs/deployment/docker.md +++ b/docs/deployment/docker.md @@ -46,11 +46,11 @@ You can add any other [engine-args][engine-args] you need after the image tag (` create a custom Dockerfile on top of the base image with an extra layer that installs them: ```Dockerfile - FROM vllm/vllm-openai:v0.8.3 + FROM vllm/vllm-openai:v0.9.0 # e.g. install the `audio` optional dependencies # NOTE: Make sure the version of vLLM matches the base image! - RUN uv pip install --system vllm[audio]==0.8.3 + RUN uv pip install --system vllm[audio]==0.9.0 ``` !!! tip -- GitLab From fa98d77773c649de05a4bda9847682c80287aa36 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Tue, 3 Jun 2025 15:30:02 -0400 Subject: [PATCH 143/274] [Kernel] DeepEP dispatch-combine kernel integration (#18434) Signed-off-by: Varun Co-authored-by: Varun Sundar Rabindranath --- csrc/moe/topk_softmax_kernels.cu | 16 +- tests/kernels/moe/__init__.py | 0 tests/kernels/moe/deepep_utils.py | 188 +++++++ tests/kernels/moe/test_deepep_deepgemm_moe.py | 371 ++++++++++++++ tests/kernels/moe/test_deepep_moe.py | 459 ++++++++++++++++++ vllm/config.py | 2 + .../device_communicators/all2all.py | 146 +++++- .../device_communicators/cuda_communicator.py | 8 + vllm/envs.py | 2 + .../layers/fused_moe/deep_gemm_moe.py | 32 +- .../fused_moe/deepep_ht_prepare_finalize.py | 236 +++++++++ .../fused_moe/deepep_ll_prepare_finalize.py | 152 ++++++ .../layers/fused_moe/fused_batched_moe.py | 57 ++- .../layers/fused_moe/fused_moe.py | 2 +- vllm/model_executor/layers/fused_moe/layer.py | 148 ++++-- .../layers/fused_moe/modular_kernel.py | 162 +++++-- .../layers/fused_moe/moe_permute_unpermute.py | 5 +- .../layers/fused_moe/pplx_prepare_finalize.py | 11 +- .../layers/fused_moe/prepare_finalize.py | 12 +- .../layers/fused_moe/triton_deep_gemm_moe.py | 7 +- vllm/model_executor/layers/fused_moe/utils.py | 4 +- .../model_executor/layers/quantization/fp8.py | 41 +- vllm/platforms/cuda.py | 15 + 23 files changed, 1952 insertions(+), 124 deletions(-) create mode 100644 tests/kernels/moe/__init__.py create mode 100644 tests/kernels/moe/deepep_utils.py create mode 100644 tests/kernels/moe/test_deepep_deepgemm_moe.py create mode 100644 tests/kernels/moe/test_deepep_moe.py create mode 100644 vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py create mode 100644 vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index a93790322..10be47966 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -516,9 +516,8 @@ void topk_softmax( topk, stream); } - else + else if (topk_indices.scalar_type() == at::ScalarType::UInt32) { - assert(topk_indices.scalar_type() == at::ScalarType::UInt32); vllm::moe::topkGatingSoftmaxKernelLauncher( gating_output.data_ptr(), topk_weights.data_ptr(), @@ -530,4 +529,17 @@ void topk_softmax( topk, stream); } + else { + assert(topk_indices.scalar_type() == at::ScalarType::Int64); + vllm::moe::topkGatingSoftmaxKernelLauncher( + gating_output.data_ptr(), + topk_weights.data_ptr(), + topk_indices.data_ptr(), + token_expert_indices.data_ptr(), + softmax_workspace.data_ptr(), + num_tokens, + num_experts, + topk, + stream); + } } diff --git a/tests/kernels/moe/__init__.py b/tests/kernels/moe/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/kernels/moe/deepep_utils.py b/tests/kernels/moe/deepep_utils.py new file mode 100644 index 000000000..2bc9b657d --- /dev/null +++ b/tests/kernels/moe/deepep_utils.py @@ -0,0 +1,188 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +DeepEP test utilities +""" +import dataclasses +import importlib +import traceback +from typing import Callable, Optional + +import torch +from torch.distributed import ProcessGroup +from torch.multiprocessing import ( + spawn) # pyright: ignore[reportPrivateImportUsage] +from typing_extensions import Concatenate, ParamSpec + +has_deep_ep = importlib.util.find_spec("deep_ep") is not None +if has_deep_ep: + from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501 + DeepEPHTPrepareAndFinalize) + from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501 + DeepEPLLPrepareAndFinalize) + +## Parallel Processes Utils + +P = ParamSpec("P") + + +@dataclasses.dataclass +class ProcessGroupInfo: + world_size: int + world_local_size: int + rank: int + node_rank: int + local_rank: int + device: torch.device + + +def _worker_parallel_launch( + local_rank: int, + world_size: int, + world_local_size: int, + node_rank: int, + init_method: str, + worker: Callable[Concatenate[ProcessGroupInfo, P], None], + *args: P.args, + **kwargs: P.kwargs, +) -> None: + rank = node_rank * world_local_size + local_rank + torch.cuda.set_device(local_rank) + device = torch.device("cuda", local_rank) + torch.distributed.init_process_group( + backend="cpu:gloo,cuda:nccl", + init_method=init_method, + rank=rank, + world_size=world_size, + device_id=device, + ) + barrier = torch.tensor([rank], device=device) + torch.distributed.all_reduce(barrier) + + try: + worker( + ProcessGroupInfo( + world_size=world_size, + world_local_size=world_local_size, + rank=rank, + node_rank=node_rank, + local_rank=local_rank, + device=device, + ), + *args, + **kwargs, + ) + except Exception as ex: + print(ex) + traceback.print_exc() + raise + finally: + torch.distributed.destroy_process_group() + + +def parallel_launch( + world_size: int, + worker: Callable[Concatenate[ProcessGroupInfo, P], None], + *args: P.args, + **kwargs: P.kwargs, +) -> None: + assert not kwargs + spawn( + _worker_parallel_launch, + args=( + world_size, + world_size, + 0, + "tcp://localhost:29500", + worker, + ) + args, + nprocs=world_size, + join=True, + ) + + +## DeepEP specific utils + + +@dataclasses.dataclass +class DeepEPHTArgs: + num_local_experts: int + + +@dataclasses.dataclass +class DeepEPLLArgs: + max_tokens_per_rank: int + hidden_size: int + num_experts: int + use_fp8_dispatch: bool + + +def make_deepep_ht_a2a(pg: ProcessGroup, + pgi: ProcessGroupInfo, + dp_size: int, + ht_args: DeepEPHTArgs, + q_dtype: Optional[torch.dtype] = None, + block_shape: Optional[list[int]] = None): + + import deep_ep + + # high throughput a2a + num_nvl_bytes = 1024 * 1024 * 1024 # 1GB + num_rdma_bytes, low_latency_mode, num_qps_per_rank = 0, False, 1 + buffer = deep_ep.Buffer(group=pg, + num_nvl_bytes=num_nvl_bytes, + num_rdma_bytes=num_rdma_bytes, + low_latency_mode=low_latency_mode, + num_qps_per_rank=num_qps_per_rank) + return DeepEPHTPrepareAndFinalize(buffer=buffer, + world_size=pgi.world_size, + rank=pgi.rank, + dp_size=dp_size, + rank_expert_offset=pgi.rank * + ht_args.num_local_experts, + quant_dtype=q_dtype, + block_shape=block_shape) + + +def make_deepep_ll_a2a(pg: ProcessGroup, + pgi: ProcessGroupInfo, + dp_size: int, + deepep_ll_args: DeepEPLLArgs, + q_dtype: Optional[torch.dtype] = None, + block_shape: Optional[list[int]] = None): + + import deep_ep + + # low-latency a2a + num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint( + deepep_ll_args.max_tokens_per_rank, deepep_ll_args.hidden_size, + pgi.world_size, deepep_ll_args.num_experts) + + buffer = deep_ep.Buffer(group=pg, + num_rdma_bytes=num_rdma_bytes, + low_latency_mode=True, + num_qps_per_rank=deepep_ll_args.num_experts // + pgi.world_size) + return DeepEPLLPrepareAndFinalize( + buffer=buffer, + world_size=pgi.world_size, + dp_size=dp_size, + max_tokens_per_rank=deepep_ll_args.max_tokens_per_rank, + quant_dtype=q_dtype, + use_fp8_dispatch=deepep_ll_args.use_fp8_dispatch, + ) + + +def make_deepep_a2a(pg: ProcessGroup, + pgi: ProcessGroupInfo, + dp_size: int, + deepep_ht_args: Optional[DeepEPHTArgs], + deepep_ll_args: Optional[DeepEPLLArgs], + q_dtype: Optional[torch.dtype] = None, + block_shape: Optional[list[int]] = None): + if deepep_ht_args is not None: + assert deepep_ll_args is None + return make_deepep_ht_a2a(pg, pgi, dp_size, deepep_ht_args, q_dtype, + block_shape) + + assert deepep_ll_args is not None + return make_deepep_ll_a2a(pg, pgi, dp_size, deepep_ll_args, q_dtype) diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py new file mode 100644 index 000000000..a1fdc1d5f --- /dev/null +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -0,0 +1,371 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Test DeepEP + DeepGEMM integration +""" + +import dataclasses +import importlib +from typing import Optional + +import pytest +import torch.distributed +from torch.distributed import ProcessGroup +from typing_extensions import ParamSpec + +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts +from vllm.model_executor.layers.fused_moe.modular_kernel import ( + FusedMoEModularKernel) +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + per_token_group_quant_fp8) +from vllm.platforms import current_platform + +from .deepep_utils import ProcessGroupInfo, parallel_launch + +has_deep_ep = importlib.util.find_spec("deep_ep") is not None + +try: + import deep_gemm + has_deep_gemm = True +except ImportError: + has_deep_gemm = False + +if has_deep_ep: + from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501 + DeepEPHTPrepareAndFinalize) + + from .deepep_utils import DeepEPHTArgs, make_deepep_a2a + +if has_deep_gemm: + from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( + DeepGemmExperts) + +requires_deep_ep = pytest.mark.skipif( + not has_deep_ep, + reason="Requires deep_ep kernels", +) + +requires_deep_gemm = pytest.mark.skipif( + not has_deep_gemm, + reason="Requires deep_gemm kernels", +) + +P = ParamSpec("P") + + +def per_block_cast_to_fp8( + x: torch.Tensor, + block_size_n: int = 128) -> tuple[torch.Tensor, torch.Tensor]: + assert x.dim() == 2 + m, n = x.shape + x_padded = torch.zeros( + (deep_gemm.ceil_div(m, 128) * 128, + deep_gemm.ceil_div(n, block_size_n) * block_size_n), + dtype=x.dtype, + device=x.device) + x_padded[:m, :n] = x + x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, block_size_n) + x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) + x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn) + x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous() + scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2)) + return x_scaled_sub, scales + + +def make_block_quant_fp8_weights( + e: int, + n: int, + k: int, + block_size: list[int], +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Return weights w1, w2, w1q, w2q, w1_scale, w2_scale + """ + dtype = torch.bfloat16 + + fp8_info = torch.finfo(torch.float8_e4m3fn) + fp8_max, fp8_min = fp8_info.max, fp8_info.min + + w1_bf16 = torch.randn((e, 2 * n, k), dtype=dtype) / 10 + w1_bf16 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype) + + w2_bf16 = torch.randn((e, k, n), dtype=dtype) / 10 + w2_bf16 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype) + + block_n, block_k = block_size[0], block_size[1] + n_tiles_w1 = ((2 * n) + block_n - 1) // block_n + k_tiles_w1 = (k + block_k - 1) // block_k + n_tiles_w2 = (k + block_n - 1) // block_n + k_tiles_w2 = (n + block_k - 1) // block_k + + w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn) + w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn) + + w1_s = torch.empty((e, n_tiles_w1, k_tiles_w1), + device="cuda", + dtype=torch.float32) + w2_s = torch.empty((e, n_tiles_w2, k_tiles_w2), + device="cuda", + dtype=torch.float32) + + assert w1_s.shape == (e, (2 * n + 127) // 128, (k + 127) // 128) + assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2] + + for i in range(e): + w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i]) + w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i]) + + return w1, w2, w1_s, w2_s + + +@dataclasses.dataclass +class TestConfig: + topk: int + m: int + k: int + n: int + num_experts: int + block_size: list[int] + + +@dataclasses.dataclass +class TestTensors: + rank_tokens: torch.Tensor # all ranks make this many tokens + rank_token_scales: Optional[torch.Tensor] + topk: torch.Tensor + topk_weights: torch.Tensor + config: TestConfig + + @staticmethod + def make(config: TestConfig, rank) -> "TestTensors": + + dtype = torch.bfloat16 + topk, m, k, block_size = (config.topk, config.m, config.k, + config.block_size) + + fp8_info = torch.finfo(torch.float8_e4m3fn) + fp8_max, fp8_min = fp8_info.max, fp8_info.min + + rank_tokens = torch.randn( + (m, k), device=torch.cuda.current_device(), dtype=dtype) / 10.0 + rank_tokens = rank_tokens.clamp(min=fp8_min, max=fp8_max) + + block_k = block_size[1] + _, rank_token_scales = per_token_group_quant_fp8(rank_tokens, block_k) + + topk_ids = torch.randint( + low=0, + high=config.num_experts, + size=(m, topk), + device=torch.cuda.current_device()).to(dtype=torch.int64) + + topk_weights = torch.randn(topk_ids.shape, + dtype=torch.float32, + device=torch.cuda.current_device()) + + return TestTensors(rank_tokens=rank_tokens, + rank_token_scales=rank_token_scales, + topk=topk_ids, + topk_weights=topk_weights, + config=config) + + +def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int, + num_local_experts: int, q_dtype: Optional[torch.dtype], + block_shape: list[int]) -> FusedMoEModularKernel: + + a2a: DeepEPHTPrepareAndFinalize = make_deepep_a2a( + pg=pg, + pgi=pgi, + dp_size=dp_size, + deepep_ht_args=DeepEPHTArgs(num_local_experts=num_local_experts), + deepep_ll_args=None, + q_dtype=q_dtype, + block_shape=block_shape) + + fused_experts = DeepGemmExperts() + mk = FusedMoEModularKernel(prepare_finalize=a2a, + fused_experts=fused_experts) + return mk + + +def deep_ep_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo, dp_size: int, + test_tensors: TestTensors, w1: torch.Tensor, + w2: torch.Tensor, w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + num_experts: int) -> torch.Tensor: + + num_local_experts = w1.size(0) + + def build_expert_map(): + num_local_experts = w1.size(0) + expert_map = torch.full((num_experts, ), + fill_value=-1, + dtype=torch.int32) + s = pgi.rank * num_local_experts + e = s + num_local_experts + expert_map[s:e] = torch.tensor(list(range(num_local_experts))) + return expert_map.to(device=torch.cuda.current_device(), + dtype=torch.int32) + + q_dtype = torch.float8_e4m3fn + + # Make modular kernel + mk: FusedMoEModularKernel = make_modular_kernel( + pg, pgi, dp_size, num_local_experts, q_dtype, + test_tensors.config.block_size) + + a1_scale = test_tensors.rank_token_scales + + out = mk.forward(hidden_states=test_tensors.rank_tokens, + w1=w1, + w2=w2, + topk_weights=test_tensors.topk_weights, + topk_ids=test_tensors.topk, + inplace=False, + activation="silu", + global_num_experts=num_experts, + expert_map=build_expert_map(), + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_zp=None, + w2_zp=None, + a1_scale=a1_scale, + a2_scale=None, + apply_router_weight_on_input=False) + return out + + +def triton_impl(a: torch.Tensor, topk_ids: torch.Tensor, + topk_weights: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, + w1_scale: torch.Tensor, w2_scale: torch.Tensor, + a1_scale: torch.Tensor, block_shape: list[int]): + + return fused_experts( + hidden_states=a, + w1=w1, + w2=w2, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=False, + use_fp8_w8a8=True, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + block_shape=block_shape, + # Make sure this is set to False so we + # dont end up comparing the same implementation. + allow_deep_gemm=False) + + +def _deep_ep_moe( + pgi: ProcessGroupInfo, + dp_size: int, + config: TestConfig, + w1: torch.Tensor, + w2: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, +): + current_platform.seed_everything(pgi.rank) + + w1 = w1.to(device=torch.cuda.current_device()) + w2 = w2.to(device=torch.cuda.current_device()) + w1_scale = w1_scale.to(device=torch.cuda.current_device()) + w2_scale = w2_scale.to(device=torch.cuda.current_device()) + + pg = torch.distributed.new_group(list(range(pgi.world_size))) + test_tensors = TestTensors.make(config, pgi.rank) + block_shape = [ + w1.size(1) // w1_scale.size(1), + w1.size(2) // w1_scale.size(2) + ] + + with set_current_vllm_config(VllmConfig()): + # Reference + triton_moe = triton_impl(a=test_tensors.rank_tokens, + topk_ids=test_tensors.topk, + topk_weights=test_tensors.topk_weights, + w1=w1, + w2=w2, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=test_tensors.rank_token_scales, + block_shape=block_shape) + + # Slice experts for this rank. + num_local_experts = config.num_experts // pgi.world_size + e_start = num_local_experts * pgi.rank + e_end = e_start + num_local_experts + w1_ep = w1[e_start:e_end] + w2_ep = w2[e_start:e_end] + w1_scale_ep = w1_scale[e_start:e_end] + w2_scale_ep = w2_scale[e_start:e_end] + + deepep_moe = deep_ep_moe_impl( + pg, + pgi, + dp_size, + test_tensors, + w1_ep, + w2_ep, + w1_scale_ep, + w2_scale_ep, + config.num_experts, + ) + + torch.testing.assert_close( + triton_moe, + deepep_moe, + atol=6e-2, + rtol=6e-2, + ) + + +MNKs = [ + (8, 128, 128), + (8, 128, 512), + (8, 512, 512), + (3, 1024, 2048), + (32, 128, 1024), + (45, 512, 2048), + (64, 1024, 1024), + (129, 128, 256), + (129, 1024, 2048), + (222, 1024, 2048), +] + + +@pytest.mark.parametrize("mnk", MNKs) +@pytest.mark.parametrize("num_experts", [32]) +@pytest.mark.parametrize("topk", [2, 6]) +@pytest.mark.parametrize("world_dp_size", [(2, 1)]) +@requires_deep_ep +@requires_deep_gemm +def test_deep_ep_moe(mnk: tuple[int, int, int], num_experts: int, topk: int, + world_dp_size: tuple[int, int]): + + m, n, k = mnk + current_platform.seed_everything(7) + + if topk > num_experts: + pytest.skip(f"Skipping test: topk={topk} > E={num_experts}") + + block_m = deep_gemm.get_m_alignment_for_contiguous_layout() + block_size = [block_m, block_m] + + world_size, dp_size = world_dp_size + config = TestConfig( + topk=topk, + m=m, + k=k, + n=n, + num_experts=num_experts, + block_size=block_size, + ) + + w1, w2, w1_scale, w2_scale = make_block_quant_fp8_weights( + num_experts, n, k, block_size) + + parallel_launch(world_size, _deep_ep_moe, dp_size, config, w1, w2, + w1_scale, w2_scale) diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py new file mode 100644 index 000000000..7e029ea95 --- /dev/null +++ b/tests/kernels/moe/test_deepep_moe.py @@ -0,0 +1,459 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Test deepep dispatch-combine logic +""" + +import dataclasses +import importlib +from typing import Optional, Union + +import pytest +import torch.distributed +from torch.distributed import ProcessGroup + +from vllm import _custom_ops as ops +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe import TritonExperts +from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( + BatchedTritonExperts) +from vllm.model_executor.layers.fused_moe.modular_kernel import ( + FusedMoEModularKernel) +from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + per_token_group_quant_fp8) +from vllm.platforms import current_platform + +from .deepep_utils import ProcessGroupInfo, parallel_launch + +has_deep_ep = importlib.util.find_spec("deep_ep") is not None + +if has_deep_ep: + from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501 + DeepEPHTPrepareAndFinalize) + from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501 + DeepEPLLPrepareAndFinalize) + + from .deepep_utils import DeepEPHTArgs, DeepEPLLArgs, make_deepep_a2a + +requires_deep_ep = pytest.mark.skipif( + not has_deep_ep, + reason="Requires deep_ep kernels", +) + +MAX_TOKENS_PER_RANK = 64 + + +def make_weights( + e, n, k, dtype +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Return weights w1, w2, w1_scale, w2_scale + """ + if dtype in [torch.float16, torch.bfloat16]: + w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 + w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10 + return w1, w2, None, None + + # per-out-channel weight quantization + assert dtype == torch.float8_e4m3fn + w1 = torch.empty((e, 2 * n, k), device="cuda", dtype=torch.float16) + w2 = torch.empty((e, k, n), device="cuda", dtype=torch.float16) + + n_b_scales = 2 * n + k_b_scales = k + w1_q = torch.empty_like(w1, dtype=dtype) + w2_q = torch.empty_like(w2, dtype=dtype) + w1_scale = torch.empty((e, n_b_scales, 1), + device="cuda", + dtype=torch.float32) + w2_scale = torch.empty((e, k_b_scales, 1), + device="cuda", + dtype=torch.float32) + for expert in range(e): + w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant( + w1[expert], use_per_token_if_dynamic=True) + w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant( + w2[expert], use_per_token_if_dynamic=True) + return w1_q, w2_q, w1_scale, w2_scale + + +@dataclasses.dataclass +class TestConfig: + dtype: torch.dtype + topk: int + m: int + k: int + n: int + num_experts: int + + +@dataclasses.dataclass +class TestTensors: + rank_tokens: torch.Tensor # all ranks make this many tokens + rank_token_scales: Optional[torch.Tensor] + topk: torch.Tensor + topk_weights: torch.Tensor + config: TestConfig + + @staticmethod + def make(config: TestConfig, low_latency_mode: bool) -> "TestTensors": + # TODO (varun) - check that float16 works ? + assert config.dtype in [torch.bfloat16, torch.float8_e4m3fn] + token_dtype = (torch.bfloat16 if config.dtype == torch.float8_e4m3fn + else config.dtype) + rank_tokens = torch.randn( + (config.m, config.k), device="cuda", dtype=token_dtype) / 10 + rank_token_scales = None + if config.dtype == torch.float8_e4m3fn: + # low_latency_mode kernels dont support per-token quant. + _, rank_token_scales = ops.scaled_fp8_quant( + rank_tokens, use_per_token_if_dynamic=not low_latency_mode) + + topk = torch.randint(low=0, + high=config.num_experts, + size=(config.m, config.topk), + device="cuda").to(dtype=torch.int64) + topk_weights = torch.randn(topk.shape, + dtype=torch.float32, + device="cuda") + return TestTensors(rank_tokens=rank_tokens, + rank_token_scales=rank_token_scales, + topk=topk, + topk_weights=topk_weights, + config=config) + + +def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo, + low_latency_mode: bool, hidden_size: int, dp_size: int, + num_experts: int, num_local_experts: int, + q_dtype: Optional[torch.dtype], + use_fp8_dispatch: bool) -> FusedMoEModularKernel: + + is_quantized = q_dtype is not None + + ht_args: Optional[DeepEPHTArgs] = None + ll_args: Optional[DeepEPLLArgs] = None + + if low_latency_mode: + ll_args = DeepEPLLArgs(max_tokens_per_rank=MAX_TOKENS_PER_RANK, + hidden_size=hidden_size, + num_experts=num_experts, + use_fp8_dispatch=use_fp8_dispatch) + else: + assert not use_fp8_dispatch, ( + "FP8 Dispatch is valid only for low-latency kernels") + ht_args = DeepEPHTArgs(num_local_experts=num_local_experts) + + a2a : Union[DeepEPHTPrepareAndFinalize, DeepEPLLPrepareAndFinalize] = \ + make_deepep_a2a(pg = pg, + pgi = pgi, + dp_size = dp_size, + q_dtype = q_dtype, + block_shape = None, + deepep_ht_args = ht_args, + deepep_ll_args = ll_args) + + if low_latency_mode: + fused_experts = BatchedTritonExperts( + max_num_tokens=MAX_TOKENS_PER_RANK, + world_size=pgi.world_size, + dp_size=dp_size, + use_fp8_w8a8=is_quantized, + use_int8_w8a8=False, + use_int8_w8a16=False, + use_int4_w4a16=False) + else: + fused_experts = TritonExperts(use_fp8_w8a8=is_quantized, + use_int8_w8a8=False, + use_int8_w8a16=False, + use_int4_w4a16=False, + per_channel_quant=False) + + mk = FusedMoEModularKernel(prepare_finalize=a2a, + fused_experts=fused_experts) + return mk + + +def deep_ep_moe_impl(pg: ProcessGroup, pgi: ProcessGroupInfo, + low_latency_mode: bool, dp_size: int, + test_tensors: TestTensors, w1: torch.Tensor, + w2: torch.Tensor, w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], num_experts: int, + use_fp8_dispatch: bool) -> torch.Tensor: + + num_local_experts = w1.size(0) + + def build_expert_map(): + num_local_experts = w1.size(0) + expert_map = torch.full((num_experts, ), + fill_value=-1, + dtype=torch.int32) + s = pgi.rank * num_local_experts + e = s + num_local_experts + expert_map[s:e] = torch.tensor(list(range(num_local_experts))) + return expert_map.to(device=torch.cuda.current_device(), + dtype=torch.int32) + + hidden_size = test_tensors.rank_tokens.size(1) + is_quantized = w1.dtype == torch.float8_e4m3fn + q_dtype = None + if is_quantized: + q_dtype = torch.float8_e4m3fn + + # Make modular kernel + mk: FusedMoEModularKernel = make_modular_kernel(pg, pgi, low_latency_mode, + hidden_size, dp_size, + num_experts, + num_local_experts, q_dtype, + use_fp8_dispatch) + + out_hidden_states = torch.empty_like(test_tensors.rank_tokens) + total_num_tokens = test_tensors.rank_tokens.size(0) + + def process_chunk(chunk_start, chunk_end, skip_result_store=False): + rank_tokens_chunk = test_tensors.rank_tokens[chunk_start:chunk_end] + topk_weights_chunk = test_tensors.topk_weights[chunk_start:chunk_end] + topk_chunk = test_tensors.topk[chunk_start:chunk_end] + rank_token_scales_chunk = test_tensors.rank_token_scales + if rank_token_scales_chunk is not None and rank_token_scales_chunk.size( + 0) == total_num_tokens: + # per act token + rank_token_scales_chunk = rank_token_scales_chunk[ + chunk_start:chunk_end] + + out = mk.forward(hidden_states=rank_tokens_chunk, + w1=w1, + w2=w2, + topk_weights=topk_weights_chunk, + topk_ids=topk_chunk, + inplace=False, + activation="silu", + global_num_experts=num_experts, + expert_map=build_expert_map(), + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_zp=None, + w2_zp=None, + a1_scale=rank_token_scales_chunk, + a2_scale=None, + apply_router_weight_on_input=False) + + if not skip_result_store: + out_hidden_states[chunk_start:chunk_end, :].copy_( + out, non_blocking=True) + + max_num_tokens_per_dp = (MAX_TOKENS_PER_RANK + if low_latency_mode else total_num_tokens) + + for chunk_start_ in range(0, total_num_tokens, max_num_tokens_per_dp): + chunk_start = chunk_start_ + chunk_end = min(chunk_start + max_num_tokens_per_dp, total_num_tokens) + # clamp start and end + chunk_start = min(chunk_start, total_num_tokens - 1) + chunk_end = min(chunk_end, total_num_tokens) + + process_chunk(chunk_start, + chunk_end, + skip_result_store=chunk_start_ >= total_num_tokens) + + return out_hidden_states + + +def torch_moe_impl(test_tensors: TestTensors, w1: torch.Tensor, + w2: torch.Tensor, w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], using_fp8_dispatch: bool): + + a, topk_ids, topk_weights = (test_tensors.rank_tokens, test_tensors.topk, + test_tensors.topk_weights) + if using_fp8_dispatch: + # The DeepEP implementation is requested to dispatch using FP8. + # For numerical stability for testing, emulate the fp8 dispatch by + # blockwise quant and de-quant. + a = test_tensors.rank_tokens + aq, aq_scale = per_token_group_quant_fp8(a, 128) + a = (aq.view(-1, 128).to(torch.float32) * aq_scale.view(-1, 1)).view( + a.shape).to(a.dtype) + + is_quantized = w1.dtype == torch.float8_e4m3fn + a_dtype = a.dtype + if is_quantized: + w1 = w1.to(dtype=torch.float32) * w1_scale + w2 = w2.to(dtype=torch.float32) * w2_scale + a = a.to(dtype=torch.float32) + + m, _ = a.shape + topk = topk_ids.size(1) + out = torch.zeros_like(a) + + for i in range(m): + a_i = a[i] + o_i = out[i] + for j in range(topk): + e = topk_ids[i][j] + e_w = topk_weights[i][j] + w1_e = w1[e] + w2_e = w2[e] + o_i += (SiluAndMul() + (a_i @ w1_e.transpose(0, 1)) @ w2_e.transpose(0, 1)) * e_w + + if is_quantized: + out = out.to(dtype=a_dtype) + + return out + + +def _deep_ep_moe( + pgi: ProcessGroupInfo, + low_latency_mode: bool, + dp_size: int, + config: TestConfig, + w1: torch.Tensor, + w2: torch.Tensor, + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + use_fp8_dispatch: bool, +): + + if not low_latency_mode: + assert not use_fp8_dispatch, ( + "FP8 dispatch interface is available only in low-latency mode") + + is_quantized = w1.dtype == torch.float8_e4m3fn + w1 = w1.to(device=torch.cuda.current_device()) + w2 = w2.to(device=torch.cuda.current_device()) + if is_quantized: + w1_scale = w1_scale.to( # type: ignore + device=torch.cuda.current_device()) + w2_scale = w2_scale.to( # type: ignore + device=torch.cuda.current_device()) + + pg = torch.distributed.new_group(list(range(pgi.world_size))) + test_tensors = TestTensors.make(config, low_latency_mode) + + with set_current_vllm_config(VllmConfig()): + # Reference + torch_combined = torch_moe_impl(test_tensors, w1, w2, w1_scale, + w2_scale, use_fp8_dispatch) + + # Splice experts for this rank. + num_local_experts = config.num_experts // pgi.world_size + e_start = num_local_experts * pgi.rank + e_end = e_start + num_local_experts + w1_ep = w1[e_start:e_end] + w2_ep = w2[e_start:e_end] + + w1_scale_ep, w2_scale_ep = None, None + if is_quantized: + w1_scale_ep = w1_scale[e_start:e_end] # type: ignore + w2_scale_ep = w2_scale[e_start:e_end] # type: ignore + deepep_combined = deep_ep_moe_impl( + pg, + pgi, + low_latency_mode, + dp_size, + test_tensors, + w1_ep, + w2_ep, + w1_scale_ep, + w2_scale_ep, + config.num_experts, + use_fp8_dispatch, + ) + + torch.testing.assert_close( + torch_combined, + deepep_combined, + atol=6e-2, + rtol=6e-2, + ) + + +MNKs = [ + (1, 128, 128), + (2, 128, 512), + (3, 1024, 2048), + (32, 128, 1024), + (45, 512, 2048), + (64, 1024, 1024), + (222, 1024, 2048), +] + +DTYPES = [torch.bfloat16, torch.float8_e4m3fn] + + +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("mnk", MNKs) +@pytest.mark.parametrize("num_experts", [32]) +@pytest.mark.parametrize("topk", [6]) +@pytest.mark.parametrize("world_dp_size", [(2, 1)]) +@requires_deep_ep +def test_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int], + num_experts: int, topk: int, world_dp_size: tuple[int, + int]): + low_latency_mode = False + use_fp8_dispatch = False + m, n, k = mnk + + current_platform.seed_everything(7) + world_size, dp_size = world_dp_size + config = TestConfig(dtype=dtype, + topk=topk, + m=m, + k=k, + n=n, + num_experts=num_experts) + + w1, w2, w1_scale, w2_scale = make_weights(num_experts, n, k, dtype) + + parallel_launch(world_size, _deep_ep_moe, low_latency_mode, dp_size, + config, w1, w2, w1_scale, w2_scale, use_fp8_dispatch) + + +MNKs = [ + (1, 128, 2560), + (2, 128, 2560), + (3, 1024, 2560), + (32, 128, 2560), + (45, 512, 2560), + (64, 1024, 2560), + (222, 1024, 2560), +] +DTYPES = [torch.float8_e4m3fn, torch.bfloat16] +USE_FP8_DISPATCH = [True, False] + + +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("mnk", MNKs) +@pytest.mark.parametrize("num_experts", [32]) +@pytest.mark.parametrize("topk", [6]) +@pytest.mark.parametrize("world_dp_size", [(2, 1)]) +@pytest.mark.parametrize("use_fp8_dispatch", USE_FP8_DISPATCH) +@requires_deep_ep +def test_low_latency_deep_ep_moe(dtype: torch.dtype, mnk: tuple[int, int, int], + num_experts: int, topk: int, + world_dp_size: tuple[int, int], + use_fp8_dispatch: bool): + + low_latency_mode = True + m, n, k = mnk + + if (low_latency_mode + and k not in DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES): + pytest.skip( + f"Skipping test as hidden size {k} is not in list of supported " + f"hidden sizes {DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES}" + ) + + current_platform.seed_everything(7) + world_size, dp_size = world_dp_size + config = TestConfig(dtype=dtype, + topk=topk, + m=m, + k=k, + n=n, + num_experts=num_experts) + + w1, w2, w1_scale, w2_scale = make_weights(num_experts, n, k, dtype) + + parallel_launch(world_size, _deep_ep_moe, low_latency_mode, dp_size, + config, w1, w2, w1_scale, w2_scale, use_fp8_dispatch) diff --git a/vllm/config.py b/vllm/config.py index d99e501ca..f6ca9328b 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1856,6 +1856,8 @@ class ParallelConfig: factors.append(self.pipeline_parallel_size) factors.append(self.tensor_parallel_size) factors.append(self.enable_expert_parallel) + factors.append(self.data_parallel_size) + factors.append(envs.VLLM_ALL2ALL_BACKEND) return hashlib.sha256(str(factors).encode()).hexdigest() def __post_init__(self) -> None: diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index ae7590299..2ab3779ec 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import importlib.util -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import torch import torch.distributed as dist @@ -129,3 +129,147 @@ class PPLXAll2AllManager(All2AllManagerBase): from pplx_kernels.nvshmem import nvshmem_finalize logger.debug("PPLX NVSHMEM finalize") nvshmem_finalize() + + +class DeepEPAll2AllManagerBase(All2AllManagerBase): + """ + All2All communication based on DeepEP High-Throughput kernels. + """ + + def __init__(self, cpu_group): + has_deepep = importlib.util.find_spec("deep_ep") is not None + assert has_deepep, "DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install DeepEP kernels." # noqa + super().__init__(cpu_group) + self.handle_cache = Cache() + + # This is the DeepEP default. Stick to it till we can establish + # reasonable defaults based on profiling. + self.num_sms = 20 + + def get_handle(self, kwargs): + raise NotImplementedError + + def dispatch(self, hidden_states: torch.Tensor, + router_logits: torch.Tensor): + raise NotImplementedError + + def combine(self, hidden_states: torch.Tensor) -> torch.Tensor: + raise NotImplementedError + + def destroy(self): + pass + + +class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase): + """ + All2All communication based on DeepEP High-Throughput kernels. + """ + + def __init__(self, cpu_group): + super().__init__(cpu_group) + + def _make_all2all_kwargs(self) -> dict[Any, Any]: + # Defaults for internode and intranode are taken from DeepEP tests. + num_nvl_bytes = 1024 * 1024 * 1024 + num_rdma_bytes = None + num_qps_per_rank = None + + if self.internode: + num_rdma_bytes = 1024 * 1024 * 1024 + num_qps_per_rank = self.num_sms // 2 + else: + assert self.intranode + num_rdma_bytes = 0 + num_qps_per_rank = 1 + + assert num_rdma_bytes is not None + assert num_qps_per_rank is not None + return dict(group=self.cpu_group, + num_nvl_bytes=num_nvl_bytes, + num_rdma_bytes=num_rdma_bytes, + low_latency_mode=False, + num_qps_per_rank=num_qps_per_rank) + + def get_handle(self, kwargs): + + assert len(kwargs) == 0, ( + "DeepEPHTAll2AllManager expects no arguments. All the required " + "args are computed in the Manager itself.") + + import deep_ep + buffer_kwargs = self._make_all2all_kwargs() + logger.debug("DeepEP all2all args %s", buffer_kwargs) + handle: deep_ep.Buffer = self.handle_cache.get_or_create( + buffer_kwargs, deep_ep.Buffer) + # It is dangerous to set num sms outside this function. num_sms is not + # a part of the hash-key that identifies this object. If we are in a + # situation where we make objects with different num_sms, the hash key + # in get_or_create must be updated. + handle.set_num_sms(self.num_sms) + return handle + + +class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase): + """ + All2All communication based on DeepEP Low-Latency kernels. + """ + + def __init__(self, cpu_group): + super().__init__(cpu_group) + + def _make_all2all_kwargs( + self, + max_num_tokens_per_dp_rank: int, + token_hidden_size: int, + num_ep_ranks: int, + num_global_experts: int, + num_local_experts: int, + ) -> dict[Any, Any]: + """ + max_num_tokens_per_dp_rank : the maximum number of tokens a DP rank + can dispatch all the ranks must hold the same value. + token_hidden_size: the hidden dimension of each token. + num_ep_ranks: the number of EP group ranks. + num_global_experts: Number of experts in the model. + num_local_experts: Number of experts in an EP rank. + """ + import deep_ep + + # Defaults for internode and intranode are taken from DeepEP tests. + num_nvl_bytes = 1024 * 1024 * 1024 + num_qps_per_rank = num_local_experts + num_rdma_bytes = None + + if self.internode: + num_rdma_bytes = 1024 * 1024 * 1024 + else: + assert self.intranode + num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint( + num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank, + hidden=token_hidden_size, + num_ranks=num_ep_ranks, + num_experts=num_global_experts) + + assert num_rdma_bytes is not None + return dict(group=self.cpu_group, + num_nvl_bytes=num_nvl_bytes, + num_rdma_bytes=num_rdma_bytes, + low_latency_mode=True, + num_qps_per_rank=num_qps_per_rank) + + def get_handle(self, kwargs): + """ + The kwargs for DeepEPLLAll2AllManager is dictated by + _make_all2all_kwargs. + """ + import deep_ep + buffer_kwargs = self._make_all2all_kwargs(**kwargs) + logger.debug("DeepEP all2all args %s", buffer_kwargs) + handle: deep_ep.Buffer = self.handle_cache.get_or_create( + buffer_kwargs, deep_ep.Buffer) + # It is dangerous to set num sms outside this function. num_sms is not + # a part of the hash-key that identifies this object. If we are in a + # situation where we make objects with different num_sms, the hash key + # in get_or_create must be updated. + handle.set_num_sms(self.num_sms) + return handle diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py index 0eebdf873..055d91690 100644 --- a/vllm/distributed/device_communicators/cuda_communicator.py +++ b/vllm/distributed/device_communicators/cuda_communicator.py @@ -67,6 +67,14 @@ class CudaCommunicator(DeviceCommunicatorBase): from .all2all import PPLXAll2AllManager self.all2all_manager = PPLXAll2AllManager(self.cpu_group) logger.info("Using PPLX all2all manager.") + elif all2all_backend == "deepep_high_throughput": + from .all2all import DeepEPHTAll2AllManager + self.all2all_manager = DeepEPHTAll2AllManager(self.cpu_group) + logger.info("Using DeepEP High-Throughput all2all manager.") + elif all2all_backend == "deepep_low_latency": + from .all2all import DeepEPLLAll2AllManager + self.all2all_manager = DeepEPLLAll2AllManager(self.cpu_group) + logger.info("Using DeepEP Low-Latency all2all manager.") else: raise ValueError(f"Unknown all2all backend: {all2all_backend}") diff --git a/vllm/envs.py b/vllm/envs.py index 2e3d6eeb5..08bf2dad4 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -826,6 +826,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # Available options: # - "naive": naive all2all implementation using all-reduce # - "pplx": use pplx kernels + # - "deepep_high_throughput", use deepep high-throughput kernels + # - "deepep_low_latency", use deepep low-latency kernels "VLLM_ALL2ALL_BACKEND": lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"), diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index 331544d64..97b4a49c0 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -12,8 +12,8 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( _moe_permute) from vllm.model_executor.layers.fused_moe.prepare_finalize import ( MoEPrepareAndFinalizeNoEP) -from vllm.model_executor.layers.fused_moe.utils import (_fp8_quantize, - _resize_cache) +from vllm.model_executor.layers.fused_moe.utils import ( + _resize_cache, per_token_group_quant_fp8) from vllm.utils import round_up logger = init_logger(__name__) @@ -34,10 +34,8 @@ def _valid_deep_gemm_shape(M: int, N: int, K: int): return align <= M and N % align == 0 and K % align == 0 -def _valid_deep_gemm(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - expert_map: Optional[torch.Tensor] = None) -> bool: +def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor, + w2: torch.Tensor) -> bool: """ Check if the given problem size is supported by the DeepGemm grouped gemm kernel. All of M, N, K and the quantization block_shape must be @@ -47,10 +45,6 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, logger.debug("DeepGemm disabled: deep_gemm not available.") return False - if expert_map is not None: - logger.debug("DeepGemm disabled: expert map NYI.") - return False - M = hidden_states.size(0) _, K, N = w2.size() if not _valid_deep_gemm_shape(M, N, K): @@ -116,7 +110,9 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): a1q = hidden_states _, N, K = w1.size() - assert global_num_experts != -1 + if global_num_experts == -1: + global_num_experts = w1.size(0) + assert w2.size(1) == K a1q, a1q_scale, _, expert_ids, inv_perm = _moe_permute( @@ -128,6 +124,14 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): self.block_shape[0], ) + if expert_map is not None: + # DeepGemm (Grouped Contiguous) kernel needs a valid B index + # for all rows of A. To that effect, simply compute with + # the 0th weight matrix. + # Note that this relies on the fact that corresponding topk + # weights would be 0 during weight multiplication. + expert_ids = torch.where(expert_ids == -1, 0, expert_ids) + # Note: M_sum is different than the pre-permuted shape of a1q. M_sum = a1q.size(0) workspace1 = _resize_cache(workspace13, (M_sum, N)) @@ -140,9 +144,9 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): self.activation(activation, workspace2, workspace1.view(-1, N)) a2q_scale: Optional[torch.Tensor] = None - - a2q, a2q_scale = _fp8_quantize(workspace2, a2_scale, False, - self.block_shape) + a2q, a2q_scale = per_token_group_quant_fp8(workspace2, + self.block_shape[1], + column_major_scales=True) dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous( (a2q, a2q_scale), (w2, w2_scale), workspace3, expert_ids) diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py new file mode 100644 index 000000000..48cf01638 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py @@ -0,0 +1,236 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional + +import deep_ep +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.utils import ( + moe_kernel_quantize_input) + + +class DeepEPHTPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): + """ + Prepare/Finalize using DeepEP High-Throughput kernels. + """ + + def __init__(self, + buffer: deep_ep.Buffer, + world_size: int, + rank: int, + dp_size: int, + rank_expert_offset: int, + quant_dtype: Optional[torch.dtype] = None, + block_shape: Optional[list[int]] = None): + super().__init__() + self.buffer = buffer + self.world_size = world_size + self.rank = rank + self.dp_size = dp_size + self.rank_expert_offset = rank_expert_offset + self.quant_dtype = quant_dtype + self.block_shape = block_shape + # The dispatch function returns a handle that the combine function + # requires. We store the handle here so it is available to the + # combine function. + self.handle = None + + # From https://github.com/deepseek-ai/DeepEP/blob/9fe9021f29c9083cd1808ab36b740208524d9f63/deep_ep/buffer.py#L164 + self.available_rank_configs = [2, 4, 8, 16, 24, 32, 64, 128, 144, 160] + + def max_num_tokens_per_rank(self) -> Optional[int]: + return None + + def topk_indices_dtype(self) -> Optional[torch.dtype]: + return torch.int64 + + def _get_dispatch_config(self) -> Optional[deep_ep.Config]: + if self.dp_size not in self.available_rank_configs: + return None + return deep_ep.Buffer.get_dispatch_config(self.dp_size) + + def _get_combine_config(self) -> Optional[deep_ep.Config]: + if self.dp_size not in self.available_rank_configs: + return None + return deep_ep.Buffer.get_combine_config(self.dp_size) + + def _do_quant(self, tokens: torch.Tensor, + token_scales: Optional[torch.Tensor], per_act_token: bool): + tokens, token_scales = moe_kernel_quantize_input( + tokens, token_scales, self.quant_dtype, per_act_token, + self.block_shape) + return tokens, token_scales + + def _do_dispatch(self, tokens: torch.Tensor, + token_scales: Optional[torch.Tensor], + rank_topk_ids: torch.Tensor, + rank_topk_weights: torch.Tensor, num_experts: int): + + has_scales = token_scales is not None + + (num_tokens_per_rank, num_tokens_per_rdma_rank, expert_num_tokens, + is_token_in_rank, event) = self.buffer.get_dispatch_layout( + topk_idx=rank_topk_ids, + num_experts=num_experts, + previous_event=None, + async_finish=False, + allocate_on_comm_stream=False) + + token_data = tokens + if has_scales: + token_data = (tokens, token_scales) + + ( + token_data, expert_topk_ids, expert_topk_weights, + expert_num_tokens_per_expert_list, self.handle, event + ) = self.buffer.dispatch( + x=token_data, + handle=None, + num_tokens_per_rank=num_tokens_per_rank, + num_tokens_per_rdma_rank=num_tokens_per_rdma_rank, + is_token_in_rank=is_token_in_rank, + num_tokens_per_expert=expert_num_tokens, + topk_idx=rank_topk_ids, + topk_weights=rank_topk_weights, + # expert_alignment rounds the number of tokens per expert + # to this value. + expert_alignment=1, + config=self._get_dispatch_config(), + previous_event=None, + async_finish=False, + allocate_on_comm_stream=False) + + if has_scales: + expert_x, expert_x_scale = token_data + else: + expert_x, expert_x_scale = token_data, None + + # The existing MOE kernels assume that all entries of topk_ids are + # valid. To that effect, set the -1s in expert_topk_ids to some expert + # outside this rank so the expert_map can remap it to -1 when safe. + # With Expert Parallel, the experts are divided amongst the rank + # sequentially. For rank 0, set it to num_experts - 1 and for all other + # ranks set it to 0 as we know that expert_map will have a -1 in those + # regions for those ranks. + # + # DeepEP's topk_ids output refers to the local experts directly. Offset + # the topk_ids to move it back to the global experts space so it aligns + # with existing vLLM interfaces. + expert_topk_ids = torch.where( + expert_topk_ids == -1, + num_experts - 1 if self.rank_expert_offset == 0 else 0, + expert_topk_ids + self.rank_expert_offset) + + return (expert_x, expert_x_scale, expert_num_tokens, expert_topk_ids, + expert_topk_weights) + + def prepare( + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + rank_topk_weights: torch.Tensor, + rank_topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], + Optional[torch.Tensor], Optional[torch.Tensor]]: + + if apply_router_weight_on_input: + topk = rank_topk_ids.size(1) + # TODO: this only works for topK=1, will need to update for topK>1 + assert topk == 1, ( + "apply_router_weight_on_input is only implemented for topk=1") + a1 = a1 * rank_topk_weights.to(a1.dtype) + + # Check if there is a block_shape / or if we can infer the quantization + # schemes from the scales. + per_token_quant = None + if all([x is None for x in [self.block_shape, a1_scale, a2_scale] + ]) and self.quant_dtype is not None: + # Quantization required despite none of the inputs suggesting + # quantization. Fallback to per_token_dynamic quant. + per_token_quant = True + else: + per_token_quant = ((self.block_shape is not None) or + (a1_scale is not None and a1_scale.numel() != 1) + or (a2_scale is not None + and a2_scale.numel() != 1)) + + if per_token_quant: + a1q, a1q_scale = self._do_quant(a1, a1_scale, per_act_token=True) + (expert_x, expert_x_scale, expert_num_tokens, expert_topk_ids, + expert_topk_weights) = self._do_dispatch( + tokens=a1q, + token_scales=a1q_scale, + rank_topk_ids=rank_topk_ids, + rank_topk_weights=rank_topk_weights, + num_experts=num_experts) + else: + # DeepEP kernels only support dispatching per-token-quant + # quantization. dispatch in bfloat16. + (expert_x, _, expert_num_tokens, expert_topk_ids, + expert_topk_weights) = self._do_dispatch( + tokens=a1, + token_scales=None, + rank_topk_ids=rank_topk_ids, + rank_topk_weights=rank_topk_weights, + num_experts=num_experts) + # quantize now + expert_x_scale = None + if expert_x.numel() != 0: + expert_x, expert_x_scale = self._do_quant(expert_x, + a1_scale, + per_act_token=False) + + return (expert_x, expert_x_scale, expert_num_tokens, expert_topk_ids, + expert_topk_weights) + + def _apply_weights_and_reduce(self, num_tokens: int, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + apply_router_weight_on_input: bool, + output_dtype: torch.dtype): + + if fused_expert_output.ndim == 2: + hidden_dim = fused_expert_output.size(-1) + fused_expert_output = fused_expert_output.view( + num_tokens, -1, hidden_dim) + + if not apply_router_weight_on_input: + # The DeepEP combine kernels don't do the topk weight + # multiplication. We multiply the weights locally. + fused_expert_output = fused_expert_output.to(torch.float32) + fused_expert_output = fused_expert_output * topk_weights.view( + fused_expert_output.size(0), -1, 1) + fused_expert_output = fused_expert_output.to(output_dtype) + + return fused_expert_output.sum(dim=1).to(output_dtype) + + def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, topk_ids: torch.Tensor, + apply_router_weight_on_input: bool) -> None: + + assert self.handle is not None + + # fused_expert_output can have 0 tokens - This happens when none of the + # tokens from the all2all reach this EP rank. + if fused_expert_output.numel() != 0: + fused_expert_output = self._apply_weights_and_reduce( + num_tokens=topk_ids.size(0), + fused_expert_output=fused_expert_output, + topk_weights=topk_weights, + apply_router_weight_on_input=apply_router_weight_on_input, + output_dtype=output.dtype) + + combined_x, _, event = self.buffer.combine( + x=fused_expert_output, + handle=self.handle, + topk_weights=None, + config=self._get_combine_config(), + previous_event=None, + async_finish=False, + allocate_on_comm_stream=False) + # Respect inplace outputs. + output.copy_(combined_x, non_blocking=True) diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py new file mode 100644 index 000000000..b9d817a14 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py @@ -0,0 +1,152 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional + +import deep_ep +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.utils import ( + moe_kernel_quantize_input) + +# DeepEP kernels quantize dispatch inputs in 128 element chunks. +DEEPEP_QUANT_BLOCK_SIZE = 128 + + +def dequant_fp8(expert_x_fp8: torch.Tensor, + expert_x_scales: torch.Tensor) -> torch.Tensor: + """ + Return dequantized tensor in fp32 + """ + # TODO (varun) : Optimize leverage num_tokens_per_expert counts + assert expert_x_fp8.is_contiguous() + expert_x_scales = expert_x_scales.contiguous() + num_experts = expert_x_fp8.size(0) + + expert_x_fp32 = expert_x_fp8.to(torch.float32).view( + num_experts, -1, DEEPEP_QUANT_BLOCK_SIZE) + expert_x_scales = expert_x_scales.view(num_experts, -1, 1) + return (expert_x_fp32 * expert_x_scales).view(expert_x_fp8.shape) + + +class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): + """ + Prepare/Finalize using DeepEP low-latency kernels. + """ + + # DeepEP low-latency kernels are compiled only for certain + # specific hidden sizes. + SUPPORTED_HIDDEN_SIZES = [2560, 4096, 5120, 7168] + + def __init__(self, + buffer: deep_ep.Buffer, + world_size: int, + dp_size: int, + max_tokens_per_rank: int, + quant_dtype: Optional[torch.dtype] = None, + block_shape: Optional[list[int]] = None, + use_fp8_dispatch: bool = False): + super().__init__() + + self.buffer = buffer + self.world_size = world_size + self.dp_size = dp_size + self.quant_dtype = quant_dtype + self.block_shape = block_shape + self.max_tokens_per_rank = max_tokens_per_rank + self.use_fp8_dispatch = use_fp8_dispatch + # The dispatch function returns a handle that the combine function + # requires. We store the handle here so it is available to the + # combine function. + self.handle = None + + def max_num_tokens_per_rank(self) -> Optional[int]: + return self.max_tokens_per_rank + + def topk_indices_dtype(self) -> Optional[torch.dtype]: + return torch.int64 + + def prepare( + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + rank_topk_weights: torch.Tensor, + rank_topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], + Optional[torch.Tensor], Optional[torch.Tensor]]: + + hidden_size = a1.size(1) + assert hidden_size in self.SUPPORTED_HIDDEN_SIZES, \ + (f"Hidden Size {hidden_size} not in supported list of hidden sizes" + f"{self.SUPPORTED_HIDDEN_SIZES}") + + if self.use_fp8_dispatch: + assert hidden_size % 128 == 0, \ + "DeepEP kernels quantize the inputs in blocks of shape 128" + + # Quantize + per_act_token = a1_scale.numel() != 1 if a1_scale is not None else ( + a2_scale.numel() != 1 if a2_scale is not None else False) + assert not per_act_token, ( + "low_latency kernels don't support per-act-token quant") + + if apply_router_weight_on_input: + topk = rank_topk_ids.size(1) + # TODO: this only works for topK=1, will need to update for topK>1 + assert topk == 1, ( + "apply_router_weight_on_input is only implemented for topk=1") + a1 = a1 * rank_topk_weights.to(a1.dtype) + + # Dispatch + expert_x, expert_num_tokens, self.handle, event, hook = \ + self.buffer.low_latency_dispatch(a1, + rank_topk_ids, + self.max_tokens_per_rank, + num_experts, + use_fp8=self.use_fp8_dispatch, + async_finish=False, + return_recv_hook=False) + + if self.use_fp8_dispatch: + # TODO (varun) : In the case of dynamic quantization, we could + # probably skip the quant below and use the results directly. + # Although note that the deepep quant is per token 128 elements. + expert_x_fp8, expert_x_scales = expert_x + expert_x = dequant_fp8(expert_x_fp8, + expert_x_scales).to(dtype=a1.dtype) + + num_experts = expert_x.size(0) + hidden_dim = expert_x.size(-1) + + expert_x = expert_x.view((-1, expert_x.size(-1))) + expert_x, expert_x_scale = moe_kernel_quantize_input( + expert_x, a1_scale, self.quant_dtype, per_act_token, + self.block_shape) + expert_x = expert_x.view((num_experts, -1, hidden_dim)) + + return (expert_x, expert_x_scale, expert_num_tokens, None, None) + + def finalize(self, output: torch.Tensor, fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, topk_ids: torch.Tensor, + apply_router_weight_on_input: bool) -> None: + + assert self.handle is not None + + combine_topk_weights = topk_weights + if apply_router_weight_on_input: + # weights have already been applied. + combine_topk_weights = torch.ones_like(topk_weights) + + # TODO (varun) : Enable zero copy mode + _, event, hook = self.buffer.low_latency_combine( + fused_expert_output, + topk_ids, + combine_topk_weights, + self.handle, + async_finish=False, + zero_copy=False, + return_recv_hook=False, + out=output) diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index 205a95e7f..7490a192d 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -10,7 +10,8 @@ import triton.language as tl import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.model_executor.layers.fused_moe.fused_moe import ( get_config_dtype_str, try_get_optimal_moe_config) -from vllm.model_executor.layers.fused_moe.utils import _resize_cache +from vllm.model_executor.layers.fused_moe.utils import ( + _resize_cache, moe_kernel_quantize_input) @triton.jit @@ -397,6 +398,12 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): self.rank = rank self.max_num_tokens = max_num_tokens + def max_num_tokens_per_rank(self) -> Optional[int]: + return self.max_num_tokens + + def topk_indices_dtype(self) -> Optional[torch.dtype]: + return None + def prepare( self, a1: torch.Tensor, @@ -407,7 +414,8 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): num_experts: int, expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, - ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], + Optional[torch.Tensor], Optional[torch.Tensor]]: assert a1.dim() == 2 assert topk_ids.dim() == 2 assert topk_ids.size(0) == a1.size(0) @@ -450,7 +458,7 @@ class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): first_expert, :rows, :] = a1[:topks.numel()][topks] tokens_per_expert[expert_id - first_expert] = rows - return b_a1, a1_scale, tokens_per_expert + return b_a1, a1_scale, tokens_per_expert, None, None def finalize( self, @@ -601,6 +609,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): use_int8_w8a8: bool = False, use_int8_w8a16: bool = False, use_int4_w4a16: bool = False, + per_channel_quant: bool = False, block_shape: Optional[list[int]] = None, world_size: int = 1, dp_size: int = 1, @@ -611,12 +620,15 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): self.use_int4_w4a16 = use_int4_w4a16 self.use_int8_w8a16 = use_int8_w8a16 self.block_shape = block_shape + self.per_channel_quant = per_channel_quant self.max_num_tokens = max_num_tokens - assert not use_int8_w8a8, "NYI" - assert not use_int4_w4a16, "NYI" self.world_size = world_size self.dp_size = dp_size + assert not use_int8_w8a8, "NYI" + assert not use_int4_w4a16, "NYI" + assert self.block_shape is None, "NYI" + def workspace_shapes( self, a: torch.Tensor, @@ -670,8 +682,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): torch.float32, torch.float16, torch.bfloat16, torch.float8_e4m3fn ] - # TODO: num_tokens -> max_num_tokens? - E, num_tokens, N, K, top_k_num = mk._moe_problem_size( + E, max_num_tokens, N, K, top_k_num = mk._moe_problem_size( hidden_states, w1, w2, topk_ids) assert w1.size(0) == E @@ -687,7 +698,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): w2.size(), top_k_num, config_dtype, - num_tokens, + max_num_tokens, block_shape=self.block_shape, ) @@ -706,10 +717,12 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): #print(f"shape: E={E}, M={num_tokens}, N={N}, K={K}, top_k={top_k_num}") # We can reuse the memory between these because by the time we need # cache3, we're done with cache1 - intermediate_cache1 = _resize_cache(workspace13, (E, num_tokens, N)) + intermediate_cache1 = _resize_cache(workspace13, + (E, max_num_tokens, N)) intermediate_cache2 = _resize_cache(workspace2, - (E, num_tokens, N // 2)) - intermediate_cache3 = _resize_cache(workspace13, (E, num_tokens, K)) + (E, max_num_tokens, N // 2)) + intermediate_cache3 = _resize_cache(workspace13, + (E, max_num_tokens, K)) # MM1 invoke_moe_batched_triton_kernel(A=hidden_states, @@ -731,15 +744,20 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): self.activation(activation, intermediate_cache2.view(-1, N // 2), intermediate_cache1.view(-1, N)) - #qintermediate_cache2 = intermediate_cache2 - a2q_scale = a2_scale - # TODO (varun) : support w8a8 - assert not self.use_fp8_w8a8 - #if self.use_fp8_w8a8: - # qintermediate_cache2, a2q_scale = _fp8_quantize( - # intermediate_cache2, a2_scale, self.block_shape) + ic2_hidden_size = intermediate_cache2.size(-1) + intermediate_cache2 = intermediate_cache2.view(-1, ic2_hidden_size) + + qintermediate_cache2, a2q_scale = moe_kernel_quantize_input( + A=intermediate_cache2, + A_scale=a2_scale, + qtype=torch.float8_e4m3fn if self.use_fp8_w8a8 else None, + per_channel_quant=self.per_channel_quant, + block_shape=self.block_shape) - invoke_moe_batched_triton_kernel(A=intermediate_cache2, + qintermediate_cache2 = qintermediate_cache2.view( + (E, -1, ic2_hidden_size)) + + invoke_moe_batched_triton_kernel(A=qintermediate_cache2, B=w2, C=intermediate_cache3, expert_num_tokens=expert_num_tokens, @@ -752,5 +770,4 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): use_int4_w4a16=self.use_int4_w4a16, config=config, block_shape=self.block_shape) - return intermediate_cache3 diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 883a48c98..de7a9a8d0 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1164,7 +1164,7 @@ def fused_experts(hidden_states: torch.Tensor, # permute/unpermute ops are available. N = w1.shape[1] if (allow_deep_gemm and use_fp8_w8a8 and N > 512 - and _valid_deep_gemm(hidden_states, w1, w2, expert_map)): + and _valid_deep_gemm(hidden_states, w1, w2)): assert apply_router_weight_on_input is False return deep_gemm_moe_fp8( hidden_states=hidden_states, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 3ce4cbc28..1812f3b67 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -5,7 +5,7 @@ import importlib from abc import abstractmethod from dataclasses import dataclass from enum import Enum -from typing import Callable, Optional +from typing import Callable, Optional, Union import torch import torch.nn.functional as F @@ -30,16 +30,19 @@ from vllm.platforms.interface import CpuArchEnum from vllm.utils import direct_register_custom_op has_pplx = importlib.util.find_spec("pplx_kernels") is not None +has_deepep = importlib.util.find_spec("deep_ep") is not None if current_platform.is_cuda_alike(): - from .fused_batched_moe import (BatchedPrepareAndFinalize, - BatchedTritonExperts) + from .fused_batched_moe import BatchedTritonExperts from .fused_moe import TritonExperts, fused_experts from .modular_kernel import (FusedMoEModularKernel, FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize) if has_pplx: from .pplx_prepare_finalize import PplxPrepareAndFinalize + if has_deepep: + from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize + from .deepep_ll_prepare_finalize import DeepEPLLPrepareAndFinalize else: fused_experts = None # type: ignore FusedMoEPermuteExpertsUnpermute = None # type: ignore @@ -71,10 +74,24 @@ class FusedMoEParallelConfig: use_ep: bool # whether to use EP or not + @property + def use_all2all_kernels(self): + return self.dp_size > 1 and self.use_ep + @property def use_pplx_kernels(self): - return self.dp_size > 1 and self.use_ep and \ - envs.VLLM_ALL2ALL_BACKEND == "pplx" + return (self.use_all2all_kernels + and envs.VLLM_ALL2ALL_BACKEND == "pplx") + + @property + def use_deepep_ht_kernels(self): + return (self.use_all2all_kernels + and envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput") + + @property + def use_deepep_ll_kernels(self): + return (self.use_all2all_kernels + and envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency") @staticmethod def make(tp_size_: int, dp_size_: int, @@ -231,6 +248,14 @@ class MoEConfig: def use_pplx_kernels(self): return self.moe_parallel_config.use_pplx_kernels + @property + def use_deepep_ht_kernels(self): + return self.moe_parallel_config.use_deepep_ht_kernels + + @property + def use_deepep_ll_kernels(self): + return self.moe_parallel_config.use_deepep_ll_kernels + class FusedMoeWeightScaleSupported(Enum): TENSOR = "tensor" @@ -252,7 +277,16 @@ class FusedMoEMethodBase(QuantizeMethodBase): all2all_manager = get_ep_group().device_communicator.all2all_manager assert all2all_manager is not None - prepare_finalize = None + quant_dtype = None + act_quant_block_size = None + from vllm.model_executor.layers.quantization.fp8 import Fp8Config + if isinstance(quant_config, Fp8Config): + act_quant_block_size = quant_config.weight_block_size + quant_dtype = torch.float8_e4m3fn + + prepare_finalize: Optional[Union[PplxPrepareAndFinalize, + DeepEPHTPrepareAndFinalize, + DeepEPLLPrepareAndFinalize]] = None if moe.use_pplx_kernels: all_to_all_args = dict( max_num_tokens=moe.max_num_tokens, @@ -288,8 +322,49 @@ class FusedMoEMethodBase(QuantizeMethodBase): dp_size=all2all_manager.tp_group.world_size, quant_dtype=moe.in_dtype, ) + elif moe.use_deepep_ht_kernels: + assert moe.dp_size == all2all_manager.dp_world_size + + all_to_all_args = dict() + handle = all2all_manager.get_handle(all_to_all_args) + prepare_finalize = DeepEPHTPrepareAndFinalize( + handle, + world_size=all2all_manager.world_size, + rank=all2all_manager.rank, + dp_size=all2all_manager.dp_world_size, + rank_expert_offset=all2all_manager.rank * + moe.num_local_experts, + quant_dtype=quant_dtype, + block_shape=act_quant_block_size, + ) + + elif moe.use_deepep_ll_kernels: + assert moe.dp_size == all2all_manager.dp_world_size + all_to_all_args = dict( + max_num_tokens_per_dp_rank=moe.max_num_tokens, + token_hidden_size=moe.hidden_dim, + num_ep_ranks=all2all_manager.world_size, + num_global_experts=moe.num_experts, + num_local_experts=moe.num_experts // + all2all_manager.world_size) + handle = all2all_manager.get_handle(all_to_all_args) + + # Note (varun): Whether to use FP8 dispatch or not needs some + # profiling. Turning it off for now. + prepare_finalize = DeepEPLLPrepareAndFinalize( + handle, + world_size=all2all_manager.world_size, + dp_size=all2all_manager.dp_world_size, + max_tokens_per_rank=moe.max_num_tokens, + quant_dtype=quant_dtype, + block_shape=act_quant_block_size, + use_fp8_dispatch=False, + ) + + self.topk_indices_dtype = None if prepare_finalize is not None: + self.topk_indices_dtype = prepare_finalize.topk_indices_dtype() experts = self.select_gemm_impl(prepare_finalize) self.fused_experts = FusedMoEModularKernel( prepare_finalize, @@ -297,7 +372,7 @@ class FusedMoEMethodBase(QuantizeMethodBase): ) def select_gemm_impl( - self, prepare_finalize: Optional[FusedMoEPrepareAndFinalize] + self, prepare_finalize: FusedMoEPrepareAndFinalize ) -> FusedMoEPermuteExpertsUnpermute: # based on the all2all implementation, select the appropriate # gemm implementation @@ -334,6 +409,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): def __init__(self, moe: MoEConfig): super().__init__() self.fused_experts = fused_experts # type: ignore + self.topk_indices_dtype = None self.moe = moe self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled() @@ -343,8 +419,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): else: self.rocm_aiter_fused_experts = None # type: ignore - def select_gemm_impl( - self, prepare_finalize: Optional[FusedMoEPrepareAndFinalize]): + def select_gemm_impl(self, prepare_finalize: FusedMoEPrepareAndFinalize): assert self.fused_experts == fused_experts @@ -353,11 +428,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): experts: Optional[FusedMoEPermuteExpertsUnpermute] = None - if isinstance(prepare_finalize, - (BatchedPrepareAndFinalize, PplxPrepareAndFinalize)): + use_batched_experts = prepare_finalize.max_num_tokens_per_rank( + ) is not None + if use_batched_experts: logger.debug("BatchedTritonExperts %s", self.moe) + assert self.moe.dp_size == all2all_manager.dp_world_size experts = BatchedTritonExperts( - max_num_tokens=MOE_DP_CHUNK_SIZE, + max_num_tokens=self.moe.max_num_tokens, world_size=all2all_manager.world_size, # dp_size actually means tp_size, bug in pplx kernels dp_size=all2all_manager.tp_group.world_size, @@ -366,6 +443,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): use_int8_w8a16=False, use_int4_w4a16=False, block_shape=None, + per_channel_quant=False, ) else: logger.debug("TritonExperts %s", self.moe) @@ -494,6 +572,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): apply_router_weight_on_input: bool = False, activation: str = "silu", ) -> torch.Tensor: + topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x, router_logits=router_logits, @@ -505,7 +584,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): custom_routing_function=custom_routing_function, scoring_func=scoring_func, e_score_correction_bias=e_score_correction_bias, - indices_type=torch.uint32 if self.moe.use_pplx_kernels else None) + indices_type=self.topk_indices_dtype) if self.rocm_aiter_moe_enabled: assert expert_map is None @@ -806,11 +885,8 @@ class FusedMoE(torch.nn.Module): # Note: get_quant_method will look at the layer's local_num_experts # for heuristic purposes, so it must be initialized first. quant_method: Optional[QuantizeMethodBase] = None - - if quant_config is None: - quant_method = UnquantizedFusedMoEMethod(moe) - else: - quant_method = quant_config.get_quant_method(self, prefix) + quant_method = (UnquantizedFusedMoEMethod(moe) if quant_config is None + else quant_config.get_quant_method(self, prefix)) assert quant_method is not None assert isinstance(quant_method, FusedMoEMethodBase) @@ -836,7 +912,8 @@ class FusedMoE(torch.nn.Module): # Chunked all2all staging tensor self.batched_hidden_states: Optional[torch.Tensor] = None self.batched_router_logits: Optional[torch.Tensor] = None - if self.moe_parallel_config.use_pplx_kernels: + if (self.moe_parallel_config.use_pplx_kernels + or self.moe_parallel_config.use_deepep_ll_kernels): act_dtype = vllm_config.model_config.dtype self.batched_hidden_states = torch.zeros( (MOE_DP_CHUNK_SIZE, self.hidden_size), @@ -880,6 +957,14 @@ class FusedMoE(torch.nn.Module): def use_pplx_kernels(self): return self.moe_parallel_config.use_pplx_kernels + @property + def use_deepep_ht_kernels(self): + return self.moe_parallel_config.use_deepep_ht_kernels + + @property + def use_deepep_ll_kernels(self): + return self.moe_parallel_config.use_deepep_ll_kernels + def _load_per_tensor_weight_scale(self, shard_id: str, param: torch.nn.Parameter, loaded_weight: torch.Tensor, @@ -1210,19 +1295,21 @@ class FusedMoE(torch.nn.Module): When just tensor-parallel is used, it is not required to reduce the shared_experts results immediately. Instead we reduce at the once at the end of the MoE op. (Refer to DeepSeekV2MoE module) - With EP and the pplx kernels - this is no longer viable as all + With EP and all2all kernels - this is no longer viable as all GPU ranks in DP, produce the complete set of hidden_states. Therefore it is required that we reduce the shared_experts output early. """ - return self.use_pplx_kernels + return (self.use_pplx_kernels or self.use_deepep_ht_kernels + or self.use_deepep_ll_kernels) def maybe_all_reduce_tensor_model_parallel( self, final_hidden_states: torch.Tensor): """ The pplx combine kernel reduces across GPU ranks by default. """ - if self.use_pplx_kernels: + if (self.use_pplx_kernels or self.use_deepep_ht_kernels + or self.use_deepep_ll_kernels): return final_hidden_states else: return tensor_model_parallel_all_reduce(final_hidden_states) @@ -1289,7 +1376,7 @@ class FusedMoE(torch.nn.Module): ctx = get_forward_context() max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu - moe_dp_chunk_size_per_rank = MOE_DP_CHUNK_SIZE + moe_dp_chunk_size_per_rank = self.moe_config.max_num_tokens num_tokens = full_hidden_states.size(0) for chunk_start_ in range(0, max_tokens_across_dp, @@ -1310,12 +1397,17 @@ class FusedMoE(torch.nn.Module): def forward_impl(self, hidden_states: torch.Tensor, router_logits: torch.Tensor): assert self.quant_method is not None - if self.moe_parallel_config.use_pplx_kernels: + if (self.moe_parallel_config.use_pplx_kernels + or self.moe_parallel_config.use_deepep_ll_kernels): return self.forward_impl_chunked(hidden_states, router_logits) - if self.dp_size > 1: + do_naive_dispatch_combine: bool = ( + self.dp_size > 1 + and not self.moe_parallel_config.use_deepep_ht_kernels) + if do_naive_dispatch_combine: hidden_states, router_logits = get_ep_group().dispatch( hidden_states, router_logits) + # Matrix multiply. final_hidden_states = self.quant_method.apply( layer=self, @@ -1335,12 +1427,12 @@ class FusedMoE(torch.nn.Module): apply_router_weight_on_input=self.apply_router_weight_on_input, ) - if self.dp_size > 1: + if do_naive_dispatch_combine: final_hidden_states = get_ep_group().combine(final_hidden_states) if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1): - # Default set to False. (May have to add shared expert outputs.) - final_hidden_states = tensor_model_parallel_all_reduce( + # Default set to False. (May have to add shared expert outputs. + final_hidden_states = self.maybe_all_reduce_tensor_model_parallel( final_hidden_states) return final_hidden_states diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 5e321c9b4..2c27d31eb 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -94,7 +94,8 @@ class FusedMoEPrepareAndFinalize(ABC): num_experts: int, expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, - ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], + Optional[torch.Tensor], Optional[torch.Tensor]]: """ Perform any quantization (and/or) dispatching needed for this kernel. @@ -113,6 +114,10 @@ class FusedMoEPrepareAndFinalize(ABC): Returns a tuple of: - quantized + dispatched a. - quantized + dispatched a1_scales. + - Optional tensor as big as number of local experts that contains the + number of tokens assigned to each local expert. + - Optional dispatched expert topk IDs + - Optional dispatched expert topk weight """ raise NotImplementedError @@ -138,6 +143,27 @@ class FusedMoEPrepareAndFinalize(ABC): """ raise NotImplementedError + @abstractmethod + def topk_indices_dtype(self) -> Optional[torch.dtype]: + """ + The PrepareFinalize All2All implementations generally constrain the + dtype of the topk_ids they support. This function returns the + required topk indices dtype so it can be respected. + Return None if there are no such restrictions. + """ + raise NotImplementedError + + @abstractmethod + def max_num_tokens_per_rank(self) -> Optional[int]: + """ + Some PrepareFinalize All2All implementations are batched. Meaning, + they can processes only as set of tokens at a time. This + function returns the batch size i.e the maximum number of tokens + the implementation can process at a time. + Return None if there are no such restrictions. + """ + raise NotImplementedError + class FusedMoEPermuteExpertsUnpermute(ABC): """ @@ -261,6 +287,61 @@ class FusedMoEModularKernel(torch.nn.Module): self.prepare_finalize = prepare_finalize self.fused_experts = fused_experts + def _do_fused_experts( + self, + a1: torch.Tensor, # input to forward fn + a1q: torch.Tensor, # output of prepare fn + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + expert_num_tokens: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor]) -> torch.Tensor: + + _, M, N, K, top_k = _moe_problem_size(a1q, w1, w2, topk_ids) + + # Use a1 here to decipher the correct workspace datatype + workspace13_shape, workspace2_shape, workspace_dtype = ( + self.fused_experts.workspace_shapes(a1, M, N, K, top_k, + global_num_experts)) + + # We can reuse the memory between cache1 and cache3 because by the time + # we need cache3, we're done with cache1 + workspace13 = torch.zeros(workspace13_shape, + device=a1.device, + dtype=workspace_dtype) + workspace2 = torch.zeros(workspace2_shape, + device=a1.device, + dtype=workspace_dtype) + + fused_out = self.fused_experts.apply( + a1q, + w1, + w2, + topk_ids, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_zp=w1_zp, + w2_zp=w2_zp, + a1q_scale=a1q_scale, + a2_scale=a2_scale, + workspace13=workspace13, + workspace2=workspace2, + expert_num_tokens=expert_num_tokens, + ) + + return fused_out + def forward( self, hidden_states: torch.Tensor, @@ -315,49 +396,48 @@ class FusedMoEModularKernel(torch.nn.Module): Returns: - torch.Tensor: The output tensor after applying the MoE layer. """ - a1 = hidden_states - E, M, N, K, top_k = _moe_problem_size(a1, w1, w2, topk_ids) - - if global_num_experts == -1: - global_num_experts = E + a1 = hidden_states output = a1 if inplace else torch.zeros_like(a1) - workspace13_shape, workspace2_shape, workspace_dtype = ( - self.fused_experts.workspace_shapes(a1, M, N, K, top_k, - global_num_experts)) - - # We can reuse the memory between cache1 and cache3 because by the time - # we need cache3, we're done with cache1 - workspace13 = torch.zeros(workspace13_shape, - device=a1.device, - dtype=workspace_dtype) - workspace2 = torch.zeros(workspace2_shape, - device=a1.device, - dtype=workspace_dtype) - - a1q, a1q_scale, expert_num_tokens = self.prepare_finalize.prepare( - a1, a1_scale, a2_scale, topk_weights, topk_ids, global_num_experts, - expert_map, apply_router_weight_on_input) - - fused_out = self.fused_experts.apply( - a1q, - w1, - w2, - topk_ids, - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - w1_scale=w1_scale, - w2_scale=w2_scale, - w1_zp=w1_zp, - w2_zp=w2_zp, - a1q_scale=a1q_scale, - a2_scale=a2_scale, - workspace13=workspace13, - workspace2=workspace2, - expert_num_tokens=expert_num_tokens, - ) + if global_num_experts == -1: + global_num_experts = w1.size(0) + + (a1q, a1q_scale, expert_num_tokens, _expert_topk_ids, + _expert_topk_weights) = self.prepare_finalize.prepare( + a1, a1_scale, a2_scale, topk_weights, topk_ids, + global_num_experts, expert_map, apply_router_weight_on_input) + # Maybe prepare gathered topk_ids and topk_weights from other EP ranks. + topk_ids = topk_ids if _expert_topk_ids is None else _expert_topk_ids + topk_weights = (topk_weights if _expert_topk_weights is None else + _expert_topk_weights) + + fused_out = None + if a1q.numel() == 0: + # This happens when none of the tokens from the all2all reach this + # EP rank. Also, note that this is only relevant for CUDAGraph + # incompatible all2all kernels like the DeepEP high-throughput + # kernels. CUDAGraph compatible all2all kernels like the pplx + # kernels and the DeepEP low-latency kernels are always batched + # and can never run into the tensor.numel() == 0 case. + fused_out = torch.empty_like(a1q).to(dtype=a1.dtype) + else: + fused_out = self._do_fused_experts( + a1=a1, + a1q=a1q, + w1=w1, + w2=w2, + topk_ids=topk_ids, + expert_num_tokens=expert_num_tokens, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_zp=w1_zp, + w2_zp=w2_zp, + a1q_scale=a1q_scale, + a2_scale=a2_scale) self.prepare_finalize.finalize(output, fused_out, topk_weights, topk_ids, apply_router_weight_on_input) diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py index da7871434..89481e5bd 100644 --- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py @@ -25,7 +25,7 @@ def _moe_permute( """ top_k_num = curr_topk_ids.size(1) - tokens_in_chunk = curr_hidden_states.sizze(0) + tokens_in_chunk = curr_hidden_states.size(0) sorted_token_ids, expert_ids, num_tokens_post_padded = ( moe_align_block_size(curr_topk_ids, @@ -37,11 +37,12 @@ def _moe_permute( inv_perm: Optional[torch.Tensor] = None num_tokens = top_k_num * tokens_in_chunk - sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1) expert_ids = torch.repeat_interleave(expert_ids, block_m, dim=0) inv_perm = torch.argsort(sorted_token_ids)[:num_tokens] # Permute according to sorted token ids. + sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1) + curr_hidden_states = _fp8_perm(curr_hidden_states, sorted_token_ids // top_k_num) diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index 8405603cf..1170a16f3 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -32,6 +32,12 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): self.dp_size = dp_size self.quant_dtype = quant_dtype + def max_num_tokens_per_rank(self) -> Optional[int]: + return self.max_num_tokens + + def topk_indices_dtype(self) -> Optional[torch.dtype]: + return torch.uint32 + def prepare( self, a1: torch.Tensor, @@ -42,7 +48,8 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): num_experts: int, expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool, - ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], + Optional[torch.Tensor], Optional[torch.Tensor]]: num_tokens = a1.size(0) # M hidden_dim = a1.size(-1) # K @@ -115,7 +122,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): bound_m=bound_m, ) - return expert_x, expert_x_scale, expert_num_tokens + return expert_x, expert_x_scale, expert_num_tokens, None, None def finalize( self, diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py index 77a9686c9..9ed95e1de 100644 --- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py @@ -24,6 +24,12 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): self.block_shape = block_shape self.quant_dtype = quant_dtype + def max_num_tokens_per_rank(self) -> Optional[int]: + return None + + def topk_indices_dtype(self) -> Optional[torch.dtype]: + return None + def prepare( self, a1: torch.Tensor, @@ -34,7 +40,9 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): num_experts: int, expert_map: Optional[torch.Tensor], apply_router_weight_on_input: bool = False, - ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], + Optional[torch.Tensor], Optional[torch.Tensor]]: + if apply_router_weight_on_input: topk = topk_ids.size(1) # TODO: this only works for topK=1, will need to update for topK>1 @@ -47,7 +55,7 @@ class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): self.per_channel_quant, self.block_shape) - return a1q, a1q_scale, None + return a1q, a1q_scale, None, None, None def finalize( self, diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py index 373e8ab39..920931a93 100644 --- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py @@ -29,9 +29,10 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): per_channel_quant=per_channel_quant, block_shape=block_shape, block_m=block_m) - self.deep_gemm_expert = DeepGemmExperts() self.allow_deep_gemm = allow_deep_gemm self.use_fp8_w8a8 = use_fp8_w8a8 + self.deep_gemm_expert = DeepGemmExperts( + ) if self.allow_deep_gemm else None def workspace_shapes( self, @@ -46,6 +47,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): # workspaces so we can be pessimistic here and allocate for DeepGemm # even if we fall back to triton later, e.g. if expert maps are set. if self.allow_deep_gemm and _valid_deep_gemm_shape(M, N, K): + assert self.deep_gemm_expert is not None return self.deep_gemm_expert.workspace_shapes( a, M, N, K, topk, num_experts) else: @@ -73,7 +75,8 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): ) -> torch.Tensor: N = w1.size(1) if (self.allow_deep_gemm and self.use_fp8_w8a8 and N > 512 - and _valid_deep_gemm(hidden_states, w1, w2, expert_map)): + and _valid_deep_gemm(hidden_states, w1, w2)): + assert self.deep_gemm_expert is not None return self.deep_gemm_expert.apply( hidden_states, w1, diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py index c3a584782..692482c2e 100644 --- a/vllm/model_executor/layers/fused_moe/utils.py +++ b/vllm/model_executor/layers/fused_moe/utils.py @@ -18,8 +18,8 @@ def _resize_cache(x: torch.Tensor, v: tuple[int, ...]) -> torch.Tensor: Shrink the given tensor and apply the given view to it. This is used to resize the intermediate fused_moe caches. """ - assert prod( - v) <= x.numel(), f"{prod(v)} <= {x.numel()}" # CUDAGRAPH unfriendly? + assert prod(v) <= x.numel( + ), f"{v} ({prod(v)}) <= {x.shape} ({x.numel()})" # CUDAGRAPH unfriendly? return x.flatten()[:prod(v)].view(*v) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index cea4d26a4..2438ec30b 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -3,7 +3,7 @@ import functools import importlib.util -from typing import Any, Callable, Optional +from typing import Any, Callable, Optional, Union import torch import torch.nn.functional as F @@ -452,6 +452,9 @@ class Fp8MoEMethod(FusedMoEMethodBase): if envs.VLLM_USE_DEEP_GEMM: if not has_deep_gemm: logger.warning_once("Failed to import DeepGemm kernels.") + elif not self.block_quant: + logger.warning_once("Model is not block quantized. Not using " + " DeepGemm kernels") elif (current_platform.is_cuda() and current_platform.has_device_capability(90)): logger.info_once("Using DeepGemm kernels for Fp8MoEMethod.") @@ -460,8 +463,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): logger.warning_once( "DeepGemm not supported on the current platform.") + self.topk_indices_dtype = None self.fused_experts = functools.partial( # type: ignore fused_experts, + use_fp8_w8a8=True, block_shape=self.quant_config.weight_block_size, allow_deep_gemm=self.allow_deep_gemm) @@ -765,18 +770,39 @@ class Fp8MoEMethod(FusedMoEMethodBase): del layer.w2_input_scale def select_gemm_impl(self, prepare_finalize): + + from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( + BatchedTritonExperts) from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( TritonOrDeepGemmExperts) assert not self.use_marlin and not self.rocm_aiter_moe_enabled, ( "Marlin and ROCm AITER are not supported with all2all yet.") - experts = TritonOrDeepGemmExperts( - use_fp8_w8a8=True, - block_shape=self.quant_config.weight_block_size, - allow_deep_gemm=self.allow_deep_gemm, - ) + experts: Optional[Union[BatchedTritonExperts, + TritonOrDeepGemmExperts]] = None + max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank() + use_batched_experts = max_num_tokens_per_rank is not None + if use_batched_experts: + experts = BatchedTritonExperts( + max_num_tokens=max_num_tokens_per_rank, + world_size=prepare_finalize.world_size, + dp_size=prepare_finalize.dp_size, + use_fp8_w8a8=True, + use_int8_w8a8=False, + use_int8_w8a16=False, + use_int4_w4a16=False, + block_shape=None, + ) + else: + experts = TritonOrDeepGemmExperts( + use_fp8_w8a8=True, + block_shape=self.quant_config.weight_block_size, + allow_deep_gemm=self.allow_deep_gemm, + ) + + assert experts is not None return experts def apply( @@ -797,6 +823,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): apply_router_weight_on_input: bool = False, activation: str = "silu", ) -> torch.Tensor: + topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x, router_logits=router_logits, @@ -808,6 +835,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): custom_routing_function=custom_routing_function, scoring_func=scoring_func, e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, ) if self.rocm_aiter_moe_enabled: @@ -855,7 +883,6 @@ class Fp8MoEMethod(FusedMoEMethodBase): topk_ids=topk_ids, inplace=True, activation=activation, - use_fp8_w8a8=True, global_num_experts=global_num_experts, apply_router_weight_on_input=apply_router_weight_on_input, expert_map=expert_map, diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index e2d9424de..07ae470fa 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -154,6 +154,21 @@ class CudaPlatformBase(Platform): logger.info( "Forcing kv cache block size to 64 for FlashMLA backend.") + if (envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput" + and parallel_config.data_parallel_size > 1 + and vllm_config.compilation_config.use_cudagraph): + logger.info( + "Data Parallel: Forcing enforce eager to be True since DP " + "with DeepEP high-throughput kernels are not CUDA Graph " + "compatible. The DeepEP low-latency kernels are CUDA Graph " + "compatible. Set the all_to_all backend to deepep_low_latency " + "to use those kernels instead.") + vllm_config.compilation_config.use_cudagraph = False + vllm_config.model_config.enforce_eager = True + # TODO (varun): Turning this ON gives incorrect results for the + # Deepseek-V2-lite model. + vllm_config.compilation_config.use_inductor = False + @classmethod def get_current_memory_usage(cls, device: Optional[torch.types.Device] = None -- GitLab From bdf13965ab4a528d30cb82854487910189865d9d Mon Sep 17 00:00:00 2001 From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com> Date: Tue, 3 Jun 2025 13:33:07 -0700 Subject: [PATCH 144/274] [V1] Support cross-layer KV sharing (#18212) Signed-off-by: Yong Hoon Shin --- tests/v1/tpu/worker/test_tpu_model_runner.py | 227 +++++++++++++++- tests/v1/worker/test_gpu_model_runner.py | 244 +++++++++++++++++- vllm/attention/backends/abstract.py | 1 + vllm/attention/backends/blocksparse_attn.py | 3 + vllm/attention/backends/cpu_mla.py | 3 +- .../backends/dual_chunk_flash_attn.py | 3 + vllm/attention/backends/flash_attn.py | 3 + vllm/attention/backends/flashinfer.py | 3 + vllm/attention/backends/flashmla.py | 3 +- vllm/attention/backends/hpu_attn.py | 3 + vllm/attention/backends/ipex_attn.py | 3 + vllm/attention/backends/mla/common.py | 3 + vllm/attention/backends/pallas.py | 3 + vllm/attention/backends/rocm_aiter_mla.py | 3 +- vllm/attention/backends/rocm_flash_attn.py | 3 + vllm/attention/backends/torch_sdpa.py | 3 + vllm/attention/backends/triton_mla.py | 3 +- vllm/attention/backends/xformers.py | 3 + vllm/attention/layer.py | 17 +- vllm/v1/attention/backends/flash_attn.py | 36 +-- vllm/v1/attention/backends/flashinfer.py | 36 +-- vllm/v1/attention/backends/mla/common.py | 4 + vllm/v1/attention/backends/mla/flashmla.py | 3 +- .../attention/backends/mla/rocm_aiter_mla.py | 3 +- vllm/v1/attention/backends/mla/triton_mla.py | 3 +- vllm/v1/attention/backends/pallas.py | 6 +- vllm/v1/attention/backends/triton_attn.py | 51 ++-- vllm/v1/attention/backends/utils.py | 33 +++ vllm/v1/worker/gpu_model_runner.py | 31 ++- vllm/v1/worker/tpu_model_runner.py | 30 ++- vllm/v1/worker/utils.py | 36 +++ 31 files changed, 733 insertions(+), 73 deletions(-) diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index 230c97e78..bc54b6ecc 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -4,8 +4,13 @@ import unittest.mock as mock import pytest -from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig +from vllm.attention.layer import Attention +from vllm.config import (CacheConfig, ModelConfig, SchedulerConfig, VllmConfig, + set_current_vllm_config) from vllm.sampling_params import SamplingParams +from vllm.utils import GiB_bytes +from vllm.v1.core.kv_cache_utils import (estimate_max_model_len, + get_kv_cache_config) from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, SchedulerOutput) from vllm.v1.worker.tpu_model_runner import ( @@ -363,3 +368,223 @@ def test_get_req_paddings(): assert _get_req_paddings(1, 32) == [8, 16, 32] assert _get_req_paddings(8, 32) == [8, 16, 32] assert _get_req_paddings(8, 36) == [8, 16, 32, 36] + + +def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(): + layer_0 = "model.layers.0.self_attn.attn" + layer_1 = "model.layers.1.self_attn.attn" + error_msg = f"{layer_1} must come before the current layer" + with pytest.raises(ValueError, match=error_msg): + fwd_context = { + # initialization below will fail because target layer is invalid; + # the target layer needs to come before layer 1 + layer_0: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_0, + kv_sharing_target_layer_name=layer_1, + ), + layer_1: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_1, + ) + } + # suppress var not used error + assert fwd_context is not None + + +def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(): + layer_0 = "model.layers.0.self_attn.attn" + layer_1 = "model.layers.1.self_attn.attn" + invalid_layer = "model.layers.0.cross_attn.attn" + error_msg = f"{invalid_layer} is not a valid Attention layer in the model" + with pytest.raises(ValueError, match=error_msg): + fwd_context = { + layer_0: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_0, + ), + layer_1: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_1, + # invalid layer: cross_attn.atn doesn't exist! + kv_sharing_target_layer_name=invalid_layer, + ) + } + # suppress var not used error + assert fwd_context is not None + + +def test_init_kv_cache_with_kv_sharing_target_same_as_current(): + layer_0 = "model.layers.0.self_attn.attn" + layer_1 = "model.layers.1.self_attn.attn" + error_msg = f"{layer_1} cannot be the same as the current layer" + with pytest.raises(ValueError, match=error_msg): + fwd_context = { + # initialization below will fail because target layer is invalid; + # the target layer needs to come before layer 1 + layer_0: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_0, + ), + layer_1: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_1, + kv_sharing_target_layer_name=layer_1, + ) + } + # suppress var not used error + assert fwd_context is not None + + +def test_init_kv_cache_without_kv_sharing(model_runner): + layer_0 = "model.layers.0.self_attn.attn" + layer_1 = "model.layers.1.self_attn.attn" + vllm_config = model_runner.vllm_config + with set_current_vllm_config(vllm_config): + fwd_context = { + layer_0: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_0, + ), + layer_1: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_1, + ) + } + # suppress var not used error + assert fwd_context is not None + # Set high context length to test max context length estimation + vllm_config.model_config.max_model_len = 3_000_000 + vllm_ctx = vllm_config.compilation_config.static_forward_context + kv_cache_spec = model_runner.get_kv_cache_spec() + assert len(kv_cache_spec) == 2 + assert len(model_runner.shared_kv_cache_layers) == 0 + + available_memory = 20 * GiB_bytes + # page size for layer 0's kv_cache_spec is 32KB + num_expected_blocks = 327680 # 20GB / 32KB / 2 (num layers) + kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, + available_memory) + assert kv_cache_config.num_blocks == num_expected_blocks + assert len(kv_cache_config.tensors) == 2 + assert kv_cache_config.tensors[layer_0].size == available_memory // 2 + assert kv_cache_config.tensors[layer_1].size == available_memory // 2 + + max_context_len =\ + estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes) + # max context len with KV sharing should be 2x as large as without + assert max_context_len == 1310720 + + # important: override tensor size to prevent large mem alloc during test + # this will only allocate 2 block worth of memory (2 * 32kb) + kv_cache_config.num_blocks = 1 + for layer in kv_cache_config.tensors: + kv_cache_config.tensors[layer].size =\ + kv_cache_spec[layer].page_size_bytes + + model_runner.initialize_kv_cache(kv_cache_config) + + layer_0_kv = vllm_ctx[layer_0].kv_cache[0] + layer_1_kv = vllm_ctx[layer_1].kv_cache[0] + # check layer 1 kv cache does NOT share memory with layer 0 + assert id(layer_1_kv) != id(layer_0_kv) + + # check layer 1 added to kv cache group's layer names + assert len(kv_cache_config.kv_cache_groups) == 1 + assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2 + assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0 + assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1 + + +def test_init_kv_cache_with_kv_sharing_valid(model_runner): + layer_0 = "model.layers.0.self_attn.attn" + layer_1 = "model.layers.1.self_attn.attn" + vllm_config = model_runner.vllm_config + with set_current_vllm_config(vllm_config): + fwd_context = { + layer_0: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_0, + ), + layer_1: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_1, + kv_sharing_target_layer_name="model.layers.0.self_attn.attn", + ) + } + # suppress var not used error + assert fwd_context is not None + # Set high context length to test max context length estimation + vllm_config.model_config.max_model_len = 3_000_000 + vllm_ctx = vllm_config.compilation_config.static_forward_context + kv_cache_spec = model_runner.get_kv_cache_spec() + assert len(kv_cache_spec) == 1 + assert layer_0 in kv_cache_spec + assert model_runner.shared_kv_cache_layers[layer_1] == layer_0 + + available_memory = 20 * GiB_bytes + # page size for layer 0's kv_cache_spec is 32KB + # with KV sharing, we can allocate (available_mem//page_size//1) blocks + # which is twice as many as without KV sharing + num_expected_blocks = 655360 # 20GB / 32KB + kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, + available_memory) + assert kv_cache_config.num_blocks == num_expected_blocks + assert len(kv_cache_config.tensors) == 1 + # Each layer now has twice the available memory for KV cache + # compared to no KV sharing + assert kv_cache_config.tensors[layer_0].size == available_memory + + max_context_len =\ + estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes) + # max context len with KV sharing should be 2x as large as without + assert max_context_len == 2 * 1310720 + + # important: override tensor size to prevent large mem alloc during test + # this will only allocate 1 block worth of memory (32kb) + kv_cache_config.num_blocks = 1 + kv_cache_config.tensors[layer_0].size =\ + kv_cache_spec[layer_0].page_size_bytes + + model_runner.initialize_kv_cache(kv_cache_config) + + layer_0_kv = vllm_ctx[layer_0].kv_cache[0] + layer_1_kv = vllm_ctx[layer_1].kv_cache[0] + # check layer 1 kv cache shares memory with layer 0 + assert id(layer_1_kv) == id(layer_0_kv) + + # check layer 1 added to kv cache group's layer names + assert len(kv_cache_config.kv_cache_groups) == 1 + assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2 + assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0 + assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1 diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index ceb9d4df2..5e2fd2fbf 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -7,8 +7,11 @@ import pytest from vllm.attention import Attention from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, - SchedulerConfig, VllmConfig) + SchedulerConfig, VllmConfig, set_current_vllm_config) from vllm.sampling_params import SamplingParams +from vllm.utils import GiB_bytes +from vllm.v1.core.kv_cache_utils import (estimate_max_model_len, + get_kv_cache_config) from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, SchedulerOutput) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, @@ -19,6 +22,7 @@ from vllm.v1.worker.gpu_model_runner import GPUModelRunner BLOCK_SIZE = 16 NUM_BLOCKS = 10 +DEVICE = "cuda" def initialize_kv_cache(runner: GPUModelRunner): @@ -55,8 +59,7 @@ def initialize_kv_cache(runner: GPUModelRunner): runner.initialize_attn_backend(kv_cache_config) -@pytest.fixture -def model_runner(): +def get_vllm_config(): scheduler_config = SchedulerConfig( max_num_seqs=10, max_num_batched_tokens=512, @@ -84,13 +87,18 @@ def model_runner(): scheduler_config=scheduler_config, parallel_config=parallel_config, ) - num_heads = model_config.get_num_kv_heads(parallel_config) + return vllm_config + + +@pytest.fixture +def model_runner(): + vllm_config = get_vllm_config() + model_config = vllm_config.model_config + num_heads = model_config.get_num_kv_heads(vllm_config.parallel_config) head_size = model_config.get_head_size() vllm_config.compilation_config.static_forward_context[ "layer.0"] = Attention(num_heads, head_size, 0.1) - - device = "cuda" - runner = GPUModelRunner(vllm_config, device) + runner = GPUModelRunner(vllm_config, DEVICE) initialize_kv_cache(runner) return runner @@ -385,3 +393,225 @@ def test_load_model_weights_inplace(dist_init, model_runner, model_runner_2): model_runner_2.load_model() # Load real weights inplace assert str(model_runner.get_model().state_dict()) == str( model_runner_2.get_model().state_dict()) + + +def test_init_kv_cache_with_kv_sharing_invalid_target_layer_order(): + layer_0 = "model.layers.0.self_attn.attn" + layer_1 = "model.layers.1.self_attn.attn" + error_msg = f"{layer_1} must come before the current layer" + with pytest.raises(ValueError, match=error_msg): + fwd_context = { + # initialization below will fail because target layer is invalid; + # the target layer needs to come before layer 1 + layer_0: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_0, + kv_sharing_target_layer_name=layer_1, + ), + layer_1: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_1, + ) + } + # suppress var not used error + assert fwd_context is not None + + +def test_init_kv_cache_with_kv_sharing_target_layer_not_exist(): + layer_0 = "model.layers.0.self_attn.attn" + layer_1 = "model.layers.1.self_attn.attn" + invalid_layer = "model.layers.0.cross_attn.attn" + error_msg = f"{invalid_layer} is not a valid Attention layer in the model" + with pytest.raises(ValueError, match=error_msg): + fwd_context = { + layer_0: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_0, + ), + layer_1: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_1, + # invalid layer: cross_attn.atn doesn't exist! + kv_sharing_target_layer_name=invalid_layer, + ) + } + # suppress var not used error + assert fwd_context is not None + + +def test_init_kv_cache_with_kv_sharing_target_same_as_current(): + layer_0 = "model.layers.0.self_attn.attn" + layer_1 = "model.layers.1.self_attn.attn" + error_msg = f"{layer_1} cannot be the same as the current layer" + with pytest.raises(ValueError, match=error_msg): + fwd_context = { + # initialization below will fail because target layer is invalid; + # the target layer needs to come before layer 1 + layer_0: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_0, + ), + layer_1: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_1, + kv_sharing_target_layer_name=layer_1, + ) + } + # suppress var not used error + assert fwd_context is not None + + +def test_init_kv_cache_without_kv_sharing(): + layer_0 = "model.layers.0.self_attn.attn" + layer_1 = "model.layers.1.self_attn.attn" + vllm_config = get_vllm_config() + with set_current_vllm_config(vllm_config): + fwd_context = { + layer_0: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_0, + ), + layer_1: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_1, + ) + } + # suppress var not used error + assert fwd_context is not None + # Set high context length to test max context length estimation + vllm_config.model_config.max_model_len = 3_000_000 + vllm_ctx = vllm_config.compilation_config.static_forward_context + runner = GPUModelRunner(vllm_config, DEVICE) + kv_cache_spec = runner.get_kv_cache_spec() + assert len(kv_cache_spec) == 2 + assert len(runner.shared_kv_cache_layers) == 0 + + available_memory = 20 * GiB_bytes + # page size for layer 0's kv_cache_spec is 32KB + num_expected_blocks = 327680 # 20GB / 32KB / 2 (num layers) + kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, + available_memory) + assert kv_cache_config.num_blocks == num_expected_blocks + assert len(kv_cache_config.tensors) == 2 + assert kv_cache_config.tensors[layer_0].size == available_memory // 2 + assert kv_cache_config.tensors[layer_1].size == available_memory // 2 + + max_context_len =\ + estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes) + # max context len with KV sharing should be 2x as large as without + assert max_context_len == 1310720 + + # important: override tensor size to prevent large mem alloc during test + # this will only allocate 2 block worth of memory (2 * 32kb) + kv_cache_config.num_blocks = 1 + for layer in kv_cache_config.tensors: + kv_cache_config.tensors[layer].size =\ + kv_cache_spec[layer].page_size_bytes + + runner.initialize_kv_cache(kv_cache_config) + + layer_0_kv = vllm_ctx[layer_0].kv_cache[0] + layer_1_kv = vllm_ctx[layer_1].kv_cache[0] + # check layer 1 kv cache does NOT share memory with layer 0 + assert id(layer_1_kv) != id(layer_0_kv) + + # check layer 1 added to kv cache group's layer names + assert len(kv_cache_config.kv_cache_groups) == 1 + assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2 + assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0 + assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1 + + +def test_init_kv_cache_with_kv_sharing_valid(): + layer_0 = "model.layers.0.self_attn.attn" + layer_1 = "model.layers.1.self_attn.attn" + vllm_config = get_vllm_config() + with set_current_vllm_config(vllm_config): + fwd_context = { + layer_0: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_0, + ), + layer_1: + Attention( + num_heads=8, + head_size=64, + scale=1.0, + prefix=layer_1, + kv_sharing_target_layer_name="model.layers.0.self_attn.attn", + ) + } + # suppress var not used error + assert fwd_context is not None + # Set high context length to test max context length estimation + vllm_config.model_config.max_model_len = 3_000_000 + vllm_ctx = vllm_config.compilation_config.static_forward_context + runner = GPUModelRunner(vllm_config, DEVICE) + kv_cache_spec = runner.get_kv_cache_spec() + assert len(kv_cache_spec) == 1 + assert layer_0 in kv_cache_spec + assert runner.shared_kv_cache_layers[layer_1] == layer_0 + + available_memory = 20 * GiB_bytes + # page size for layer 0's kv_cache_spec is 32KB + # with KV sharing, we can allocate (available_mem//page_size//1) blocks + # which is twice as many as without KV sharing + num_expected_blocks = 655360 # 20GB / 32KB + kv_cache_config = get_kv_cache_config(vllm_config, kv_cache_spec, + available_memory) + assert kv_cache_config.num_blocks == num_expected_blocks + assert len(kv_cache_config.tensors) == 1 + # Each layer now has twice the available memory for KV cache + # compared to no KV sharing + assert kv_cache_config.tensors[layer_0].size == available_memory + + max_context_len =\ + estimate_max_model_len(vllm_config, kv_cache_spec, 5 * GiB_bytes) + # max context len with KV sharing should be 2x as large as without + assert max_context_len == 2 * 1310720 + + # important: override tensor size to prevent large mem alloc during test + # this will only allocate 1 block worth of memory (32kb) + kv_cache_config.num_blocks = 1 + kv_cache_config.tensors[layer_0].size =\ + kv_cache_spec[layer_0].page_size_bytes + + runner.initialize_kv_cache(kv_cache_config) + + layer_0_kv = vllm_ctx[layer_0].kv_cache[0] + layer_1_kv = vllm_ctx[layer_1].kv_cache[0] + # check layer 1 kv cache shares memory with layer 0 + assert id(layer_1_kv) == id(layer_0_kv) + + # check layer 1 added to kv cache group's layer names + assert len(kv_cache_config.kv_cache_groups) == 1 + assert len(kv_cache_config.kv_cache_groups[0].layer_names) == 2 + assert kv_cache_config.kv_cache_groups[0].layer_names[0] == layer_0 + assert kv_cache_config.kv_cache_groups[0].layer_names[1] == layer_1 diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index deb3951d6..0ba5a5bf9 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -270,6 +270,7 @@ class AttentionImpl(ABC, Generic[T]): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, ) -> None: raise NotImplementedError diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index a2fd557f8..c1663516d 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -306,7 +306,10 @@ class BlocksparseFlashAttentionImpl(AttentionImpl): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") assert blocksparse_params is not None assert alibi_slopes is None, ValueError( "Alibi not support for blocksparse flash attention.") diff --git a/vllm/attention/backends/cpu_mla.py b/vllm/attention/backends/cpu_mla.py index 39e667bca..cf7883e12 100644 --- a/vllm/attention/backends/cpu_mla.py +++ b/vllm/attention/backends/cpu_mla.py @@ -206,12 +206,13 @@ class CPUMLAImpl(MLACommonImpl[CPUMLAMetadata]): blocksparse_params: Optional[Dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, + kv_sharing_target_layer_name: Optional[str], # MLA Specific Arguments **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap, attn_type, - **mla_args) + kv_sharing_target_layer_name, **mla_args) unsupported_features = [ alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap diff --git a/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py index 3548df88d..963bccdf2 100644 --- a/vllm/attention/backends/dual_chunk_flash_attn.py +++ b/vllm/attention/backends/dual_chunk_flash_attn.py @@ -290,9 +290,12 @@ class DualChunkFlashAttentionImpl(FlashAttentionImpl): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, layer_idx: int = -1, dual_chunk_attention_config: Optional[Dict[str, Any]] = None, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 26be2c04f..73e377268 100755 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -618,8 +618,11 @@ class FlashAttentionImpl(AttentionImpl): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, use_irope: bool = False, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") if blocksparse_params is not None: raise ValueError( "FlashAttention does not support block-sparse attention.") diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 7ae7ea37f..a3937760f 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -936,8 +936,11 @@ class FlashInferImpl(AttentionImpl): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, use_irope: bool = False, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") if use_irope: logger.warning_once( "Using irope in FlashInfer is not supported yet, it will fall" diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py index 9a6b8a40e..e185d0260 100644 --- a/vllm/attention/backends/flashmla.py +++ b/vllm/attention/backends/flashmla.py @@ -184,12 +184,13 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]): blocksparse_params: Optional[Dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, + kv_sharing_target_layer_name: Optional[str] = None, # MLA Specific Arguments **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap, attn_type, - **mla_args) + kv_sharing_target_layer_name, **mla_args) assert is_flashmla_supported(), \ "FlashMLA is not supported on this device" diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index 5128e4975..9bd513fd8 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -110,9 +110,12 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module): blocksparse_params: Optional[Dict[str, Any]] = None, max_seq_len: int = 4096, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, use_irope: bool = False, ) -> None: super(AttentionImpl, self).__init__() + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") if use_irope: logger.warning_once( "Using irope in HPU is not supported yet, it will fall back " diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index 30441b3ad..5051c6a7c 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -123,8 +123,11 @@ class IpexAttnBackendImpl(AttentionImpl[IpexAttnMetadata]): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, use_irope: bool = False, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") if use_irope: logger.warning_once( "Using irope in Ipex is not supported yet, it will fall" diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index 50842abd3..78cf95288 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -1000,6 +1000,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): blocksparse_params: Optional[Dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, + kv_sharing_target_layer_name: Optional[str], # MLA Specific Arguments q_lora_rank: Optional[int], kv_lora_rank: int, @@ -1009,6 +1010,8 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): v_head_dim: int, kv_b_proj: ColumnParallelLinear, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing not supported in V0.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index a6823ac05..7ad67615d 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -109,8 +109,11 @@ class PallasAttentionBackendImpl(AttentionImpl): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, use_irope: bool = False, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") if use_irope: logger.warning_once( "Using irope in Pallas is not supported yet, it will fall back " diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py index 855036071..1edf34351 100644 --- a/vllm/attention/backends/rocm_aiter_mla.py +++ b/vllm/attention/backends/rocm_aiter_mla.py @@ -370,12 +370,13 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]): blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, + kv_sharing_target_layer_name: Optional[str], # MLA Specific Arguments **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap, attn_type, - **mla_args) + kv_sharing_target_layer_name, **mla_args) unsupported_features = [ alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 755e0da06..4b460dc0b 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -494,8 +494,11 @@ class ROCmFlashAttentionImpl(AttentionImpl): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, use_irope: bool = False, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") if use_irope: logger.warning_once( "Using irope in ROCm Flash Attention is not supported yet, it " diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 760634004..f3fb5adcf 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -405,8 +405,11 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, use_irope: bool = False, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") if blocksparse_params is not None: raise ValueError( "Torch SPDA does not support block-sparse attention.") diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py index d9fff8fac..e06f7d54e 100644 --- a/vllm/attention/backends/triton_mla.py +++ b/vllm/attention/backends/triton_mla.py @@ -38,12 +38,13 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]): blocksparse_params: Optional[Dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, + kv_sharing_target_layer_name: Optional[str], # MLA Specific Arguments **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap, attn_type, - **mla_args) + kv_sharing_target_layer_name, **mla_args) unsupported_features = [ alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 8355e0397..04ef928b7 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -390,8 +390,11 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]): blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, use_irope: bool = False, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported in V0.") if blocksparse_params is not None: raise ValueError( "XFormers does not support block-sparse attention.") diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 6c5b05a5c..a5fbd1a1c 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -21,6 +21,7 @@ from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.platforms import _Backend, current_platform from vllm.utils import direct_register_custom_op +from vllm.v1.attention.backends.utils import validate_kv_sharing_target class Attention(nn.Module): @@ -50,6 +51,7 @@ class Attention(nn.Module): use_mla: bool = False, prefix: str = "", attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, **extra_impl_args, ) -> None: """ @@ -135,7 +137,7 @@ class Attention(nn.Module): self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap, attn_type, - **extra_impl_args) + kv_sharing_target_layer_name, **extra_impl_args) self.backend = backend_name_to_enum(attn_backend.get_name()) self.dtype = dtype @@ -153,6 +155,19 @@ class Attention(nn.Module): compilation_config.static_forward_context[prefix] = self self.layer_name = prefix self.attn_type = attn_type + + if kv_sharing_target_layer_name is not None: + if not envs.VLLM_USE_V1: + raise NotImplementedError( + "Cross-layer KV sharing is not supported in V0.") + + validate_kv_sharing_target( + prefix, + kv_sharing_target_layer_name, + compilation_config.static_forward_context, + ) + self.kv_sharing_target_layer_name = kv_sharing_target_layer_name + # use a placeholder kv cache tensor during init, which will be replaced # by bind_kv_cache # this variable will not be accessed if use_direct_call is True diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 9e989df1c..a92c51883 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -485,6 +485,7 @@ class FlashAttentionImpl(AttentionImpl): blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[str] = None, use_irope: bool = False, ) -> None: if blocksparse_params is not None: @@ -506,6 +507,7 @@ class FlashAttentionImpl(AttentionImpl): # In flash-attn, setting logits_soft_cap as 0 means no soft cap. logits_soft_cap = 0 self.logits_soft_cap = logits_soft_cap + self.kv_sharing_target_layer_name = kv_sharing_target_layer_name assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads @@ -569,22 +571,26 @@ class FlashAttentionImpl(AttentionImpl): # performance to make sure it does not introduce any overhead. num_actual_tokens = attn_metadata.num_actual_tokens - # Reshape the input keys and values and store them in the cache. - # NOTE(woosuk): Here, key and value are padded while slot_mapping is - # not padded. However, we don't need to do key[:num_actual_tokens] and - # value[:num_actual_tokens] because the reshape_and_cache_flash op uses - # the slot_mapping's shape to determine the number of actual tokens. key_cache, value_cache = kv_cache.unbind(0) - torch.ops._C_cache_ops.reshape_and_cache_flash( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) + + if self.kv_sharing_target_layer_name is None: + # Reshape the input keys and values and store them in the cache. + # Skip this if sharing KV cache with an earlier attention layer. + # NOTE(woosuk): Here, key and value are padded while slot_mapping is + # not padded. However, we don't need to do key[:num_actual_tokens] + # and value[:num_actual_tokens] because the reshape_and_cache_flash + # op uses the slot_mapping's shape to determine the number of + # actual tokens. + torch.ops._C_cache_ops.reshape_and_cache_flash( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) if self.kv_cache_dtype.startswith("fp8"): key_cache = key_cache.view(torch.float8_e4m3fn) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 8bd998eba..f1b61c152 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -507,6 +507,7 @@ class FlashInferImpl(AttentionImpl): blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[int] = None, ) -> None: self.num_heads = num_heads self.head_size = head_size @@ -521,6 +522,7 @@ class FlashInferImpl(AttentionImpl): self.sliding_window = (sliding_window - 1, 0) self.kv_cache_dtype = kv_cache_dtype self.logits_soft_cap = logits_soft_cap + self.kv_sharing_target_layer_name = kv_sharing_target_layer_name assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads @@ -568,21 +570,25 @@ class FlashInferImpl(AttentionImpl): # performance to make sure it does not introduce any overhead. num_actual_tokens = attn_metadata.num_actual_tokens - # Reshape the input keys and values and store them in the cache. - # NOTE(woosuk): Here, key and value are padded while slot_mapping is - # not padded. However, we don't need to do key[:num_actual_tokens] and - # value[:num_actual_tokens] because the reshape_and_cache_flash op uses - # the slot_mapping's shape to determine the number of actual tokens. - torch.ops._C_cache_ops.reshape_and_cache_flash( - key, - value, - kv_cache[:, 0], - kv_cache[:, 1], - attn_metadata.slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) + + if self.kv_sharing_target_layer_name is None: + # Reshape the input keys and values and store them in the cache. + # Skip this if sharing KV cache with an earlier attention layer. + # NOTE(woosuk): Here, key and value are padded while slot_mapping is + # not padded. However, we don't need to do key[:num_actual_tokens] + # and value[:num_actual_tokens] because the reshape_and_cache_flash + # op uses the slot_mapping's shape to determine the number of + # actual tokens. + torch.ops._C_cache_ops.reshape_and_cache_flash( + key, + value, + kv_cache[:, 0], + kv_cache[:, 1], + attn_metadata.slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) window_left = (self.sliding_window[0] if self.sliding_window is not None else -1) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 96befca5a..06acbb909 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -586,6 +586,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, + kv_sharing_target_layer_name: Optional[str], # MLA Specific Arguments q_lora_rank: Optional[int], kv_lora_rank: int, @@ -595,6 +596,9 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): v_head_dim: int, kv_b_proj: ColumnParallelLinear, ) -> None: + if kv_sharing_target_layer_name is not None: + raise NotImplementedError("KV sharing is not supported for MLA") + self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 060a7c9d8..318b8ede1 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -93,12 +93,13 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]): blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, + kv_sharing_target_layer_name: Optional[str], # MLA Specific Arguments **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap, attn_type, - **mla_args) + kv_sharing_target_layer_name, **mla_args) assert is_flashmla_supported(), \ "FlashMLA is not supported on this device" diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 8925b5a5c..1f0406a7a 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -139,12 +139,13 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]): blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, + kv_sharing_target_layer_name: Optional[str], # MLA Specific Arguments **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap, attn_type, - **mla_args) + kv_sharing_target_layer_name, **mla_args) assert (num_heads == 16 or num_heads == 128), ( f"Aiter MLA only supports 16 or 128 number of heads.\n" f"Provided {num_heads} number of heads.\n" diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py index 0857fc133..e26d79091 100644 --- a/vllm/v1/attention/backends/mla/triton_mla.py +++ b/vllm/v1/attention/backends/mla/triton_mla.py @@ -41,12 +41,13 @@ class TritonMLAImpl(MLACommonImpl[MLACommonMetadata]): blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, + kv_sharing_target_layer_name: Optional[str], # MLA Specific Arguments **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap, attn_type, - **mla_args) + kv_sharing_target_layer_name, **mla_args) unsupported_features = [ alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py index 896f1394c..0f956ba88 100644 --- a/vllm/v1/attention/backends/pallas.py +++ b/vllm/v1/attention/backends/pallas.py @@ -113,6 +113,7 @@ class PallasAttentionBackendImpl(AttentionImpl): blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[int] = None, use_irope: bool = False, ) -> None: if use_irope: @@ -128,6 +129,7 @@ class PallasAttentionBackendImpl(AttentionImpl): self.num_kv_heads = num_kv_heads self.sliding_window = sliding_window self.logits_soft_cap = logits_soft_cap + self.kv_sharing_target_layer_name = kv_sharing_target_layer_name assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads @@ -181,7 +183,9 @@ class PallasAttentionBackendImpl(AttentionImpl): num_tokens, hidden_size = query.shape query = query.view(num_tokens, self.num_heads, self.head_size) - if kv_cache.numel() > 0: + if self.kv_sharing_target_layer_name is None and kv_cache.numel() > 0: + # Write input keys and values to the KV cache. + # Skip this if sharing KV cache with an earlier attention layer. slot_mapping = attn_metadata.slot_mapping write_to_kv_cache(key, value, kv_cache, slot_mapping) diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 6a3314dd8..968f13701 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -88,6 +88,7 @@ class TritonAttentionImpl(AttentionImpl): blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, + kv_sharing_target_layer_name: Optional[int] = None, use_irope: bool = False, ) -> None: if blocksparse_params is not None: @@ -109,6 +110,7 @@ class TritonAttentionImpl(AttentionImpl): # In flash-attn, setting logits_soft_cap as 0 means no soft cap. logits_soft_cap = 0 self.logits_soft_cap = logits_soft_cap + self.kv_sharing_target_layer_name = kv_sharing_target_layer_name self.use_irope = use_irope @@ -178,31 +180,34 @@ class TritonAttentionImpl(AttentionImpl): if use_prefill_decode_attn: key_cache, value_cache = PagedAttention.split_kv_cache( kv_cache, self.num_kv_heads, self.head_size) - - # Reshape the input keys and values and store them in the cache. - PagedAttention.write_to_paged_cache( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) - else: key_cache, value_cache = kv_cache.unbind(0) - torch.ops._C_cache_ops.reshape_and_cache_flash( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) + + if self.kv_sharing_target_layer_name is None: + # Reshape the input keys and values and store them in the cache. + # Skip this if sharing KV cache with an earlier attention layer. + if use_prefill_decode_attn: + PagedAttention.write_to_paged_cache( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) + else: + torch.ops._C_cache_ops.reshape_and_cache_flash( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) if self.kv_cache_dtype.startswith("fp8"): key_cache = key_cache.view(self.fp8_dtype) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 2e65619ed..72c764353 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -17,3 +17,36 @@ class CommonAttentionMetadata: seq_lens: torch.Tensor """(batch_size,), the length of each request including both computed tokens and newly scheduled tokens""" + + +def validate_kv_sharing_target(current_layer_name, target_layer_name, + static_forward_context): + error_msg = (f"Specified KV sharing target layer for {current_layer_name} " + f"is not valid: target layer {target_layer_name} ") + + if current_layer_name == target_layer_name: + raise ValueError(error_msg + + "cannot be the same as the current layer.") + + if target_layer_name not in static_forward_context: + from vllm.model_executor.models.utils import extract_layer_index + + # If target layer name is not in the static fwd context, it means either + # a) the target layer does not come BEFORE the current layer, or + # b) the target layer is not an Attention layer that exists in the model + current_layer_idx = extract_layer_index(current_layer_name) + target_layer_idx = extract_layer_index(target_layer_name) + if current_layer_idx <= target_layer_idx: + raise ValueError(error_msg + "must come before the current layer.") + else: + raise ValueError(error_msg + + "is not a valid Attention layer in the model.") + + # Currently KV sharing is only supported between layers of the same type + target_layer_attn_type = static_forward_context[ + target_layer_name].attn_type + expected = static_forward_context[current_layer_name].attn_type + if target_layer_attn_type != expected: + raise ValueError( + error_msg + + f"must be the same type as the current layer ({expected}).") diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c96ad0c01..b7448be26 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -59,8 +59,8 @@ from vllm.v1.worker.block_table import BlockTable from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin -from .utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs, - scatter_mm_placeholders) +from .utils import (gather_mm_placeholders, initialize_kv_cache_for_kv_sharing, + sanity_check_mm_encoder_outputs, scatter_mm_placeholders) if TYPE_CHECKING: import xgrammar as xgr @@ -276,6 +276,12 @@ class GPUModelRunner(LoRAModelRunnerMixin): pin_memory=self.pin_memory) self.seq_lens_np = self.seq_lens_cpu.numpy() + # Layer pairings for cross-layer KV sharing. + # If an Attention layer `layer_name` is in the keys of this dict, it + # means this layer will perform attention using the keys and values + # from the KV cache of `shared_kv_cache_layers[layer_name]`. + self.shared_kv_cache_layers: dict[str, str] = {} + def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool: """ Update the order of requests in the batch based on the attention @@ -2097,6 +2103,15 @@ class GPUModelRunner(LoRAModelRunnerMixin): # KV cache specs. raise ValueError("Unknown KV cache spec type.") + # Setup `kv_cache_config` and `kv_caches` for models + # with cross-layer KV sharing + if self.shared_kv_cache_layers: + initialize_kv_cache_for_kv_sharing( + self.shared_kv_cache_layers, + kv_cache_config.kv_cache_groups, + kv_caches, + ) + if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) # validate all draft model layers belong to the same kv cache @@ -2125,6 +2140,18 @@ class GPUModelRunner(LoRAModelRunnerMixin): use_mla = self.vllm_config.model_config.use_mla kv_cache_spec: dict[str, KVCacheSpec] = {} for layer_name, attn_module in layers.items(): + if (kv_tgt_layer := + attn_module.kv_sharing_target_layer_name) is not None: + # The layer doesn't need its own KV cache and will use that of + # the target layer. We skip creating a KVCacheSpec for it, so + # that KV cache management logic will act as this layer does + # not exist, and doesn't allocate KV cache for the layer. This + # enables the memory saving of cross-layer kv sharing, allowing + # a given amount of memory to accommodate longer context lengths + # or enable more requests to be processed simultaneously. + self.shared_kv_cache_layers[layer_name] = kv_tgt_layer + continue + # TODO: Support other attention modules, e.g., cross-attention if attn_module.attn_type == AttentionType.DECODER: if attn_module.sliding_window is not None: diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 48ea3cb7b..f15234f49 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -44,7 +44,8 @@ from vllm.v1.utils import bind_kv_cache from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin -from .utils import sanity_check_mm_encoder_outputs +from .utils import (initialize_kv_cache_for_kv_sharing, + sanity_check_mm_encoder_outputs) if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput @@ -238,6 +239,12 @@ class TPUModelRunner(LoRAModelRunnerMixin): self.num_reqs_paddings = _get_req_paddings( min_req_size=MIN_NUM_SEQS, max_req_size=self.max_num_reqs) + # Layer pairings for cross-layer KV sharing. + # If an Attention layer `layer_name` is in the keys of this dict, it + # means this layer will perform attention using the keys and values + # from the KV cache of `shared_kv_cache_layers[layer_name]`. + self.shared_kv_cache_layers: dict[str, str] = {} + # tensors for structured decoding self.grammar_bitmask_cpu = torch.zeros( (self.max_num_reqs, cdiv(self.vocab_size, 32)), @@ -455,6 +462,18 @@ class TPUModelRunner(LoRAModelRunnerMixin): block_size = self.vllm_config.cache_config.block_size kv_cache_spec: dict[str, KVCacheSpec] = {} for layer_name, attn_module in layers.items(): + if (kv_tgt_layer := + attn_module.kv_sharing_target_layer_name) is not None: + # The layer doesn't need its own KV cache and will use that of + # the target layer. We skip creating a KVCacheSpec for it, so + # that KV cache management logic will act as this layer does + # not exist, and doesn't allocate KV cache for the layer. This + # enables the memory saving of cross-layer kv sharing, allowing + # a given amount of memory to accommodate longer context lengths + # or enable more requests to be processed simultaneously. + self.shared_kv_cache_layers[layer_name] = kv_tgt_layer + continue + if attn_module.attn_type == AttentionType.DECODER: if attn_module.sliding_window is not None: kv_cache_spec[layer_name] = SlidingWindowSpec( @@ -1376,6 +1395,15 @@ class TPUModelRunner(LoRAModelRunnerMixin): else: raise NotImplementedError + # Setup `kv_cache_config` and `kv_caches` for models + # with cross-layer KV sharing + if self.shared_kv_cache_layers: + initialize_kv_cache_for_kv_sharing( + self.shared_kv_cache_layers, + kv_cache_config.kv_cache_groups, + kv_caches, + ) + bind_kv_cache( kv_caches, self.vllm_config.compilation_config.static_forward_context, diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index b23b28c1d..055cf0153 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -4,6 +4,8 @@ from typing import Optional import torch +from vllm.v1.kv_cache_interface import KVCacheGroupSpec + def sanity_check_mm_encoder_outputs( mm_embeddings: object, @@ -73,3 +75,37 @@ def gather_mm_placeholders( return placeholders return placeholders[is_embed] + + +def initialize_kv_cache_for_kv_sharing( + shared_kv_cache_layers: dict[str, str], + kv_cache_groups: list[KVCacheGroupSpec], + kv_caches: dict[str, torch.Tensor], +) -> None: + """ + Sets up KV cache sharing by reusing the allocated KV caches in `kv_caches` + for layers that do not allocate its own KV cache, based on the mapping in + `shared_kv_cache_layers`. Adds these layers to the corresponding KV cache + group, which is needed to ensure that attention metadata is assigned later. + + Args: + shared_kv_cache_layers: Layer pairings for cross-layer KV sharing. + If an Attention layer `layer_name` is in the keys of this dict, it + means this layer will perform attention using the keys and values + from the KV cache of `shared_kv_cache_layers[layer_name]`. + kv_cache_groups: The KV cache groups of the model. + kv_caches: The allocated kv_caches with layer names as keys. + Note that layers in shared_kv_cache_layers.keys() are not + originally included as it only contains layers which have its own + KV cache allocation. + """ + # Record index of KV cache group for each layer that allocates a KV cache. + layer_to_kv_cache_group_idx: dict[str, int] = {} + for i, kv_cache_group in enumerate(kv_cache_groups): + for layer_name in kv_cache_group.layer_names: + layer_to_kv_cache_group_idx[layer_name] = i + + for layer_name, target_layer_name in shared_kv_cache_layers.items(): + kv_caches[layer_name] = kv_caches[target_layer_name] + group_idx = layer_to_kv_cache_group_idx[target_layer_name] + kv_cache_groups[group_idx].layer_names.append(layer_name) -- GitLab From e31446b6c8d887cdca031abf8527555adee46058 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 3 Jun 2025 16:48:25 -0400 Subject: [PATCH 145/274] [Perf] Tune `scaled_fp8_quant` by increasing vectorization (#18844) Signed-off-by: mgoin --- csrc/quantization/fp8/common.cu | 35 ++++--- csrc/quantization/fp8/common.cuh | 68 ++++++------- .../fused_kernels/layernorm_utils.cuh | 99 ++++++++++--------- csrc/quantization/vectorization.cuh | 23 +++-- 4 files changed, 115 insertions(+), 110 deletions(-) diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu index eceb3a8ea..f3f9f669e 100644 --- a/csrc/quantization/fp8/common.cu +++ b/csrc/quantization/fp8/common.cu @@ -39,8 +39,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel( fp8_type* __restrict__ token_output = &out[offset]; // For vectorization, token_input and token_output pointers need to be - // aligned at 8-byte and 4-byte addresses respectively. - bool const can_vectorize = hidden_size % 4 == 0; + // aligned at 32-byte and 16-byte addresses respectively. + bool const can_vectorize = hidden_size % 16 == 0; float absmax_val = 0.0f; if (can_vectorize) { @@ -48,24 +48,24 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel( } else { for (int i = tid; i < hidden_size; i += blockDim.x) { float const x = static_cast(token_input[i]); - absmax_val = max(absmax_val, fabs(x)); + absmax_val = fmaxf(absmax_val, fabsf(x)); } } - using BlockReduce = cub::BlockReduce; + using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage reduceStorage; float const block_absmax_val_maybe = BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x); __shared__ float token_scale; if (tid == 0) { if (scale_ub) { - token_scale = min(block_absmax_val_maybe, *scale_ub); + token_scale = fminf(block_absmax_val_maybe, *scale_ub); } else { token_scale = block_absmax_val_maybe; } // token scale computation - token_scale = max(token_scale / quant_type_max_v, - min_scaling_factor::val()); + token_scale = fmaxf(token_scale / quant_type_max_v, + min_scaling_factor::val()); scale[token_idx] = token_scale; } __syncthreads(); @@ -88,10 +88,11 @@ void static_scaled_fp8_quant(torch::Tensor& out, // [..., d] torch::Tensor const& input, // [..., d] torch::Tensor const& scale) // [1] { - int64_t num_tokens = input.numel() / input.size(-1); - int64_t num_elems = input.numel(); - dim3 grid(num_tokens); - dim3 block(1024); + int const block_size = 256; + int const num_tokens = input.numel() / input.size(-1); + int const num_elems = input.numel(); + dim3 const grid(num_tokens); + dim3 const block(block_size); const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); VLLM_DISPATCH_FLOATING_TYPES( @@ -110,10 +111,11 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, // [..., d] torch::Tensor const& input, // [..., d] torch::Tensor& scale) // [1] { - int64_t num_tokens = input.numel() / input.size(-1); - int64_t num_elems = input.numel(); - dim3 grid(num_tokens); - dim3 block(1024); + int const block_size = 256; + int const num_tokens = input.numel() / input.size(-1); + int const num_elems = input.numel(); + dim3 const grid(num_tokens); + dim3 const block(block_size); const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); VLLM_DISPATCH_FLOATING_TYPES( @@ -141,8 +143,9 @@ void dynamic_per_token_scaled_fp8_quant( int const hidden_size = input.size(-1); int const num_tokens = input.numel() / hidden_size; + int const block_size = 256; dim3 const grid(num_tokens); - dim3 const block(std::min(hidden_size, 1024)); + dim3 const block(std::min(hidden_size, block_size)); const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh index def8b31b2..d36f94a8f 100644 --- a/csrc/quantization/fp8/common.cuh +++ b/csrc/quantization/fp8/common.cuh @@ -46,7 +46,7 @@ __device__ __forceinline__ fp8_type scaled_fp8_conversion(float const val, } float r = - fmax(-quant_type_max_v, fmin(x, quant_type_max_v)); + fmaxf(-quant_type_max_v, fminf(x, quant_type_max_v)); #ifndef USE_ROCM return static_cast(r); #else @@ -65,7 +65,7 @@ template __global__ void segmented_max_reduction(float* __restrict__ scale, const scalar_t* __restrict__ input, int64_t num_elems) { - __shared__ float cache[1024]; + __shared__ float cache[256]; int64_t i = blockDim.x * blockIdx.x + threadIdx.x; // First store maximum for all values processes by @@ -73,7 +73,7 @@ __global__ void segmented_max_reduction(float* __restrict__ scale, scalar_t tmp = 0.0; while (i < num_elems) { float x = static_cast(input[i]); - tmp = max(tmp, fabs(x)); + tmp = fmaxf(tmp, fabsf(x)); i += blockDim.x * gridDim.x; } cache[threadIdx.x] = tmp; @@ -100,25 +100,27 @@ template __device__ float thread_max_vec(scalar_t const* __restrict__ input, int64_t const num_elems, int const tid, int const step) { + constexpr size_t VEC_SIZE = 16; + using scalarxN_t = vec_n_t; // Vectorized input/output to better utilize memory bandwidth. - vec4_t const* vectorized_in = - reinterpret_cast const*>(input); + auto const* vectorized_in = reinterpret_cast(input); - int64_t const num_vec_elems = num_elems >> 2; + // num_elems / VEC_SIZE (which is 16) + int64_t const num_vec_elems = num_elems >> 4; float absmax_val = 0.0f; -#pragma unroll 4 +#pragma unroll for (int64_t i = tid; i < num_vec_elems; i += step) { - vec4_t in_vec = vectorized_in[i]; - absmax_val = max(absmax_val, fabs(in_vec.x)); - absmax_val = max(absmax_val, fabs(in_vec.y)); - absmax_val = max(absmax_val, fabs(in_vec.z)); - absmax_val = max(absmax_val, fabs(in_vec.w)); + scalarxN_t in_vec = vectorized_in[i]; +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + absmax_val = fmaxf(absmax_val, fabsf(in_vec.val[j])); + } } - // Handle the remaining elements if num_elems is not divisible by 4 - for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) { - absmax_val = max(absmax_val, fabs(input[i])); + // Handle the remaining elements if num_elems is not divisible by VEC_SIZE + for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) { + absmax_val = fmaxf(absmax_val, fabsf(input[i])); } return absmax_val; @@ -130,31 +132,31 @@ __device__ void scaled_fp8_conversion_vec(fp8_type* __restrict__ out, float const scale, int64_t const num_elems, int const tid, int const step) { - using float8x4_t = q8x4_t; + constexpr size_t VEC_SIZE = 16; + using scalarxN_t = vec_n_t; + using float8xN_t = q8_n_t; // Vectorized input/output to better utilize memory bandwidth. - auto const* vectorized_in = reinterpret_cast const*>(input); - auto* vectorized_out = reinterpret_cast(out); + auto const* vectorized_in = reinterpret_cast(input); + auto* vectorized_out = reinterpret_cast(out); - int64_t const num_vec_elems = num_elems >> 2; + // num_elems / VEC_SIZE (which is 16) + int64_t const num_vec_elems = num_elems >> 4; -#pragma unroll 4 +#pragma unroll for (int64_t i = tid; i < num_vec_elems; i += step) { - vec4_t in_vec = vectorized_in[i]; - float8x4_t out_vec; - - out_vec.x = scaled_fp8_conversion( - static_cast(in_vec.x), scale); - out_vec.y = scaled_fp8_conversion( - static_cast(in_vec.y), scale); - out_vec.z = scaled_fp8_conversion( - static_cast(in_vec.z), scale); - out_vec.w = scaled_fp8_conversion( - static_cast(in_vec.w), scale); + scalarxN_t in_vec = vectorized_in[i]; + float8xN_t out_vec; + +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + out_vec.val[j] = scaled_fp8_conversion( + static_cast(in_vec.val[j]), scale); + } vectorized_out[i] = out_vec; } - // Handle the remaining elements if num_elems is not divisible by 4 - for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) { + // Handle the remaining elements if num_elems is not divisible by VEC_SIZE + for (int64_t i = num_vec_elems * VEC_SIZE + tid; i < num_elems; i += step) { out[i] = scaled_fp8_conversion( static_cast(input[i]), scale); } diff --git a/csrc/quantization/fused_kernels/layernorm_utils.cuh b/csrc/quantization/fused_kernels/layernorm_utils.cuh index e6d23cd24..3f188872d 100644 --- a/csrc/quantization/fused_kernels/layernorm_utils.cuh +++ b/csrc/quantization/fused_kernels/layernorm_utils.cuh @@ -140,6 +140,7 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input, // sum of squares float ss = 0.0f; + const int VEC_SIZE = 4; int32_t const num_vec_elems = hidden_size >> 2; #pragma unroll 4 @@ -147,22 +148,23 @@ __device__ void compute_rms(float* rms, scalar_t const* __restrict__ input, vec4_t in = vec_input[i]; vec4_t x; - x.x = static_cast(in.x); - x.y = static_cast(in.y); - x.z = static_cast(in.z); - x.w = static_cast(in.w); +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + x.val[j] = static_cast(in.val[j]); + } + if constexpr (has_residual) { vec4_t r = vec_residual[i]; - x.x += static_cast(r.x); - x.y += static_cast(r.y); - x.z += static_cast(r.z); - x.w += static_cast(r.w); +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + x.val[j] += static_cast(r.val[j]); + } } - ss += x.x * x.x; - ss += x.y * x.y; - ss += x.z * x.z; - ss += x.w * x.w; +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + ss += x.val[j] * x.val[j]; + } } using BlockReduce = cub::BlockReduce; @@ -203,6 +205,7 @@ __device__ void compute_dynamic_per_token_scales( constexpr scalar_out_t qmax{quant_type_max_v}; + const int VEC_SIZE = 4; int32_t const num_vec_elems = hidden_size >> 2; float block_absmax_val_maybe = 0.0f; @@ -212,26 +215,25 @@ __device__ void compute_dynamic_per_token_scales( vec4_t const w = vec_weight[i]; vec4_t x; - x.x = static_cast(in.x); - x.y = static_cast(in.y); - x.z = static_cast(in.z); - x.w = static_cast(in.w); +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + x.val[j] = static_cast(in.val[j]); + } + if constexpr (has_residual) { vec4_t r = vec_residual[i]; - x.x += static_cast(r.x); - x.y += static_cast(r.y); - x.z += static_cast(r.z); - x.w += static_cast(r.w); +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + x.val[j] += static_cast(r.val[j]); + } } - block_absmax_val_maybe = fmaxf( - block_absmax_val_maybe, fabs(static_cast(x.x * rms) * w.x)); - block_absmax_val_maybe = fmaxf( - block_absmax_val_maybe, fabs(static_cast(x.y * rms) * w.y)); - block_absmax_val_maybe = fmaxf( - block_absmax_val_maybe, fabs(static_cast(x.z * rms) * w.z)); - block_absmax_val_maybe = fmaxf( - block_absmax_val_maybe, fabs(static_cast(x.w * rms) * w.w)); +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + block_absmax_val_maybe = + fmaxf(block_absmax_val_maybe, + fabs(static_cast(x.val[j] * rms) * w.val[j])); + } } using BlockReduce = cub::BlockReduce; @@ -282,6 +284,7 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output, vec_residual = reinterpret_cast*>(&residual[token_offset]); } + const int VEC_SIZE = 4; int32_t const num_vec_elems = hidden_size >> 2; // TODO(luka/varun) extract into type-agnostic vectorized quant function to @@ -292,33 +295,31 @@ __device__ void norm_and_quant(scalar_out_t* __restrict__ output, vec4_t const w = vec_weight[i]; vec4_t x; - x.x = static_cast(in.x); - x.y = static_cast(in.y); - x.z = static_cast(in.z); - x.w = static_cast(in.w); +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + x.val[j] = static_cast(in.val[j]); + } + if constexpr (has_residual) { vec4_t r = vec_residual[i]; - x.x += static_cast(r.x); - x.y += static_cast(r.y); - x.z += static_cast(r.z); - x.w += static_cast(r.w); - // Update residual - r.x = static_cast(x.x); - r.y = static_cast(x.y); - r.z = static_cast(x.z); - r.w = static_cast(x.w); +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + x.val[j] += static_cast(r.val[j]); + } +// Update residual +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + r.val[j] = static_cast(x.val[j]); + } vec_residual[i] = r; } q8x4_t out; - out.x = ScaledQuant::quant_fn( - static_cast(x.x * rms) * w.x, scale); - out.y = ScaledQuant::quant_fn( - static_cast(x.y * rms) * w.y, scale); - out.z = ScaledQuant::quant_fn( - static_cast(x.z * rms) * w.z, scale); - out.w = ScaledQuant::quant_fn( - static_cast(x.w * rms) * w.w, scale); +#pragma unroll + for (int j = 0; j < VEC_SIZE; ++j) { + out.val[j] = ScaledQuant::quant_fn( + static_cast(x.val[j] * rms) * w.val[j], scale); + } vec_output[i] = out; } } diff --git a/csrc/quantization/vectorization.cuh b/csrc/quantization/vectorization.cuh index 866da10b5..11d57a5fa 100644 --- a/csrc/quantization/vectorization.cuh +++ b/csrc/quantization/vectorization.cuh @@ -10,23 +10,22 @@ namespace vllm { // Vectorization containers -template -struct __align__(8) vec4_t { - scalar_t x; - scalar_t y; - scalar_t z; - scalar_t w; +template +struct __align__(vec_size * sizeof(scalar_t)) vec_n_t { + scalar_t val[vec_size]; }; -template -struct __align__(4) q8x4_t { +template +struct __align__(vec_size * sizeof(quant_type_t)) q8_n_t { static_assert(std::is_same_v || std::is_same_v || std::is_same_v); - quant_type_t x; - quant_type_t y; - quant_type_t z; - quant_type_t w; + quant_type_t val[vec_size]; }; +template +using vec4_t = vec_n_t; +template +using q8x4_t = q8_n_t; + } // namespace vllm -- GitLab From 6865fe0074771ed56c1cb2eca047a8e74ab53ce9 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Tue, 3 Jun 2025 22:07:19 +0100 Subject: [PATCH 146/274] Fix interaction between `Optional` and `Annotated` in CLI typing (#19093) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Yikun Jiang --- tests/engine/test_arg_utils.py | 18 +++++++++++++++--- vllm/engine/arg_utils.py | 26 +++++++++++++++++++------- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index ab78aa7da..cfbc7c245 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -5,14 +5,14 @@ import json from argparse import ArgumentError, ArgumentTypeError from contextlib import nullcontext from dataclasses import dataclass, field -from typing import Literal, Optional +from typing import Annotated, Literal, Optional import pytest from vllm.config import CompilationConfig, config from vllm.engine.arg_utils import (EngineArgs, contains_type, get_kwargs, - get_type, is_not_builtin, is_type, - literal_to_kwargs, nullable_kvs, + get_type, get_type_hints, is_not_builtin, + is_type, literal_to_kwargs, nullable_kvs, optional_type, parse_type) from vllm.utils import FlexibleArgumentParser @@ -160,6 +160,18 @@ def test_is_not_builtin(type_hint, expected): assert is_not_builtin(type_hint) == expected +@pytest.mark.parametrize( + ("type_hint", "expected"), [ + (Annotated[int, "annotation"], {int}), + (Optional[int], {int, type(None)}), + (Annotated[Optional[int], "annotation"], {int, type(None)}), + (Optional[Annotated[int, "annotation"]], {int, type(None)}), + ], + ids=["Annotated", "Optional", "Annotated_Optional", "Optional_Annotated"]) +def test_get_type_hints(type_hint, expected): + assert get_type_hints(type_hint) == expected + + def test_get_kwargs(): kwargs = get_kwargs(DummyConfig) print(kwargs) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 587a23134..2197d44ca 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -15,7 +15,7 @@ from typing import (Annotated, Any, Callable, Dict, List, Literal, Optional, import regex as re import torch -from pydantic import SkipValidation, TypeAdapter, ValidationError +from pydantic import TypeAdapter, ValidationError from typing_extensions import TypeIs, deprecated import vllm.envs as envs @@ -151,17 +151,29 @@ def is_not_builtin(type_hint: TypeHint) -> bool: return type_hint.__module__ != "builtins" +def get_type_hints(type_hint: TypeHint) -> set[TypeHint]: + """Extract type hints from Annotated or Union type hints.""" + type_hints: set[TypeHint] = set() + origin = get_origin(type_hint) + args = get_args(type_hint) + + if origin is Annotated: + type_hints.update(get_type_hints(args[0])) + elif origin is Union: + for arg in args: + type_hints.update(get_type_hints(arg)) + else: + type_hints.add(type_hint) + + return type_hints + + def get_kwargs(cls: ConfigType) -> dict[str, Any]: cls_docs = get_attr_docs(cls) kwargs = {} for field in fields(cls): # Get the set of possible types for the field - type_hints: set[TypeHint] = set() - if get_origin(field.type) in {Union, Annotated}: - predicate = lambda arg: not isinstance(arg, SkipValidation) - type_hints.update(filter(predicate, get_args(field.type))) - else: - type_hints.add(field.type) + type_hints: set[TypeHint] = get_type_hints(field.type) # If the field is a dataclass, we can use the model_validate_json generator = (th for th in type_hints if is_dataclass(th)) -- GitLab From 6cac54f4d1673991a415b9897d610c132104155b Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 4 Jun 2025 05:41:36 +0800 Subject: [PATCH 147/274] [v1] Re-init input batch for multiple kv cache groups (#18654) Signed-off-by: Chen Zhang --- tests/v1/worker/test_gpu_input_batch.py | 29 ++------------- tests/v1/worker/test_gpu_model_runner.py | 4 ++- vllm/v1/worker/block_table.py | 3 +- vllm/v1/worker/gpu_input_batch.py | 18 +++++----- vllm/v1/worker/gpu_model_runner.py | 46 ++++++++++++++++++++---- vllm/v1/worker/tpu_model_runner.py | 7 ++-- 6 files changed, 61 insertions(+), 46 deletions(-) diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index e932e4b32..72547e86b 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -10,8 +10,6 @@ import torch from vllm.sampling_params import SamplingParams from vllm.utils import is_pin_memory_available, make_tensor_with_pad -from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, - KVCacheGroupSpec, KVCacheTensor) from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch @@ -25,27 +23,6 @@ CUDA_DEVICES = [ MAX_NUM_PROMPT_TOKENS = 64 -def get_kv_cache_config() -> KVCacheConfig: - return KVCacheConfig( - num_blocks=10, - tensors={ - "layer.0": KVCacheTensor(size=1024), - }, - kv_cache_groups=[ - KVCacheGroupSpec( - layer_names=["layer.0"], - kv_cache_spec=FullAttentionSpec( - block_size=1, - num_kv_heads=1, - head_size=16, - dtype=torch.float16, - use_mla=False, - ), - ), - ], - ) - - def _compare_objs(obj1, obj2): attrs = inspect.getmembers(obj1, lambda a: not (inspect.isroutine(a))) attr_names = set([ @@ -252,7 +229,7 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int): device=torch.device(device), pin_memory=is_pin_memory_available(), vocab_size=1024, - block_size=1, + block_sizes=[1], ) reqs: list[CachedRequestState] = [] req_id_reqs = {} @@ -342,7 +319,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int, device=torch.device(device), pin_memory=is_pin_memory_available(), vocab_size=1024, - block_size=1, + block_sizes=[1], ) ref_input_batch: InputBatch = InputBatch( max_num_reqs=batch_size, @@ -351,7 +328,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int, device=torch.device(device), pin_memory=is_pin_memory_available(), vocab_size=1024, - block_size=1, + block_sizes=[1], ) reqs: list[CachedRequestState] = [] diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 5e2fd2fbf..0553d94de 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -54,7 +54,9 @@ def initialize_kv_cache(runner: GPUModelRunner): device=runner.device, pin_memory=runner.pin_memory, vocab_size=runner.model_config.get_vocab_size(), - block_size=kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size, + block_sizes=[ + kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size + ], ) runner.initialize_attn_backend(kv_cache_config) diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 958262c49..5cd5674fb 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -105,10 +105,11 @@ class MultiGroupBlockTable: def __init__(self, max_num_reqs: int, max_model_len: int, max_num_batched_tokens: int, pin_memory: bool, - device: torch.device, block_size: int) -> None: + device: torch.device, block_sizes: list[int]) -> None: self.block_tables = [ BlockTable(max_num_reqs, cdiv(max_model_len, block_size), max_num_batched_tokens, pin_memory, device) + for block_size in block_sizes ] def append_row(self, block_ids: list[list[int]], row_idx: int) -> None: diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index bb986b604..34737029f 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -56,14 +56,14 @@ class CachedRequestState: class InputBatch: def __init__( - self, - max_num_reqs: int, - max_model_len: int, - max_num_batched_tokens: int, - device: torch.device, - pin_memory: bool, - vocab_size: int, - block_size: int, + self, + max_num_reqs: int, + max_model_len: int, + max_num_batched_tokens: int, + device: torch.device, + pin_memory: bool, + vocab_size: int, + block_sizes: list[int], # The block_size of each kv cache group ): self.max_num_reqs = max_num_reqs self.max_model_len = max_model_len @@ -105,7 +105,7 @@ class InputBatch: max_num_batched_tokens=max_num_batched_tokens, pin_memory=pin_memory, device=device, - block_size=block_size, + block_sizes=block_sizes, ) # Sampling-related. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b7448be26..6a566a602 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -143,7 +143,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.attn_metadata_builders: list[AttentionMetadataBuilder] = [] self.attn_backends: list[type[AttentionBackend]] = [] # self.kv_cache_config: KVCacheConfig - # self.input_batch: InputBatch # Persistent batch. # req_id -> (input_id -> encoder_output) self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {} @@ -173,6 +172,15 @@ class GPUModelRunner(LoRAModelRunnerMixin): # Request states. self.requests: dict[str, CachedRequestState] = {} + # Input Batch + # NOTE(Chen): Ideally, we should initialize the input batch inside + # `initialize_kv_cache` based on the kv cache config. However, as in + # https://github.com/vllm-project/vllm/pull/18298, due to some unknown + # reasons, we have to initialize the input batch before `load_model`, + # quantization + weight offloading will fail otherwise. As a temporary + # solution, we initialize the input batch here, and re-initialize it + # in `initialize_kv_cache` if the block_sizes here is different from + # the block_sizes in the kv cache config. self.input_batch = InputBatch( max_num_reqs=self.max_num_reqs, max_model_len=self.max_model_len, @@ -180,7 +188,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): device=self.device, pin_memory=self.pin_memory, vocab_size=self.model_config.get_vocab_size(), - block_size=self.cache_config.block_size, + block_sizes=[self.cache_config.block_size], ) self.use_cuda_graph = (self.vllm_config.compilation_config.level @@ -2040,6 +2048,35 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.attn_backends.append(attn_backend_i) self.attn_metadata_builders.append(attn_metadata_builder_i) + def may_reinitialize_input_batch(self, + kv_cache_config: KVCacheConfig) -> None: + """ + Re-initialize the input batch if the block sizes are different from + `[self.cache_config.block_size]`. This usually happens when there + are multiple KV cache groups. + + Args: + kv_cache_config: The KV cache configuration. + """ + block_sizes = [ + kv_cache_group.kv_cache_spec.block_size + for kv_cache_group in kv_cache_config.kv_cache_groups + ] + if block_sizes != [self.cache_config.block_size]: + assert self.cache_config.cpu_offload_gb == 0, ( + "Cannot re-initialize the input batch when CPU weight " + "offloading is enabled. See https://github.com/vllm-project/vllm/pull/18298 " # noqa: E501 + "for more details.") + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.max_model_len, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=self.pin_memory, + vocab_size=self.model_config.get_vocab_size(), + block_sizes=block_sizes, + ) + def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: """ Initialize KV cache based on `kv_cache_config`. @@ -2047,11 +2084,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): kv_cache_config: Configuration for the KV cache, including the KV cache size of each layer """ - if len(kv_cache_config.kv_cache_groups) > 1: - raise NotImplementedError( - "Hybrid models with more than one KV cache type are not " - "supported yet.") self.kv_cache_config = kv_cache_config + self.may_reinitialize_input_batch(kv_cache_config) self.initialize_attn_backend(kv_cache_config) kv_caches: dict[str, torch.Tensor] = {} diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index f15234f49..73c445d14 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -200,7 +200,7 @@ class TPUModelRunner(LoRAModelRunnerMixin): device=self.device, pin_memory=self.pin_memory, vocab_size=self.model_config.get_vocab_size(), - block_size=self.block_size, + block_sizes=[self.block_size], ) # Cached torch/numpy tensor @@ -1358,8 +1358,9 @@ class TPUModelRunner(LoRAModelRunnerMixin): device=self.device, pin_memory=self.pin_memory, vocab_size=self.model_config.get_vocab_size(), - block_size=kv_cache_config.kv_cache_groups[0].kv_cache_spec. - block_size, + block_sizes=[ + kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size + ], ) # Verify dtype compatibility between block_table_cpu and input_batch assert self.block_table_cpu.dtype == self.input_batch.block_table[ -- GitLab From 135cf55cd1d83cd4e18266e343a59e6d9f87856f Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Tue, 3 Jun 2025 18:26:33 -0400 Subject: [PATCH 148/274] [V1][Spec Decode][Ngram] 1.35x gain -> 1.95x gain on InstructCoder with prompt fix (#18971) --- benchmarks/benchmark_dataset.py | 10 +++++++++- vllm/benchmarks/datasets.py | 14 +++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 80a9246aa..5d2a26cd4 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -865,7 +865,15 @@ class InstructCoderDataset(HuggingFaceDataset): for item in self.data: if len(sampled_requests) >= num_requests: break - prompt = f"{item['instruction']}:\n{item['input']}" + prompt = f"{item['input']}\n\n{item['instruction']} Just output \ + the code, do not include any explanation." + + # apply template + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + add_generation_prompt=True, + tokenize=False, + ) prompt_len = len(tokenizer(prompt).input_ids) sampled_requests.append( SampleRequest( diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 0ef3e0254..f795a1256 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -880,7 +880,19 @@ class InstructCoderDataset(HuggingFaceDataset): for item in self.data: if len(sampled_requests) >= num_requests: break - prompt = f"{item['instruction']}:\n{item['input']}" + prompt = f"{item['input']}\n\n{item['instruction']} Just output \ + the code, do not include any explanation." + + # apply template + prompt = tokenizer.apply_chat_template( + [{ + "role": "user", + "content": prompt + }], + add_generation_prompt=True, + tokenize=False, + ) + prompt_len = len(tokenizer(prompt).input_ids) sampled_requests.append( SampleRequest( -- GitLab From b5fd9506c14bed640210a7f3d0adb03a024afdbe Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 4 Jun 2025 06:30:55 +0800 Subject: [PATCH 149/274] [Bugfix] get_num_blocks_to_allocate with null_block (#19031) Signed-off-by: Chen Zhang --- tests/v1/core/test_specialized_manager.py | 23 ++++++++++++++++++++ vllm/v1/core/block_pool.py | 5 +++-- vllm/v1/core/kv_cache_utils.py | 3 +++ vllm/v1/core/single_type_kv_cache_manager.py | 5 +++-- 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py index c6f7481dd..92ce8ea8b 100644 --- a/tests/v1/core/test_specialized_manager.py +++ b/tests/v1/core/test_specialized_manager.py @@ -144,3 +144,26 @@ def test_sliding_window_remove_skipped_blocks(): # of removed blocks should be [1003, 1002]. manager.remove_skipped_blocks("test", 11) assert_block_id(block_table, [null_block_id] * 4 + original_block_ids[4:]) + + +def test_get_num_blocks_to_allocate(): + block_size = 2 + sliding_window_spec = SlidingWindowSpec( + block_size=block_size, + num_kv_heads=1, + head_size=1, + dtype=torch.float32, + sliding_window=4, # Placeholder value, not related to test result + use_mla=False, + ) + + block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True) + manager = get_sliding_window_manager(sliding_window_spec, block_pool) + cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)] + cached_blocks_2 = [block_pool.null_block for _ in range(5) + ] + [KVCacheBlock(i + 1) for i in range(5)] + + assert manager.get_num_blocks_to_allocate("1", 20 * block_size, + cached_blocks_1) == 20 + assert manager.get_num_blocks_to_allocate("2", 20 * block_size, + cached_blocks_2) == 15 diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index 27eaca497..5118e4d8e 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -63,6 +63,7 @@ class BlockPool: # The ref_cnt of null_block is not maintained, needs special care to # avoid freeing it. self.null_block = self.free_block_queue.popleft() + self.null_block.is_null = True self.enable_kv_cache_events = enable_kv_cache_events self.kv_event_queue: list[KVCacheEvent] = [] @@ -252,7 +253,7 @@ class BlockPool: for block in blocks: # ref_cnt=0 means this block is in the free list (i.e. eviction # candidate), so remove it. - if block.ref_cnt == 0 and block != self.null_block: + if block.ref_cnt == 0 and not block.is_null: self.free_block_queue.remove(block) block.incr_ref() @@ -267,7 +268,7 @@ class BlockPool: for block in ordered_blocks: block.decr_ref() # null_block should not be added to the free list. - if block.ref_cnt == 0 and block != self.null_block: + if block.ref_cnt == 0 and not block.is_null: self.free_block_queue.append(block) def reset_prefix_cache(self) -> bool: diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 61476362e..3b5a37926 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -125,6 +125,9 @@ class KVCacheBlock: prev_free_block: Optional["KVCacheBlock"] = None next_free_block: Optional["KVCacheBlock"] = None + # Whether the block is a null block that should never be cached. + is_null: bool = False + def incr_ref(self): self.ref_cnt += 1 diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 233c73e88..a529cde09 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -83,8 +83,9 @@ class SingleTypeKVCacheManager(ABC): # free queue and ref_cnt == 0), it will be changed from a free block # to a computed block when the request is allocated, so we also count # it as needed to be allocated. - num_evictable_computed_blocks = sum(blk.ref_cnt == 0 - for blk in new_computed_blocks) + num_evictable_computed_blocks = sum( + blk.ref_cnt == 0 and not blk.is_null + for blk in new_computed_blocks) return ((num_new_blocks + num_evictable_computed_blocks) * self.num_kv_cache_groups) -- GitLab From 4de790fcad85abb0969da18bc9125889407c432a Mon Sep 17 00:00:00 2001 From: Chauncey Date: Wed, 4 Jun 2025 07:27:24 +0800 Subject: [PATCH 150/274] [Bugfix]: Fix the incompatibility issue with tool_choice 'required' when Thinking is enabled (#19075) Signed-off-by: chaunceyjiang --- .../test_completion_with_function_calling.py | 2 +- vllm/entrypoints/openai/serving_chat.py | 18 +++++++++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index dbea2dc0b..5c1f07832 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -9,7 +9,7 @@ import pytest_asyncio from ...utils import RemoteOpenAIServer # any model with a chat template should work here -MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" +MODEL_NAME = "Qwen/Qwen3-0.6B" @pytest.fixture(scope="module") diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 7e514d660..777b7f5bc 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -320,10 +320,13 @@ class OpenAIServingChat(OpenAIServing): def extract_tool_call_required_streaming( self, previous_text: str, - current_text: str, + current_text: Optional[str], delta_text: str, function_name_returned: bool, ) -> tuple[Optional[DeltaMessage], bool]: + if current_text is None or current_text == "": + # if the current text is empty, we cannot parse it + return None, function_name_returned try: obj = partial_json_parser.loads(current_text) except partial_json_parser.core.exceptions.MalformedJSON: @@ -650,10 +653,18 @@ class OpenAIServingChat(OpenAIServing): current_text = previous_text + delta_text fn_name_returned = function_name_returned[i] + if self.reasoning_parser: + _, content = \ + reasoning_parser.extract_reasoning_content( + current_text, + request + ) + else: + content = current_text delta_message, function_name_returned[i] = ( self.extract_tool_call_required_streaming( previous_text=previous_text, - current_text=current_text, + current_text=content, delta_text=delta_text, function_name_returned=fn_name_returned)) @@ -981,8 +992,9 @@ class OpenAIServingChat(OpenAIServing): # the fields of FunctionDefinition are a superset of the # tool call outputs and can be used for parsing + assert content is not None tool_calls = TypeAdapter( - list[FunctionDefinition]).validate_json(output.text) + list[FunctionDefinition]).validate_json(content) message = ChatMessage( role=role, content="", -- GitLab From 5d96533e2235c37e64ef381fafa244db197b25dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Wed, 4 Jun 2025 01:53:16 +0200 Subject: [PATCH 151/274] [Bugfix][P/D] Fix Prefix Cache Bug (#18411) Signed-off-by: nicklucche Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> --- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 3f0b0e295..fd2228012 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -739,7 +739,8 @@ class NixlConnectorWorker: # just notify P worker that we have the blocks we need. num_local_blocks = len(local_block_ids) if num_local_blocks == 0: - self.nixl_wrapper.send_notif(dst_engine_id, + agent_name = self._remote_agents[dst_engine_id] + self.nixl_wrapper.send_notif(agent_name, notif_msg=request_id.encode("utf-8")) return -- GitLab From a8da78eac92b5e79947a6fdd51bec0d1e5cea0a7 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 4 Jun 2025 08:14:06 +0800 Subject: [PATCH 152/274] [Bugfix] Max concurrency estimation and check_enough_kv_cache_memory for models with sliding window layers (#19029) Signed-off-by: Chen Zhang --- tests/v1/core/test_kv_cache_utils.py | 90 +++++++++++++++++++++++++--- vllm/v1/core/kv_cache_utils.py | 61 +++++++++++++------ 2 files changed, 125 insertions(+), 26 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index ad34becb1..71ea43383 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -12,13 +12,11 @@ from vllm.utils import GiB_bytes, sha256 from vllm.v1.core.kv_cache_manager import KVCacheManager # disable yapf here as it formats differently than isort such that both fail # yapf: disable -from vllm.v1.core.kv_cache_utils import (FreeKVCacheBlockQueue, KVCacheBlock, - PrefixCachingMetrics, - estimate_max_model_len, - generate_block_hash_extra_keys, - hash_block_tokens, - hash_request_tokens, - unify_kv_cache_configs) +from vllm.v1.core.kv_cache_utils import ( + FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics, + estimate_max_model_len, generate_block_hash_extra_keys, + get_max_concurrency_for_kv_cache_config, hash_block_tokens, + hash_request_tokens, unify_kv_cache_configs) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheTensor, SlidingWindowSpec) @@ -597,6 +595,84 @@ def test_estimate_max_model_len(model_id, max_model_len, assert estimated_max_len == want_estimated_max_len +def test_get_max_concurrency_for_kv_cache_config(): + # Create a VllmConfig + model_id = "Qwen/Qwen1.5-7B" + max_model_len = 16384 + model_config = ModelConfig( + model_id, + task="generate", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + max_model_len=max_model_len, + ) + scheduler_config = SchedulerConfig(max_num_batched_tokens=1024, + enable_chunked_prefill=True) + + vllm_config = VllmConfig( + model_config=model_config, + scheduler_config=scheduler_config, + ) + + full_attention_spec = FullAttentionSpec( + block_size=16, + num_kv_heads=32, + head_size=128, + dtype=torch.float16, + use_mla=False, + ) + + sliding_window_spec = SlidingWindowSpec( + block_size=16, + num_kv_heads=32, + head_size=128, + dtype=torch.float16, + use_mla=False, + sliding_window=1024, + ) + + kv_cache_config_full_attention = KVCacheConfig( + num_blocks=int(1024 * 1.5), + tensors={}, + kv_cache_groups=[ + KVCacheGroupSpec([f"layer_{i}" for i in range(32)], + full_attention_spec), + ], + ) + max_concurrency_full_attention = get_max_concurrency_for_kv_cache_config( + vllm_config, kv_cache_config_full_attention) + assert max_concurrency_full_attention == 1.5 + + kv_cache_config_sliding_window = KVCacheConfig( + num_blocks=129 * 3, + tensors={}, + kv_cache_groups=[ + KVCacheGroupSpec([f"layer_{i}" for i in range(32)], + sliding_window_spec), + ], + ) + max_concurrency_sliding_window = get_max_concurrency_for_kv_cache_config( + vllm_config, kv_cache_config_sliding_window) + assert max_concurrency_sliding_window == 3 + + kv_cache_config_hybrid_model = KVCacheConfig( + num_blocks=(1024 + 129) * 3, + tensors={}, + kv_cache_groups=[ + KVCacheGroupSpec([f"layer_{i}" for i in range(32)], + full_attention_spec), + KVCacheGroupSpec([f"layer_{i}" for i in range(32, 64)], + sliding_window_spec), + ], + ) + max_concurrency_hybrid_model = get_max_concurrency_for_kv_cache_config( + vllm_config, kv_cache_config_hybrid_model) + assert max_concurrency_hybrid_model == 3 + + def test_allocate_with_lookahead(): """Verify that lookahead tokens correctly affect block allocation""" block_size = 4 diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 3b5a37926..ad3c21f79 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -3,13 +3,13 @@ """KV-Cache Utilities.""" import os from collections import deque -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from dataclasses import dataclass from typing import Any, Callable, NamedTuple, Optional from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import GiB_bytes, sha256 +from vllm.utils import GiB_bytes, cdiv, sha256 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheSpec, KVCacheTensor, SlidingWindowSpec) @@ -468,6 +468,15 @@ def hash_request_tokens(hash_function: Any, block_size: int, return ret +def max_memory_usage_bytes(vllm_config: VllmConfig, + kv_cache_specs: Iterable[KVCacheSpec]) -> int: + """ + Get the maximum memory usage in bytes for the given KV cache specs. + """ + return sum( + spec.max_memory_usage_bytes(vllm_config) for spec in kv_cache_specs) + + def estimate_max_model_len(vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec], available_memory: int) -> int: @@ -489,11 +498,8 @@ def estimate_max_model_len(vllm_config: VllmConfig, # Modify the max_model_len for this calculation vllm_config.model_config.max_model_len = model_len # Calculate memory needed for the given model length - memory_needed = sum( - (layer_spec.max_memory_usage_bytes(vllm_config) - for layer_spec in kv_cache_spec.values()), - start=0, - ) + memory_needed = max_memory_usage_bytes(vllm_config, + kv_cache_spec.values()) return memory_needed <= available_memory # Binary search for the maximum model length @@ -538,9 +544,7 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig, "initializing the engine.") max_model_len = vllm_config.model_config.max_model_len - needed_memory = 0 - for layer_spec in kv_cache_spec.values(): - needed_memory += layer_spec.max_memory_usage_bytes(vllm_config) + needed_memory = max_memory_usage_bytes(vllm_config, kv_cache_spec.values()) if needed_memory > available_memory: # Estimate the maximum model length that can fit in the available memory @@ -606,6 +610,24 @@ def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool: return len(layer_keys) == 1 +def get_max_concurrency_for_kv_cache_config( + vllm_config: VllmConfig, kv_cache_config: KVCacheConfig) -> float: + """ + Get the maximum concurrency for the given KV cache configuration. + """ + num_layer_per_group = max( + len(group.layer_names) for group in kv_cache_config.kv_cache_groups) + max_memory_usage_per_request = num_layer_per_group * max_memory_usage_bytes( + vllm_config, + (group.kv_cache_spec for group in kv_cache_config.kv_cache_groups)) + memory_per_block = kv_cache_config.kv_cache_groups[ + 0].kv_cache_spec.page_size_bytes * num_layer_per_group + num_block_per_request = cdiv(max_memory_usage_per_request, + memory_per_block) + max_concurrency = kv_cache_config.num_blocks / num_block_per_request + return max_concurrency + + def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec], available_memory: int) -> KVCacheConfig: @@ -637,14 +659,6 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override) num_blocks = num_gpu_blocks_override - num_tokens = num_blocks * vllm_config.cache_config.block_size - num_tokens_str = f"{num_tokens:,}" - logger.info("GPU KV cache size: %s tokens", num_tokens_str) - max_model_len_str = f"{vllm_config.model_config.max_model_len:,}" - max_concurrency = num_tokens / vllm_config.model_config.max_model_len - logger.info("Maximum concurrency for %s tokens per request: %.2fx", - max_model_len_str, max_concurrency) - per_layer_size = page_size * num_blocks # All layers have the same KV cache spec, so we create one kv cache group # for all layers. @@ -659,6 +673,15 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig, kv_cache_groups=create_kv_cache_group_specs(kv_cache_spec, grouped_layer_names), ) + + num_tokens = num_blocks * vllm_config.cache_config.block_size + num_tokens_str = f"{num_tokens:,}" + logger.info("GPU KV cache size: %s tokens", num_tokens_str) + max_model_len_str = f"{vllm_config.model_config.max_model_len:,}" + max_concurrency = get_max_concurrency_for_kv_cache_config( + vllm_config, kv_cache_config) + logger.info("Maximum concurrency for %s tokens per request: %.2fx", + max_model_len_str, max_concurrency) return kv_cache_config @@ -705,8 +728,8 @@ def get_kv_cache_config(vllm_config: VllmConfig, Returns: The generated KVCacheConfigs """ - check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory) unify_hybrid_kv_cache_specs(kv_cache_spec) + check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory) if is_kv_cache_type_uniform(kv_cache_spec): # KV cache of all layers are the same, which is true for # most models. Allocate the same amount of memory for -- GitLab From b712be98c790794479030313f2c2b9dae17ea7de Mon Sep 17 00:00:00 2001 From: Yan Ru Pei Date: Tue, 3 Jun 2025 17:14:20 -0700 Subject: [PATCH 153/274] feat: add data parallel rank to KVEventBatch (#18925) --- .buildkite/test-pipeline.yaml | 2 + tests/distributed/conftest.py | 107 +++++++----- tests/distributed/test_events.py | 69 +++++++- tests/v1/engine/test_engine_core_client.py | 189 +++++++++++++++++---- vllm/distributed/kv_events.py | 77 ++++++++- vllm/v1/core/sched/scheduler.py | 4 +- 6 files changed, 362 insertions(+), 86 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 5fb8ceaac..8ab96b3b7 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -145,6 +145,7 @@ steps: - examples/offline_inference/rlhf_colocate.py - tests/examples/offline_inference/data_parallel.py - tests/v1/test_async_llm_dp.py + - tests/v1/engine/test_engine_core_client.py commands: # test with tp=2 and external_dp=2 - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py @@ -154,6 +155,7 @@ steps: # test with internal dp - python3 ../examples/offline_inference/data_parallel.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py + - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py diff --git a/tests/distributed/conftest.py b/tests/distributed/conftest.py index 95f085788..666a715cc 100644 --- a/tests/distributed/conftest.py +++ b/tests/distributed/conftest.py @@ -13,11 +13,13 @@ from vllm.distributed.kv_events import EventPublisherFactory from .test_events import SampleBatch +DP_RANK = 0 + @pytest.fixture def random_port(): """Generate a random port number for testing""" - return random.randint(10000, 60000) + return random.randint(10000, 59900) @pytest.fixture @@ -30,21 +32,23 @@ def publisher_config(random_port, request): replay_endpoint = endpoint + "-replay" else: endpoint = f"tcp://*:{random_port}" - replay_endpoint = f"tcp://*:{random_port + 1}" + replay_endpoint = f"tcp://*:{random_port + 100}" - return KVEventsConfig(enable_kv_cache_events=True, - publisher="zmq", - endpoint=endpoint, - replay_endpoint=replay_endpoint, - buffer_steps=100, - hwm=1000, - topic="test") + return KVEventsConfig( + enable_kv_cache_events=True, + publisher="zmq", + endpoint=endpoint, + replay_endpoint=replay_endpoint, + buffer_steps=100, + hwm=1000, + topic="test", + ) @pytest.fixture def publisher(publisher_config): """Create and return a publisher instance""" - pub = EventPublisherFactory.create(publisher_config) + pub = EventPublisherFactory.create(publisher_config, DP_RANK) yield pub pub.shutdown() @@ -60,7 +64,11 @@ def subscriber(publisher_config): if replay_endpoint and replay_endpoint.startswith("tcp://*"): replay_endpoint = replay_endpoint.replace("*", "127.0.0.1") - sub = MockSubscriber(endpoint, replay_endpoint, publisher_config.topic) + sub = MockSubscriber( + [endpoint], + [replay_endpoint] if replay_endpoint else None, + publisher_config.topic, + ) yield sub sub.close() @@ -68,26 +76,37 @@ def subscriber(publisher_config): class MockSubscriber: """Helper class to receive and verify published events""" - def __init__(self, - pub_endpoint: str, - replay_endpoint: Optional[str] = None, - topic: str = "", - decode_type=SampleBatch): + def __init__( + self, + pub_endpoints: Union[str, list[str]], + replay_endpoints: Optional[Union[str, list[str]]] = None, + topic: str = "", + decode_type=SampleBatch, + ): self.ctx = zmq.Context.instance() - # Set up subscriber socket - self.sub = self.ctx.socket(zmq.SUB) - self.sub.setsockopt(zmq.SUBSCRIBE, topic.encode('utf-8')) - self.sub.connect(pub_endpoint) + # Convert single endpoint to list for consistency + if isinstance(pub_endpoints, str): + pub_endpoints = [pub_endpoints] + if isinstance(replay_endpoints, str): + replay_endpoints = [replay_endpoints] - # Set up replay socket if provided - self.replay = None - if replay_endpoint: - self.replay = self.ctx.socket(zmq.REQ) - self.replay.connect(replay_endpoint) + # Set up subscriber socket - connect to all endpoints + self.sub = self.ctx.socket(zmq.SUB) + self.sub.setsockopt(zmq.SUBSCRIBE, topic.encode("utf-8")) + for endpoint in pub_endpoints: + self.sub.connect(endpoint) + + # Set up replay sockets if provided + self.replay_sockets = [] + if replay_endpoints: + for replay_endpoint in replay_endpoints: + replay = self.ctx.socket(zmq.REQ) + replay.connect(replay_endpoint) + self.replay_sockets.append(replay) self.topic = topic - self.topic_bytes = topic.encode('utf-8') + self.topic_bytes = topic.encode("utf-8") self.received_msgs: list[tuple[int, SampleBatch]] = [] self.last_seq = -1 self.decoder = msgspec.msgpack.Decoder(type=decode_type) @@ -107,25 +126,31 @@ class MockSubscriber: self.received_msgs.append((seq, data)) return seq, data - def request_replay(self, start_seq: int) -> None: + def request_replay(self, start_seq: int, socket_idx: int = 0) -> None: """Request replay of messages starting from start_seq""" - if not self.replay: - raise ValueError("Replay socket not initialized") - - self.replay.send(start_seq.to_bytes(8, "big")) - - def receive_replay(self) -> list[tuple[int, SampleBatch]]: - """Receive replayed messages""" - if not self.replay: - raise ValueError("Replay socket not initialized") - + if not self.replay_sockets: + raise ValueError("Replay sockets not initialized") + if socket_idx >= len(self.replay_sockets): + raise ValueError(f"Invalid socket index {socket_idx}") + + self.replay_sockets[socket_idx].send(start_seq.to_bytes(8, "big")) + + def receive_replay(self, + socket_idx: int = 0) -> list[tuple[int, SampleBatch]]: + """Receive replayed messages from a specific replay socket""" + if not self.replay_sockets: + raise ValueError("Replay sockets not initialized") + if socket_idx >= len(self.replay_sockets): + raise ValueError(f"Invalid socket index {socket_idx}") + + replay_socket = self.replay_sockets[socket_idx] replayed: list[tuple[int, SampleBatch]] = [] while True: try: - if not self.replay.poll(1000): + if not replay_socket.poll(1000): break - frames = self.replay.recv_multipart() + frames = replay_socket.recv_multipart() if not frames or not frames[-1]: # End of replay marker break @@ -142,5 +167,5 @@ class MockSubscriber: def close(self): """Clean up resources""" self.sub.close() - if self.replay: - self.replay.close() + for replay in self.replay_sockets: + replay.close() diff --git a/tests/distributed/test_events.py b/tests/distributed/test_events.py index ec1e5a2d6..8be9ee0a1 100644 --- a/tests/distributed/test_events.py +++ b/tests/distributed/test_events.py @@ -9,6 +9,8 @@ import pytest from vllm.distributed.kv_events import (EventBatch, EventPublisherFactory, NullEventPublisher) +DP_RANK = 0 + class EventSample( msgspec.Struct, @@ -121,7 +123,7 @@ def test_topic_filtering(publisher_config): publisher_config.replay_endpoint = None publisher_config.topic = "foo" - pub = EventPublisherFactory.create(publisher_config) + pub = EventPublisherFactory.create(publisher_config, DP_RANK) from .conftest import MockSubscriber sub_foo = MockSubscriber(publisher_config.endpoint, None, "foo") @@ -185,9 +187,72 @@ def test_high_volume(publisher, subscriber): def test_null_publisher(): """Test that NullEventPublisher can be used without errors""" - publisher = NullEventPublisher() + publisher = NullEventPublisher(DP_RANK) # This should not raise any errors batch = create_test_events(5) publisher.publish(batch) publisher.shutdown() + + +def test_data_parallel_rank_tagging(publisher_config): + """Test that events are properly tagged with their data parallel rank""" + + publisher_config.topic = "foo" + pub_0 = EventPublisherFactory.create(publisher_config, DP_RANK) + pub_1 = EventPublisherFactory.create(publisher_config, DP_RANK + 1) + + # Hardcode the expected endpoints based on port offsetting behavior + # Both ranks get offsets according to _offset_endpoint_port function + base_endpoint = publisher_config.endpoint + if "tcp://" in base_endpoint: + # For TCP endpoints: tcp://localhost:5557 -> tcp://localhost:5557, tcp://localhost:5558 + expected_endpoint_0 = base_endpoint # rank 0 gets port + 0 = same port + expected_endpoint_1 = base_endpoint.replace( + ":5557", ":5558") # rank 1 gets port + 1 + else: + # For inproc endpoints: inproc://test -> inproc://test_dp0, inproc://test_dp1 + expected_endpoint_0 = base_endpoint # rank 0 gets base + expected_endpoint_1 = base_endpoint + "_dp1" # rank 1 gets _dp1 + + from .conftest import MockSubscriber + sub_0 = MockSubscriber(expected_endpoint_0, None, publisher_config.topic) + sub_1 = MockSubscriber(expected_endpoint_1, None, publisher_config.topic) + + try: + time.sleep(0.1) # Let publishers start up + + # Publish events from different ranks + batch_0 = create_test_events(2) + batch_1 = create_test_events(3) + + pub_0.publish(batch_0) + pub_1.publish(batch_1) + + # Receive events from rank 0 + result_0 = sub_0.receive_one(timeout=200) + assert result_0 is not None, "No message received from rank 0" + seq_0, received_0 = result_0 + + # Receive events from rank 1 + result_1 = sub_1.receive_one(timeout=200) + assert result_1 is not None, "No message received from rank 1" + seq_1, received_1 = result_1 + + # Verify DP rank tagging + assert received_0.data_parallel_rank == 0, ( + f"Expected DP rank 0, got {received_0.data_parallel_rank}") + assert received_1.data_parallel_rank == 1, ( + f"Expected DP rank 1, got {received_1.data_parallel_rank}") + + # Verify event content is correct + assert len( + received_0.events) == 2, "Wrong number of events from rank 0" + assert len( + received_1.events) == 3, "Wrong number of events from rank 1" + + finally: + pub_0.shutdown() + pub_1.shutdown() + sub_0.close() + sub_1.close() diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index a01b205df..47181d36f 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -12,8 +12,10 @@ from typing import Optional import pytest from transformers import AutoTokenizer +from tests.utils import multi_gpu_test from vllm import SamplingParams -from vllm.distributed.kv_events import BlockStored, KVEventBatch +from vllm.distributed.kv_events import (BlockStored, KVEventBatch, + ZmqEventPublisher) from vllm.engine.arg_utils import EngineArgs from vllm.platforms import current_platform from vllm.usage.usage_lib import UsageContext @@ -37,10 +39,15 @@ PROMPT = "Hello my name is Robert and I love quantization kernels" PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids -def make_request(params: SamplingParams) -> EngineCoreRequest: +def make_request( + params: SamplingParams, + prompt_tokens_ids: Optional[list[int]] = None) -> EngineCoreRequest: + if not prompt_tokens_ids: + prompt_tokens_ids = PROMPT_TOKENS + return EngineCoreRequest( request_id=str(uuid.uuid4()), - prompt_token_ids=PROMPT_TOKENS, + prompt_token_ids=prompt_tokens_ids, mm_inputs=None, mm_hashes=None, mm_placeholders=None, @@ -88,6 +95,25 @@ async def loop_until_done_async(client: EngineCoreClient, outputs: dict): break +async def loop_until_fully_done_async(client: EngineCoreClient, outputs: dict): + + while True: + engine_core_outputs = (await client.get_output_async()).outputs + + if len(engine_core_outputs) == 0: + continue + + # Add outputs to the dict + for out in engine_core_outputs: + outputs[out.request_id].append(out) + + # Check if all request IDs in outputs have finished + if all(outs and outs[-1].finished for outs in outputs.values()): + break + + await asyncio.sleep(0.1) + + # Dummy utility function to monkey-patch into engine core. def echo(self, msg: str, err_msg: Optional[str] = None) -> str: print(f"echo util function called: {msg}, {err_msg}") @@ -273,10 +299,12 @@ def test_kv_cache_events( block_size = 16 num_blocks = 2 - engine_args = EngineArgs(model=MODEL_NAME, - enforce_eager=True, - enable_prefix_caching=True, - block_size=block_size) + engine_args = EngineArgs( + model=MODEL_NAME, + enforce_eager=True, + enable_prefix_caching=True, + block_size=block_size, + ) engine_args.kv_events_config = publisher_config vllm_config = engine_args.create_engine_config( @@ -297,19 +325,8 @@ def test_kv_cache_events( try: custom_tokens = list(range(num_blocks * block_size)) - request = EngineCoreRequest( - request_id=str(uuid.uuid4()), - prompt_token_ids=custom_tokens, - mm_inputs=None, - mm_hashes=None, - mm_placeholders=None, - sampling_params=SamplingParams( - max_tokens=1), # Short completion for speed - eos_token_id=None, - arrival_time=time.time(), - lora_request=None, - cache_salt=None, - ) + sampling_params = SamplingParams(max_tokens=1) + request = make_request(sampling_params, custom_tokens) client.add_request(request) outputs: dict[str, list] = {request.request_id: []} @@ -321,24 +338,130 @@ def test_kv_cache_events( seq, received = result assert seq == 0, "Sequence number mismatch" - assert len(received.events) == 1, ( - "We should have exactly one BlockStored event") + assert (len(received.events) == 1 + ), "We should have exactly one BlockStored event" event = received.events[0] assert isinstance( - event, BlockStored), ("We should have a BlockStored event") - assert len(event.block_hashes) == num_blocks, ( - "We should have a BlockStored event with 2 block_hashes") - assert event.block_size == block_size, ( - "Block size should be the same as the block size") - assert event.parent_block_hash is None, ( - "Parent block hash should be None") + event, BlockStored), "We should have a BlockStored event" + assert (len(event.block_hashes) == num_blocks + ), "We should have a BlockStored event with 2 block_hashes" + assert (event.block_size == block_size + ), "Block size should be the same as the block size" + assert (event.parent_block_hash + is None), "Parent block hash should be None" assert event.lora_id is None, "Lora id should be None" - assert len(event.token_ids) == num_blocks * block_size, ( - "Token ids should be the same as the custom tokens") - assert event.token_ids == custom_tokens, ( - "Token ids should be the same as the custom tokens") + assert (len(event.token_ids) == num_blocks * block_size + ), "Token ids should be the same as the custom tokens" + assert (event.token_ids == custom_tokens + ), "Token ids should be the same as the custom tokens" + finally: + client.shutdown() + subscriber.close() + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "multiprocessing_mode,publisher_config", + [(True, "tcp")], + indirect=["publisher_config"], +) +@multi_gpu_test(num_gpus=4) +async def test_kv_cache_events_dp( + monkeypatch: pytest.MonkeyPatch, + multiprocessing_mode: bool, + publisher_config, +): + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + block_size = 16 + num_blocks = 2 + dp_size = 2 + tp_size = 2 + + engine_args = EngineArgs( + model=MODEL_NAME, + enforce_eager=True, + enable_prefix_caching=True, + data_parallel_size=dp_size, + tensor_parallel_size=tp_size, + block_size=block_size, + ) + engine_args.kv_events_config = publisher_config + + vllm_config = engine_args.create_engine_config( + UsageContext.UNKNOWN_CONTEXT) + + executor_class = Executor.get_class(vllm_config) + client = EngineCoreClient.make_client( + multiprocess_mode=multiprocessing_mode, + asyncio_mode=True, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=False, + ) + await asyncio.sleep(1) + + # Build endpoints for all DP ranks + base_endpoint = publisher_config.endpoint.replace("*", "127.0.0.1") + endpoints = [] + for i in range(dp_size): + offset_endpoint = ZmqEventPublisher.offset_endpoint_port( + base_endpoint, i) + endpoints.append(offset_endpoint) + + subscriber = MockSubscriber(endpoints, + topic=publisher_config.topic, + decode_type=KVEventBatch) + + try: + custom_tokens = list(range(num_blocks * block_size)) + sampling_params = SamplingParams(max_tokens=1) + all_request_ids = [] + + # Create and add 25 requests + # NOTE: attempts to force routing to both dp groups but can be flaky + for i in range(25): + await asyncio.sleep(0.01) + request = make_request(sampling_params, custom_tokens) + await client.add_request_async(request) + all_request_ids.append(request.request_id) + + await asyncio.sleep(0.1) + + # Initialize outputs dict for all requests + outputs: dict[str, list] = { + req_id: [] + for req_id in all_request_ids + } + + print("processing requests...") + await asyncio.wait_for(loop_until_fully_done_async( + client, outputs), + timeout=20.0) + + # Receive from subscriber until no more messages + print("collecting results...") + results = [] + while True: + result = subscriber.receive_one(timeout=1) + print(result) + if result is None: + break + results.append(result) + + # Collect all events and data_parallel_ranks from all results + all_dp_ranks = [ + received.data_parallel_rank for (_, received) in results + ] + unique_dps = set(all_dp_ranks) + assert ( + len(unique_dps) == 2 + ), f"Expected 2 unique data_parallel_ranks, got {len(unique_dps)}" + finally: client.shutdown() + subscriber.close() @pytest.mark.timeout(20) diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py index 9bf1c058a..2d7935773 100644 --- a/vllm/distributed/kv_events.py +++ b/vllm/distributed/kv_events.py @@ -28,6 +28,7 @@ class EventBatch( ): ts: float events: list[Any] + data_parallel_rank: Optional[int] = None class KVCacheEvent( @@ -60,7 +61,22 @@ class KVEventBatch(EventBatch): class EventPublisher(ABC): - """Lightweight publisher for EventBatch batches.""" + """Lightweight publisher for EventBatch batches with data parallelism + support. + + In data parallel setups, each DP rank runs its own EventPublisher instance + to avoid duplicate events and ensure proper event attribution: + + - Each DP rank creates a separate publisher + - Publishers automatically annotate events with their data_parallel_rank + - This allows consumers to distinguish events from different DP ranks + + The publisher is responsible for adding DP metadata since the scheduler + operates independently of DP topology and shouldn't need DP awareness. + """ + + def __init__(self, data_parallel_rank: int = 0) -> None: + self._data_parallel_rank = data_parallel_rank @abstractmethod def publish(self, events: EventBatch) -> None: @@ -113,6 +129,7 @@ class ZmqEventPublisher(EventPublisher): def __init__( self, + data_parallel_rank: int, endpoint: str = "tcp://*:5557", replay_endpoint: Optional[str] = None, buffer_steps: int = 10_000, @@ -121,6 +138,7 @@ class ZmqEventPublisher(EventPublisher): topic: str = "", ) -> None: # Storage + super().__init__(data_parallel_rank) self._event_queue = Queue[Optional[EventBatch]](maxsize=max_queue_size) self._buffer = deque[tuple[int, bytes]](maxlen=buffer_steps) @@ -128,8 +146,11 @@ class ZmqEventPublisher(EventPublisher): self._ctx = zmq.Context.instance() self._pub: Optional[zmq.Socket] = None self._replay: Optional[zmq.Socket] = None - self._endpoint = endpoint - self._replay_endpoint = replay_endpoint + self._dp_rank = data_parallel_rank + + self._endpoint = self.offset_endpoint_port(endpoint, self._dp_rank) + self._replay_endpoint = self.offset_endpoint_port( + replay_endpoint, self._dp_rank) self._hwm = hwm self._socket_setup() @@ -149,6 +170,8 @@ class ZmqEventPublisher(EventPublisher): def publish(self, events: EventBatch) -> None: if not self._running: raise RuntimeError("Publisher is closed") + if events.data_parallel_rank is None: + events.data_parallel_rank = self._data_parallel_rank self._event_queue.put(events) def shutdown(self) -> None: @@ -191,11 +214,12 @@ class ZmqEventPublisher(EventPublisher): self._pub.set_hwm(self._hwm) # Heuristic: bind if wildcard / * present, else connect. # bind stable, connect volatile convention - if ("*" in self._endpoint or "::" in self._endpoint - or self._endpoint.startswith("ipc://") - or self._endpoint.startswith("inproc://")): + if (self._endpoint is not None + and ("*" in self._endpoint or "::" in self._endpoint + or self._endpoint.startswith("ipc://") + or self._endpoint.startswith("inproc://"))): self._pub.bind(self._endpoint) - else: + elif self._endpoint is not None: self._pub.connect(self._endpoint) # Set up replay socket: use ROUTER @@ -266,6 +290,38 @@ class ZmqEventPublisher(EventPublisher): # receiving payload is (-1, b""") self._replay.send_multipart((client_id, b"", self.END_SEQ, b"")) + @staticmethod + def offset_endpoint_port(endpoint: Optional[str], + data_parallel_rank: int) -> Optional[str]: + """Helper function to offset the port in an endpoint by + the data parallel rank. + + Args: + endpoint: The endpoint string + (e.g., "tcp://*:5557" or "inproc://cache") + data_parallel_rank: The data parallel rank to offset by + + Returns: + The endpoint with the port offset by data_parallel_rank + or suffix appended + """ + # Do nothing if input is None or data_parallel_rank is 0 + if not endpoint or data_parallel_rank == 0: + return endpoint + + if "inproc" in endpoint: + return f"{endpoint}_dp{data_parallel_rank}" + if "tcp" in endpoint: + if endpoint and ":" in endpoint: + # Get everything after the last colon (the port) + last_colon_idx = endpoint.rfind(":") + base_addr = endpoint[:last_colon_idx] + base_port = int(endpoint[last_colon_idx + 1:]) + new_port = base_port + data_parallel_rank + return f"{base_addr}:{new_port}" + return endpoint + raise ValueError("Invalid endpoint: must contain 'inproc' or 'tcp'") + class EventPublisherFactory: _registry: dict[str, Callable[..., EventPublisher]] = { @@ -281,7 +337,9 @@ class EventPublisherFactory: cls._registry[name] = ctor @classmethod - def create(cls, config: Optional[KVEventsConfig]) -> EventPublisher: + def create(cls, + config: Optional[KVEventsConfig], + data_parallel_rank: int = 0) -> EventPublisher: """Create publisher from a config mapping.""" if not config: return NullEventPublisher() @@ -294,4 +352,5 @@ class EventPublisherFactory: constructor = cls._registry[kind] except KeyError as exc: raise ValueError(f"Unknown event publisher '{kind}'") from exc - return constructor(**config_dict) + return constructor(data_parallel_rank=data_parallel_rank, + **config_dict) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index e510a0626..32d03b311 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -80,7 +80,9 @@ class Scheduler(SchedulerInterface): config=self.vllm_config, role=KVConnectorRole.SCHEDULER) self.kv_event_publisher = EventPublisherFactory.create( - self.kv_events_config) + self.kv_events_config, + vllm_config.parallel_config.data_parallel_rank, + ) num_gpu_blocks = self.cache_config.num_gpu_blocks assert num_gpu_blocks is not None and num_gpu_blocks > 0 -- GitLab From abd7df2fca570998693fa8c1ae39d83fb789ef27 Mon Sep 17 00:00:00 2001 From: Jiaxin Shan Date: Tue, 3 Jun 2025 17:15:18 -0700 Subject: [PATCH 154/274] [Misc] Fix path and python alias errors in disagg_prefill exmaples (#18919) --- .../disagg_prefill_lmcache_v1/disagg_example_nixl.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh index df8a41293..0b6c9213e 100644 --- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh +++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh @@ -33,7 +33,7 @@ check_num_gpus() { ensure_python_library_installed() { echo "Checking if $1 is installed..." - python -c "import $1" > /dev/null 2>&1 + python3 -c "import $1" > /dev/null 2>&1 if [ $? -ne 0 ]; then if [ "$1" == "nixl" ]; then echo "$1 is not installed. Please refer to https://github.com/ai-dynamo/nixl for installation." @@ -121,8 +121,8 @@ main() { echo "All servers are up. Starting benchmark..." # begin benchmark - cd ../../../benchmarks/ - python benchmark_serving.py --port 9000 --seed $(date +%s) \ + cd ../../../../benchmarks/ + python3 benchmark_serving.py --port 9000 --seed $(date +%s) \ --model meta-llama/Llama-3.1-8B-Instruct \ --dataset-name random --random-input-len 7500 --random-output-len 200 \ --num-prompts 200 --burstiness 100 --request-rate 3.6 | tee benchmark.log -- GitLab From 52dceb172d6fe762bb60b670df61866fe86b6f17 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Tue, 3 Jun 2025 21:09:13 -0400 Subject: [PATCH 155/274] [Docs] Add developer doc about CI failures (#18782) Signed-off-by: Russell Bryant Co-authored-by: Mark McLoughlin Co-authored-by: Cyrus Leung --- docs/contributing/ci-failures.md | 120 +++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 docs/contributing/ci-failures.md diff --git a/docs/contributing/ci-failures.md b/docs/contributing/ci-failures.md new file mode 100644 index 000000000..4d8f78197 --- /dev/null +++ b/docs/contributing/ci-failures.md @@ -0,0 +1,120 @@ +# CI Failures + +What should I do when a CI job fails on my PR, but I don't think my PR caused +the failure? + +- Check the dashboard of current CI test failures: + 👉 [CI Failures Dashboard](https://github.com/orgs/vllm-project/projects/20) + +- If your failure **is already listed**, it's likely unrelated to your PR. + Help fixing it is always welcome! + - Leave comments with links to additional instances of the failure. + - React with a 👍 to signal how many are affected. + +- If your failure **is not listed**, you should **file an issue**. + +## Filing a CI Test Failure Issue + +- **File a bug report:** + 👉 [New CI Failure Report](https://github.com/vllm-project/vllm/issues/new?template=450-ci-failure.yml) + +- **Use this title format:** + + ``` + [CI Failure]: failing-test-job - regex/matching/failing:test + ``` + +- **For the environment field:** + + ``` + Still failing on main as of commit abcdef123 + ``` + +- **In the description, include failing tests:** + + ``` + FAILED failing/test.py:failing_test1 - Failure description + FAILED failing/test.py:failing_test2 - Failure description + https://github.com/orgs/vllm-project/projects/20 + https://github.com/vllm-project/vllm/issues/new?template=400-bug-report.yml + FAILED failing/test.py:failing_test3 - Failure description + ``` + +- **Attach logs** (collapsible section example): +
+ Logs: + + ```text + ERROR 05-20 03:26:38 [dump_input.py:68] Dumping input data + --- Logging error --- + Traceback (most recent call last): + File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 203, in execute_model + return self.model_executor.execute_model(scheduler_output) + ... + FAILED failing/test.py:failing_test1 - Failure description + FAILED failing/test.py:failing_test2 - Failure description + FAILED failing/test.py:failing_test3 - Failure description + ``` + +
+ +## Logs Wrangling + +Download the full log file from Buildkite locally. + +Strip timestamps and colorization: + +```bash +# Strip timestamps +sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' ci.log + +# Strip colorization +sed -i -r 's/\x1B\[[0-9;]*[mK]//g' ci.log +``` + +Use a tool for quick copy-pasting: + +```bash +tail -525 ci_build.log | wl-copy +``` + +## Investigating a CI Test Failure + +1. Go to 👉 [Buildkite main branch](https://buildkite.com/vllm/ci/builds?branch=main) +2. Bisect to find the first build that shows the issue. +3. Add your findings to the GitHub issue. +4. If you find a strong candidate PR, mention it in the issue and ping contributors. + +## Reproducing a Failure + +CI test failures may be flaky. Use a bash loop to run repeatedly: + +```bash +COUNT=1; while pytest -sv tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]; do + COUNT=$[$COUNT + 1]; echo "RUN NUMBER ${COUNT}"; +done +``` + +## Submitting a PR + +If you submit a PR to fix a CI failure: + +- Link the PR to the issue: + Add `Closes #12345` to the PR description. +- Add the `ci-failure` label: + This helps track it in the [CI Failures GitHub Project](https://github.com/orgs/vllm-project/projects/20). + +## Other Resources + +- 🔍 [Test Reliability on `main`](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main&order=ASC&sort_by=reliability) +- 🧪 [Latest Buildkite CI Runs](https://buildkite.com/vllm/ci/builds?branch=main) + +## Daily Triage + +Use [Buildkite analytics (2-day view)](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main&period=2days) to: + +- Identify recent test failures **on `main`**. +- Exclude legitimate test failures on PRs. +- (Optional) Ignore tests with 0% reliability. + +Compare to the [CI Failures Dashboard](https://github.com/orgs/vllm-project/projects/20). -- GitLab From 4555143ea7fdd2b2f0106e40889bfbab49879237 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Wed, 4 Jun 2025 09:43:01 +0800 Subject: [PATCH 156/274] [CPU] V1 support for the CPU backend (#16441) --- .../scripts/hardware_ci/run-cpu-test.sh | 13 +- docs/usage/v1_guide.md | 2 + requirements/cpu.txt | 3 + .../attention/test_attention_selector.py | 5 +- .../models/language/generation/test_common.py | 1 - vllm/attention/backends/cpu_mla.py | 6 +- vllm/attention/backends/torch_sdpa.py | 16 +- vllm/compilation/wrapper.py | 7 +- vllm/engine/arg_utils.py | 4 +- vllm/platforms/cpu.py | 67 +++++-- vllm/v1/attention/backends/cpu_attn.py | 163 ++++++++++++++++++ vllm/v1/worker/cpu_model_runner.py | 86 +++++++++ vllm/v1/worker/cpu_worker.py | 101 +++++++++++ vllm/v1/worker/gpu_model_runner.py | 28 +-- vllm/v1/worker/gpu_worker.py | 3 +- 15 files changed, 465 insertions(+), 40 deletions(-) create mode 100644 vllm/v1/attention/backends/cpu_attn.py create mode 100644 vllm/v1/worker/cpu_model_runner.py create mode 100644 vllm/v1/worker/cpu_worker.py diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index 0a1193560..61aa7df13 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -6,6 +6,7 @@ set -ex # allow to bind to different cores CORE_RANGE=${CORE_RANGE:-48-95} +OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95} NUMA_NODE=${NUMA_NODE:-1} export CMAKE_BUILD_PARALLEL_LEVEL=32 @@ -23,10 +24,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . # Run the image, setting --shm-size=4g for tensor parallel. -docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" -docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 function cpu_tests() { set -e @@ -56,7 +55,7 @@ function cpu_tests() { # Run AWQ test docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - pytest -s -v \ + VLLM_USE_V1=0 pytest -s -v \ tests/quantization/test_ipex_quant.py" # Run chunked-prefill and prefix-cache test @@ -68,8 +67,6 @@ function cpu_tests() { # online serving docker exec cpu-test-"$NUMA_NODE" bash -c " set -e - export VLLM_CPU_KVCACHE_SPACE=10 - export VLLM_CPU_OMP_THREADS_BIND=$1 python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 python3 benchmarks/benchmark_serving.py \ @@ -89,4 +86,4 @@ function cpu_tests() { # All of CPU tests are expected to be finished less than 40 mins. export -f cpu_tests -timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" +timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index a2321bf98..7c4909cb5 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -40,6 +40,8 @@ This living user guide outlines a few known **important changes and limitations* | **NVIDIA** | 🚀 Natively Supported | | **AMD** | 🚧 WIP | | **TPU** | 🚧 WIP | +| **CPU** | 🚧 WIP | + #### Feature / Model | Feature / Model | Status | diff --git a/requirements/cpu.txt b/requirements/cpu.txt index 121330158..e43b44397 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -1,6 +1,9 @@ # Common dependencies -r common.txt +numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding +numba == 0.61.2; python_version > '3.9' + # Dependencies for CPUs packaging>=24.2 setuptools>=77.0.3,<80.0.0 diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index 435fe6225..f3e641557 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -85,7 +85,10 @@ def test_env( CpuPlatform()): backend = get_attn_backend(16, torch.float16, torch.float16, block_size, False) - assert backend.get_name() == "TORCH_SDPA" + if use_v1: + assert backend.get_name() == "TORCH_SDPA_VLLM_V1" + else: + assert backend.get_name() == "TORCH_SDPA" elif device == "hip": with patch("vllm.attention.selector.current_platform", diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index ed9e54722..f656f90c4 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -87,7 +87,6 @@ AITER_MODEL_LIST = [ pytest.param("bigcode/starcoder2-3b"), # starcoder2 pytest.param( "TitanML/tiny-mixtral", # mixtral - marks=[pytest.mark.cpu_model], ) ]) @pytest.mark.parametrize("max_tokens", [32]) diff --git a/vllm/attention/backends/cpu_mla.py b/vllm/attention/backends/cpu_mla.py index cf7883e12..793cb87b7 100644 --- a/vllm/attention/backends/cpu_mla.py +++ b/vllm/attention/backends/cpu_mla.py @@ -178,7 +178,7 @@ class CPUMLAMetadataBuilder(AttentionMetadataBuilder[CPUMLAMetadata]): seq_lens_tensor=seq_lens_tensor, max_query_len=max_query_len, max_kv_len=max_kv_len, - query_start_loc=query_start_loc, + prefill_query_start_loc=query_start_loc, kv_start_loc=kv_start_loc, max_decode_seq_len=input_data.max_decode_seq_len, num_prefills=input_data.num_prefills, @@ -264,8 +264,8 @@ class CPUMLAImpl(MLACommonImpl[CPUMLAMetadata]): key=k, value=v_padded, out=output, - seqlen_q=prefill_metadata.query_start_loc, - seqlen_k=prefill_metadata.query_start_loc, + seqlen_q=prefill_metadata.prefill_query_start_loc, + seqlen_k=prefill_metadata.prefill_query_start_loc, max_seqlen_q=prefill_metadata.max_query_len, max_seqlen_k=prefill_metadata.max_query_len, pdropout=0.0, diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index f3fb5adcf..23231c323 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -87,10 +87,13 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata): # For chunked prefill only max_query_len: Optional[int] = None max_kv_len: Optional[int] = None - query_start_loc: Optional[torch.Tensor] = None + prefill_query_start_loc: Optional[torch.Tensor] = None kv_start_loc: Optional[torch.Tensor] = None prefill_block_tables: Optional[torch.Tensor] = None + # For V1 logits index only + query_start_loc: Optional[torch.Tensor] = None + # Begin encoder attn & enc/dec cross-attn fields... # Encoder sequence lengths representation encoder_seq_lens: Optional[List[int]] = None @@ -375,7 +378,7 @@ class TorchSDPAMetadataBuilder(AttentionMetadataBuilder[TorchSDPAMetadata]): seq_lens_tensor=seq_lens_tensor, max_query_len=max_query_len, max_kv_len=max_kv_len, - query_start_loc=query_start_loc, + prefill_query_start_loc=query_start_loc, kv_start_loc=kv_start_loc, max_decode_seq_len=input_data.max_decode_seq_len, num_prefills=input_data.num_prefills, @@ -470,6 +473,11 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): Returns: shape = [num_tokens, num_heads * head_size] """ + + # For warming-up + if attn_metadata is None: + return query + attn_type = self.attn_type if (attn_type == AttentionType.ENCODER and (not attn_metadata.is_all_encoder_attn_metadata_set)): @@ -537,8 +545,8 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): output = torch.empty_like(query) if prefill_meta := attn_metadata.prefill_metadata: - assert attn_metadata.seq_lens is not None if not prefill_meta.prefill_metadata.chunked_prefill: # type: ignore + assert attn_metadata.seq_lens is not None self._run_sdpa_forward(output, query, key, @@ -555,7 +563,7 @@ class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): query[:prefill_meta.num_prefill_tokens, :, :], key_cache, value_cache, - prefill_meta.query_start_loc, + prefill_meta.prefill_query_start_loc, prefill_meta.kv_start_loc, prefill_meta.max_query_len, prefill_meta.max_kv_len, diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index 8c8d0b5cb..2a261c84c 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -41,11 +41,16 @@ class TorchCompileWrapperWithCustomDispatcher: # compiling the forward method backend = vllm_config.compilation_config.init_backend(vllm_config) + options = None + if isinstance(backend, str) and backend == "inductor": + options = get_current_vllm_config( + ).compilation_config.inductor_compile_config compiled_callable = torch.compile( self.forward, fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, - backend=backend) + backend=backend, + options=options) self.compiled_callable = compiled_callable self.original_code_object = self.__class__.forward.__code__ diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 2197d44ca..b1c4b27a0 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1399,6 +1399,7 @@ class EngineArgs: "FLASHINFER", "FLASHINFER_VLLM_V1", "ROCM_AITER_MLA", + "TORCH_SDPA_VLLM_V1", ] if (envs.is_set("VLLM_ATTENTION_BACKEND") and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS): @@ -1431,7 +1432,8 @@ class EngineArgs: # Non-[CUDA, TPU] may be supported on V1, but off by default for now. v0_hardware = not any( - (current_platform.is_cuda(), current_platform.is_tpu())) + (current_platform.is_cuda(), current_platform.is_tpu(), + current_platform.is_cpu())) if v0_hardware and _warn_or_fallback( # noqa: SIM103 current_platform.device_name): return False diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 2739f5c8c..265959d62 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -57,7 +57,10 @@ class CpuPlatform(Platform): logger.info("Using CPU MLA backend.") return "vllm.attention.backends.cpu_mla.CPUMLABackend" logger.info("Using Torch SDPA backend.") - return "vllm.attention.backends.torch_sdpa.TorchSDPABackend" + if use_v1: + return "vllm.v1.attention.backends.cpu_attn.TorchSDPABackend" + else: + return "vllm.attention.backends.torch_sdpa.TorchSDPABackend" @classmethod def get_device_total_memory(cls, device_id: int = 0) -> int: @@ -81,6 +84,8 @@ class CpuPlatform(Platform): if not model_config.enforce_eager: model_config.enforce_eager = True + model_config.disable_cascade_attn = True + cache_config = vllm_config.cache_config ipex_available = find_spec("intel_extension_for_pytorch") is not None @@ -128,7 +133,8 @@ class CpuPlatform(Platform): f" {kv_cache_space}, expect a positive integer value.") parallel_config = vllm_config.parallel_config - if (parallel_config.distributed_executor_backend is not None + if (parallel_config.world_size > 1 + and parallel_config.distributed_executor_backend is not None and parallel_config.distributed_executor_backend != "mp"): logger.warning(("%s is not supported on CPU, fallback to mp " "distributed executor backend."), @@ -141,7 +147,38 @@ class CpuPlatform(Platform): parallel_config.sd_worker_cls = \ "vllm.worker.cpu_worker.CPUWorker" else: - parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker" + if envs.VLLM_USE_V1: + parallel_config.worker_cls = \ + "vllm.v1.worker.cpu_worker.CPUWorker" + else: + parallel_config.worker_cls = \ + "vllm.worker.cpu_worker.CPUWorker" + + # Note: workaround for v1 gpu_model_runner + from vllm.config import CompilationLevel + vllm_config.compilation_config.cudagraph_capture_sizes = [] + + compilation_config = vllm_config.compilation_config + if (envs.VLLM_USE_V1 and vllm_config.compilation_config.level + == CompilationLevel.PIECEWISE): + compilation_config.level = CompilationLevel.DYNAMO_ONCE + compilation_config.backend = "eager" + compilation_config.custom_ops += ["none"] + compilation_config.inductor_compile_config.update({ + "dce": + True, + "size_asserts": + False, + "nan_asserts": + False, + "memory_planning": + True, + "epilogue_fusion": + True, + }) + + if vllm_config.lora_config is not None: + compilation_config.level = CompilationLevel.NO_COMPILATION assert vllm_config.device_config.device_type == "cpu" @@ -149,6 +186,12 @@ class CpuPlatform(Platform): # Environment variables for CPU executor # + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + # Note: to avoid the error 'nthreads cannot be larger than environment + # variable "NUMEXPR_MAX_THREADS" (64)'. + os.environ["NUMEXPR_MAX_THREADS"] = str(len(os.sched_getaffinity(0))) + # Set default threads num for OpenMP parallel os.environ["OMP_NUM_THREADS"] = str(torch.get_num_threads()) @@ -171,13 +214,6 @@ class CpuPlatform(Platform): # To hint IPEX uses shared memory based AllReduce os.environ["LOCAL_WORLD_SIZE"] = str( vllm_config.parallel_config.tensor_parallel_size) - if sys.platform == "darwin" and \ - envs.VLLM_WORKER_MULTIPROC_METHOD == "fork": - if os.environ.get('VLLM_WORKER_MULTIPROC_METHOD', None) is None: - logger.warning( - "Default to spawn method on MacOS. If this is not desired," - " set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly.") - os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' if vllm_config.model_config and vllm_config.model_config.use_mla: logger.info( @@ -204,3 +240,14 @@ class CpuPlatform(Platform): Get device specific communicator class for distributed communication. """ return "vllm.distributed.device_communicators.cpu_communicator.CpuCommunicator" # noqa + + @classmethod + def supports_structured_output(cls) -> bool: + return True + + @classmethod + def supports_v1(cls, model_config) -> bool: + """Returns whether the current platform can support v1 for the supplied + model configuration. + """ + return True diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py new file mode 100644 index 000000000..d7a580c28 --- /dev/null +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -0,0 +1,163 @@ +# SPDX-License-Identifier: Apache-2.0 +import numpy as np +import torch + +from vllm.attention.backends.abstract import AttentionMetadata +from vllm.attention.backends.torch_sdpa import (TorchSDPABackendImpl, + TorchSDPAMetadata) +from vllm.attention.backends.utils import CommonAttentionState +from vllm.attention.ops.ipex_attn import PagedAttention +from vllm.v1.attention.backends.utils import CommonAttentionMetadata +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.kv_cache_interface import AttentionSpec +from vllm.v1.worker.block_table import BlockTable +from vllm.v1.worker.cpu_model_runner import CPUModelRunner +from vllm.v1.worker.gpu_input_batch import InputBatch + + +class TorchSDPABackend: + accept_output_buffer: bool = False + + @staticmethod + def get_name() -> str: + return "TORCH_SDPA_VLLM_V1" + + @staticmethod + def get_impl_cls() -> type["TorchSDPABackendImpl"]: + return TorchSDPABackendImpl + + @staticmethod + def get_metadata_cls() -> type["AttentionMetadata"]: + return TorchSDPAMetadata + + @staticmethod + def get_state_cls() -> type["CommonAttentionState"]: + return CommonAttentionState + + @staticmethod + def get_builder_cls() -> type["TorchSDPAMetadataBuilderV1"]: + return TorchSDPAMetadataBuilderV1 + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> tuple[int, ...]: + return PagedAttention.get_kv_cache_shape(num_blocks, block_size, + num_kv_heads, head_size) + + @staticmethod + def use_cascade_attention(*args, **kwargs) -> bool: + return False + + +class TorchSDPAMetadataBuilderV1: + + def __init__(self, runner: CPUModelRunner, kv_cache_spec: AttentionSpec, + block_table: BlockTable) -> None: + self.runner = runner + self.block_table = block_table + + # For reorder + self.reorder_prompt_req_index_list = np.empty(self.runner.max_num_reqs, + dtype=np.int64) + self.reorder_decode_req_index_list = np.empty(self.runner.max_num_reqs, + dtype=np.int64) + self.num_prompt_req: int = 0 + + self.seq_start_loc_cpu = torch.zeros( + runner.max_num_reqs + 1, + dtype=torch.int32, + device="cpu", + ) + self.seq_start_loc_np = self.seq_start_loc_cpu.numpy() + + def reorder_batch(self, input_batch: InputBatch, + scheduler_output: SchedulerOutput) -> bool: + prompt_list_idx = 0 + decode_list_idx = 0 + for req_index in range(input_batch.num_reqs): + if input_batch.num_computed_tokens_cpu[ + req_index] < input_batch.num_prompt_tokens[req_index]: + # prompt stage + self.reorder_prompt_req_index_list[prompt_list_idx] = req_index + prompt_list_idx += 1 + else: + # decode stage + self.reorder_decode_req_index_list[decode_list_idx] = req_index + decode_list_idx += 1 + assert decode_list_idx + prompt_list_idx == input_batch.num_reqs + + # Update prompt requests number + self.num_prompt_req = prompt_list_idx + + reorder_req_num = 0 + for req_index in range(decode_list_idx): + if self.reorder_decode_req_index_list[req_index] < prompt_list_idx: + reorder_req_num += 1 + else: + break + + if reorder_req_num == 0: + return False + + reorder_prompt_list = ( + self.reorder_prompt_req_index_list[:prompt_list_idx] + [-reorder_req_num:]) + reorder_decode_list = ( + self.reorder_decode_req_index_list[:decode_list_idx] + [:reorder_req_num]) + assert reorder_decode_list.size == reorder_prompt_list.size + + for idx in range(reorder_req_num): + prompt_req_index = reorder_prompt_list[idx].item() + decode_req_index = reorder_decode_list[idx].item() + input_batch.swap_states(prompt_req_index, decode_req_index) + + return True + + def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata): + runner = self.runner + block_table = self.block_table + seq_lens_np = runner.seq_lens_np[:num_reqs] + num_prompt_req = self.num_prompt_req + max_prefill_seq_len = seq_lens_np[:num_prompt_req].max().item( + ) if num_prompt_req > 0 else 0 + max_decode_seq_len = seq_lens_np[num_prompt_req:num_reqs].max().item( + ) if num_prompt_req < num_reqs else 0 + self.seq_start_loc_np[0] = 0 + np.cumsum(seq_lens_np, out=self.seq_start_loc_np[1:num_reqs + 1]) + num_prefill_tokens = runner.query_start_loc_np[num_prompt_req].item() + num_decode_tokens = runner.query_start_loc_np[num_reqs].item( + ) - num_prefill_tokens + slot_mapping = block_table.slot_mapping_cpu[:num_actual_tokens].long() + block_table_tensor = block_table.get_device_tensor() + attn_metadata = TorchSDPAMetadata( + num_prefills=num_prompt_req, + num_prefill_tokens=num_prefill_tokens, + num_decode_tokens=num_decode_tokens, + slot_mapping=slot_mapping, + seq_lens_tensor=runner. + seq_lens_cpu[num_prompt_req:num_reqs], # decode + max_decode_seq_len=max_decode_seq_len, # decode + block_tables=block_table_tensor[num_prompt_req:num_reqs], # decode + chunked_prefill=True, + max_query_len=max_query_len, + max_kv_len=max_prefill_seq_len, + prefill_query_start_loc=runner. + query_start_loc_cpu[:num_prompt_req + 1], # prefill + kv_start_loc=self.seq_start_loc_cpu[:num_prompt_req + + 1], # prefill + prefill_block_tables=block_table_tensor[: + num_prompt_req], # prefill + query_start_loc=runner.query_start_loc_cpu[:num_reqs + + 1], # for logits index + multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=False, + ) + + return attn_metadata diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py new file mode 100644 index 000000000..607cfc0ef --- /dev/null +++ b/vllm/v1/worker/cpu_model_runner.py @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: Apache-2.0 +from contextlib import contextmanager +from typing import Any + +import torch + +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.model_executor.model_loader import get_model +from vllm.v1.worker.gpu_model_runner import GPUModelRunner + +logger = init_logger(__name__) + + +class CPUModelRunner(GPUModelRunner): + + def __init__(self, vllm_config: VllmConfig, device: torch.device): + super().__init__(vllm_config, device) + + assert device == torch.device("cpu") + assert self.speculative_config is None, "spec decode is not supported." + + self.use_cuda_graph = False + self.cascade_attn_enabled = False + + self._postprocess_tenosrs() + + def _postprocess_tenosrs(self) -> None: + # Note: replace device tensors with cpu tensors + def replace_tensor(obj: Any, cpu_attr_name: str, + device_attr_name) -> None: + cpu_tensor = getattr(obj, cpu_attr_name, None) + device_tensor = getattr(obj, device_attr_name, None) + if cpu_tensor is not None and device_tensor is not None: + assert isinstance(cpu_tensor, torch.Tensor) + assert isinstance(device_tensor, torch.Tensor) + setattr(obj, device_attr_name, cpu_tensor) + + for k, v in vars(self).items(): + if k.endswith("_cpu") and isinstance(v, torch.Tensor): + replace_tensor(self, k, k[:-4]) + + for k, v in vars(self.input_batch).items(): + if k.endswith("_cpu_tensor") and isinstance(v, torch.Tensor): + replace_tensor(self.input_batch, k, k[:-11]) + + for k, v in vars(self.input_batch.block_table).items(): + if k.endswith("_cpu") and isinstance(v, torch.Tensor): + replace_tensor(self.input_batch.block_table, k, k[:-4]) + + def load_model(self) -> None: + logger.info("Starting to load model %s...", self.model_config.model) + self.model = get_model(vllm_config=self.vllm_config) + + if self.lora_config: + self.model = self.load_lora_model(self.model, self.model_config, + self.scheduler_config, + self.lora_config, self.device) + + def warming_up_model(self) -> None: + logger.info("Warming up model for the compilation...") + # Only generate graph for the generic shape + self._dummy_run(max(16, self.max_num_reqs)) + logger.info("Warming up done.") + + def _init_device_properties(self) -> None: + pass + + def _sync_device(self) -> None: + pass + + +@contextmanager +def _set_global_compilation_settings(): + import torch._inductor.config + + # Note: The CPPGEMM backend requires freezing parameters. + freezing_value = torch._inductor.config.freezing + torch._inductor.config.freezing = True + # Note: workaround for "ValueError: fast mode: can't pickle cyclic objects + # including object type dict" + force_disable_caches = torch._inductor.config.force_disable_caches + torch._inductor.config.force_disable_caches = True + yield + torch._inductor.config.freezing = freezing_value + torch._inductor.config.force_disable_caches = force_disable_caches diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py new file mode 100644 index 000000000..0b710b7bc --- /dev/null +++ b/vllm/v1/worker/cpu_worker.py @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: Apache-2.0 +import os +from typing import Optional + +import torch + +from vllm import envs +from vllm.config import VllmConfig +from vllm.distributed.parallel_state import get_pp_group, get_tp_group +from vllm.logger import init_logger +from vllm.model_executor.utils import set_random_seed +from vllm.sequence import IntermediateTensors +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.worker.cpu_model_runner import CPUModelRunner +from vllm.v1.worker.gpu_worker import (Worker, + init_worker_distributed_environment) + +logger = init_logger(__name__) + + +class CPUWorker(Worker): + + def __init__(self, + vllm_config: VllmConfig, + local_rank: int, + rank: int, + distributed_init_method: str, + is_driver_worker: bool = False): + super().__init__(vllm_config, + local_rank, + rank, + distributed_init_method, + is_driver_worker=is_driver_worker) + + self.parallel_config.disable_custom_all_reduce = True + + def init_device(self): + # Setup OpenMP threads affinity. + omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND + if omp_cpuids == "all": + self.local_omp_cpuid = "all" + else: + self.local_omp_cpuid = omp_cpuids.split("|")[self.rank] + ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid) + if ret: + logger.info(ret) + + # Note: unique identifier for creating allreduce shared memory + os.environ["VLLM_DIST_IDENT"] = self.distributed_init_method.split( + ":")[-1] + # Initialize the distributed environment. + init_worker_distributed_environment(self.vllm_config, self.rank, + self.distributed_init_method, + self.local_rank, "gloo") + # Set random seed. + set_random_seed(self.model_config.seed) + + # Construct the model runner + self.model_runner: CPUModelRunner = CPUModelRunner( + self.vllm_config, torch.device("cpu")) + + def sleep(self, level: int = 1) -> None: + logger.warning("sleep mode is not supported on CPU, ignore it.") + pass + + def wake_up(self, tags: Optional[list[str]] = None) -> None: + logger.warning("sleep mode is not supported on CPU, ignore it.") + pass + + def determine_available_memory(self) -> int: + return self.cache_config.cpu_kvcache_space_bytes # type: ignore + + def compile_or_warm_up_model(self) -> None: + # Reset the seed to ensure that the random state is not affected by + # the model initialization and profiling. + set_random_seed(self.model_config.seed) + self.model_runner.warming_up_model() + + @torch.inference_mode() + def execute_model( + self, + scheduler_output: "SchedulerOutput", + ) -> Optional[ModelRunnerOutput]: + intermediate_tensors = None + if not get_pp_group().is_first_rank: + intermediate_tensors = IntermediateTensors( + get_pp_group().recv_tensor_dict( + all_gather_group=get_tp_group())) + + output = self.model_runner.execute_model(scheduler_output, + intermediate_tensors) + + if not get_pp_group().is_last_rank: + assert isinstance(output, IntermediateTensors) + get_pp_group().send_tensor_dict(output.tensors, + all_gather_group=get_tp_group()) + return None + + assert isinstance(output, ModelRunnerOutput) + return output if self.is_driver_worker else None diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6a566a602..6ea6bb020 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5,7 +5,7 @@ import copy import gc import time import weakref -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Union import numpy as np import torch @@ -38,7 +38,6 @@ from vllm.sequence import IntermediateTensors from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, GiB_bytes, LazyLoader, async_tensor_h2d, cdiv, check_use_alibi, is_pin_memory_available) -from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.core.encoder_cache_manager import compute_encoder_budget from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec, @@ -203,8 +202,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.vllm_config.compilation_config.cudagraph_capture_sizes)) # Cache the device properties. - self.device_properties = torch.cuda.get_device_properties(self.device) - self.num_sms = self.device_properties.multi_processor_count + self._init_device_properties() # Persistent buffers for CUDA graphs. self.input_ids = torch.zeros(self.max_num_tokens, @@ -315,6 +313,17 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.input_batch, scheduler_output) return batch_reordered + # Note: used for model runner override. + def _init_device_properties(self) -> None: + """Initialize attributes from torch.cuda.get_device_properties + """ + self.device_properties = torch.cuda.get_device_properties(self.device) + self.num_sms = self.device_properties.multi_processor_count + + # Note: used for model runner override. + def _sync_device(self) -> None: + torch.cuda.synchronize() + def _update_states(self, scheduler_output: "SchedulerOutput") -> None: """Update the cached states and the persistent batch with the scheduler output. @@ -538,8 +547,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): def _prepare_inputs( self, scheduler_output: "SchedulerOutput", - ) -> tuple[dict[str, FlashAttentionMetadata], torch.Tensor, - Optional[SpecDecodeMetadata]]: + ) -> tuple[dict[str, Any], torch.Tensor, Optional[SpecDecodeMetadata]]: total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens assert total_num_scheduled_tokens > 0 num_reqs = self.input_batch.num_reqs @@ -652,7 +660,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): common_attn_metadata = CommonAttentionMetadata( query_start_loc=query_start_loc, seq_lens=seq_lens) - attn_metadata: dict[str, FlashAttentionMetadata] = {} + attn_metadata: dict[str, Any] = {} # Prepare the attention metadata for each KV cache group and make layers # in the same group share the same metadata. for kv_cache_group_id, kv_cache_group_spec in enumerate( @@ -1710,7 +1718,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): # Must synchronize the non-blocking GPU->CPU transfers. if prompt_logprobs_dict: - torch.cuda.synchronize() + self._sync_device() return prompt_logprobs_dict @@ -1740,7 +1748,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): dtype=np.int32) if skip_attn: - attn_metadata: Optional[dict[str, FlashAttentionMetadata]] = None + attn_metadata: Optional[dict[str, Any]] = None else: query_start_loc = self.query_start_loc[:num_reqs + 1] seq_lens = self.seq_lens[:num_reqs] @@ -1964,7 +1972,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): sampler_output = self._dummy_sampler_run(hidden_states) else: sampler_output = None - torch.cuda.synchronize() + self._sync_device() del hidden_states, sampler_output self.encoder_cache.clear() gc.collect() diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index f36cf5d5c..3bf3b2221 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -342,13 +342,14 @@ def init_worker_distributed_environment( rank: int, distributed_init_method: Optional[str] = None, local_rank: int = -1, + backend: str = "nccl", ) -> None: """Initialize the distributed environment.""" parallel_config = vllm_config.parallel_config set_custom_all_reduce(not parallel_config.disable_custom_all_reduce) init_distributed_environment(parallel_config.world_size, rank, - distributed_init_method, local_rank) + distributed_init_method, local_rank, backend) ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size) -- GitLab From 1409ef913446aa282f6426efbb0ed02a59320467 Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Wed, 4 Jun 2025 04:24:56 +0100 Subject: [PATCH 157/274] [Core] Cast multimodal input in hf processor (#18862) Signed-off-by: Lukas Geiger --- vllm/inputs/registry.py | 26 +++++++++++++++++-- vllm/multimodal/inputs.py | 8 +----- vllm/spec_decode/draft_model_runner.py | 1 - vllm/v1/worker/gpu_model_runner.py | 2 -- vllm/v1/worker/tpu_model_runner.py | 2 -- vllm/worker/cpu_enc_dec_model_runner.py | 1 - vllm/worker/cpu_model_runner.py | 1 - vllm/worker/cpu_pooling_model_runner.py | 1 - vllm/worker/enc_dec_model_runner.py | 1 - vllm/worker/model_runner.py | 1 - vllm/worker/multi_step_neuron_model_runner.py | 1 - ...i_step_neuronx_distributed_model_runner.py | 1 - vllm/worker/neuron_model_runner.py | 2 -- vllm/worker/pooling_model_runner.py | 1 - vllm/worker/xpu_model_runner.py | 1 - 15 files changed, 25 insertions(+), 25 deletions(-) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 73d19aecd..3dad021e3 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -4,9 +4,12 @@ from collections.abc import Mapping from dataclasses import dataclass from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union +import torch from transformers import BatchFeature, PretrainedConfig, ProcessorMixin from typing_extensions import TypeVar +from vllm.jsontree import JSONTree, json_map_leaves +from vllm.logger import init_logger from vllm.transformers_utils.processor import cached_processor_from_config from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import resolve_mm_processor_kwargs @@ -21,6 +24,8 @@ _T = TypeVar("_T") _C = TypeVar("_C", bound=PretrainedConfig, default=PretrainedConfig) _P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin) +logger = init_logger(__name__) + @dataclass(frozen=True) class InputContext: @@ -134,7 +139,7 @@ class InputProcessingContext(InputContext): hf_processor: ProcessorMixin, data: Mapping[str, object], kwargs: Mapping[str, object] = {}, - ) -> BatchFeature: + ) -> Union[BatchFeature, JSONTree]: """ Call `hf_processor` on the prompt `data` (text, image, audio...) with configurable options `kwargs`. @@ -154,8 +159,25 @@ class InputProcessingContext(InputContext): allow_var_kwargs=True, ) + def maybe_cast_dtype(x): + # This mimics the behavior of transformers.BatchFeature + if isinstance(x, torch.Tensor) and x.is_floating_point(): + return x.to(dtype=self.model_config.dtype) + return x + try: - return hf_processor(**data, **merged_kwargs, return_tensors="pt") + output = hf_processor(**data, **merged_kwargs, return_tensors="pt") + # this emulates output.to(dtype=self.model_config.dtype) + cast_output = json_map_leaves(maybe_cast_dtype, output) + if isinstance(output, BatchFeature): + return BatchFeature(cast_output) + + logger.warning_once( + f"{type(hf_processor).__name__} did not return `BatchFeature`. " + "Make sure to match the behaviour of `ProcessorMixin` when " + "implementing custom processors.") + return cast_output + except Exception as exc: msg = (f"Failed to apply {type(hf_processor).__name__} " f"on data={data} with kwargs={merged_kwargs}") diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 35d2a6e8c..0bf5b1cf1 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -747,17 +747,11 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): batched_inputs: BatchedTensorInputs, *, device: torch.types.Device, - dtype: Optional[torch.dtype] = None, ) -> BatchedTensorInputs: json_inputs = cast(JSONTree[torch.Tensor], batched_inputs) - def maybe_cast_dtype(x: torch.Tensor): - # This mimics the behavior of transformers.BatchFeature - return x.to(dtype=dtype) if x.is_floating_point() else x - json_mapped = json_map_leaves( - # NOTE: Cast the dtype before sending it to device - lambda x: maybe_cast_dtype(x).to(device=device, non_blocking=True), + lambda x: x.to(device=device, non_blocking=True), json_inputs, ) diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index 8ccfefea1..96646ec94 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -297,7 +297,6 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase): intermediate_tensors=intermediate_tensors, **MultiModalKwargs.as_kwargs( multi_modal_kwargs, - dtype=self.model_runner.model_config.dtype, device=self.device, ), **model_execute_kwargs, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6ea6bb020..9ac33a149 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -957,7 +957,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs) batched_mm_inputs = MultiModalKwargs.as_kwargs( batched_mm_inputs, - dtype=self.model_config.dtype, device=self.device, ) @@ -1951,7 +1950,6 @@ class GPUModelRunner(LoRAModelRunnerMixin): [dummy_mm_kwargs] * max_num_mm_items) batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs( batched_dummy_mm_inputs, - dtype=self.model_config.dtype, device=self.device, ) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 73c445d14..94e438fb4 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -718,7 +718,6 @@ class TPUModelRunner(LoRAModelRunnerMixin): batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs) batched_mm_inputs = MultiModalKwargs.as_kwargs( batched_mm_inputs, - dtype=self.model_config.dtype, device=self.device, ) @@ -1560,7 +1559,6 @@ class TPUModelRunner(LoRAModelRunnerMixin): batch_size) return MultiModalKwargs.as_kwargs( batched_dummy_mm_inputs, - dtype=self.model_config.dtype, device=self.device, ) diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py index 677d66357..c99e2652a 100644 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ b/vllm/worker/cpu_enc_dec_model_runner.py @@ -300,7 +300,6 @@ class CPUEncoderDecoderModelRunner( model_input.encoder_input_positions, **MultiModalKwargs.as_kwargs( model_input.multi_modal_kwargs or {}, - dtype=self.model_config.dtype, device=self.device, ), "intermediate_tensors": diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 6213cf760..68cdf65ca 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -630,7 +630,6 @@ class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]): if model_input.multi_modal_kwargs is not None: multimodal_kwargs = MultiModalKwargs.as_kwargs( model_input.multi_modal_kwargs, - dtype=self.model_config.dtype, device=self.device, ) execute_model_kwargs = {} diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py index 174f86f48..203fdf225 100644 --- a/vllm/worker/cpu_pooling_model_runner.py +++ b/vllm/worker/cpu_pooling_model_runner.py @@ -53,7 +53,6 @@ class CPUPoolingModelRunner( model_input.input_positions, **MultiModalKwargs.as_kwargs( model_input.multi_modal_kwargs or {}, - dtype=self.model_config.dtype, device=self.device, ), **cross_enc_kwargs, diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index a3e7b0147..8d92edc5b 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -205,7 +205,6 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): intermediate_tensors=intermediate_tensors, **MultiModalKwargs.as_kwargs( multi_modal_kwargs, - dtype=self.model_config.dtype, device=self.device, ), **seqlen_agnostic_kwargs, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 75501e0f7..82db6617b 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1848,7 +1848,6 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]): intermediate_tensors=intermediate_tensors, **MultiModalKwargs.as_kwargs( multi_modal_kwargs, - dtype=self.model_config.dtype, device=self.device, ), **seqlen_agnostic_kwargs, diff --git a/vllm/worker/multi_step_neuron_model_runner.py b/vllm/worker/multi_step_neuron_model_runner.py index 336e41649..25f588077 100644 --- a/vllm/worker/multi_step_neuron_model_runner.py +++ b/vllm/worker/multi_step_neuron_model_runner.py @@ -73,7 +73,6 @@ class MultiStepNeuronModelRunner(NeuronModelRunner): input_block_ids=model_input.input_block_ids, **MultiModalKwargs.as_kwargs( model_input.multi_modal_kwargs or {}, - dtype=self.model_config.dtype, device=self.device, ), ) diff --git a/vllm/worker/multi_step_neuronx_distributed_model_runner.py b/vllm/worker/multi_step_neuronx_distributed_model_runner.py index de9827723..dd521dd67 100644 --- a/vllm/worker/multi_step_neuronx_distributed_model_runner.py +++ b/vllm/worker/multi_step_neuronx_distributed_model_runner.py @@ -52,7 +52,6 @@ class MultiStepNeuronxDistributedModelRunner(NeuronxDistributedModelRunner): sampling_params=sampling_params, **MultiModalKwargs.as_kwargs( model_input.multi_modal_kwargs or {}, - dtype=self.model_config.dtype, device=self.device, ), ) diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index 28855bb46..7ccf1a2c0 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -395,7 +395,6 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): adapter_ids=model_input.adapter_ids, **MultiModalKwargs.as_kwargs( model_input.multi_modal_kwargs or {}, - dtype=self.model_config.dtype, device=self.device, ), ) @@ -408,7 +407,6 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): input_block_ids=model_input.input_block_ids, **MultiModalKwargs.as_kwargs( model_input.multi_modal_kwargs or {}, - dtype=self.model_config.dtype, device=self.device, ), ) diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py index be6b3d137..f80955f71 100644 --- a/vllm/worker/pooling_model_runner.py +++ b/vllm/worker/pooling_model_runner.py @@ -122,7 +122,6 @@ class PoolingModelRunner( intermediate_tensors=intermediate_tensors, **MultiModalKwargs.as_kwargs( multi_modal_kwargs, - dtype=self.model_config.dtype, device=self.device, ), **cross_enc_kwargs, diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index ecbb63d91..b2d3ce852 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -565,7 +565,6 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): intermediate_tensors=intermediate_tensors, **MultiModalKwargs.as_kwargs( model_input.multi_modal_kwargs or {}, - dtype=self.model_config.dtype, device=self.device, ), ) -- GitLab From 5d6d1adf15aca59cb135853d0f11308af4bbd6e3 Mon Sep 17 00:00:00 2001 From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com> Date: Wed, 4 Jun 2025 08:13:01 +0400 Subject: [PATCH 158/274] [KERNEL] Sampler. CUDA kernel for applying repetition penalty (#18437) --- CMakeLists.txt | 1 + csrc/ops.h | 5 ++ csrc/sampler.cu | 86 +++++++++++++++++++ csrc/torch_bindings.cpp | 7 ++ .../test_apply_repetition_penalties.py | 76 ++++++++++++++++ vllm/_custom_ops.py | 39 +++++++++ vllm/model_executor/layers/utils.py | 13 +-- 7 files changed, 218 insertions(+), 9 deletions(-) create mode 100644 csrc/sampler.cu create mode 100644 tests/kernels/test_apply_repetition_penalties.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 87aa23c08..f11d28590 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -242,6 +242,7 @@ set(VLLM_EXT_SRC "csrc/activation_kernels.cu" "csrc/layernorm_kernels.cu" "csrc/layernorm_quant_kernels.cu" + "csrc/sampler.cu" "csrc/cuda_view.cu" "csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu" diff --git a/csrc/ops.h b/csrc/ops.h index 7044b4588..297f32b4a 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -92,6 +92,11 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight, void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual, torch::Tensor& weight, double epsilon); +void apply_repetition_penalties_(torch::Tensor& logits, + const torch::Tensor& prompt_mask, + const torch::Tensor& output_mask, + const torch::Tensor& repetition_penalties); + void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight, torch::Tensor& scale, double epsilon); diff --git a/csrc/sampler.cu b/csrc/sampler.cu new file mode 100644 index 000000000..ee5793dda --- /dev/null +++ b/csrc/sampler.cu @@ -0,0 +1,86 @@ +#include "dispatch_utils.h" + +#include +#include + +#ifndef USE_ROCM + #include +#else + #include +#endif + +namespace vllm { + +template +__global__ void apply_repetition_penalties_kernel( + scalar_t* __restrict__ logits, // [num_seqs, vocab_size] + const bool* __restrict__ prompt_mask, // [num_seqs, vocab_size] + const bool* __restrict__ output_mask, // [num_seqs, vocab_size] + const scalar_t* __restrict__ repetition_penalties, // [num_seqs] + const int num_seqs, const int vocab_size, const int tile_size) { + // Each block handles one sequence and a tile of vocab + const int seq_idx = blockIdx.x; + if (seq_idx >= num_seqs) return; + + const int tile_start = blockIdx.y * tile_size; + const int tile_end = min(tile_start + tile_size, vocab_size); + + // Load repetition penalty for this sequence + const scalar_t penalty = repetition_penalties[seq_idx]; + + // Each thread processes multiple vocab items within the tile + for (int vocab_idx = tile_start + threadIdx.x; vocab_idx < tile_end; + vocab_idx += blockDim.x) { + const int64_t idx = static_cast(seq_idx) * vocab_size + vocab_idx; + const bool is_repeated = prompt_mask[idx] || output_mask[idx]; + if (is_repeated) { + scalar_t logit = logits[idx]; + if (logit > 0) { + logits[idx] = logit / penalty; + } else { + logits[idx] = logit * penalty; + } + } + } +} + +} // namespace vllm + +void apply_repetition_penalties_( + torch::Tensor& logits, // [num_seqs, vocab_size], in-place + const torch::Tensor& prompt_mask, // [num_seqs, vocab_size] + const torch::Tensor& output_mask, // [num_seqs, vocab_size] + const torch::Tensor& repetition_penalties) { // [num_seqs] + TORCH_CHECK(logits.is_contiguous()); + TORCH_CHECK(prompt_mask.is_contiguous()); + TORCH_CHECK(output_mask.is_contiguous()); + TORCH_CHECK(repetition_penalties.is_contiguous()); + + int vocab_size = logits.size(-1); + int num_seqs = logits.size(0); + + // Get number of SMs on the current device + int sms = 0; + cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, + logits.get_device()); + + // Compute tile_num and tile_size + int tile_num = + std::min(vocab_size, std::max(1, (sms + num_seqs - 1) / num_seqs)); + int tile_size = (vocab_size + tile_num - 1) / tile_num; + + // Each block handles one sequence and a tile of vocab + dim3 grid(num_seqs, tile_num); + dim3 block(std::min(tile_size, 1024)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(logits)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + VLLM_DISPATCH_FLOATING_TYPES( + logits.scalar_type(), "apply_repetition_penalties_kernel", [&] { + vllm::apply_repetition_penalties_kernel + <<>>( + logits.data_ptr(), prompt_mask.data_ptr(), + output_mask.data_ptr(), + repetition_penalties.data_ptr(), num_seqs, vocab_size, + tile_size); + }); +} \ No newline at end of file diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 371894c56..3fffaf290 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -170,6 +170,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "float epsilon) -> ()"); ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm); + // Apply repetition penalties to logits in-place + ops.def( + "apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, " + "Tensor output_mask, Tensor repetition_penalties) -> ()"); + ops.impl("apply_repetition_penalties_", torch::kCUDA, + &apply_repetition_penalties_); + // Layernorm-quant // Apply Root Mean Square (RMS) Normalization to the input tensor. ops.def( diff --git a/tests/kernels/test_apply_repetition_penalties.py b/tests/kernels/test_apply_repetition_penalties.py new file mode 100644 index 000000000..9115949a1 --- /dev/null +++ b/tests/kernels/test_apply_repetition_penalties.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest +import torch + +from tests.kernels.utils import opcheck +from vllm._custom_ops import (apply_repetition_penalties_cuda, + apply_repetition_penalties_torch) +from vllm.platforms import current_platform + +NUM_SEQS = [1, 2, 3, 4, 8, 13, 17, 32, 37, 256, 1023, 1024, 1025] +# [stress, stress, stress, Qwen, llama 4] +VOCAB_SIZES = [17, 256, 1019, 151936, 202048] +REPETITION_PENALTY_VALUES = [1.05] +SEEDS = [0] +DTYPES = [torch.float32, torch.float16] + + +@pytest.mark.parametrize("num_seqs", NUM_SEQS) +@pytest.mark.parametrize("vocab_size", VOCAB_SIZES) +@pytest.mark.parametrize("repetition_penalty", REPETITION_PENALTY_VALUES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.skipif(not current_platform.is_cuda(), + reason="This test for checking CUDA kernel") +@torch.inference_mode() +def test_apply_repetition_penalties( + num_seqs: int, + vocab_size: int, + repetition_penalty: float, + dtype: torch.dtype, + seed: int, +) -> None: + """ + Test the apply_repetition_penalties custom op + against a reference implementation. + """ + current_platform.seed_everything(seed) + torch.set_default_device("cuda:0") + + # Create test data + logits = torch.randn(num_seqs, vocab_size, dtype=dtype) + + # Create masks with some random tokens marked as repeated + prompt_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool) + output_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool) + + # Mark some tokens as repeated in prompt and output + prompt_indices = torch.randint(0, vocab_size, + (num_seqs, max(1, vocab_size // 200))) + output_indices = torch.randint(0, vocab_size, + (num_seqs, max(1, vocab_size // 200))) + + for i in range(num_seqs): + prompt_mask[i, prompt_indices[i]] = True + output_mask[i, output_indices[i]] = True + + # Create repetition penalties tensor + repetition_penalties = torch.full((num_seqs, ), + repetition_penalty, + dtype=dtype) + + # Run all three implementations + logits_torch = logits.clone() + logits_cuda = logits.clone() + + apply_repetition_penalties_torch(logits_torch, prompt_mask, output_mask, + repetition_penalties) + apply_repetition_penalties_cuda(logits_cuda, prompt_mask, output_mask, + repetition_penalties) + + # Compare all outputs to reference + torch.testing.assert_close(logits_torch, logits_cuda, rtol=1e-3, atol=1e-3) + + # Test the operator by applying the opcheck utility + opcheck(torch.ops._C.apply_repetition_penalties_, + (logits.clone(), prompt_mask, output_mask, repetition_penalties)) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 008a7aa94..3282edf41 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -282,6 +282,45 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor, torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon) +def apply_repetition_penalties_torch( + logits: torch.Tensor, prompt_mask: torch.Tensor, + output_mask: torch.Tensor, repetition_penalties: torch.Tensor) -> None: + repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat( + 1, logits.size(1)) + # If token appears in prompt or output, apply, otherwise use 1.0 for no-op. + penalties = torch.where(prompt_mask | output_mask, repetition_penalties, + 1.0) + # If logits are positive, divide by penalty, otherwise multiply by penalty. + scaling = torch.where(logits > 0, 1.0 / penalties, penalties) + logits *= scaling + + +def apply_repetition_penalties_cuda( + logits: torch.Tensor, prompt_mask: torch.Tensor, + output_mask: torch.Tensor, repetition_penalties: torch.Tensor) -> None: + torch.ops._C.apply_repetition_penalties_(logits, prompt_mask, output_mask, + repetition_penalties) + + +def apply_repetition_penalties(logits: torch.Tensor, prompt_mask: torch.Tensor, + output_mask: torch.Tensor, + repetition_penalties: torch.Tensor) -> None: + """Apply repetition penalties to logits in-place. + + Args: + logits: The logits tensor of shape [num_seqs, vocab_size]. + prompt_mask: A boolean tensor indicating which tokens appear in the prompt. + output_mask: A boolean tensor indicating which tokens appear in the output. + repetition_penalties: The repetition penalties of shape (num_seqs, ). + """ + if current_platform.is_cuda() and logits.is_contiguous(): + apply_repetition_penalties_cuda(logits, prompt_mask, output_mask, + repetition_penalties) + else: + apply_repetition_penalties_torch(logits, prompt_mask, output_mask, + repetition_penalties) + + def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int, input_tokens: torch.Tensor, sampled_token_ids: torch.Tensor, diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index d97d84238..41b5253dc 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -50,16 +50,11 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor, vocab_size, num_seqs) output_bin_counts, output_mask = get_token_bin_counts_and_mask( output_tokens_tensor, vocab_size, num_seqs) - repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat( - 1, vocab_size) - # If token appears in prompt or output, apply, otherwise use 1.0 for no-op. - penalties = torch.where(prompt_mask | output_mask, repetition_penalties, - 1.0) - - # If logits are positive, divide by penalty, otherwise multiply by penalty. - scaling = torch.where(logits > 0, 1.0 / penalties, penalties) - logits *= scaling + # Apply repetition penalties as a custom op + from vllm._custom_ops import apply_repetition_penalties + apply_repetition_penalties(logits, prompt_mask, output_mask, + repetition_penalties) # We follow the definition in OpenAI API. # Refer to https://platform.openai.com/docs/api-reference/parameter-details -- GitLab From 8d646c2e53d3d840a3442bdd00845a6b57eb666f Mon Sep 17 00:00:00 2001 From: Calvin Chen <45745657+calvin0327@users.noreply.github.com> Date: Wed, 4 Jun 2025 12:23:26 +0800 Subject: [PATCH 159/274] [Cleanup][v1]:remote guided-decoding-backend for example (#19059) Signed-off-by: calvin chen <120380290@qq.com> --- .../online_serving/openai_chat_completion_structured_outputs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py index 64379083d..5c55d5313 100644 --- a/examples/online_serving/openai_chat_completion_structured_outputs.py +++ b/examples/online_serving/openai_chat_completion_structured_outputs.py @@ -139,7 +139,6 @@ def extra_backend_options_completion(client: OpenAI, model: str): extra_body={ "guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"], - "guided_decoding_backend": "xgrammar", "guided_decoding_disable_fallback": True, }, ) -- GitLab From 41aa5784287f00b026f3ba225ac18ab3caccc622 Mon Sep 17 00:00:00 2001 From: Kaixi Hou Date: Wed, 4 Jun 2025 12:40:26 +0800 Subject: [PATCH 160/274] [NVIDIA] Add Cutlass MLA backend (#17625) --- csrc/attention/mla/cutlass_mla_kernels.cu | 2 +- tests/kernels/test_cutlass_mla_decode.py | 4 +- vllm/engine/arg_utils.py | 1 + vllm/platforms/cuda.py | 8 ++ vllm/platforms/interface.py | 1 + vllm/v1/attention/backends/mla/common.py | 2 +- vllm/v1/attention/backends/mla/cutlass_mla.py | 96 +++++++++++++++++++ 7 files changed, 111 insertions(+), 3 deletions(-) create mode 100644 vllm/v1/attention/backends/mla/cutlass_mla.py diff --git a/csrc/attention/mla/cutlass_mla_kernels.cu b/csrc/attention/mla/cutlass_mla_kernels.cu index 6743af0cf..f4b6b19f4 100644 --- a/csrc/attention/mla/cutlass_mla_kernels.cu +++ b/csrc/attention/mla/cutlass_mla_kernels.cu @@ -119,7 +119,7 @@ typename T::Fmha::Arguments args_from_options( {static_cast(out.data_ptr()), stride_O, static_cast(nullptr), stride_LSE}, hw_info, - -1, // split_kv + 1, // split_kv nullptr, // is_var_split_kv }; // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute diff --git a/tests/kernels/test_cutlass_mla_decode.py b/tests/kernels/test_cutlass_mla_decode.py index c56024b75..2b745b84d 100644 --- a/tests/kernels/test_cutlass_mla_decode.py +++ b/tests/kernels/test_cutlass_mla_decode.py @@ -76,7 +76,9 @@ def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int, pack_factor = 128 // block_size block_num = ((block_num + pack_factor - 1) // pack_factor) * pack_factor - q = torch.randn(bs, h_q, d) + # Amplify input values to ensure test coverage of edge cases where CUTLASS + # kernel errors occur with split_k settings. + q = torch.randn(bs, h_q, d) * 100 block_table = torch.randint(0, bs * block_num, (bs, block_num), dtype=torch.int32) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b1c4b27a0..901346831 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1395,6 +1395,7 @@ class EngineArgs: "PALLAS_VLLM_V1", "TRITON_ATTN_VLLM_V1", "TRITON_MLA", + "CUTLASS_MLA_VLLM_V1", "FLASHMLA", "FLASHINFER", "FLASHINFER_VLLM_V1", diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 07ae470fa..bde606f0c 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -183,6 +183,14 @@ class CudaPlatformBase(Platform): if use_mla: # TODO(lucas): refactor to be more concise # we should probably consider factoring out V1 here + if selected_backend == _Backend.CUTLASS_MLA_VLLM_V1: + if use_v1: + logger.info_once("Using Cutlass MLA backend on V1 engine.") + return ("vllm.v1.attention.backends.mla." + "cutlass_mla.CutlassMLABackend") + else: + logger.warning( + "Cutlass MLA backend is only supported on V1 engine") if selected_backend == _Backend.TRITON_MLA or block_size != 64: if use_v1: logger.info_once("Using Triton MLA backend on V1 engine.") diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 1ec9c78a3..7fef697d8 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -51,6 +51,7 @@ class _Backend(enum.Enum): TRITON_MLA_VLLM_V1 = enum.auto() FLASHMLA_VLLM_V1 = enum.auto() FLASHMLA = enum.auto() # Supported by V1 + CUTLASS_MLA_VLLM_V1 = enum.auto() HPU_ATTN = enum.auto() PALLAS = enum.auto() PALLAS_VLLM_V1 = enum.auto() diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 06acbb909..e6b4f6404 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -350,7 +350,7 @@ class MLACommonMetadataBuilder(Generic[M]): self.num_heads = model_config.get_num_attention_heads( runner.parallel_config) self.mla_dims = get_mla_dims(model_config) - self.aot_schedule = is_vllm_fa and (get_flash_attn_version() == 3) + self.aot_schedule = current_platform.is_cuda() self.kv_cache_spec = kv_cache_spec # Dont try to access the runner on AMD diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py new file mode 100644 index 000000000..70aee058e --- /dev/null +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -0,0 +1,96 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Optional + +import torch + +import vllm._custom_ops as ops +from vllm.attention.backends.abstract import (AttentionType, + is_quantized_kv_cache) +from vllm.logger import init_logger +from vllm.v1.attention.backends.mla.common import (MLACommonBackend, + MLACommonImpl, + MLACommonMetadata) + +logger = init_logger(__name__) + + +class CutlassMLABackend(MLACommonBackend): + + @staticmethod + def get_name() -> str: + return "CUTLASS_MLA_VLLM_V1" + + @staticmethod + def get_impl_cls() -> type["CutlassMLAImpl"]: + return CutlassMLAImpl + + +class CutlassMLAImpl(MLACommonImpl[MLACommonMetadata]): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[list[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[dict[str, Any]], + logits_soft_cap: Optional[float], + attn_type: str, + # MLA Specific Arguments + **mla_args) -> None: + super().__init__(num_heads, head_size, scale, num_kv_heads, + alibi_slopes, sliding_window, kv_cache_dtype, + blocksparse_params, logits_soft_cap, attn_type, + **mla_args) + + unsupported_features = [ + alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap + ] + if any(unsupported_features): + raise NotImplementedError( + "CutlassMLAImpl does not support one of the following: " + "alibi_slopes, sliding_window, blocksparse_params, " + "logits_soft_cap") + + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "CutlassMLAImpl") + + if is_quantized_kv_cache(self.kv_cache_dtype): + raise NotImplementedError( + "CutlassMLA V1 with FP8 KV cache not yet supported") + + def _forward_decode( + self, + q_nope: torch.Tensor, + q_pe: torch.Tensor, + kv_c_and_k_pe_cache: torch.Tensor, + attn_metadata: MLACommonMetadata, + ) -> torch.Tensor: + assert kv_c_and_k_pe_cache.numel() > 0 + assert attn_metadata.decode is not None + + if self.kv_cache_dtype.startswith("fp8"): + raise NotImplementedError("FP8 Cutlass MLA not yet supported") + + B = q_nope.shape[0] + + o = torch.empty((B, self.num_heads, self.kv_lora_rank), + dtype=q_nope.dtype, + device=q_nope.device) + + # Run MLA + # Clone q_nope and q_pe to make sure strides computation is correct. + q_nope = q_nope.clone() + q_pe = q_pe.clone() + ops.cutlass_mla_decode(o, q_nope, q_pe, kv_c_and_k_pe_cache, + attn_metadata.decode.seq_lens, + attn_metadata.decode.block_table, self.scale) + + return self._v_up_proj(o) -- GitLab From b124e1085b1bf977e3dac96d99ffd9d8ddfdb6cc Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 3 Jun 2025 23:10:15 -0700 Subject: [PATCH 161/274] [Bugfix] Fix FA3 full cuda graph correctness (#19106) Signed-off-by: Woosuk Kwon --- .buildkite/test-pipeline.yaml | 1 + .../compile/piecewise/test_full_cudagraph.py | 7 +++-- vllm/v1/attention/backends/flash_attn.py | 29 ++++++++++++++----- vllm/v1/worker/gpu_model_runner.py | 5 ++++ 4 files changed, 32 insertions(+), 10 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8ab96b3b7..4ee6b499b 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -320,6 +320,7 @@ steps: # these tests need to be separated, cannot combine - pytest -v -s compile/piecewise/test_simple.py - pytest -v -s compile/piecewise/test_toy_llama.py + - pytest -v -s compile/piecewise/test_full_cudagraph.py - label: PyTorch Fullgraph Test # 18min mirror_hardwares: [amdexperimental, amdproduction] diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py index 3188ea40f..134bade48 100644 --- a/tests/compile/piecewise/test_full_cudagraph.py +++ b/tests/compile/piecewise/test_full_cudagraph.py @@ -7,6 +7,7 @@ import pytest from vllm import LLM, SamplingParams from vllm.config import CompilationConfig +from vllm.platforms import current_platform MODEL = "Qwen/Qwen2-1.5B-Instruct" @@ -37,7 +38,7 @@ def full_cudagraph_llm(): "VLLM_FLASH_ATTN_VERSION": "3" }): return LLM(model=MODEL, - gpu_memory_utilization=0.2, + gpu_memory_utilization=0.3, compilation_config=CompilationConfig(full_cuda_graph=True)) @@ -48,7 +49,7 @@ def piecewise_llm(): "VLLM_FLASH_ATTN_VERSION": "3" }): return LLM(model=MODEL, - gpu_memory_utilization=0.5, + gpu_memory_utilization=0.6, compilation_config=CompilationConfig()) @@ -61,6 +62,8 @@ def generate_text(llm: LLM, batch_size: int, max_tokens: int): return llm.generate(prompts, sampling_params) +@pytest.mark.skipif(current_platform.get_device_capability() != (9, 0), + reason="Only Hopper GPUs support FlashAttention 3") @pytest.mark.parametrize(("batch_size", "max_tokens"), [(1, 10), (7, 10), (16, 10), (25, 10), (32, 10), (45, 10), diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index a92c51883..a9f748d02 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -307,13 +307,14 @@ class FlashAttentionMetadataBuilder: self.kv_cache_spec = kv_cache_spec self.block_table = block_table - if get_flash_attn_version() == 3: - self.aot_schedule = not compilation_config.full_cuda_graph - if not self.aot_schedule: - logger.warning( - "AOT Schedule is disabled when using full_cuda_graph") - else: - self.aot_schedule = False + self.aot_schedule = (get_flash_attn_version() == 3) + self.use_full_cuda_graph = compilation_config.full_cuda_graph + if self.use_full_cuda_graph and not self.aot_schedule: + raise ValueError("Full CUDA graph mode requires AOT scheduling, " + "which requires FlashAttention 3.") + self.scheduler_metadata = torch.zeros(self.runner.max_num_reqs + 1, + dtype=torch.int32, + device=self.runner.device) # Sliding window size to be used with the AOT scheduler will be # populated on first build() call. @@ -326,7 +327,7 @@ class FlashAttentionMetadataBuilder: def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int, common_prefix_len: int, common_attn_metadata: CommonAttentionMetadata): - max_seq_len = self.runner.seq_lens_np[:num_reqs].max() + max_seq_len = int(self.runner.seq_lens_np[:num_reqs].max()) query_start_loc = common_attn_metadata.query_start_loc seq_lens = common_attn_metadata.seq_lens block_table = self.block_table @@ -448,6 +449,18 @@ class FlashAttentionMetadataBuilder: max_seq_len=max_seq_len, causal=True) + if self.use_full_cuda_graph: + assert scheduler_metadata is not None + n = scheduler_metadata.shape[0] + self.scheduler_metadata[:n].copy_(scheduler_metadata, + non_blocking=True) + # NOTE(woosuk): We should zero out the rest of the scheduler + # metadata to guarantee the correctness. Otherwise, some thread + # blocks may use the invalid scheduler metadata and overwrite the + # output buffer. + self.scheduler_metadata[n:] = 0 + scheduler_metadata = self.scheduler_metadata[:n] + attn_metadata = FlashAttentionMetadata( num_actual_tokens=num_actual_tokens, max_query_len=max_query_len, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9ac33a149..4a67e3778 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1750,6 +1750,11 @@ class GPUModelRunner(LoRAModelRunnerMixin): attn_metadata: Optional[dict[str, Any]] = None else: query_start_loc = self.query_start_loc[:num_reqs + 1] + # Make sure max_model_len is used at the graph capture time. + self.seq_lens_np[:num_reqs] = self.max_model_len + self.seq_lens_np[num_reqs:] = 0 + self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs], + non_blocking=True) seq_lens = self.seq_lens[:num_reqs] common_attn_metadata = CommonAttentionMetadata( -- GitLab From 3336c8cfbef6c7d6688ca1e5b0b26424baef02c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Wed, 4 Jun 2025 16:42:06 +0800 Subject: [PATCH 162/274] Fix #19130 (#19132) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 汪志鹏 --- .../vision_language_multi_image.py | 36 +++++++++++++------ 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index de6365c0d..ea7a793d0 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -593,21 +593,21 @@ def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData: def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData: try: - from qwen_vl_utils import process_vision_info + from qwen_vl_utils import smart_resize except ModuleNotFoundError: print( "WARNING: `qwen-vl-utils` not installed, input images will not " "be automatically resized. You can enable this functionality by " "`pip install qwen-vl-utils`." ) - process_vision_info = None + smart_resize = None model_name = "Qwen/Qwen2-VL-7B-Instruct" # Tested on L40 engine_args = EngineArgs( model=model_name, - max_model_len=32768 if process_vision_info is None else 4096, + max_model_len=32768 if smart_resize is None else 4096, max_num_seqs=5, limit_mm_per_prompt={"image": len(image_urls)}, ) @@ -630,10 +630,18 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData: messages, tokenize=False, add_generation_prompt=True ) - if process_vision_info is None: + if smart_resize is None: image_data = [fetch_image(url) for url in image_urls] else: - image_data, _ = process_vision_info(messages) + + def post_process_image(image: Image) -> Image: + width, height = image.size + resized_height, resized_width = smart_resize( + height, width, max_pixels=1024 * 28 * 28 + ) + return image.resize((resized_width, resized_height)) + + image_data = [post_process_image(fetch_image(url)) for url in image_urls] return ModelRequestData( engine_args=engine_args, @@ -644,20 +652,20 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData: def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: try: - from qwen_vl_utils import process_vision_info + from qwen_vl_utils import smart_resize except ModuleNotFoundError: print( "WARNING: `qwen-vl-utils` not installed, input images will not " "be automatically resized. You can enable this functionality by " "`pip install qwen-vl-utils`." ) - process_vision_info = None + smart_resize = None model_name = "Qwen/Qwen2.5-VL-3B-Instruct" engine_args = EngineArgs( model=model_name, - max_model_len=32768 if process_vision_info is None else 4096, + max_model_len=32768 if smart_resize is None else 4096, max_num_seqs=5, limit_mm_per_prompt={"image": len(image_urls)}, ) @@ -680,10 +688,18 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData: messages, tokenize=False, add_generation_prompt=True ) - if process_vision_info is None: + if smart_resize is None: image_data = [fetch_image(url) for url in image_urls] else: - image_data, _ = process_vision_info(messages, return_video_kwargs=False) + + def post_process_image(image: Image) -> Image: + width, height = image.size + resized_height, resized_width = smart_resize( + height, width, max_pixels=1024 * 28 * 28 + ) + return image.resize((resized_width, resized_height)) + + image_data = [post_process_image(fetch_image(url)) for url in image_urls] return ModelRequestData( engine_args=engine_args, -- GitLab From 8e972d9c44cc8a6b1d0a3596c41604c56a492977 Mon Sep 17 00:00:00 2001 From: Siyuan Liu Date: Wed, 4 Jun 2025 01:43:00 -0700 Subject: [PATCH 163/274] [TPU] Skip hanging tests (#19115) Signed-off-by: Siyuan Liu --- .buildkite/scripts/hardware_ci/run-tpu-v1-test.sh | 2 +- tests/v1/tpu/test_spmd_model_weight_loading.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh index 3212b660e..a394046d2 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh @@ -150,7 +150,7 @@ run_and_track_test 9 "test_multimodal.py" \ run_and_track_test 10 "test_pallas.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" run_and_track_test 11 "test_struct_output_generate.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" + "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k 'not test_structured_output_with_reasoning_matrices'" run_and_track_test 12 "test_moe_pallas.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" run_and_track_test 13 "test_lora.py" \ diff --git a/tests/v1/tpu/test_spmd_model_weight_loading.py b/tests/v1/tpu/test_spmd_model_weight_loading.py index d36edfc3f..916325e41 100644 --- a/tests/v1/tpu/test_spmd_model_weight_loading.py +++ b/tests/v1/tpu/test_spmd_model_weight_loading.py @@ -45,11 +45,14 @@ def _get_spmd_mesh(): return MESH -@pytest.mark.parametrize("model", [ - "Qwen/Qwen2-1.5B-Instruct", - "meta-llama/Llama-3.1-8B-Instruct", - "meta-llama/Llama-3.1-70B-Instruct", -]) +@pytest.mark.parametrize( + "model", + [ + "Qwen/Qwen2-1.5B-Instruct", + # Skip large models due to CI runner disk space limitations + # "meta-llama/Llama-3.1-8B-Instruct", + # "meta-llama/Llama-3.1-70B-Instruct", + ]) def test_tpu_model_loader(model): # Skip the 70B test if there are less than 8 chips # TODO: Query using torch xla API, the query API is not working -- GitLab From 2669a0d7b518371bb1d950425bd64a320010733f Mon Sep 17 00:00:00 2001 From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Date: Wed, 4 Jun 2025 02:10:45 -0700 Subject: [PATCH 164/274] Fix ValueError: Missing value for tag key(s): model_name,engine. (#19113) Signed-off-by: Seiji Eicher --- tests/v1/metrics/test_ray_metrics.py | 5 ++++- vllm/v1/metrics/ray_wrappers.py | 10 ++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/v1/metrics/test_ray_metrics.py b/tests/v1/metrics/test_ray_metrics.py index ea54038a2..0898ae65e 100644 --- a/tests/v1/metrics/test_ray_metrics.py +++ b/tests/v1/metrics/test_ray_metrics.py @@ -47,12 +47,15 @@ def test_engine_log_metrics_ray( engine_args, stat_loggers=[RayPrometheusStatLogger]) for i, prompt in enumerate(example_prompts): - engine.generate( + results = engine.generate( request_id=f"request-id-{i}", prompt=prompt, sampling_params=SamplingParams(max_tokens=max_tokens), ) + async for _ in results: + pass + # Create the actor and call the async method actor = EngineTestActor.remote() # type: ignore[attr-defined] ray.get(actor.run.remote()) diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py index 18c8dcf0a..cce692d6c 100644 --- a/vllm/v1/metrics/ray_wrappers.py +++ b/vllm/v1/metrics/ray_wrappers.py @@ -31,6 +31,16 @@ class RayPrometheusMetric: self.metric.set_default_tags(labelskwargs) + if labels: + if len(labels) != len(self.metric._tag_keys): + raise ValueError( + "Number of labels must match the number of tag keys. " + f"Expected {len(self.metric._tag_keys)}, got {len(labels)}" + ) + + self.metric.set_default_tags( + dict(zip(self.metric._tag_keys, labels))) + return self -- GitLab From 8711bc5e684d43a333c0c20bef575a0d8ee8346f Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Wed, 4 Jun 2025 19:18:48 +0800 Subject: [PATCH 165/274] [Misc] Add packages for benchmark as extra dependency (#19089) Signed-off-by: Isotr0py <2037008807@qq.com> --- docs/cli/README.md | 2 ++ setup.py | 1 + vllm/benchmarks/datasets.py | 39 ++++++++++++++++--------------------- 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/docs/cli/README.md b/docs/cli/README.md index f43ce7663..df700fb74 100644 --- a/docs/cli/README.md +++ b/docs/cli/README.md @@ -77,6 +77,8 @@ vllm complete --quick "The future of AI is" Run benchmark tests for latency online serving throughput and offline inference throughput. +To use benchmark commands, please install with extra dependencies using `pip install vllm[bench]`. + Available Commands: ```bash diff --git a/setup.py b/setup.py index b07cdea30..ea7cd0169 100644 --- a/setup.py +++ b/setup.py @@ -688,6 +688,7 @@ setup( ext_modules=ext_modules, install_requires=get_requirements(), extras_require={ + "bench": ["pandas", "datasets"], "tensorizer": ["tensorizer>=2.9.0"], "fastsafetensors": ["fastsafetensors >= 0.1.10"], "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"], diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index f795a1256..4da9f7368 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -24,7 +24,6 @@ from io import BytesIO from typing import Any, Callable, Optional, Union import numpy as np -import pandas as pd from PIL import Image from transformers import PreTrainedTokenizerBase @@ -33,6 +32,23 @@ from vllm.lora.utils import get_adapter_absolute_path from vllm.multimodal import MultiModalDataDict from vllm.multimodal.image import convert_image_mode from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer +from vllm.utils import PlaceholderModule + +try: + from datasets import load_dataset +except ImportError: + datasets = PlaceholderModule("datasets") + load_dataset = datasets.placeholder_attr("load_dataset") + +try: + import pandas as pd +except ImportError: + pd = PlaceholderModule("pandas") + +try: + import librosa +except ImportError: + librosa = PlaceholderModule("librosa") logger = logging.getLogger(__name__) @@ -636,13 +652,6 @@ class BurstGPTDataset(BenchmarkDataset): if self.dataset_path is None: raise ValueError("dataset_path must be provided for loading data.") - try: - import pandas as pd - except ImportError as e: - raise ImportError( - "Pandas is required for BurstGPTDataset. Please install it " - "using `pip install pandas`.") from e - df = pd.read_csv(self.dataset_path) # Filter to keep only GPT-4 rows. gpt4_df = df[df["Model"] == "GPT-4"] @@ -717,13 +726,6 @@ class HuggingFaceDataset(BenchmarkDataset): def load_data(self) -> None: """Load data from HuggingFace datasets.""" - try: - from datasets import load_dataset - except ImportError as e: - raise ImportError( - "Hugging Face datasets library is required for this dataset. " - "Please install it using `pip install datasets`.") from e - self.data = load_dataset( self.dataset_path, name=self.dataset_subset, @@ -1147,13 +1149,6 @@ class ASRDataset(HuggingFaceDataset): output_len: Optional[int] = None, **kwargs, ) -> list: - try: - import librosa - except ImportError as e: - raise ImportError( - "librosa is required for ASRDataset. Please install it " - "using `pip install librosa`.") from e - output_len = (output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN) prompt = ASRDataset.TRANSCRIPTION_PREAMBLE -- GitLab From 35cf32df304770b9dd3878438544b3a1a1cc79a5 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Wed, 4 Jun 2025 19:48:57 +0800 Subject: [PATCH 166/274] Improve the output precision of embedding models (#19092) --- tests/models/language/pooling/embed_utils.py | 6 +-- tests/models/language/pooling/mteb_utils.py | 12 ++--- tests/models/language/pooling/test_gte.py | 7 --- .../models/language/pooling/test_intfloat.py | 46 +++++++++++++++++++ tests/models/language/pooling/test_jina.py | 3 +- tests/models/language/pooling/test_nomic.py | 3 -- vllm/model_executor/models/bert.py | 13 ++++-- vllm/model_executor/models/bert_with_rope.py | 7 ++- 8 files changed, 69 insertions(+), 28 deletions(-) create mode 100644 tests/models/language/pooling/test_intfloat.py diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py index 07bc9f447..dabd7bee7 100644 --- a/tests/models/language/pooling/embed_utils.py +++ b/tests/models/language/pooling/embed_utils.py @@ -56,14 +56,10 @@ def correctness_test_embed_models(hf_runner, max_model_len=None, **vllm_extra_kwargs) as vllm_model: vllm_outputs = vllm_model.encode(example_prompts) - vllm_dtype = vllm_model.model.llm_engine.model_config.dtype - model_dtype = getattr( - vllm_model.model.llm_engine.model_config.hf_config, "torch_dtype", - vllm_dtype) with hf_runner( model_info.name, - dtype=model_dtype, + dtype="float32", is_sentence_transformer=True, ) as hf_model: diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index 2705be25e..0a047951d 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -7,7 +7,6 @@ import numpy as np import pytest from tests.models.utils import EmbedModelInfo -from vllm.model_executor.model_loader.utils import set_default_torch_dtype # Most models on the STS12 task (See #17175): # - Model implementation and minor changes in tensor dtype @@ -104,17 +103,18 @@ def mteb_test_embed_models(hf_runner, MTEB_EMBED_TASKS) vllm_dtype = vllm_model.model.llm_engine.model_config.dtype - with set_default_torch_dtype(vllm_dtype) and hf_runner( - model_info.name, is_sentence_transformer=True, - dtype=vllm_dtype) as hf_model: + with hf_runner(model_info.name, + is_sentence_transformer=True, + dtype="float32") as hf_model: if hf_model_callback is not None: hf_model_callback(hf_model) st_main_score = run_mteb_embed_task(hf_model, MTEB_EMBED_TASKS) + st_dtype = next(hf_model.model.parameters()).dtype - print("VLLM:", vllm_main_score) - print("SentenceTransformers:", st_main_score) + print("VLLM:", vllm_dtype, vllm_main_score) + print("SentenceTransformers:", st_dtype, st_main_score) print("Difference:", st_main_score - vllm_main_score) assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_EMBED_TOL) diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py index 2178a815b..05bd479f4 100644 --- a/tests/models/language/pooling/test_gte.py +++ b/tests/models/language/pooling/test_gte.py @@ -11,27 +11,21 @@ MODELS = [ ########## BertModel EmbedModelInfo("thenlper/gte-large", architecture="BertModel", - dtype="float32", enable_test=True), EmbedModelInfo("thenlper/gte-base", architecture="BertModel", - dtype="float32", enable_test=False), EmbedModelInfo("thenlper/gte-small", architecture="BertModel", - dtype="float32", enable_test=False), EmbedModelInfo("thenlper/gte-large-zh", architecture="BertModel", - dtype="float32", enable_test=False), EmbedModelInfo("thenlper/gte-base-zh", architecture="BertModel", - dtype="float32", enable_test=False), EmbedModelInfo("thenlper/gte-small-zh", architecture="BertModel", - dtype="float32", enable_test=False), ########### NewModel EmbedModelInfo("Alibaba-NLP/gte-multilingual-base", @@ -46,7 +40,6 @@ MODELS = [ ########### Qwen2ForCausalLM EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct", architecture="Qwen2ForCausalLM", - dtype="float32", enable_test=True), ########## ModernBertModel EmbedModelInfo("Alibaba-NLP/gte-modernbert-base", diff --git a/tests/models/language/pooling/test_intfloat.py b/tests/models/language/pooling/test_intfloat.py new file mode 100644 index 000000000..b6e83857f --- /dev/null +++ b/tests/models/language/pooling/test_intfloat.py @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest + +from ...utils import EmbedModelInfo +from .embed_utils import correctness_test_embed_models +from .mteb_utils import mteb_test_embed_models + +MODELS = [ + ########## BertModel + EmbedModelInfo("intfloat/e5-small", + architecture="BertModel", + enable_test=True), + EmbedModelInfo("intfloat/e5-base", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("intfloat/e5-large", + architecture="BertModel", + enable_test=False), + EmbedModelInfo("intfloat/multilingual-e5-small", + architecture="BertModel", + enable_test=False), + ########## XLMRobertaModel + EmbedModelInfo("intfloat/multilingual-e5-base", + architecture="XLMRobertaModel", + enable_test=True), + EmbedModelInfo("intfloat/multilingual-e5-large", + architecture="XLMRobertaModel", + enable_test=False), + EmbedModelInfo("intfloat/multilingual-e5-large-instruct", + architecture="XLMRobertaModel", + enable_test=False), +] + + +@pytest.mark.parametrize("model_info", MODELS) +def test_embed_models_mteb(hf_runner, vllm_runner, + model_info: EmbedModelInfo) -> None: + mteb_test_embed_models(hf_runner, vllm_runner, model_info) + + +@pytest.mark.parametrize("model_info", MODELS) +def test_embed_models_correctness(hf_runner, vllm_runner, + model_info: EmbedModelInfo, + example_prompts) -> None: + correctness_test_embed_models(hf_runner, vllm_runner, model_info, + example_prompts) diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py index 2adf34b29..33255021a 100644 --- a/tests/models/language/pooling/test_jina.py +++ b/tests/models/language/pooling/test_jina.py @@ -32,8 +32,7 @@ TEXTS_2 = [ EMBEDDING_MODELS = [ EmbedModelInfo("jinaai/jina-embeddings-v3", architecture="XLMRobertaModel", - is_matryoshka=True, - dtype="float32") + is_matryoshka=True) ] diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling/test_nomic.py index 59dbd74fb..e16ec239a 100644 --- a/tests/models/language/pooling/test_nomic.py +++ b/tests/models/language/pooling/test_nomic.py @@ -9,18 +9,15 @@ from .mteb_utils import mteb_test_embed_models MODELS = [ EmbedModelInfo("nomic-ai/nomic-embed-text-v1", architecture="NomicBertModel", - dtype="float32", enable_test=True), EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5", architecture="NomicBertModel", - dtype="float32", enable_test=False), EmbedModelInfo("nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False), EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe", architecture="NomicBertModel", - dtype="float32", enable_test=True) ] diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 389393987..cacec7342 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -414,10 +414,15 @@ class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant): intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - return self.model(input_ids=input_ids, - position_ids=positions, - inputs_embeds=inputs_embeds, - intermediate_tensors=intermediate_tensors) + hidden_states = self.model(input_ids=input_ids, + position_ids=positions, + inputs_embeds=inputs_embeds, + intermediate_tensors=intermediate_tensors) + + # convert the embedding output to float32, + # otherwise precision will be lost significantly + hidden_states = hidden_states.to(torch.float32) + return hidden_states def pooler( self, diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 0f22393c7..d1b84a9f0 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -432,7 +432,12 @@ class BertWithRope(nn.Module, SupportsV0Only, SupportsQuant): else: hidden_states = self.embeddings(input_ids=input_ids, token_type_ids=token_type_ids) - return self.encoder(positions, hidden_states) + hidden_states = self.encoder(positions, hidden_states) + + # convert the embedding output to float32, + # otherwise precision will be lost significantly + hidden_states = hidden_states.to(torch.float32) + return hidden_states def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: -- GitLab From 01dc9a76db7d314aaf51be9ffc6ff561bae5626f Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 4 Jun 2025 19:49:20 +0800 Subject: [PATCH 167/274] [CI/Build][Bugfix] Ensure compatibility with transformers 4.52 (#18678) Signed-off-by: DarkLight1337 --- requirements/test.in | 2 +- requirements/test.txt | 2 +- .../multimodal/generation/test_common.py | 9 +++- .../multimodal/generation/test_florence2.py | 2 + .../generation/test_granite_speech.py | 2 +- .../multimodal/generation/test_phi4mm.py | 4 ++ .../generation/vlm_utils/model_utils.py | 18 ++++++- .../multimodal/processing/test_common.py | 2 +- tests/models/registry.py | 47 ++++++------------- tests/models/test_initialization.py | 11 +++++ vllm/config.py | 2 + vllm/model_executor/models/aya_vision.py | 12 +++-- vllm/model_executor/models/idefics3.py | 16 +++++-- 13 files changed, 82 insertions(+), 47 deletions(-) diff --git a/requirements/test.in b/requirements/test.in index 9b574a09f..bbbd41e16 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -34,7 +34,7 @@ opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test lm-eval[api]==0.4.8 # required for model evaluation test mteb>=1.38.11, <2 # required for mteb test -transformers==4.51.3 +transformers==4.52.4 tokenizers==0.21.1 huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads. schemathesis>=3.39.15 # Required for openai schema test. diff --git a/requirements/test.txt b/requirements/test.txt index 03aec80ac..fb0eede08 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -794,7 +794,7 @@ tqdm==4.66.6 # transformers tqdm-multiprocess==0.0.11 # via lm-eval -transformers==4.51.3 +transformers==4.52.4 # via # -r requirements/test.in # genai-perf diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index a5bbcfc22..496850b19 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -226,6 +226,8 @@ VLM_TEST_SETTINGS = { img_idx_to_prompt=lambda idx: "", auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.blip2_vllm_to_hf_output, + # FIXME: https://github.com/huggingface/transformers/pull/38510 + marks=[pytest.mark.skip("Model is broken")], ), "chameleon": VLMTestInfo( models=["facebook/chameleon-7b"], @@ -281,10 +283,10 @@ VLM_TEST_SETTINGS = { multi_image_prompt="Describe the two images in detail.", # noqa: E501 max_model_len=4096, max_num_seqs=2, - dtype="bfloat16", auto_cls=AutoModelForImageTextToText, vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}}, patch_hf_runner=model_utils.gemma3_patch_hf_runner, + num_logprobs=10, ), "glm4v": VLMTestInfo( models=["THUDM/glm-4v-9b"], @@ -337,7 +339,8 @@ VLM_TEST_SETTINGS = { models=[ "OpenGVLab/InternVL2-1B", "OpenGVLab/InternVL2-2B", - "OpenGVLab/Mono-InternVL-2B", + # FIXME: Config cannot be loaded in transformers 4.52 + # "OpenGVLab/Mono-InternVL-2B", ], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 @@ -568,6 +571,8 @@ VLM_TEST_SETTINGS = { max_num_seqs=2, vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output, prompt_path_encoder=model_utils.qwen_prompt_path_encoder, + # FIXME: https://github.com/huggingface/transformers/issues/38358 + marks=[pytest.mark.skip("Model initialization fails")], ), "qwen2_vl": VLMTestInfo( models=["Qwen/Qwen2-VL-2B-Instruct"], diff --git a/tests/models/multimodal/generation/test_florence2.py b/tests/models/multimodal/generation/test_florence2.py index b048cec5e..a622957f9 100644 --- a/tests/models/multimodal/generation/test_florence2.py +++ b/tests/models/multimodal/generation/test_florence2.py @@ -100,6 +100,8 @@ def run_test( ) +# FIXME: https://github.com/huggingface/transformers/issues/38358 +@pytest.mark.skip("Model initialization fails") @pytest.mark.core_model @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize( diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py index 14552010d..c5ffa5f3a 100644 --- a/tests/models/multimodal/generation/test_granite_speech.py +++ b/tests/models/multimodal/generation/test_granite_speech.py @@ -29,7 +29,7 @@ def vllm_to_hf_output( return output_ids, hf_output_str, out_logprobs -MODEL_NAME = "ibm-granite/granite-speech-3.3-8b" +MODEL_NAME = "ibm-granite/granite-speech-3.3-2b" # Audio lora co-exists directly in the model directory, but # currently still needs to be passed directly to vLLM. audio_lora_path = MODEL_NAME diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py index e4cd476a9..4e8465778 100644 --- a/tests/models/multimodal/generation/test_phi4mm.py +++ b/tests/models/multimodal/generation/test_phi4mm.py @@ -122,6 +122,10 @@ def run_test( for prompts, images, audios in inputs ] + # This error occurs inside `get_peft_model` + # FIXME: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/75 + pytest.skip("HF impl is not compatible with current transformers") + hf_model_kwargs = {"_attn_implementation": "sdpa"} with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model: diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index 1b087191f..af4c72f44 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -10,11 +10,12 @@ from typing import Optional, Union import numpy as np import numpy.typing as npt +import pytest import regex as re import torch from PIL.Image import Image from transformers import (AutoConfig, AutoTokenizer, BatchFeature, - GenerationConfig) + GenerationConfig, GenerationMixin) from vllm.sequence import SampleLogprobs from vllm.transformers_utils.tokenizer import patch_padding_side @@ -324,6 +325,16 @@ def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner: hf_model.processor = processor + orig_generate = hf_model.model.generate + + def _generate(self, *args, **kwargs): + # FIXME: https://github.com/huggingface/transformers/issues/38333 + kwargs["disable_compile"] = True + + return orig_generate(*args, **kwargs) + + hf_model.model.generate = types.MethodType(_generate, hf_model.model) + return hf_model @@ -610,6 +621,11 @@ def _internvl_generate( if getattr(self, "use_visual_token_mask", False): visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype) forward_kwargs["visual_token_mask"] = visual_token_mask + + # e.g. InternVL2-2B + if not isinstance(self.language_model, GenerationMixin): + pytest.skip("HF impl is not compatible with current transformers") + outputs = self.language_model.generate( **forward_kwargs, **generate_kwargs, diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index be574435e..1e6608955 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -245,7 +245,7 @@ def _test_processing_correctness_one( "adept/fuyu-8b", "google/gemma-3-4b-it", "THUDM/glm-4v-9b", - "ibm-granite/granite-speech-3.3-8b", + "ibm-granite/granite-speech-3.3-2b", "h2oai/h2ovl-mississippi-800m", "OpenGVLab/InternVL2-1B", "OpenGVLab/InternVL3-1B", diff --git a/tests/models/registry.py b/tests/models/registry.py index ed49676a9..3e07dc0f3 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -160,17 +160,12 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501 "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"), "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-1.5B-Instruct", - is_available_online=False, - min_transformers_version="4.52.2"), + min_transformers_version="4.53"), "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"), "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"), "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"), "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"), - "Glm4ForCausalLM": _HfExamplesInfo( - "THUDM/GLM-4-32B-0414", - is_available_online=False, - min_transformers_version="4.52.dev0" - ), + "Glm4ForCausalLM": _HfExamplesInfo("THUDM/GLM-4-9B-0414"), "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}), "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder", @@ -181,8 +176,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { {"1b": "EleutherAI/pythia-1.4b"}), "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"), "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"), - "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview", # noqa: E501 - min_transformers_version="4.52.0"), # noqa: E501 + "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview"), # noqa: E501 "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts"), # noqa: E501 "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1", trust_remote_code=True), @@ -203,8 +197,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf", is_available_online=False), "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"), - "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1", - is_available_online=False), + "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"), "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"), # noqa: E501 "MiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-2B-sft-bf16", trust_remote_code=True), @@ -243,10 +236,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"), "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"), "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"), - "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b", - is_available_online=False), + "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"), "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b", # noqa: E501 - is_available_online=False), + v0_only=True), "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t", v0_only=True), "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"), @@ -256,7 +248,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "TeleFLMForCausalLM": _HfExamplesInfo("CofeAI/FLM-2-52B-Instruct-2407", trust_remote_code=True), "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat", - is_available_online=False, + tokenizer="meta-llama/Llama-2-7b", trust_remote_code=True), "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct"), "MiMoForCausalLM": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", @@ -275,8 +267,7 @@ _EMBEDDING_EXAMPLE_MODELS = { trust_remote_code=True), "GteNewModel": _HfExamplesInfo("Alibaba-NLP/gte-base-en-v1.5", trust_remote_code=True, - hf_overrides={"architectures": - ["GteNewModel"]}), + hf_overrides={"architectures": ["GteNewModel"]}), # noqa: E501 "InternLM2ForRewardModel": _HfExamplesInfo("internlm/internlm2-1_8b-reward", trust_remote_code=True), "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"), # noqa: E501 @@ -298,10 +289,8 @@ _EMBEDDING_EXAMPLE_MODELS = { "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full", trust_remote_code=True), "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501 - # The model on Huggingface is currently being updated, - # hence I temporarily mark it as not available online - "PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501 - is_available_online=False), + "PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501 + is_available_online=False), # noqa: E501 } _CROSS_ENCODER_EXAMPLE_MODELS = { @@ -327,8 +316,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}), # noqa: E501 "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"), "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"), - "GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-8b", # noqa: E501 - min_transformers_version="4.52.0"), # noqa: E501 + "GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-2b"), # noqa: E501 "GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b", trust_remote_code=True, hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501 @@ -347,7 +335,6 @@ _MULTIMODAL_EXAMPLE_MODELS = { trust_remote_code=True, v0_only=True), "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501 - min_transformers_version="4.51", max_model_len=10240), "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf", extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501 @@ -360,8 +347,6 @@ _MULTIMODAL_EXAMPLE_MODELS = { transformers_version_reason="HF model is not compatible.", # noqa: E501 hf_overrides={"architectures": ["MantisForConditionalGeneration"]}), # noqa: E501 "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6", - max_transformers_version="4.48", - transformers_version_reason="Use of deprecated imports which have been removed.", # noqa: E501 trust_remote_code=True), "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5", extras={"2.6": "openbmb/MiniCPM-V-2_6"}, # noqa: E501 @@ -399,10 +384,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501 "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501 "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct"), # noqa: E501 - "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B", - min_transformers_version="4.52"), - "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ", # noqa: E501 - min_transformers_version="4.52"), + "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B"), + "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ"), # noqa: E501 "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"), "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"), # noqa: E501 "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501 @@ -413,8 +396,8 @@ _MULTIMODAL_EXAMPLE_MODELS = { # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer # Therefore, we borrow the BartTokenizer from the original Bart model "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base", # noqa: E501 - tokenizer="Isotr0py/Florence-2-tokenizer", - trust_remote_code=True,), # noqa: E501 + tokenizer="Isotr0py/Florence-2-tokenizer", # noqa: E501 + trust_remote_code=True), # noqa: E501 "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501 "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501 } diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index af023d903..98a58d01e 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -21,6 +21,10 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch): model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip") + # FIXME: Possible memory leak in the previous tests? + if model_arch == "GraniteSpeechForConditionalGeneration": + pytest.skip("Avoid OOM") + # Avoid OOM and reduce initialization time by only using 1 layer def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig: hf_config.update(model_info.hf_overrides) @@ -41,6 +45,13 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch): "num_hidden_layers": 1, }) + # e.g.: ibm-granite/granite-speech-3.3-2b + if hasattr(hf_config, "encoder_config"): + hf_config.encoder_config.update({ + "num_layers": 1, + "num_hidden_layers": 1, + }) + return hf_config # Avoid calling model.forward() diff --git a/vllm/config.py b/vllm/config.py index f6ca9328b..a07c41dda 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3139,6 +3139,8 @@ def _find_dtype( config_dtype = getattr(config.get_text_config(), "torch_dtype", None) if config_dtype is None and hasattr(config, "vision_config"): config_dtype = getattr(config.vision_config, "torch_dtype", None) + if config_dtype is None and hasattr(config, "encoder_config"): + config_dtype = getattr(config.encoder_config, "torch_dtype", None) # Try to read the dtype of the weights if they are in safetensors format if config_dtype is None: diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index 22efb707a..7e15e57a4 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -111,7 +111,13 @@ class AyaVisionProcessingInfo(BaseProcessingInfo): return self.ctx.get_hf_config(AyaVisionConfig) def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor: - return self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs) + processor = self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs) + + # Temporary workaround since this processor has multiple image tokens + # See https://github.com/huggingface/transformers/issues/38350 + processor._check_special_mm_tokens = lambda *args, **kwargs: None + + return processor def get_image_processor(self) -> GotOcr2ImageProcessor: return self.get_hf_processor().image_processor @@ -188,9 +194,7 @@ class AyaVisionMultiModalProcessor( image_processor = hf_processor.image_processor # HF processor pops the `num_patches` kwarg, which is needed by vLLM - if (images := - mm_data.get("images")) is not None and '' in prompt: - assert isinstance(images, list) + if (images := mm_data.get("images")) is not None: parsed_images = (self._get_data_parser().parse_mm_data({ "image": images diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 4bc5e2a0c..de8596282 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -22,8 +22,8 @@ from typing import Literal, Optional, TypedDict, Union import torch from torch import nn -from transformers import (BatchFeature, Idefics3Config, Idefics3ImageProcessor, - Idefics3Processor) +from transformers import (AddedToken, BatchFeature, Idefics3Config, + Idefics3ImageProcessor, Idefics3Processor) from vllm.config import VllmConfig from vllm.model_executor.layers.linear import ReplicatedLinear @@ -199,13 +199,21 @@ class Idefics3ProcessingInfo(BaseProcessingInfo): return grid_w * grid_h + 1 + # TODO: Remove after requiring transformers>=4.52 + def _get_content(self, token: Union[AddedToken, str]) -> str: + if isinstance(token, str): + return token + + return token.content + def _get_image_token( self, processor: Optional[Idefics3Processor]) -> tuple[str, str, str]: if processor is None: processor = self.get_hf_processor() - image_token = processor.image_token.content - fake_image_token = processor.fake_image_token.content + + image_token = self._get_content(processor.image_token) + fake_image_token = self._get_content(processor.fake_image_token) global_image_token = processor.global_image_tag return image_token, fake_image_token, global_image_token -- GitLab From 02658c2dfed40acaf04c8d2470b3493e8fead523 Mon Sep 17 00:00:00 2001 From: Xu Wenqing <121550081+Xu-Wenqing@users.noreply.github.com> Date: Wed, 4 Jun 2025 21:24:18 +0800 Subject: [PATCH 168/274] Add DeepSeek-R1-0528 function call chat template (#18874) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 许文卿 --- docs/features/tool_calling.md | 6 +- examples/tool_chat_template_deepseekr1.jinja | 92 ++++++++++++++++++++ 2 files changed, 96 insertions(+), 2 deletions(-) create mode 100644 examples/tool_chat_template_deepseekr1.jinja diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 6ee1060dd..3547069f7 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -238,9 +238,11 @@ Flags: `--tool-call-parser hermes` ### DeepSeek-V3 Models (`deepseek_v3`) Supported models: -* `deepseek-ai/DeepSeek-V3-0324` -Flags: `--tool-call-parser deepseek_v3 --chat-template examples/tool_chat_template_deepseekv3.jinja` +* `deepseek-ai/DeepSeek-V3-0324` (use with ) +* `deepseek-ai/DeepSeek-R1-0528` (use with ) + +Flags: `--tool-call-parser deepseek_v3 --chat-template {see_above}` ### Models with Pythonic Tool Calls (`pythonic`) diff --git a/examples/tool_chat_template_deepseekr1.jinja b/examples/tool_chat_template_deepseekr1.jinja new file mode 100644 index 000000000..9ae19341f --- /dev/null +++ b/examples/tool_chat_template_deepseekr1.jinja @@ -0,0 +1,92 @@ +{% if not add_generation_prompt is defined %} + {% set add_generation_prompt = false %} +{% endif %} +{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true, is_last_user=false) %} +{%- for message in messages %} + {%- if message['role'] == 'system' %} + {%- if ns.is_first_sp %} + {% set ns.system_prompt = ns.system_prompt + message['content'] %} + {% set ns.is_first_sp = false %} + {%- else %} + {% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %} + {%- endif %} + {%- endif %} +{%- endfor %} + +{#- Adapted from https://github.com/sgl-project/sglang/blob/main/examples/chat_template/tool_chat_template_deepseekr1.jinja #} +{% if tools is defined and tools is not none %} + {% set tool_ns = namespace(text='You are a helpful assistant with tool calling capabilities. ' + 'When a tool call is needed, you MUST use the following format to issue the call:\n' + '<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>FUNCTION_NAME\n' + '```json\n{"param1": "value1", "param2": "value2"}\n```<|tool▁call▁end|><|tool▁calls▁end|>\n\n' + 'Make sure the JSON is valid.' + '## Tools\n\n### Function\n\nYou have the following functions available:\n\n') %} + {% for tool in tools %} + {% set tool_ns.text = tool_ns.text + '\n```json\n' + (tool | tojson) + '\n```\n' %} + {% endfor %} + {% set ns.system_prompt = ns.system_prompt + '\n\n' + tool_ns.text %} +{% endif %} + +{{ bos_token }} +{{ ns.system_prompt }} +{%- for message in messages %} + {% set content = message['content'] %} + {%- if message['role'] == 'user' %} + {%- set ns.is_tool = false -%} + {%- set ns.is_first = false -%} + {%- set ns.is_last_user = true -%} + {{'<|User|>' + content + '<|Assistant|>'}} + {%- endif %} + {%- if message['role'] == 'assistant' %} + {% if '' in content %} + {% set content = content.split('')[-1] %} + {% endif %} + {% endif %} + {%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %} + {%- set ns.is_last_user = false -%} + {%- if ns.is_tool %} + {{'<|tool▁outputs▁end|>'}} + {%- endif %} + {%- set ns.is_first = false %} + {%- set ns.is_tool = false -%} + {%- set ns.is_output_first = true %} + {%- for tool in message['tool_calls'] %} + {%- if not ns.is_first %} + {%- if content is none %} + {{'<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<|tool▁call▁end|>'}} + {%- else %} + {{content + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<|tool▁call▁end|>'}} + {%- endif %} + {%- set ns.is_first = true -%} + {%- else %} + {{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments']|tojson + '\n' + '```' + '<|tool▁call▁end|>'}} + {%- endif %} + {%- endfor %} + {{'<|tool▁calls▁end|><|end▁of▁sentence|>'}} + {%- endif %} + {%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none)%} + {%- set ns.is_last_user = false -%} + {%- if ns.is_tool %} + {{'<|tool▁outputs▁end|>' + content + '<|end▁of▁sentence|>'}} + {%- set ns.is_tool = false -%} + {%- else %} + {{content + '<|end▁of▁sentence|>'}} + {%- endif %} + {%- endif %} + {%- if message['role'] == 'tool' %} + {%- set ns.is_last_user = false -%} + {%- set ns.is_tool = true -%} + {%- if ns.is_output_first %} + {{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + content + '<|tool▁output▁end|>'}} + {%- set ns.is_output_first = false %} + {%- else %} + {{'\n<|tool▁output▁begin|>' + content + '<|tool▁output▁end|>'}} + {%- endif %} + {%- endif %} +{%- endfor -%} +{% if ns.is_tool %} + {{'<|tool▁outputs▁end|>'}} +{% endif %} +{% if add_generation_prompt and not ns.is_last_user and not ns.is_tool %} + {{'<|Assistant|>'}} +{% endif %} -- GitLab From 5f2cd251d212eed3052c5406875eb26811335d3e Mon Sep 17 00:00:00 2001 From: Lain Date: Wed, 4 Jun 2025 07:48:45 -0700 Subject: [PATCH 169/274] Sm100 blockwise fp8 swap ab (#18564) --- .../c3x/scaled_mm_blockwise_sm100_fp8.cu | 4 - ...scaled_mm_blockwise_sm100_fp8_dispatch.cuh | 206 ++++++++++++------ .../layers/quantization/utils/fp8_utils.py | 14 -- 3 files changed, 140 insertions(+), 84 deletions(-) diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu index 84492553c..4a8a5ed02 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu @@ -9,10 +9,6 @@ void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales) { - TORCH_CHECK( - a.size(0) % 4 == 0, - "Input tensor must have a number of rows that is a multiple of 4. ", - "but got: ", a.size(0), " rows."); if (out.dtype() == torch::kBFloat16) { cutlass_gemm_blockwise_sm100_fp8_dispatch( out, a, b, a_scales, b_scales); diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh index ef324364c..c841125db 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh @@ -1,5 +1,6 @@ #pragma once +#include "cuda_utils.h" #include "cutlass/cutlass.h" #include "cutlass/numeric_types.h" @@ -22,49 +23,49 @@ namespace vllm { using namespace cute; -template +// clang-format off +template struct cutlass_3x_gemm_fp8_blockwise { + static constexpr bool swap_ab = swap_ab_; using ElementAB = cutlass::float_e4m3_t; using ElementA = ElementAB; using LayoutA = cutlass::layout::RowMajor; + using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose::type; static constexpr int AlignmentA = 128 / cutlass::sizeof_bits::value; using ElementB = ElementAB; using LayoutB = cutlass::layout::ColumnMajor; + using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose::type; static constexpr int AlignmentB = 128 / cutlass::sizeof_bits::value; - using ElementC = void; using ElementD = OutType; using LayoutD = cutlass::layout::RowMajor; + using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose::type; static constexpr int AlignmentD = 128 / cutlass::sizeof_bits::value; + using ElementC = void; // TODO: support bias using LayoutC = LayoutD; + using LayoutC_Transpose = LayoutD_Transpose; static constexpr int AlignmentC = AlignmentD; using ElementAccumulator = float; using ElementCompute = float; using ElementBlockScale = float; - // MMA and Cluster Tile Shapes - // Shape of the tile computed by tcgen05 MMA, could be across 2 SMs if Cluster - // Shape %2 == 0 using MmaTileShape_MNK = Shape<_128,_128,_128>; - static constexpr int ScaleMsPerTile = size<0>(ScalesPerTile{}); - static constexpr int ScaleGranularityM = - size<0>(MmaTileShape{}) / ScaleMsPerTile; - static constexpr int ScaleGranularityN = - size<1>(MmaTileShape{}) / size<1>(ScalesPerTile{}); - static constexpr int ScaleGranularityK = - size<2>(MmaTileShape{}) / size<2>(ScalesPerTile{}); - - // Shape of the threadblocks in a cluster - using ClusterShape_MNK = ClusterShape; - - using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig< - ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, - cute::UMMA::Major::MN, cute::UMMA::Major::K>; + using ScaleConfig = conditional_t, + cutlass::detail::Sm100BlockwiseScaleConfig< + ScaleGranularityM, ScaleGranularityN, ScaleGranularityK, + cute::UMMA::Major::MN, cute::UMMA::Major::K>>; + + // layout_SFA and layout_SFB cannot be swapped since they are deduced. using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); @@ -73,7 +74,6 @@ struct cutlass_3x_gemm_fp8_blockwise { static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest; using ElementScalar = float; - // clang-format off using DefaultOperation = cutlass::epilogue::fusion::LinearCombination; using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< ArchTag, @@ -84,33 +84,47 @@ struct cutlass_3x_gemm_fp8_blockwise { ElementAccumulator, ElementCompute, ElementC, - LayoutC, + conditional_t, AlignmentC, ElementD, - LayoutD, + conditional_t, AlignmentD, EpilogueScheduler, DefaultOperation >::CollectiveOp; using StageCountType = cutlass::gemm::collective::StageCountAuto; - using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder< - ArchTag, - OperatorClass, - ElementA, - cute::tuple, - AlignmentA, - ElementB, - cute::tuple, - AlignmentB, - ElementAccumulator, - MmaTileShape, - ClusterShape, - + using CollectiveMainloop = conditional_t, + AlignmentB, + ElementA, + cute::tuple, + AlignmentA, + ElementAccumulator, + MmaTileShape, + ClusterShape, cutlass::gemm::collective::StageCountAutoCarveout(sizeof(typename CollectiveEpilogue::SharedStorage))>, - MainloopScheduler - >::CollectiveOp; - // clang-format on + MainloopScheduler + >::CollectiveOp, + typename cutlass::gemm::collective::CollectiveBuilder< + ArchTag, + OperatorClass, + ElementA, + cute::tuple, + AlignmentA, + ElementB, + cute::tuple, + AlignmentB, + ElementAccumulator, + MmaTileShape, + ClusterShape, + cutlass::gemm::collective::StageCountAutoCarveout(sizeof(typename CollectiveEpilogue::SharedStorage))>, + MainloopScheduler + >::CollectiveOp>; using KernelType = enable_sm100_only, CollectiveMainloop, CollectiveEpilogue>>; @@ -123,6 +137,7 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales) { + static constexpr bool swap_ab = Gemm::swap_ab; using GemmKernel = typename Gemm::GemmKernel; using StrideA = typename Gemm::GemmKernel::StrideA; using StrideB = typename Gemm::GemmKernel::StrideB; @@ -136,7 +151,6 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, using ElementD = typename Gemm::ElementD; int32_t m = a.size(0), n = b.size(1), k = a.size(1); - auto prob_shape = cute::make_shape(m, n, k, 1); StrideA a_stride; StrideB b_stride; @@ -146,11 +160,13 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, b_stride = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1)); c_stride = - cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1)); + cutlass::make_cute_packed_stride(StrideC{}, swap_ab ? cute::make_shape(n, m, 1) : cute::make_shape(m, n, 1)); - LayoutSFA layout_SFA = + LayoutSFA layout_SFA = swap_ab ? + ScaleConfig::tile_atom_to_shape_SFA(make_shape(n, m, k, 1)) : ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1)); - LayoutSFB layout_SFB = + LayoutSFB layout_SFB = swap_ab ? + ScaleConfig::tile_atom_to_shape_SFB(make_shape(n, m, k, 1)) : ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1)); auto a_ptr = static_cast(a.data_ptr()); @@ -158,9 +174,22 @@ void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a, auto a_scales_ptr = static_cast(a_scales.data_ptr()); auto b_scales_ptr = static_cast(b_scales.data_ptr()); - typename GemmKernel::MainloopArguments mainloop_args{ - a_ptr, a_stride, b_ptr, b_stride, - a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB}; + auto mainloop_args = [&](){ + // layout_SFA and layout_SFB cannot be swapped since they are deduced. + if (swap_ab) { + return typename GemmKernel::MainloopArguments{ + b_ptr, b_stride, a_ptr, a_stride, + b_scales_ptr, layout_SFA, a_scales_ptr, layout_SFB + }; + } + else { + return typename GemmKernel::MainloopArguments{ + a_ptr, a_stride, b_ptr, b_stride, + a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB + }; + } + }(); + auto prob_shape = swap_ab ? cute::make_shape(n, m, k, 1) : cute::make_shape(m, n, k, 1); auto c_ptr = static_cast(out.data_ptr()); typename GemmKernel::EpilogueArguments epilogue_args{ @@ -175,29 +204,74 @@ void cutlass_gemm_blockwise_sm100_fp8_dispatch(torch::Tensor& out, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales) { - auto m = a.size(0); - auto k = a.size(1); - auto n = b.size(1); - int sms; + int32_t m = a.size(0), n = b.size(1), k = a.size(1), sms; cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device()); - auto should_use_2sm = [&sms](int m, int n, int tile1SM = 128) { - return std::ceil(static_cast(m) / tile1SM) * - std::ceil(static_cast(n) / tile1SM) >= - sms; - }; - bool use_2sm = should_use_2sm(m, n); - if (use_2sm) { - cutlass_gemm_caller_blockwise, Shape<_256, _1, _1>, - Shape<_2, _2, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm, - cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>( - out, a, b, a_scales, b_scales); + constexpr int TILE_K = 128; + // TODO: better heuristics + bool swap_ab = (m < 16) || (m % 4 != 0); + bool use_tma_epilogue = (m * n) % 4 == 0; + if (!swap_ab) { + constexpr int TILE_N = 128; + int tile_m = 256; + if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 64) <= sms) { + tile_m = 64; + } + else if (cuda_utils::ceil_div(n, TILE_N) * cuda_utils::ceil_div(m, 128) <= sms) { + tile_m = 128; + } + if (tile_m == 64) { + if (use_tma_epilogue) { + cutlass_gemm_caller_blockwise, Int>, + Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( + out, a, b, a_scales, b_scales); + } else { + cutlass_gemm_caller_blockwise, Int>, + Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( + out, a, b, a_scales, b_scales); + } + } else if (tile_m == 128) { + if (use_tma_epilogue) { + cutlass_gemm_caller_blockwise, Int>, + Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( + out, a, b, a_scales, b_scales); + } else { + cutlass_gemm_caller_blockwise, Int>, + Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( + out, a, b, a_scales, b_scales); + } + } else { // tile_m == 256 + if (use_tma_epilogue) { + cutlass_gemm_caller_blockwise, Int>, + Shape<_2, _1, _1>, cutlass::epilogue::TmaWarpSpecialized2Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>( + out, a, b, a_scales, b_scales); + } else { + cutlass_gemm_caller_blockwise, Int>, + Shape<_2, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized2Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise2SmSm100>>( + out, a, b, a_scales, b_scales); + } + } } else { + // TODO: Test more tile N configs + constexpr int TILE_M = 128; + constexpr int TILE_N = 16; + // TMA epilogue isn't compatible with Swap A/B cutlass_gemm_caller_blockwise, Shape<_128, _1, _1>, - Shape<_1, _1, _1>, cutlass::epilogue::TmaWarpSpecialized1Sm, - cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100>>( + OutType, TILE_M, 1, TILE_K, Shape, Int, Int>, + Shape<_1, _1, _1>, cutlass::epilogue::NoSmemWarpSpecialized1Sm, + cutlass::gemm::KernelTmaWarpSpecializedBlockwise1SmSm100, true>>( out, a, b, a_scales, b_scales); } } diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 1ebd2a898..270979c8e 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -136,24 +136,10 @@ def apply_w8a8_block_fp8_linear( use_cutlass, use_aiter_and_is_supported) if use_cutlass: - rows, cols = input_2d.shape - # Blackwell GPUs (SM100) require row dimensions to be multiple of 4 for - # optimal tensor core usage. Can be removed when targeting platforms - # without this constraint. - should_pad = current_platform.has_device_capability( - 100) and rows % 4 != 0 - if should_pad: - input_2d = torch.nn.functional.pad(input_2d, - (0, 0, 0, 4 - (rows % 4)), - value=0).contiguous() - q_input, x_scale = per_token_group_quant_fp8( input_2d, block_size[1], column_major_scales=use_cutlass) - output = w8a8_blockscale_func(q_input, weight, x_scale, weight_scale, block_size, input.dtype) - if should_pad: - output = output[:rows, :] else: q_input, x_scale = per_token_group_quant_fp8( -- GitLab From 8f4ffbd373cb19e8f8dcfa6dec1dbbe98fbeae96 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 4 Jun 2025 22:57:55 +0800 Subject: [PATCH 170/274] [Doc] Update V1 Guide for embedding models (#19141) Signed-off-by: DarkLight1337 --- docs/usage/v1_guide.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 7c4909cb5..baeb5411b 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -55,7 +55,7 @@ This living user guide outlines a few known **important changes and limitations* | **Spec Decode** | 🚧 WIP ([PR #13933](https://github.com/vllm-project/vllm/pull/13933))| | **Prompt Logprobs with Prefix Caching** | 🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))| | **Structured Output Alternative Backends** | 🟡 Planned | -| **Embedding Models** | 🚧 WIP ([PR #18015](https://github.com/vllm-project/vllm/pull/18015)) | +| **Embedding Models** | 🚧 WIP ([PR #16188](https://github.com/vllm-project/vllm/pull/16188)) | | **Mamba Models** | 🟡 Planned | | **Encoder-Decoder Models** | 🟠 Delayed | | **Request-level Structured Output Backend** | 🔴 Deprecated | @@ -145,9 +145,9 @@ vLLM V1 currently excludes model architectures with the `SupportsV0Only` protoco and the majority fall into the following categories. V1 support for these models will be added eventually. **Embedding Models** -Initially, we will create a [separate model runner](https://github.com/vllm-project/vllm/pull/18015) to provide V1 support without conflicting with other ongoing work. +The initial support will be provided by [PR #16188](https://github.com/vllm-project/vllm/pull/16188). -Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249), which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360) to enable simultaneous generation and embedding using the same engine instance in V1. [PR #16188](https://github.com/vllm-project/vllm/pull/16188) is the first step towards enabling this. +Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249), which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360) to enable simultaneous generation and embedding using the same engine instance in V1. **Mamba Models** Models using selective state-space mechanisms (instead of standard transformer attention) -- GitLab From c8dcc159214a20650451dcd64b226f56671763f1 Mon Sep 17 00:00:00 2001 From: jmswen Date: Wed, 4 Jun 2025 08:26:47 -0700 Subject: [PATCH 171/274] Allow AsyncLLMEngine.generate to target a specific DP rank (#19102) Signed-off-by: Jon Swenson --- .../multi_instance_data_parallel.py | 58 +++++++++++++++++++ tests/tokenization/test_detokenize.py | 3 +- tests/v1/engine/test_engine_core.py | 1 + tests/v1/engine/test_engine_core_client.py | 1 + tests/v1/engine/test_output_processor.py | 5 ++ vllm/engine/async_llm_engine.py | 12 +++- vllm/v1/engine/__init__.py | 1 + vllm/v1/engine/async_llm.py | 5 +- vllm/v1/engine/core_client.py | 14 ++++- vllm/v1/engine/processor.py | 2 + 10 files changed, 97 insertions(+), 5 deletions(-) create mode 100644 examples/online_serving/multi_instance_data_parallel.py diff --git a/examples/online_serving/multi_instance_data_parallel.py b/examples/online_serving/multi_instance_data_parallel.py new file mode 100644 index 000000000..62b1ec71a --- /dev/null +++ b/examples/online_serving/multi_instance_data_parallel.py @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: Apache-2.0 +import asyncio +from typing import Optional + +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.outputs import RequestOutput +from vllm.sampling_params import SamplingParams + +""" +To run this example, run the following commands simultaneously with +different CUDA_VISIBLE_DEVICES: + python examples/online_serving/multi_instance_data_parallel.py + + vllm serve ibm-research/PowerMoE-3b -dp 2 -dpr 1 \ + --data-parallel-address 127.0.0.1 --data-parallel-rpc-port 62300 \ + --data-parallel-size-local 1 --enforce-eager --headless + +Once both instances have completed the handshake, this example will +send a request to the instance with DP rank 1. +""" + + +async def main(): + engine_args = AsyncEngineArgs( + model="ibm-research/PowerMoE-3b", + data_parallel_size=2, + dtype="auto", + max_model_len=2048, + data_parallel_address="127.0.0.1", + data_parallel_rpc_port=62300, + data_parallel_size_local=1, + enforce_eager=True, + ) + + engine_client = AsyncLLMEngine.from_engine_args(engine_args) + + sampling_params = SamplingParams( + temperature=0.7, + top_p=0.9, + max_tokens=100, + ) + + prompt = "Who won the 2004 World Series?" + final_output: Optional[RequestOutput] = None + async for output in engine_client.generate( + prompt=prompt, + sampling_params=sampling_params, + request_id="abcdef", + data_parallel_rank=1, + ): + final_output = output + if final_output: + print(final_output.outputs[0].text) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index b289dc972..9f2414eca 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -70,7 +70,8 @@ def _run_incremental_decode(tokenizer, None, 0.0, None, - cache_salt=None) + cache_salt=None, + data_parallel_rank=None) if fast is None: detokenizer = IncrementalDetokenizer.from_new_request( diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 3d7632a60..1cbbf3037 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -42,6 +42,7 @@ def make_request() -> EngineCoreRequest: arrival_time=time.time(), lora_request=None, cache_salt=None, + data_parallel_rank=None, ) diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 47181d36f..c2dc3b473 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -56,6 +56,7 @@ def make_request( arrival_time=time.time(), lora_request=None, cache_salt=None, + data_parallel_rank=None, ) diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index a83454ee6..6b88b0cf1 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -59,6 +59,7 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind, eos_token_id=None, lora_request=None, cache_salt=None, + data_parallel_rank=None, sampling_params=SamplingParams( skip_special_tokens=False, spaces_between_special_tokens=False, @@ -406,6 +407,7 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind, eos_token_id=None, lora_request=None, cache_salt=None, + data_parallel_rank=None, sampling_params=SamplingParams( skip_special_tokens=False, spaces_between_special_tokens=False, @@ -569,6 +571,7 @@ def test_stop_token(include_stop_str_in_output: bool, eos_token_id=eos_token_id, lora_request=None, cache_salt=None, + data_parallel_rank=None, sampling_params=SamplingParams( skip_special_tokens=False, spaces_between_special_tokens=False, @@ -666,6 +669,7 @@ def test_stop_string(include_stop_str_in_output: bool, eos_token_id=None, lora_request=None, cache_salt=None, + data_parallel_rank=None, sampling_params=SamplingParams( skip_special_tokens=False, spaces_between_special_tokens=False, @@ -780,6 +784,7 @@ def test_iteration_stats(dummy_test_vectors): eos_token_id=None, lora_request=None, cache_salt=None, + data_parallel_rank=None, sampling_params=SamplingParams(), ) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) ] diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 6d8d97cf5..59971f5d6 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -442,6 +442,7 @@ class _AsyncLLMEngine(LLMEngine): trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, + data_parallel_rank: Optional[int] = None, ) -> None: ... @@ -456,6 +457,7 @@ class _AsyncLLMEngine(LLMEngine): trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, + data_parallel_rank: Optional[int] = None, ) -> None: ... @@ -473,6 +475,7 @@ class _AsyncLLMEngine(LLMEngine): trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, + data_parallel_rank: Optional[int] = None, *, inputs: Optional[PromptType] = None, # DEPRECATED ) -> None: @@ -902,6 +905,7 @@ class AsyncLLMEngine(EngineClient): trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, + data_parallel_rank: Optional[int] = None, ) -> Coroutine[None, None, AsyncGenerator[Union[ RequestOutput, PoolingRequestOutput], None]]: ... @@ -917,6 +921,7 @@ class AsyncLLMEngine(EngineClient): trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, + data_parallel_rank: Optional[int] = None, ) -> Coroutine[None, None, AsyncGenerator[Union[ RequestOutput, PoolingRequestOutput], None]]: ... @@ -935,6 +940,7 @@ class AsyncLLMEngine(EngineClient): trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, + data_parallel_rank: Optional[int] = None, *, inputs: Optional[PromptType] = None, # DEPRECATED ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]: @@ -967,6 +973,7 @@ class AsyncLLMEngine(EngineClient): trace_headers=trace_headers, prompt_adapter_request=prompt_adapter_request, priority=priority, + data_parallel_rank=data_parallel_rank, ) return stream.generator() @@ -980,6 +987,7 @@ class AsyncLLMEngine(EngineClient): trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, + data_parallel_rank: Optional[int] = None, ) -> AsyncGenerator[RequestOutput, None]: """Generate outputs for a request. @@ -999,7 +1007,8 @@ class AsyncLLMEngine(EngineClient): for generation, if any. priority: The priority of the request. Only applicable with priority scheduling. - + data_parallel_rank: The (global) data parallel rank that must + handle this request. Only applicable if DP is enabled. Yields: The output `RequestOutput` objects from the LLMEngine for the request. @@ -1057,6 +1066,7 @@ class AsyncLLMEngine(EngineClient): trace_headers=trace_headers, prompt_adapter_request=prompt_adapter_request, priority=priority, + data_parallel_rank=data_parallel_rank, ): yield LLMEngine.validate_output(output, RequestOutput) except asyncio.CancelledError: diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index d1bec2523..59463f1ba 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -55,6 +55,7 @@ class EngineCoreRequest( arrival_time: float lora_request: Optional[LoRARequest] cache_salt: Optional[str] + data_parallel_rank: Optional[int] # Index of the client, used to ensure outputs are sent back to the same # client for this request when scaling out the front-end. diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 0e3696321..61ea3c4c3 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -229,6 +229,7 @@ class AsyncLLM(EngineClient): trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, + data_parallel_rank: Optional[int] = None, ) -> RequestOutputCollector: """Add new request to the AsyncLLM.""" @@ -245,7 +246,7 @@ class AsyncLLM(EngineClient): prompt_str, request = self.processor.process_inputs( request_id, prompt, params, arrival_time, lora_request, tokenization_kwargs, trace_headers, prompt_adapter_request, - priority) + priority, data_parallel_rank) if params.n == 1: await self._add_request(request, prompt_str, None, 0, queue) @@ -291,6 +292,7 @@ class AsyncLLM(EngineClient): trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, + data_parallel_rank: Optional[int] = None, ) -> AsyncGenerator[RequestOutput, None]: """ Main function called by the API server to kick off a request @@ -321,6 +323,7 @@ class AsyncLLM(EngineClient): trace_headers=trace_headers, prompt_adapter_request=prompt_adapter_request, priority=priority, + data_parallel_rank=data_parallel_rank, ) # The output_handler task pushes items into the queue. diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index adb0709c8..0cd58d01d 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -982,7 +982,16 @@ class DPAsyncMPClient(AsyncMPClient): resources.stats_update_task = asyncio.create_task( run_engine_stats_update_task()) - def get_core_engine_for_request(self) -> CoreEngine: + def get_core_engine_for_request(self, + dp_rank: Optional[int] = None + ) -> CoreEngine: + if dp_rank is not None: + # engines are already in rank order + if dp_rank < 0 or dp_rank >= len(self.core_engines): + raise ValueError(f"Requested DP rank {dp_rank} is out of " + f"range [0, {len(self.core_engines)})") + return self.core_engines[dp_rank] + if not self.lb_engines: return self.core_engines[0] # TODO use P2C alg for larger DP sizes @@ -1018,7 +1027,8 @@ class DPAsyncMPClient(AsyncMPClient): request.current_wave = self.current_wave request.client_index = self.client_index - chosen_engine = self.get_core_engine_for_request() + chosen_engine = self.get_core_engine_for_request( + request.data_parallel_rank) self.reqs_in_flight[request.request_id] = chosen_engine to_await = self._send_input(EngineCoreRequestType.ADD, request, diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 5c0d01d9b..546fc98d6 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -212,6 +212,7 @@ class Processor: trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, + data_parallel_rank: Optional[int] = None, ) -> tuple[Optional[str], EngineCoreRequest]: # TODO(woosuk): Support pooling models. @@ -328,6 +329,7 @@ class Processor: arrival_time=arrival_time, lora_request=lora_request, cache_salt=decoder_inputs.get("cache_salt"), + data_parallel_rank=data_parallel_rank, ) def _validate_model_inputs(self, -- GitLab From d459fae0a2c464e28680bc6d564c1de1b295029e Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Wed, 4 Jun 2025 11:39:23 -0400 Subject: [PATCH 172/274] [Bugfix][EP+DP] Fix internode check (#19112) Signed-off-by: Tyler Michael Smith --- vllm/distributed/device_communicators/all2all.py | 6 ------ .../device_communicators/base_device_communicator.py | 3 +-- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index 2ab3779ec..cab2496bf 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -84,10 +84,6 @@ class PPLXAll2AllManager(All2AllManagerBase): assert has_pplx, "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels." # noqa super().__init__(cpu_group) - # TODO(tms): Disable pplx-a2a intranode as it fails with the error: - # failed: cuda error /app/pplx/csrc/all_to_all/intranode.cpp:84 'invalid resource handle' # noqa - self.internode = True - if self.internode: # inter-node communication needs nvshmem, # intra-node communication uses p2p mapping directly @@ -178,7 +174,6 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase): num_rdma_bytes = 1024 * 1024 * 1024 num_qps_per_rank = self.num_sms // 2 else: - assert self.intranode num_rdma_bytes = 0 num_qps_per_rank = 1 @@ -243,7 +238,6 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase): if self.internode: num_rdma_bytes = 1024 * 1024 * 1024 else: - assert self.intranode num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint( num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank, hidden=token_hidden_size, diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py index 38370d4dc..1bc2d8e02 100644 --- a/vllm/distributed/device_communicators/base_device_communicator.py +++ b/vllm/distributed/device_communicators/base_device_communicator.py @@ -49,8 +49,7 @@ class All2AllManagerBase: # all2all communication often has separate implementations for # intra-node and inter-node communication - self.intranode = in_the_same_node_as(cpu_group, source_rank=0) - self.internode = not self.intranode + self.internode = not all(in_the_same_node_as(cpu_group, source_rank=0)) def get_handle(self, kwargs): # get a handle for the all2all communication, -- GitLab From 53a5a0ce30dd623808ebd02947e5183f918b6c2f Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 4 Jun 2025 13:46:28 -0400 Subject: [PATCH 173/274] [Perf] Tunings for SM100 FP8 CUTLASS kernel (#18778) Signed-off-by: mgoin --- .../c3x/scaled_mm_sm100_fp8_dispatch.cuh | 53 ++++++++++++++++++- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh index 468b77d95..6da2da634 100644 --- a/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh +++ b/csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh @@ -15,6 +15,7 @@ using c3x::cutlass_gemm_caller; template typename Epilogue> struct sm100_fp8_config_default { + // M in (128, inf) static_assert(std::is_same()); using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; @@ -25,6 +26,34 @@ struct sm100_fp8_config_default { KernelSchedule, EpilogueSchedule>; }; +template typename Epilogue> +struct sm100_fp8_config_M128 { + // M in (64, 128] + static_assert(std::is_same()); + using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; + using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; + using TileShape = Shape<_128, _128, _64>; + using ClusterShape = Shape<_2, _2, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm_sm100; +}; + +template typename Epilogue> +struct sm100_fp8_config_M64 { + // M in [1, 64] + static_assert(std::is_same()); + using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto; + using EpilogueSchedule = cutlass::epilogue::collective::EpilogueScheduleAuto; + using TileShape = Shape<_64, _64, _256>; + using ClusterShape = Shape<_1, _8, _1>; + using Cutlass3xGemm = + cutlass_3x_gemm_sm100; +}; + template typename Epilogue, typename... EpilogueArgs> @@ -39,8 +68,28 @@ inline void cutlass_gemm_sm100_fp8_dispatch(torch::Tensor& out, using Cutlass3xGemmDefault = typename sm100_fp8_config_default::Cutlass3xGemm; - return cutlass_gemm_caller( - out, a, b, std::forward(args)...); + using Cutlass3xGemmM64 = + typename sm100_fp8_config_M64::Cutlass3xGemm; + using Cutlass3xGemmM128 = + typename sm100_fp8_config_M128::Cutlass3xGemm; + + uint32_t const m = a.size(0); + uint32_t const mp2 = + std::max(static_cast(64), next_pow_2(m)); // next power of 2 + + if (mp2 <= 64) { + // m in [1, 64] + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else if (mp2 <= 128) { + // m in (64, 128] + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else { + // m in (128, inf) + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } } template