Commit eefa41c1 authored by zhuwenwen's avatar zhuwenwen
Browse files

sync v0.18.0

parent 82155c76
...@@ -10,8 +10,6 @@ from transformers import AutoConfig ...@@ -10,8 +10,6 @@ from transformers import AutoConfig
from vllm.model_executor.layers.fused_moe import fused_topk from vllm.model_executor.layers.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
_moe_permute,
_moe_unpermute_and_reduce,
moe_permute, moe_permute,
moe_unpermute, moe_unpermute,
) )
...@@ -41,7 +39,6 @@ def benchmark_permute( ...@@ -41,7 +39,6 @@ def benchmark_permute(
use_fp8_w8a8: bool, use_fp8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
num_iters: int = 100, num_iters: int = 100,
use_customized_permute: bool = False,
) -> float: ) -> float:
# init_dtype = torch.float16 if use_fp8_w8a8 else dtype # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype) hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
...@@ -113,11 +110,9 @@ def benchmark_unpermute( ...@@ -113,11 +110,9 @@ def benchmark_unpermute(
use_fp8_w8a8: bool, use_fp8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
num_iters: int = 100, num_iters: int = 100,
use_customized_permute: bool = False,
) -> float: ) -> float:
# init_dtype = torch.float16 if use_fp8_w8a8 else dtype # init_dtype = torch.float16 if use_fp8_w8a8 else dtype
hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype) hidden_states = torch.randn(num_tokens, hidden_size, dtype=dtype)
output_hidden_states = torch.empty_like(hidden_states)
if use_fp8_w8a8: if use_fp8_w8a8:
qhidden_states, scale = _fp8_quantize(hidden_states, None, None) qhidden_states, scale = _fp8_quantize(hidden_states, None, None)
else: else:
...@@ -151,36 +146,15 @@ def benchmark_unpermute( ...@@ -151,36 +146,15 @@ def benchmark_unpermute(
) )
def run(input: tuple): def run(input: tuple):
if use_customized_permute: (permuted_hidden_states, first_token_off, inv_perm_idx) = input
( output = torch.empty_like(hidden_states)
permuted_hidden_states, moe_unpermute(
first_token_off, output,
inv_perm_idx, permuted_hidden_states,
m_indices, topk_weights,
) = input inv_perm_idx,
output = torch.empty_like(hidden_states) first_token_off,
moe_unpermute( )
output,
permuted_hidden_states,
topk_weights,
inv_perm_idx,
first_token_off,
)
else:
(
permuted_hidden_states,
a1q_scale,
sorted_token_ids,
expert_ids,
inv_perm,
) = input
_moe_unpermute_and_reduce(
output_hidden_states,
permuted_hidden_states,
inv_perm,
topk_weights,
True,
)
# JIT compilation & warmup # JIT compilation & warmup
input = prepare() input = prepare()
...@@ -235,8 +209,7 @@ class BenchmarkWorker: ...@@ -235,8 +209,7 @@ class BenchmarkWorker:
dtype: torch.dtype, dtype: torch.dtype,
use_fp8_w8a8: bool, use_fp8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
use_customized_permute: bool = False, ) -> tuple[float, float]:
) -> tuple[dict[str, int], float]:
set_random_seed(self.seed) set_random_seed(self.seed)
permute_time = benchmark_permute( permute_time = benchmark_permute(
...@@ -248,7 +221,6 @@ class BenchmarkWorker: ...@@ -248,7 +221,6 @@ class BenchmarkWorker:
use_fp8_w8a8, use_fp8_w8a8,
use_int8_w8a16, use_int8_w8a16,
num_iters=100, num_iters=100,
use_customized_permute=use_customized_permute,
) )
unpermute_time = benchmark_unpermute( unpermute_time = benchmark_unpermute(
num_tokens, num_tokens,
...@@ -259,7 +231,6 @@ class BenchmarkWorker: ...@@ -259,7 +231,6 @@ class BenchmarkWorker:
use_fp8_w8a8, use_fp8_w8a8,
use_int8_w8a16, use_int8_w8a16,
num_iters=100, num_iters=100,
use_customized_permute=use_customized_permute,
) )
return permute_time, unpermute_time return permute_time, unpermute_time
...@@ -306,7 +277,6 @@ def main(args: argparse.Namespace): ...@@ -306,7 +277,6 @@ def main(args: argparse.Namespace):
dtype = torch.float16 if current_platform.is_rocm() else config.dtype dtype = torch.float16 if current_platform.is_rocm() else config.dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8" use_fp8_w8a8 = args.dtype == "fp8_w8a8"
use_int8_w8a16 = args.dtype == "int8_w8a16" use_int8_w8a16 = args.dtype == "int8_w8a16"
use_customized_permute = args.use_customized_permute
if args.batch_size is None: if args.batch_size is None:
batch_sizes = [ batch_sizes = [
...@@ -358,7 +328,6 @@ def main(args: argparse.Namespace): ...@@ -358,7 +328,6 @@ def main(args: argparse.Namespace):
dtype, dtype,
use_fp8_w8a8, use_fp8_w8a8,
use_int8_w8a16, use_int8_w8a16,
use_customized_permute,
) )
for batch_size in batch_sizes for batch_size in batch_sizes
], ],
...@@ -378,7 +347,6 @@ if __name__ == "__main__": ...@@ -378,7 +347,6 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto" "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
) )
parser.add_argument("--use-customized-permute", action="store_true")
parser.add_argument("--seed", type=int, default=0) parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--batch-size", type=int, required=False) parser.add_argument("--batch-size", type=int, required=False)
parser.add_argument("--trust-remote-code", action="store_true") parser.add_argument("--trust-remote-code", action="store_true")
......
...@@ -26,6 +26,12 @@ ...@@ -26,6 +26,12 @@
typedef __hip_bfloat16 __nv_bfloat16; typedef __hip_bfloat16 __nv_bfloat16;
#endif #endif
#if defined(__gfx942__)
constexpr float kFp8ScaleDivisor = 224.f;
#else
constexpr float kFp8ScaleDivisor = 448.f;
#endif
void swap_blocks(torch::Tensor& src, torch::Tensor& dst, void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
int64_t block_size_in_bytes, int64_t block_size_in_bytes,
const torch::Tensor& block_mapping) { const torch::Tensor& block_mapping) {
...@@ -403,8 +409,7 @@ __global__ void concat_and_cache_ds_mla_kernel( ...@@ -403,8 +409,7 @@ __global__ void concat_and_cache_ds_mla_kernel(
} }
// Compute the scale for the tile // Compute the scale for the tile
float tile_scale = max_abs / 448.f; float tile_scale = fmaxf(max_abs / kFp8ScaleDivisor, FLT_MIN);
tile_scale = fmaxf(tile_scale, FLT_MIN);
// The first lane of each half-warp writes the scale to kv_cache // The first lane of each half-warp writes the scale to kv_cache
if ((lane_idx == 0) || (lane_idx == 16)) { if ((lane_idx == 0) || (lane_idx == 16)) {
...@@ -473,11 +478,7 @@ __global__ void indexer_k_quant_and_cache_kernel( ...@@ -473,11 +478,7 @@ __global__ void indexer_k_quant_and_cache_kernel(
#endif #endif
} }
#if defined(__gfx942__) float scale = fmaxf(amax, 1e-4) / kFp8ScaleDivisor;
float scale = fmaxf(amax, 1e-4) / 224.0f;
#else
float scale = fmaxf(amax, 1e-4) / 448.0f;
#endif
if (use_ue8m0) { if (use_ue8m0) {
scale = exp2f(ceilf(log2f(scale))); scale = exp2f(ceilf(log2f(scale)));
......
...@@ -358,13 +358,14 @@ void onednn_scaled_mm( ...@@ -358,13 +358,14 @@ void onednn_scaled_mm(
const std::optional<torch::Tensor>& azp, // [M] or [1] const std::optional<torch::Tensor>& azp, // [M] or [1]
const std::optional<torch::Tensor>& azp_adj, // [M] or [1] const std::optional<torch::Tensor>& azp_adj, // [M] or [1]
const std::optional<torch::Tensor>& bias, // [N] const std::optional<torch::Tensor>& bias, // [N]
int64_t handler) { const torch::Tensor& handler_tensor) {
CPU_KERNEL_GUARD_IN(onednn_scaled_mm) CPU_KERNEL_GUARD_IN(onednn_scaled_mm)
TORCH_CHECK(a.dim() == 2); TORCH_CHECK(a.dim() == 2);
TORCH_CHECK(a.is_contiguous()); TORCH_CHECK(a.is_contiguous());
TORCH_CHECK(c.is_contiguous()); TORCH_CHECK(c.is_contiguous());
W8A8MatMulPrimitiveHandler* ptr = W8A8MatMulPrimitiveHandler* ptr =
reinterpret_cast<W8A8MatMulPrimitiveHandler*>(handler); reinterpret_cast<W8A8MatMulPrimitiveHandler*>(
handler_tensor.item<int64_t>());
const int32_t* azp_ptr = nullptr; const int32_t* azp_ptr = nullptr;
if (azp.has_value()) { if (azp.has_value()) {
azp_ptr = azp->data_ptr<int32_t>(); azp_ptr = azp->data_ptr<int32_t>();
...@@ -517,13 +518,14 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b, ...@@ -517,13 +518,14 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b,
void onednn_mm(torch::Tensor& c, // [M, OC], row-major void onednn_mm(torch::Tensor& c, // [M, OC], row-major
const torch::Tensor& a, // [M, IC], row-major const torch::Tensor& a, // [M, IC], row-major
const std::optional<torch::Tensor>& bias, int64_t handler) { const std::optional<torch::Tensor>& bias,
const torch::Tensor& handler_tensor) {
CPU_KERNEL_GUARD_IN(onednn_mm) CPU_KERNEL_GUARD_IN(onednn_mm)
TORCH_CHECK(a.dim() == 2); TORCH_CHECK(a.dim() == 2);
TORCH_CHECK(a.stride(-1) == 1); TORCH_CHECK(a.stride(-1) == 1);
TORCH_CHECK(c.stride(-1) == 1); TORCH_CHECK(c.stride(-1) == 1);
MatMulPrimitiveHandler* ptr = MatMulPrimitiveHandler* ptr =
reinterpret_cast<MatMulPrimitiveHandler*>(handler); reinterpret_cast<MatMulPrimitiveHandler*>(handler_tensor.item<int64_t>());
// ACL matmuls expect contiguous source tensors // ACL matmuls expect contiguous source tensors
#ifdef VLLM_USE_ACL #ifdef VLLM_USE_ACL
...@@ -565,4 +567,4 @@ void onednn_mm(torch::Tensor& c, // [M, OC], row-major ...@@ -565,4 +567,4 @@ void onednn_mm(torch::Tensor& c, // [M, OC], row-major
ptr->execute(exec_args); ptr->execute(exec_args);
}); });
} }
\ No newline at end of file
...@@ -23,13 +23,14 @@ void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a, ...@@ -23,13 +23,14 @@ void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
const std::optional<torch::Tensor>& azp, const std::optional<torch::Tensor>& azp,
const std::optional<torch::Tensor>& azp_adj, const std::optional<torch::Tensor>& azp_adj,
const std::optional<torch::Tensor>& bias, const std::optional<torch::Tensor>& bias,
int64_t handler); const torch::Tensor& handler_tensor);
int64_t create_onednn_mm_handler(const torch::Tensor& b, int64_t create_onednn_mm_handler(const torch::Tensor& b,
int64_t primitive_cache_size); int64_t primitive_cache_size);
void onednn_mm(torch::Tensor& c, const torch::Tensor& a, void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
const std::optional<torch::Tensor>& bias, int64_t handler); const std::optional<torch::Tensor>& bias,
const torch::Tensor& handler_tensor);
bool is_onednn_acl_supported(); bool is_onednn_acl_supported();
...@@ -200,7 +201,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -200,7 +201,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// oneDNN GEMM // oneDNN GEMM
ops.def( ops.def(
"onednn_mm(Tensor! c, Tensor a, Tensor? bias, " "onednn_mm(Tensor! c, Tensor a, Tensor? bias, "
"int handler) -> ()"); "Tensor handler_tensor) -> ()");
ops.impl("onednn_mm", torch::kCPU, &onednn_mm); ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
// Check if oneDNN was built with ACL backend // Check if oneDNN was built with ACL backend
...@@ -216,7 +217,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -216,7 +217,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// oneDNN scaled_mm for W8A8 with static per-tensor activation quantization // oneDNN scaled_mm for W8A8 with static per-tensor activation quantization
ops.def( ops.def(
"onednn_scaled_mm(Tensor! c, Tensor a, Tensor a_scales, Tensor? azp, " "onednn_scaled_mm(Tensor! c, Tensor a, Tensor a_scales, Tensor? azp, "
"Tensor? azp_adj, Tensor? bias, int handler) -> ()"); "Tensor? azp_adj, Tensor? bias, Tensor handler_tensor) -> ()");
ops.impl("onednn_scaled_mm", torch::kCPU, &onednn_scaled_mm); ops.impl("onednn_scaled_mm", torch::kCPU, &onednn_scaled_mm);
// Compute int8 quantized tensor for given scaling factor. // Compute int8 quantized tensor for given scaling factor.
...@@ -335,4 +336,4 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -335,4 +336,4 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.impl("mla_decode_kvcache", torch::kCPU, &mla_decode_kvcache); ops.impl("mla_decode_kvcache", torch::kCPU, &mla_decode_kvcache);
} }
REGISTER_EXTENSION(TORCH_EXTENSION_NAME) REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
\ No newline at end of file
...@@ -58,6 +58,11 @@ You can tune the performance by adjusting `max_num_batched_tokens`: ...@@ -58,6 +58,11 @@ You can tune the performance by adjusting `max_num_batched_tokens`:
- For optimal throughput, we recommend setting `max_num_batched_tokens > 8192` especially for smaller models on large GPUs. - For optimal throughput, we recommend setting `max_num_batched_tokens > 8192` especially for smaller models on large GPUs.
- If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the V0 default scheduling policy (except that it still prioritizes decodes). - If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the V0 default scheduling policy (except that it still prioritizes decodes).
!!! warning
When chunked prefill is disabled, `max_num_batched_tokens` must be greater than `max_model_len`.
In that case, if `max_num_batched_tokens < max_model_len`, vLLM may crash at server start‑up.
```python ```python
from vllm import LLM from vllm import LLM
......
...@@ -71,7 +71,7 @@ class MyModel(nn.Module): ...@@ -71,7 +71,7 @@ class MyModel(nn.Module):
```python ```python
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor | None,
positions: torch.Tensor, positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None, intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None, inputs_embeds: torch.Tensor | None = None,
......
...@@ -43,28 +43,73 @@ Further update the model as follows: ...@@ -43,28 +43,73 @@ Further update the model as follows:
) )
``` ```
- Implement [embed_multimodal][vllm.model_executor.models.interfaces.SupportsMultiModal.embed_multimodal] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs. - Remove the embedding part from the [forward][torch.nn.Module.forward] method:
- Move the multi-modal embedding to [embed_multimodal][vllm.model_executor.models.interfaces.SupportsMultiModal.embed_multimodal].
- The text embedding and embedding merge are handled automatically by a default implementation of [embed_input_ids][vllm.model_executor.models.interfaces.SupportsMultiModal.embed_input_ids]. It does not need to be overridden in most cases.
```diff
def forward(
self,
input_ids: torch.Tensor | None,
- pixel_values: torch.Tensor,
positions: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None,
) -> torch.Tensor:
- if inputs_embeds is None:
- inputs_embeds = self.get_input_embeddings()(input_ids)
-
- if pixel_values is not None:
- image_features = self.get_image_features(
- pixel_values=pixel_values,
- )
- special_image_mask = self.get_placeholder_mask(
- input_ids,
- inputs_embeds=inputs_embeds,
- image_features=image_features,
- )
- inputs_embeds = inputs_embeds.masked_scatter(
- special_image_mask,
- image_features,
- )
hidden_states = self.language_model(
input_ids,
positions,
intermediate_tensors,
inputs_embeds=inputs_embeds,
)
...
+ def embed_multimodal(
+ self,
+ pixel_values: torch.Tensor,
+ ) -> MultiModalEmbeddings | None:
+ return self.get_image_features(
+ pixel_values=pixel_values,
+ )
```
??? code Below we provide a boilerplate of a typical implementation pattern of [embed_multimodal][vllm.model_executor.models.interfaces.SupportsMultiModal.embed_multimodal], but feel free to adjust it to your own needs.
```python ```python
def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor: def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
image_features = self.vision_encoder(image_input) image_features = self.vision_encoder(image_input)
return self.multi_modal_projector(image_features) return self.multi_modal_projector(image_features)
def embed_multimodal( def embed_multimodal(
self, self,
**kwargs: object, **kwargs: object,
) -> MultiModalEmbeddings | None: ) -> MultiModalEmbeddings | None:
# Validate the multimodal input keyword arguments # Validate the multimodal input keyword arguments
image_input = self._parse_and_validate_image_input(**kwargs) image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None: if image_input is None:
return None return None
# Run multimodal inputs through encoder and projector # Run multimodal inputs through encoder and projector
vision_embeddings = self._process_image_input(image_input) vision_embeddings = self._process_image_input(image_input)
return vision_embeddings return vision_embeddings
``` ```
!!! important !!! important
The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request. The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
......
...@@ -10,7 +10,7 @@ receives a request for a LoRA adapter that hasn't been loaded yet, the resolver ...@@ -10,7 +10,7 @@ receives a request for a LoRA adapter that hasn't been loaded yet, the resolver
to locate and load the adapter from their configured storage locations. This enables: to locate and load the adapter from their configured storage locations. This enables:
- **Dynamic LoRA Loading**: Load adapters on-demand without server restarts - **Dynamic LoRA Loading**: Load adapters on-demand without server restarts
- **Multiple Storage Backends**: Support for filesystem, S3, and custom backends. The built-in `lora_filesystem_resolver` requires a local storage path, but custom resolvers can be implemented to fetch from any source. - **Multiple Storage Backends**: Support for filesystem, S3, and custom backends. The built-in `lora_filesystem_resolver` requires a local storage path, while the built-in `hf_hub_resolver` will pull LoRA adapters from Huggingface Hub and proceed in an identical manner. In general, custom resolvers can be implemented to fetch from any source.
- **Automatic Discovery**: Seamless integration with existing LoRA workflows - **Automatic Discovery**: Seamless integration with existing LoRA workflows
- **Scalable Deployment**: Centralized adapter management across multiple vLLM instances - **Scalable Deployment**: Centralized adapter management across multiple vLLM instances
......
...@@ -160,10 +160,12 @@ Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adap ...@@ -160,10 +160,12 @@ Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adap
You can set up multiple LoRAResolver plugins if you want to load LoRA adapters from different sources. For example, you might have one resolver for local files and another for S3 storage. vLLM will load the first LoRA adapter that it finds. You can set up multiple LoRAResolver plugins if you want to load LoRA adapters from different sources. For example, you might have one resolver for local files and another for S3 storage. vLLM will load the first LoRA adapter that it finds.
You can either install existing plugins or implement your own. By default, vLLM comes with a [resolver plugin to load LoRA adapters from a local directory.](https://github.com/vllm-project/vllm/tree/main/vllm/plugins/lora_resolvers) You can either install existing plugins or implement your own. By default, vLLM comes with a [resolver plugin to load LoRA adapters from a local directory, as well as a resolver plugin to load LoRA adapters from repositories on Hugging Face Hub](https://github.com/vllm-project/vllm/tree/main/vllm/plugins/lora_resolvers)
To enable this resolver, set `VLLM_ALLOW_RUNTIME_LORA_UPDATING` to True, set `VLLM_PLUGINS` to include `lora_filesystem_resolver`, and then set `VLLM_LORA_RESOLVER_CACHE_DIR` to a local directory. When vLLM receives a request using a LoRA adapter `foobar`, To enable either of these resolvers, you must `set VLLM_ALLOW_RUNTIME_LORA_UPDATING` to True.
it will first look in the local directory for a directory `foobar`, and attempt to load the contents of that directory as a LoRA adapter. If successful, the request will complete as normal and
that adapter will then be available for normal use on the server. - To leverage a local directory, set `VLLM_PLUGINS` to include `lora_filesystem_resolver` and set `VLLM_LORA_RESOLVER_CACHE_DIR` to a local directory. When vLLM receives a request using a LoRA adapter `foobar`,
it will first look in the local directory for a directory `foobar`, and attempt to load the contents of that directory as a LoRA adapter. If successful, the request will complete as normal and that adapter will then be available for normal use on the server.
- To leverage repositories on Hugging Face Hub, set `VLLM_PLUGINS` to include `lora_hf_hub_resolver` and set `VLLM_LORA_RESOLVER_HF_REPO_LIST` to a comma separated list of repository IDs on Hugging Face Hub. When vLLM receives a request for the LoRA adapter `my/repo/subpath`, it will download the adapter at the `subpath` of `my/repo` if it exists and contains an `adapter_config.json`, then build a request to the cached dir for the adapter, similar to the `lora_filesystem_resolver`. Please note that enabling remote downloads is insecure and not intended for use in production environments.
Alternatively, follow these example steps to implement your own plugin: Alternatively, follow these example steps to implement your own plugin:
......
...@@ -213,10 +213,19 @@ Support use case: Prefill with 'HND' and decode with 'NHD' with experimental con ...@@ -213,10 +213,19 @@ Support use case: Prefill with 'HND' and decode with 'NHD' with experimental con
--kv-transfer-config '{..., "enable_permute_local_kv":"True"}' --kv-transfer-config '{..., "enable_permute_local_kv":"True"}'
``` ```
### Cross layers blocks
By default, this feature is disabled. On attention backends that support this feature, each logical block is contiguous in physical memory. This reduces the number of buffers that need to be transferred.
To enable this feature:
```bash
--kv-transfer-config '{..., "kv_connector_extra_config": {"enable_cross_layers_blocks": "True"}}'
```
## Example Scripts/Code ## Example Scripts/Code
Refer to these example scripts in the vLLM repository: Refer to these example scripts in the vLLM repository:
- [run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh) - [run_accuracy_test.sh](../../tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh)
- [toy_proxy_server.py](../../tests/v1/kv_connector/nixl_integration/toy_proxy_server.py) - [toy_proxy_server.py](../../tests/v1/kv_connector/nixl_integration/toy_proxy_server.py)
- [test_accuracy.py](../../tests/v1/kv_connector/nixl_integration/test_accuracy.py) - [test_accuracy.py](../../tests/v1/kv_connector/nixl_integration/test_accuracy.py)
\ No newline at end of file
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
This example shows how to use vLLM for running offline inference This example shows how to use vLLM for running offline inference
with the correct prompt format on Qwen2.5-Omni (thinker only). with the correct prompt format on Qwen3-Omni (thinker only).
""" """
from typing import NamedTuple from typing import NamedTuple
...@@ -112,23 +112,51 @@ def get_multi_audios_query() -> QueryResult: ...@@ -112,23 +112,51 @@ def get_multi_audios_query() -> QueryResult:
) )
def get_multi_images_query() -> QueryResult:
question = "What are the differences between these two images?"
prompt = (
f"<|im_start|>system\n{default_system}<|im_end|>\n"
"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
"<|vision_start|><|image_pad|><|vision_end|>"
f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
return QueryResult(
inputs={
"prompt": prompt,
"multi_modal_data": {
"image": [
convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB"),
convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB"),
],
},
},
limit_mm_per_prompt={
"image": 2,
},
)
query_map = { query_map = {
"mixed_modalities": get_mixed_modalities_query, "mixed_modalities": get_mixed_modalities_query,
"use_audio_in_video": get_use_audio_in_video_query, "use_audio_in_video": get_use_audio_in_video_query,
"multi_audios": get_multi_audios_query, "multi_audios": get_multi_audios_query,
"multi_images": get_multi_images_query,
} }
def main(args): def main(args):
model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct" model_name = args.model
query_result = query_map[args.query_type]() query_result = query_map[args.query_type]()
llm = LLM( llm = LLM(
model=model_name, model=model_name,
max_model_len=12800, max_model_len=args.max_model_len,
max_num_seqs=5, max_num_seqs=5,
limit_mm_per_prompt=query_result.limit_mm_per_prompt, limit_mm_per_prompt=query_result.limit_mm_per_prompt,
seed=args.seed, seed=args.seed,
tensor_parallel_size=args.tensor_parallel_size,
gpu_memory_utilization=args.gpu_memory_utilization,
) )
# We set temperature to 0.2 so that outputs can be different # We set temperature to 0.2 so that outputs can be different
...@@ -161,6 +189,31 @@ def parse_args(): ...@@ -161,6 +189,31 @@ def parse_args():
default=0, default=0,
help="Set the seed when initializing `vllm.LLM`.", help="Set the seed when initializing `vllm.LLM`.",
) )
parser.add_argument(
"--model",
type=str,
default="Qwen/Qwen3-Omni-30B-A3B-Instruct",
help="Model name or path.",
)
parser.add_argument(
"--tensor-parallel-size",
"-tp",
type=int,
default=1,
help="Tensor parallel size for distributed inference.",
)
parser.add_argument(
"--gpu-memory-utilization",
type=float,
default=0.9,
help="GPU memory utilization (0.0 to 1.0).",
)
parser.add_argument(
"--max-model-len",
type=int,
default=12800,
help="Maximum model context length.",
)
return parser.parse_args() return parser.parse_args()
......
...@@ -609,6 +609,42 @@ def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData: ...@@ -609,6 +609,42 @@ def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
) )
# GLM-OCR
def run_glm_ocr(questions: list[str], modality: str) -> ModelRequestData:
model_name = "zai-org/GLM-OCR"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=2,
mm_processor_kwargs={
"size": {"shortest_edge": 12544, "longest_edge": 47040000},
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
enforce_eager=True,
)
if modality == "image":
placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
elif modality == "video":
placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
prompts = [
(
"[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
f"{placeholder}"
f"{question}<|assistant|>assistant\n"
)
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# H2OVL-Mississippi # H2OVL-Mississippi
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData: def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image" assert modality == "image"
...@@ -2022,6 +2058,32 @@ def run_step3(questions: list[str], modality: str) -> ModelRequestData: ...@@ -2022,6 +2058,32 @@ def run_step3(questions: list[str], modality: str) -> ModelRequestData:
) )
# StepVL10B
def run_step_vl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "stepfun-ai/Step3-VL-10B"
engine_args = EngineArgs(
model=model_name,
max_num_batched_tokens=4096,
tensor_parallel_size=1,
trust_remote_code=True,
limit_mm_per_prompt={modality: 1},
reasoning_parser="deepseek_r1",
)
prompts = [
"<|begin▁of▁sentence|> You are a helpful assistant.<|BOT|>user\n "
f"<im_patch>{question} <|EOT|><|BOT|>assistant\n<think>\n"
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# omni-research/Tarsier-7b # omni-research/Tarsier-7b
def run_tarsier(questions: list[str], modality: str) -> ModelRequestData: def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image" assert modality == "image"
...@@ -2096,6 +2158,7 @@ model_example_map = { ...@@ -2096,6 +2158,7 @@ model_example_map = {
"glm4_1v": run_glm4_1v, "glm4_1v": run_glm4_1v,
"glm4_5v": run_glm4_5v, "glm4_5v": run_glm4_5v,
"glm4_5v_fp8": run_glm4_5v_fp8, "glm4_5v_fp8": run_glm4_5v_fp8,
"glm_ocr": run_glm_ocr,
"h2ovl_chat": run_h2ovl, "h2ovl_chat": run_h2ovl,
"hunyuan_vl": run_hunyuan_vl, "hunyuan_vl": run_hunyuan_vl,
"hyperclovax_seed_vision": run_hyperclovax_seed_vision, "hyperclovax_seed_vision": run_hyperclovax_seed_vision,
...@@ -2143,6 +2206,7 @@ model_example_map = { ...@@ -2143,6 +2206,7 @@ model_example_map = {
"skywork_chat": run_skyworkr1v, "skywork_chat": run_skyworkr1v,
"smolvlm": run_smolvlm, "smolvlm": run_smolvlm,
"step3": run_step3, "step3": run_step3,
"stepvl": run_step_vl,
"tarsier": run_tarsier, "tarsier": run_tarsier,
"tarsier2": run_tarsier2, "tarsier2": run_tarsier2,
} }
...@@ -2150,6 +2214,7 @@ model_example_map = { ...@@ -2150,6 +2214,7 @@ model_example_map = {
MODELS_NEED_VIDEO_METADATA = [ MODELS_NEED_VIDEO_METADATA = [
"glm4_1v", "glm4_1v",
"glm_ocr",
"glm4_5v", "glm4_5v",
"glm4_5v_fp8", "glm4_5v_fp8",
"molmo2", "molmo2",
...@@ -2481,4 +2546,4 @@ def main(args): ...@@ -2481,4 +2546,4 @@ def main(args):
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = parse_args()
main(args) main(args)
\ No newline at end of file
...@@ -1208,6 +1208,32 @@ def load_step3(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -1208,6 +1208,32 @@ def load_step3(question: str, image_urls: list[str]) -> ModelRequestData:
) )
def load_step_vl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "stepfun-ai/Step3-VL-10B"
engine_args = EngineArgs(
model=model_name,
max_num_batched_tokens=4096,
limit_mm_per_prompt={"image": len(image_urls)},
hf_overrides={"vision_config": {"enable_patch": False}},
trust_remote_code=True,
reasoning_parser="deepseek_r1",
)
prompt = (
"<|begin▁of▁sentence|> You are a helpful assistant.<|BOT|>user\n "
f"{'<im_patch>' * len(image_urls)}{question}<|EOT|><|BOT|>"
"assistant\n<think>\n"
)
image_data = [fetch_image(url) for url in image_urls]
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=image_data,
)
def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData: def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "omni-research/Tarsier-7b" model_name = "omni-research/Tarsier-7b"
...@@ -1437,6 +1463,7 @@ model_example_map = { ...@@ -1437,6 +1463,7 @@ model_example_map = {
"rvl": load_r_vl, "rvl": load_r_vl,
"smolvlm": load_smolvlm, "smolvlm": load_smolvlm,
"step3": load_step3, "step3": load_step3,
"stepvl": load_step_vl,
"tarsier": load_tarsier, "tarsier": load_tarsier,
"tarsier2": load_tarsier2, "tarsier2": load_tarsier2,
"glm4_1v": load_glm4_1v, "glm4_1v": load_glm4_1v,
...@@ -1606,4 +1633,4 @@ def main(args: Namespace): ...@@ -1606,4 +1633,4 @@ def main(args: Namespace):
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = parse_args()
main(args) main(args)
\ No newline at end of file
...@@ -157,6 +157,37 @@ VLLM_CONFIGURE_LOGGING=0 \ ...@@ -157,6 +157,37 @@ VLLM_CONFIGURE_LOGGING=0 \
vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048 vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
``` ```
### Example 4: Disable access logs for health check endpoints
In production environments, health check endpoints like `/health`, `/metrics`,
and `/ping` are frequently called by load balancers and monitoring systems,
generating a large volume of repetitive access logs. To reduce log noise while
keeping logs for other endpoints, use the `--disable-access-log-for-endpoints`
option.
**Disable access logs for health and metrics endpoints:**
```bash
vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048 \
--disable-access-log-for-endpoints /health,/metrics,/ping
```
**Common endpoints to consider filtering:**
| Endpoint | Description | Typical Caller |
| ---------- | ---------------------- | ---------------------------------------------------- |
| `/health` | Health check | Kubernetes liveness/readiness probes, load balancers |
| `/metrics` | Prometheus metrics | Prometheus scraper (every 15-60s) |
| `/ping` | SageMaker health check | SageMaker infrastructure |
| `/load` | Server load metrics | Custom monitoring |
**Notes:**
- This option only affects uvicorn access logs, not vLLM application logs
- Specify multiple endpoints by separating them with commas (no spaces)
- The filter uses exact path matching, query parameters are ignored (e.g., `/health?verbose=true` matches `/health`)
- If you need to completely disable all access logs, use `--disable-uvicorn-access-log` instead
## Additional resources ## Additional resources
- [`logging.config` Dictionary Schema Details](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details) - [`logging.config` Dictionary Schema Details](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details)
\ No newline at end of file
...@@ -43,6 +43,7 @@ vllm = "vllm.entrypoints.cli.main:main" ...@@ -43,6 +43,7 @@ vllm = "vllm.entrypoints.cli.main:main"
[project.entry-points."vllm.general_plugins"] [project.entry-points."vllm.general_plugins"]
lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver" lora_filesystem_resolver = "vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver"
lora_hf_hub_resolver = "vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver"
[tool.setuptools_scm] [tool.setuptools_scm]
# no extra settings needed, presence enables setuptools-scm # no extra settings needed, presence enables setuptools-scm
......
...@@ -524,6 +524,7 @@ class MockModelConfig: ...@@ -524,6 +524,7 @@ class MockModelConfig:
tokenizer_revision = None tokenizer_revision = None
multimodal_config = MultiModalConfig() multimodal_config = MultiModalConfig()
hf_config = MockHFConfig() hf_config = MockHFConfig()
hf_text_config = MockHFConfig()
logits_processors: list[str] | None = None logits_processors: list[str] | None = None
diff_sampling_param: dict | None = None diff_sampling_param: dict | None = None
allowed_local_media_path: str = "" allowed_local_media_path: str = ""
...@@ -1928,4 +1929,4 @@ class TestCreateRemainingArgsDelta: ...@@ -1928,4 +1929,4 @@ class TestCreateRemainingArgsDelta:
assert tc.id == "call_nofunc" assert tc.id == "call_nofunc"
assert tc.type == "function" assert tc.type == "function"
assert tc.function.name is None assert tc.function.name is None
assert tc.function.arguments == '{"data": "value"}' assert tc.function.arguments == '{"data": "value"}'
\ No newline at end of file
...@@ -43,7 +43,6 @@ from .mk_objects import ( ...@@ -43,7 +43,6 @@ from .mk_objects import (
TestMoEQuantConfig, TestMoEQuantConfig,
expert_info, expert_info,
make_fused_experts, make_fused_experts,
make_prepare_finalize,
prepare_finalize_info, prepare_finalize_info,
) )
from .parallel_utils import ProcessGroupInfo from .parallel_utils import ProcessGroupInfo
...@@ -586,10 +585,12 @@ def make_modular_kernel( ...@@ -586,10 +585,12 @@ def make_modular_kernel(
routing_method=RoutingMethodType.DeepSeekV3, routing_method=RoutingMethodType.DeepSeekV3,
) )
# make modular kernel prepare_finalize = maybe_make_prepare_finalize(
prepare_finalize = make_prepare_finalize( moe=moe,
config.prepare_finalize_type, config.all2all_backend(), moe, quant_config quant_config=quant_config,
allow_new_interface=True,
) )
assert prepare_finalize is not None
fused_experts = make_fused_experts( fused_experts = make_fused_experts(
config.fused_experts_type, config.fused_experts_type,
...@@ -673,4 +674,4 @@ def run_modular_kernel( ...@@ -673,4 +674,4 @@ def run_modular_kernel(
): ):
out = mk.apply(**mk_kwargs) out = mk.apply(**mk_kwargs)
return out return out
\ No newline at end of file
...@@ -7,9 +7,6 @@ import torch ...@@ -7,9 +7,6 @@ import torch
# Fused experts and PrepareFinalize imports # Fused experts and PrepareFinalize imports
import vllm.model_executor.layers.fused_moe.modular_kernel as mk import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.model_executor.layers.fused_moe import TritonExperts from vllm.model_executor.layers.fused_moe import TritonExperts
from vllm.model_executor.layers.fused_moe.all2all_utils import (
maybe_make_prepare_finalize,
)
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
BatchedDeepGemmExperts, BatchedDeepGemmExperts,
) )
...@@ -435,23 +432,6 @@ if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe(): ...@@ -435,23 +432,6 @@ if cutlass_fp4_supported() or has_flashinfer_cutlass_fused_moe():
] ]
def make_prepare_finalize(
prepare_finalize_type: mk.FusedMoEPrepareAndFinalize,
backend: str | None,
moe: FusedMoEConfig,
quant_config: FusedMoEQuantConfig,
) -> mk.FusedMoEPrepareAndFinalize:
if backend != "naive" and backend is not None:
prepare_finalize = maybe_make_prepare_finalize(moe, quant_config)
assert prepare_finalize is not None
return prepare_finalize
elif prepare_finalize_type == FlashInferCutlassMoEPrepareAndFinalize:
return create_flashinfer_prepare_finalize(
use_dp=moe.moe_parallel_config.dp_size > 1
)
else:
return MoEPrepareAndFinalizeNoEP()
def _slice(rank: int, num_local_experts: int, t: torch.Tensor) -> torch.Tensor: def _slice(rank: int, num_local_experts: int, t: torch.Tensor) -> torch.Tensor:
s = rank * num_local_experts s = rank * num_local_experts
e = s + num_local_experts e = s + num_local_experts
...@@ -500,4 +480,4 @@ def make_fused_experts( ...@@ -500,4 +480,4 @@ def make_fused_experts(
torch.set_printoptions(threshold=1000, edgeitems=5, linewidth=80) torch.set_printoptions(threshold=1000, edgeitems=5, linewidth=80)
return experts return experts
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import numpy as np
import pytest import pytest
import torch import torch
from transformers import AutoModelForTokenClassification from transformers import AutoModelForTokenClassification
...@@ -40,7 +42,6 @@ def test_bert_like_models( ...@@ -40,7 +42,6 @@ def test_bert_like_models(
model: str, model: str,
dtype: str, dtype: str,
) -> None: ) -> None:
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model: with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.token_classify(example_prompts) vllm_outputs = vllm_model.token_classify(example_prompts)
...@@ -73,6 +74,7 @@ def test_bert_like_models( ...@@ -73,6 +74,7 @@ def test_bert_like_models(
@pytest.mark.parametrize("model", ["disham993/electrical-ner-ModernBERT-base"]) @pytest.mark.parametrize("model", ["disham993/electrical-ner-ModernBERT-base"])
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.flaky(reruns=3)
@torch.inference_mode @torch.inference_mode
def test_modernbert_models( def test_modernbert_models(
hf_runner, hf_runner,
...@@ -81,6 +83,14 @@ def test_modernbert_models( ...@@ -81,6 +83,14 @@ def test_modernbert_models(
model: str, model: str,
dtype: str, dtype: str,
) -> None: ) -> None:
# NOTE: https://github.com/vllm-project/vllm/pull/32403
# `disham993/electrical-ner-ModernBERT-base` is a randomly initialized
# model, which can cause numerical precision variance and edge cases.
# We use @flaky(reruns=3) to mitigate intermittent failures.
print(
f"\n[NOTE] Testing {model} (randomly initialized weights) - "
"flaky tolerance enabled due to numerical precision variance."
)
with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model: with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.token_classify(example_prompts) vllm_outputs = vllm_model.token_classify(example_prompts)
...@@ -140,4 +150,4 @@ def test_auto_conversion( ...@@ -140,4 +150,4 @@ def test_auto_conversion(
for hf_output, vllm_output in zip(hf_outputs, vllm_outputs): for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
hf_output = hf_output.detach().clone().cpu().float() hf_output = hf_output.detach().clone().cpu().float()
vllm_output = vllm_output.detach().clone().cpu().float() vllm_output = vllm_output.detach().clone().cpu().float()
assert torch.allclose(hf_output, vllm_output, atol=1e-2) assert torch.allclose(hf_output, vllm_output, atol=1e-2)
\ No newline at end of file
...@@ -91,6 +91,19 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = { ...@@ -91,6 +91,19 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
"use_processor": True, "use_processor": True,
"question": "What is the content of each image?", "question": "What is the content of each image?",
}, },
"glm_ocr": {
"model_name": "zai-org/GLM-OCR",
"interface": "llm_generate",
"max_model_len": 131072,
"max_num_seqs": 2,
"sampling_params": {
"temperature": 0.0,
"max_tokens": 256,
"stop_token_ids": None,
},
"use_processor": True,
"question": "Text Recognition:",
},
"keye_vl": { "keye_vl": {
"model_name": "Kwai-Keye/Keye-VL-8B-Preview", "model_name": "Kwai-Keye/Keye-VL-8B-Preview",
"interface": "llm_generate", "interface": "llm_generate",
...@@ -427,4 +440,4 @@ def test_vit_backend_functionality( ...@@ -427,4 +440,4 @@ def test_vit_backend_functionality(
elif config["interface"] == "llm_generate": elif config["interface"] == "llm_generate":
run_llm_generate_test(config, mm_encoder_attn_backend, image_assets) run_llm_generate_test(config, mm_encoder_attn_backend, image_assets)
else: else:
raise ValueError(f"Unknown interface: {config['interface']}") raise ValueError(f"Unknown interface: {config['interface']}")
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment