Commit 469e903b authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.2' into v0.8.2-dev

parents 389ebcf7 25f560a6
......@@ -65,12 +65,7 @@ struct cutlass_sparse_3x_gemm {
typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
float>::type;
using EpilogueDescriptor =
cutlass::epilogue::collective::detail::EpilogueDescriptor<
TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
ElementD, EpilogueSchedule>;
using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
using Epilogue = Epilogue_<ElementAcc, ElementD, TileShape>;
using ElementC = void;
using LayoutC = cutlass::layout::RowMajor;
......
......@@ -58,7 +58,9 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a,
// Guard against compilation issues for sm90 kernels
#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
if (version_num >= 90) {
// We build for 9.0a which is not forward compatible, so restrict this to
// Hopper only
if (version_num == 90) {
cutlass_scaled_sparse_mm_sm90(c, a, bt_nzs, bt_meta, a_scales, b_scales,
bias);
return;
......@@ -82,7 +84,9 @@ std::vector<torch::Tensor> cutlass_sparse_compress(torch::Tensor const& a) {
// Guard against compilation issues for sm90 kernels
#if defined ENABLE_SPARSE_SCALED_MM_C3X && ENABLE_SPARSE_SCALED_MM_C3X
if (version_num >= 90) {
// We build for 9.0a which is not forward compatible, so restrict this to
// Hopper only
if (version_num == 90) {
std::vector<torch::Tensor> result_tensors;
auto [a_meta, a_nzs] = cutlass_sparse_compress_sm90(a);
......
......@@ -4,6 +4,7 @@
#include "core/registration.h"
#include <torch/library.h>
#include <torch/version.h>
// Note on op signatures:
// The X_meta signatures are for the meta functions corresponding to op X.
......@@ -17,6 +18,15 @@
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// vLLM custom ops
//
// The default behavior in PyTorch 2.6 is "requires_contiguous", so we need
// to override this for many GEMMs with the following tag. Otherwise,
// torch.compile will force all input tensors to be contiguous(), which
// will break many custom ops that require column-major weight matrices.
// TODO: remove this for PyTorch 2.8, when the default is planned to switch
// to match exact eager-mode strides.
at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
ops.def("weak_ref_tensor(Tensor input) -> Tensor");
ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
......@@ -353,25 +363,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def(
"aqlm_gemm(Tensor input, Tensor codes, Tensor codebooks, "
"Tensor scales, int[] codebook_partition_sizes, Tensor? bias) "
"-> Tensor");
"-> Tensor",
{stride_tag});
ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);
// Decompression method for AQLM.
ops.def(
"aqlm_dequant(Tensor codes, Tensor codebooks, "
"int[] codebook_partition_sizes) -> Tensor");
"int[] codebook_partition_sizes) -> Tensor",
{stride_tag});
ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);
// Quantized GEMM for AWQ.
ops.def(
"awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
"Tensor _zeros, SymInt split_k_iters) -> Tensor");
"Tensor _zeros, SymInt split_k_iters) -> Tensor",
{stride_tag});
ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
// Dequantization for AWQ.
ops.def(
"awq_dequantize(Tensor _kernel, Tensor _scaling_factors, "
"Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor");
"Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor",
{stride_tag});
ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
// Note about marlin kernel 'workspace' arguments:
......@@ -392,7 +406,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def(
"marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
"Tensor! workspace, SymInt size_m, SymInt size_n, SymInt size_k) -> "
"Tensor");
"Tensor",
{stride_tag});
// conditionally compiled so impl in source file
// Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
......@@ -400,7 +415,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
"Tensor b_scales, Tensor workspace, "
"int b_q_type, "
"SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor");
"SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor",
{stride_tag});
// conditionally compiled so impl in source file
// Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
......@@ -426,7 +442,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor? channel_scales,"
" Tensor? token_scales,"
" str? schedule"
") -> Tensor");
") -> Tensor",
{stride_tag});
ops.def(
"machete_prepack_B("
" Tensor B,"
......@@ -445,7 +462,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, "
"int b_q_type, "
"SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
"bool has_zp, bool use_fp32_reduce, bool is_zp_float) -> Tensor");
"bool has_zp, bool use_atomic_add, bool use_fp32_reduce, "
"bool is_zp_float) -> Tensor",
{stride_tag});
// conditionally compiled so impl registration is in source file
// gptq_marlin repack from GPTQ.
......@@ -476,12 +495,23 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"ggml_mul_mat_a8(Tensor W, Tensor X, int type, SymInt row) -> Tensor");
ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8);
// moe kernel for GGML.
ops.def(
"ggml_moe_a8(Tensor X, Tensor W, "
"Tensor sorted_token_ids, Tensor expert_ids, Tensor "
"num_tokens_post_padded, "
"int type, SymInt row, SymInt top_k, SymInt tokens) -> Tensor");
ops.impl("ggml_moe_a8", torch::kCUDA, &ggml_moe_a8);
ops.def("ggml_moe_get_block_size", &ggml_moe_get_block_size);
#ifndef USE_ROCM
// fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
ops.def(
"fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
"Tensor! workspace, int num_bits, SymInt size_m, SymInt size_n, "
"SymInt size_k) -> Tensor");
"SymInt size_k) -> Tensor",
{stride_tag});
// conditionally compiled so impl registration is in source file
// marlin_qqq_gemm for QQQ.
......@@ -489,15 +519,25 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"marlin_qqq_gemm(Tensor a, Tensor b_q_weight, "
"Tensor s_tok, Tensor s_ch, Tensor s_group, "
"Tensor! workspace, SymInt size_m, SymInt size_n, "
"SymInt size_k) -> Tensor");
"SymInt size_k) -> Tensor",
{stride_tag});
// conditionally compiled so impl registration is in source file
// CUTLASS nvfp4 block scaled GEMM
ops.def(
"cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b,"
" Tensor block_scale_a, Tensor block_scale_b,"
" Tensor alpha) -> ()",
{stride_tag});
ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
// quantization, as well as bias
ops.def(
"cutlass_scaled_mm(Tensor! out, Tensor a,"
" Tensor b, Tensor a_scales,"
" Tensor b_scales, Tensor? bias) -> ()");
" Tensor b_scales, Tensor? bias) -> ()",
{stride_tag});
ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
// CUTLASS w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
......@@ -506,7 +546,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
" Tensor b, Tensor a_scales,"
" Tensor b_scales, Tensor azp_adj,"
" Tensor? azp, Tensor? bias) -> ()");
" Tensor? azp, Tensor? bias) -> ()",
{stride_tag});
ops.impl("cutlass_scaled_mm_azp", torch::kCUDA, &cutlass_scaled_mm_azp);
// Check if cutlass scaled_mm is supported for CUDA devices of the given
......@@ -519,7 +560,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
"bool");
ops.impl("cutlass_scaled_mm_supports_block_fp8",
&cutlass_scaled_mm_supports_fp8);
&cutlass_scaled_mm_supports_block_fp8);
// Check if cutlass sparse scaled_mm is supported for CUDA devices of the
// given capability
......@@ -534,7 +575,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"cutlass_scaled_sparse_mm(Tensor! out, Tensor a,"
" Tensor bt_nzs,"
" Tensor bt_meta, Tensor a_scales,"
" Tensor b_scales, Tensor? bias) -> ()");
" Tensor b_scales, Tensor? bias) -> ()",
{stride_tag});
ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
// CUTLASS sparse matrix compressor
......@@ -582,15 +624,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor! output_scale, Tensor input_scale) -> ()");
ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant);
// Check if cutlass_scaled_mm_fp4 is supported for CUDA devices
// of the given capability
ops.def("cutlass_scaled_mm_supports_fp4(int cuda_device_capability) -> bool");
ops.impl("cutlass_scaled_mm_supports_fp4", &cutlass_scaled_mm_supports_fp4);
#endif
// Quantized GEMM for GPTQ.
// Note: even though the C++ inferred schema is correct for this op, it seems
// to prevent the meta function registry.
// ops.def(
// "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
// "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) "
// "-> Tensor");
// "-> Tensor",
// {stride_tag});
// ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
// Post processing for GPTQ.
......@@ -630,6 +678,25 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"Tensor!? azp) -> ()");
ops.impl("dynamic_scaled_int8_quant", torch::kCUDA,
&dynamic_scaled_int8_quant);
#ifndef USE_ROCM
// reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel
ops.def(
"rearrange_kn_weight_as_n32k16_order(Tensor b_qweight, Tensor b_scales, "
"Tensor? b_zeros, "
"bool has_zp, Tensor! b_qweight_reorder, Tensor! b_scales_reorder, "
"Tensor!? b_zeros_reorder, "
"int K, int N, int N_32align) -> ()");
// conditionally compiled so impl in source file
// AllSpark quantization ops
ops.def(
"allspark_w8a16_gemm(Tensor a, Tensor b_qweight, Tensor b_scales, "
"Tensor? b_qzeros, "
"SymInt n, SymInt group_size, SymInt sm_count, SymInt sm_version, SymInt "
"CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder) -> Tensor");
// conditionally compiled so impl in source file
#endif
}
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
......@@ -699,6 +766,12 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
"convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
"str kv_cache_dtype) -> ()");
cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
// Gather cache blocks from src_cache to dst.
cache_ops.def(
"gather_cache(Tensor src_cache, Tensor! dst, Tensor block_table, "
"Tensor cu_seq_lens, int batch_size, Tensor? seq_starts) -> ()");
cache_ops.impl("gather_cache", torch::kCUDA, &gather_cache);
}
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
......@@ -715,7 +788,6 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
&get_max_shared_memory_per_block_device_attribute);
}
#ifndef USE_ROCM
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
// Custom all-reduce kernels
custom_ar.def(
......@@ -733,7 +805,12 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
custom_ar.def("register_buffer", &register_buffer);
custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
custom_ar.def("register_graph_buffers", &register_graph_buffers);
custom_ar.def("allocate_shared_buffer_and_handle",
&allocate_shared_buffer_and_handle);
custom_ar.def("open_mem_handle(Tensor mem_handle) -> int", &open_mem_handle);
custom_ar.impl("open_mem_handle", torch::kCPU, &open_mem_handle);
custom_ar.def("free_shared_buffer", &free_shared_buffer);
}
#endif
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
......@@ -4,7 +4,7 @@
```bash
# Install dependencies.
pip install -r requirements-docs.txt
pip install -r ../requirements/docs.txt
# Build the docs.
make clean
......
......@@ -4,6 +4,8 @@
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
- [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)
- [The ninth vLLM meetup](https://lu.ma/h7g3kuj9), with Meta, February 27th 2025. [[Slides]](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing)
- [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing)
- [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing)
- [The sixth vLLM meetup](https://lu.ma/87q3nvnh), with NVIDIA, September 9th 2024. [[Slides]](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing)
......
......@@ -17,7 +17,6 @@ import inspect
import logging
import os
import sys
from typing import List
import requests
from sphinx.ext import autodoc
......@@ -58,7 +57,7 @@ templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns: List[str] = ["**/*.template.md", "**/*.inc.md"]
exclude_patterns: list[str] = ["**/*.template.md", "**/*.inc.md"]
# Exclude the prompt "$" when copying code
copybutton_prompt_text = r"\$ "
......@@ -86,6 +85,7 @@ html_static_path = ["_static"]
html_js_files = ["custom.js"]
html_css_files = ["custom.css"]
myst_heading_anchors = 2
myst_url_schemes = {
'http': None,
'https': None,
......
......@@ -74,8 +74,6 @@ def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
) -> torch.Tensor:
...
```
......
......@@ -16,8 +16,6 @@ Further update the model as follows:
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
+ pixel_values: torch.Tensor,
) -> SamplerOutput:
```
......@@ -36,7 +34,8 @@ Further update the model as follows:
image_features = self.vision_encoder(image_input)
return self.multi_modal_projector(image_features)
def get_multimodal_embeddings(self, **kwargs: object) -> Optional[NestedTensors]:
def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
# Validate the multimodal input keyword arguments
image_input = self._parse_and_validate_image_input(**kwargs)
......@@ -63,7 +62,7 @@ Further update the model as follows:
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[NestedTensors] = None,
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor:
# `get_input_embeddings` should already be implemented for the language
......@@ -722,13 +721,13 @@ def _get_mm_fields_config(
:::::
### Prompt replacements
### Prompt updates
Override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements` to
return a list of {class}`~vllm.multimodal.processing.PromptReplacement` instances.
Override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` to
return a list of {class}`~vllm.multimodal.processing.PromptUpdate` instances.
Each {class}`~vllm.multimodal.processing.PromptReplacement` instance specifies a find-and-replace
operation performed by the HF processor.
Each {class}`~vllm.multimodal.processing.PromptUpdate` instance specifies an update operation
(e.g.: insertion, replacement) performed by the HF processor.
::::{tab-set}
:::{tab-item} Basic example: LLaVA
......@@ -745,15 +744,15 @@ for sample in text:
```
It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
Based on this, we override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_replacements` as follows:
Based on this, we override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` as follows:
```python
def _get_prompt_replacements(
def _get_prompt_updates(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> list[PromptReplacement]:
) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index
......@@ -861,7 +860,7 @@ prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
)
```
To accommodate this, instead of a string you can return an instance of `PromptReplacementDetails`
To accommodate this, instead of a string you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`
with different `full` and `feature` attributes:
```python
......@@ -880,7 +879,7 @@ def get_replacement_fuyu(item_idx: int):
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows
return PromptReplacementDetails(
return PromptUpdateDetails(
full=image_tokens + [bos_token_id],
features=image_tokens,
)
......@@ -890,12 +889,12 @@ Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the
we can search for it to conduct the replacement at the start of the string:
```python
def _get_prompt_replacements(
def _get_prompt_updates(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> list[PromptReplacement]:
) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config()
bos_token_id = hf_config.bos_token_id
assert isinstance(bos_token_id, int)
......@@ -915,7 +914,7 @@ def _get_prompt_replacements(
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
[_NEWLINE_TOKEN_ID]) * nrows
return PromptReplacementDetails(
return PromptUpdateDetails(
full=image_tokens + [bos_token_id],
features=image_tokens,
)
......@@ -950,3 +949,35 @@ to register them to the multi-modal registry:
+ dummy_inputs=YourDummyInputsBuilder)
class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
```
## Notes
### Inserting feature tokens without replacement
Some HF processors directly insert feature tokens without replacing anything in the original prompt. In that case, you can use {class}`~vllm.multimodal.processing.PromptInsertion` instead of {class}`~vllm.multimodal.processing.PromptReplacement` inside {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`.
Examples:
- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
- Florence2 (insert at start of prompt): <gh-file:vllm/model_executor/models/florence2.py>
- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
### Handling prompt updates unrelated to multi-modal data
{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` assumes that each application of prompt update corresponds to one multi-modal item. If the HF processor performs additional processing regardless of how many multi-modal items there are, you should override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_tokens_only` so that the processed token inputs are consistent with the result of applying the HF processor on text inputs. This is because token inputs bypass the HF processor according to [our design](#mm-processing).
Examples:
- Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py>
- Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py>
- Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py>
### Custom HF processor
Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor`.
Examples:
- DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py>
- InternVL: <gh-file:vllm/model_executor/models/internvl.py>
- Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py>
......@@ -23,7 +23,7 @@ Check out the [building from source](#build-from-source) documentation for detai
## Testing
```bash
pip install -r requirements-dev.txt
pip install -r requirements/dev.txt
# Linting, formatting and static type checking
pre-commit install --hook-type pre-commit --hook-type commit-msg
......@@ -145,6 +145,9 @@ review process:
- Please respond to all comments within a reasonable time frame. If a comment
isn't clear or you disagree with a suggestion, feel free to ask for
clarification or discuss the suggestion.
- Note that not all CI checks will be executed due to limited computational
resources. The reviewer will add `ready` label to the PR when the PR is
ready to merge or a full CI run is needed.
## Thank You
......
......@@ -4,6 +4,8 @@
Profiling is only intended for vLLM developers and maintainers to understand the proportion of time spent in different parts of the codebase. **vLLM end-users should never turn on profiling** as it will significantly slow down the inference.
:::
## Profile with PyTorch Profiler
We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`
The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
......@@ -22,13 +24,13 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve
`export VLLM_RPC_TIMEOUT=1800000`
:::
## Example commands and usage
### Example commands and usage
### Offline Inference
#### Offline Inference
Refer to <gh-file:examples/offline_inference/simple_profiling.py> for an example.
### OpenAI Server
#### OpenAI Server
```bash
VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
......@@ -39,3 +41,135 @@ benchmark_serving.py:
```bash
python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2
```
## Profile with NVIDIA Nsight Systems
Nsight systems is an advanced tool that exposes more profiling details, such as register and shared memory usage, annotated code regions and low-level CUDA APIs and events.
[Install nsight-systems](https://docs.nvidia.com/nsight-systems/InstallationGuide/index.html) using your package manager.
The following block is an example for Ubuntu.
```bash
apt update
apt install -y --no-install-recommends gnupg
echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu$(source /etc/lsb-release; echo "$DISTRIB_RELEASE" | tr -d .)/$(dpkg --print-architecture) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list
apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
apt update
apt install nsight-systems-cli
```
### Example commands and usage
#### Offline Inference
For basic usage, you can just append `nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node` before any existing script you would run for offline inference.
The following is an example using the `benchmarks/benchmark_latency.py` script:
```bash
nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node python benchmarks/benchmark_latency.py --model meta-llama/Llama-3.1-8B-Instruct --num-iters-warmup 5 --num-iters 1 --batch-size 16 --input-len 512 --output-len 8
```
#### OpenAI Server
To profile the server, you will want to prepend your `vllm serve` command with `nsys profile` just like for offline inference, however you must specify `--delay XX --duration YY` parameters according to the needs of your benchmark. After the duration time has been used up, the server will be killed.
```bash
# server
nsys profile -o report.nsys-rep --trace-fork-before-exec=true --cuda-graph-trace=node --delay 30 --duration 60 vllm serve meta-llama/Llama-3.1-8B-Instruct
# client
python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Llama-3.1-8B-Instruct --num-prompts 1 --dataset-name random --random-input 1024 --random-output 512
```
In practice, you should set the `--duration` argument to a large value. Whenever you want the server to stop profiling, run:
```
nsys sessions list
```
to get the session id in the form of `profile-XXXXX`, then run:
```
nsys stop --session=profile-XXXXX
```
to manually kill the profiler and generate your `nsys-rep` report.
#### Analysis
You can view these profiles either as summaries in the CLI, using `nsys stats [profile-file]`, or in the GUI by installing Nsight [locally following the directions here](https://developer.nvidia.com/nsight-systems/get-started).
CLI example:
```bash
nsys stats report1.nsys-rep
...
** CUDA GPU Kernel Summary (cuda_gpu_kern_sum):
Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name
-------- --------------- --------- ----------- ----------- -------- --------- ----------- ----------------------------------------------------------------------------------------------------
46.3 10,327,352,338 17,505 589,965.9 144,383.0 27,040 3,126,460 944,263.8 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize128x128x64_warpgroupsize1x1x1_execute_segment_k_of…
14.8 3,305,114,764 5,152 641,520.7 293,408.0 287,296 2,822,716 867,124.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize256x128x64_warpgroupsize2x1x1_execute_segment_k_of…
12.1 2,692,284,876 14,280 188,535.4 83,904.0 19,328 2,862,237 497,999.9 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x128x64_warpgroupsize1x1x1_execute_segment_k_off…
9.5 2,116,600,578 33,920 62,399.8 21,504.0 15,326 2,532,285 290,954.1 sm90_xmma_gemm_bf16bf16_bf16f32_f32_tn_n_tilesize64x64x64_warpgroupsize1x1x1_execute_segment_k_off_…
5.0 1,119,749,165 18,912 59,208.4 9,056.0 6,784 2,578,366 271,581.7 void vllm::act_and_mul_kernel<c10::BFloat16, &vllm::silu_kernel<c10::BFloat16>, (bool)1>(T1 *, cons…
4.1 916,662,515 21,312 43,011.6 19,776.0 8,928 2,586,205 199,790.1 void cutlass::device_kernel<flash::enable_sm90_or_later<flash::FlashAttnFwdSm90<flash::CollectiveMa…
2.6 587,283,113 37,824 15,526.7 3,008.0 2,719 2,517,756 139,091.1 std::enable_if<T2>(int)0&&vllm::_typeConvert<T1>::exists, void>::type vllm::fused_add_rms_norm_kern…
1.9 418,362,605 18,912 22,121.5 3,871.0 3,328 2,523,870 175,248.2 void vllm::rotary_embedding_kernel<c10::BFloat16, (bool)1>(const long *, T1 *, T1 *, const T1 *, in
0.7 167,083,069 18,880 8,849.7 2,240.0 1,471 2,499,996 101,436.1 void vllm::reshape_and_cache_flash_kernel<__nv_bfloat16, __nv_bfloat16, (vllm::Fp8KVCacheDataType)0…
...
```
GUI example:
<img width="1799" alt="Screenshot 2025-03-05 at 11 48 42 AM" src="https://github.com/user-attachments/assets/c7cff1ae-6d6f-477d-a342-bd13c4fc424c" />
## Profiling vLLM Python Code
The Python standard library includes
[cProfile](https://docs.python.org/3/library/profile.html) for profiling Python
code. vLLM includes a couple of helpers that make it easy to apply it to a section of vLLM.
Both the `vllm.utils.cprofile` and `vllm.utils.cprofile_context` functions can be
used to profile a section of code.
### Example usage - decorator
The first helper is a Python decorator that can be used to profile a function.
If a filename is specified, the profile will be saved to that file. If no filename is
specified, profile data will be printed to stdout.
```python
import vllm.utils
@vllm.utils.cprofile("expensive_function.prof")
def expensive_function():
# some expensive code
pass
```
### Example Usage - context manager
The second helper is a context manager that can be used to profile a block of
code. Similar to the decorator, the filename is optional.
```python
import vllm.utils
def another_function():
# more expensive code
pass
with vllm.utils.cprofile_context("another_function.prof"):
another_function()
```
### Analyzing Profile Results
There are multiple tools available that can help analyze the profile results.
One example is [snakeviz](https://jiffyclub.github.io/snakeviz/).
```bash
pip install snakeviz
snakeviz expensive_function.prof
```
......@@ -37,7 +37,7 @@ you may contact the following individuals:
## Slack Discussion
You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai)
You may use the `#security` channel in the [vLLM Slack](https://slack.vllm.ai)
to discuss security-related topics. However, please do not disclose any
vulnerabilities in this channel. If you need to report a vulnerability, please
use the GitHub security advisory system or contact a VMT member privately.
......
......@@ -27,6 +27,36 @@ container to access the host's shared memory. vLLM uses PyTorch, which uses shar
memory to share data between processes under the hood, particularly for tensor parallel inference.
:::
:::{note}
Optional dependencies are not included in order to avoid licensing issues (e.g. <gh-issue:8030>).
If you need to use those dependencies (having accepted the license terms),
create a custom Dockerfile on top of the base image with an extra layer that installs them:
```Dockerfile
FROM vllm/vllm-openai:v0.8.0
# e.g. install the `audio` and `video` optional dependencies
# NOTE: Make sure the version of vLLM matches the base image!
RUN uv pip install vllm[audio,video]==0.8.0
```
:::
:::{tip}
Some new models may only be available on the main branch of [HF Transformers](https://github.com/huggingface/transformers).
To use the development version of `transformers`, create a custom Dockerfile on top of the base image
with an extra layer that installs their code from source:
```Dockerfile
FROM vllm/vllm-openai:latest
RUN uv pip install git+https://github.com/huggingface/transformers.git
```
:::
(deployment-docker-build-image-from-source)=
## Building vLLM's Docker Image from Source
......
......@@ -4,9 +4,9 @@
A Helm chart to deploy vLLM for Kubernetes
Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLMm Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variables values.
Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values.
This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm install and documentation on architecture and values file.
This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file.
## Prerequisites
......
......@@ -7,5 +7,192 @@ A major use case is for multi-host/multi-node distributed inference.
vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kubernetes for distributed model serving.
Please see [this guide](https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/vllm) for more details on
deploying vLLM on Kubernetes using LWS.
## Prerequisites
* At least two Kubernetes nodes, each with 8 GPUs, are required.
* Install LWS by following the instructions found [here](https://lws.sigs.k8s.io/docs/installation/).
## Deploy and Serve
Deploy the following yaml file `lws.yaml`
```yaml
apiVersion: leaderworkerset.x-k8s.io/v1
kind: LeaderWorkerSet
metadata:
name: vllm
spec:
replicas: 2
leaderWorkerTemplate:
size: 2
restartPolicy: RecreateGroupOnPodRestart
leaderTemplate:
metadata:
labels:
role: leader
spec:
containers:
- name: vllm-leader
image: docker.io/vllm/vllm-openai:latest
env:
- name: HUGGING_FACE_HUB_TOKEN
value: <your-hf-token>
command:
- sh
- -c
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
resources:
limits:
nvidia.com/gpu: "8"
memory: 1124Gi
ephemeral-storage: 800Gi
requests:
ephemeral-storage: 800Gi
cpu: 125
ports:
- containerPort: 8080
readinessProbe:
tcpSocket:
port: 8080
initialDelaySeconds: 15
periodSeconds: 10
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 15Gi
workerTemplate:
spec:
containers:
- name: vllm-worker
image: docker.io/vllm/vllm-openai:latest
command:
- sh
- -c
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
resources:
limits:
nvidia.com/gpu: "8"
memory: 1124Gi
ephemeral-storage: 800Gi
requests:
ephemeral-storage: 800Gi
cpu: 125
env:
- name: HUGGING_FACE_HUB_TOKEN
value: <your-hf-token>
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 15Gi
---
apiVersion: v1
kind: Service
metadata:
name: vllm-leader
spec:
ports:
- name: http
port: 8080
protocol: TCP
targetPort: 8080
selector:
leaderworkerset.sigs.k8s.io/name: vllm
role: leader
type: ClusterIP
```
```bash
kubectl apply -f lws.yaml
```
Verify the status of the pods:
```bash
kubectl get pods
```
Should get an output similar to this:
```bash
NAME READY STATUS RESTARTS AGE
vllm-0 1/1 Running 0 2s
vllm-0-1 1/1 Running 0 2s
vllm-1 1/1 Running 0 2s
vllm-1-1 1/1 Running 0 2s
```
Verify that the distributed tensor-parallel inference works:
```bash
kubectl logs vllm-0 |grep -i "Loading model weights took"
```
Should get something similar to this:
```text
INFO 05-08 03:20:24 model_runner.py:173] Loading model weights took 0.1189 GB
(RayWorkerWrapper pid=169, ip=10.20.0.197) INFO 05-08 03:20:28 model_runner.py:173] Loading model weights took 0.1189 GB
```
## Access ClusterIP service
```bash
# Listen on port 8080 locally, forwarding to the targetPort of the service's port 8080 in a pod selected by the service
kubectl port-forward svc/vllm-leader 8080:8080
```
The output should be similar to the following:
```text
Forwarding from 127.0.0.1:8080 -> 8080
Forwarding from [::1]:8080 -> 8080
```
## Serve the model
Open another terminal and send a request
```text
curl http://localhost:8080/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
"prompt": "San Francisco is a",
"max_tokens": 7,
"temperature": 0
}'
```
The output should be similar to the following
```text
{
"id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
"object": "text_completion",
"created": 1715138766,
"model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
"choices": [
{
"index": 0,
"text": " top destination for foodies, with",
"logprobs": null,
"finish_reason": "length",
"stop_reason": null
}
],
"usage": {
"prompt_tokens": 5,
"total_tokens": 12,
"completion_tokens": 7
}
}
```
......@@ -6,4 +6,6 @@
kserve
kubeai
llamastack
llmaz
production-stack
:::
(deployment-llmaz)=
# llmaz
[llmaz](https://github.com/InftyAI/llmaz) is an easy-to-use and advanced inference platform for large language models on Kubernetes, aimed for production use. It uses vLLM as the default model serving backend.
Please refer to the [Quick Start](https://github.com/InftyAI/llmaz?tab=readme-ov-file#quick-start) for more details.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment