Unverified Commit 31b25f65 authored by Didier Durand's avatar Didier Durand Committed by GitHub
Browse files
parent abb34ac4
...@@ -112,7 +112,7 @@ def parse_from_filename(file: str) -> WheelFileInfo: ...@@ -112,7 +112,7 @@ def parse_from_filename(file: str) -> WheelFileInfo:
def generate_project_list(subdir_names: list[str], comment: str = "") -> str: def generate_project_list(subdir_names: list[str], comment: str = "") -> str:
""" """
Generate project list HTML content linking to each project & variant sub-directory. Generate project list HTML content linking to each project & variant subdirectory.
""" """
href_tags = [] href_tags = []
for name in sorted(subdir_names): for name in sorted(subdir_names):
...@@ -168,23 +168,23 @@ def generate_index_and_metadata( ...@@ -168,23 +168,23 @@ def generate_index_and_metadata(
comment (str | None): Optional comment to include in the generated HTML files. comment (str | None): Optional comment to include in the generated HTML files.
First, parse all wheel files to extract metadata. First, parse all wheel files to extract metadata.
We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory). We need to collect all wheel files for each variant, and generate an index for it (in a subdirectory).
The index for the default variant (if any) is generated in the root index directory. The index for the default variant (if any) is generated in the root index directory.
If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
is purely a copy of the corresponding variant index, with only the links adjusted. is purely a copy of the corresponding variant index, with only the links adjusted.
Otherwise, all wheels without variant suffixes are treated as the default variant. Otherwise, all wheels without variant suffixes are treated as the default variant.
If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content If `alias_to_default` is provided, an additional alias subdirectory is created, it has the same content
as the default variant index, but the links are adjusted accordingly. as the default variant index, but the links are adjusted accordingly.
Index directory structure: Index directory structure:
index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/) index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
index.html # project list, linking to "vllm/" and other packages, and all variant sub-directories index.html # project list, linking to "vllm/" and other packages, and all variant subdirectories
vllm/ vllm/
index.html # package index, pointing to actual files in wheel_base_dir (relative path) index.html # package index, pointing to actual files in wheel_base_dir (relative path)
metadata.json # machine-readable metadata for all wheels in this package metadata.json # machine-readable metadata for all wheels in this package
cpu/ # cpu variant sub-directory cpu/ # cpu variant subdirectory
index.html index.html
vllm/ vllm/
index.html index.html
...@@ -194,7 +194,7 @@ def generate_index_and_metadata( ...@@ -194,7 +194,7 @@ def generate_index_and_metadata(
vllm/ vllm/
index.html index.html
metadata.json metadata.json
cu130/ # cu130 variant sub-directory cu130/ # cu130 variant subdirectory
index.html index.html
vllm/ vllm/
index.html index.html
......
...@@ -116,7 +116,7 @@ class Dequantizer4b { ...@@ -116,7 +116,7 @@ class Dequantizer4b {
scalar_vec_t output_vec_0(wb_0); scalar_vec_t output_vec_0(wb_0);
scalar_vec_t output_vec_1(wb_1); scalar_vec_t output_vec_1(wb_1);
// AMX needs to interlave K elements to pack as 32 bits // AMX needs to interleave K elements to pack as 32 bits
if constexpr (isa == ISA::AMX) { if constexpr (isa == ISA::AMX) {
vec_op::interleave_save(output_vec_0, output_vec_1, curr_weight); vec_op::interleave_save(output_vec_0, output_vec_1, curr_weight);
} else { } else {
......
...@@ -292,7 +292,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -292,7 +292,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"value_cache, Tensor(a3!) output, Tensor query_start_loc, Tensor " "value_cache, Tensor(a3!) output, Tensor query_start_loc, Tensor "
"seq_lens, float scale, bool causal, Tensor? alibi_slopes, SymInt " "seq_lens, float scale, bool causal, Tensor? alibi_slopes, SymInt "
"sliding_window_left, SymInt sliding_window_right, Tensor block_table, " "sliding_window_left, SymInt sliding_window_right, Tensor block_table, "
"float softcap, Tensor sheduler_metadata, Tensor? s_aux) -> ()", "float softcap, Tensor scheduler_metadata, Tensor? s_aux) -> ()",
&cpu_attention_with_kv_cache); &cpu_attention_with_kv_cache);
// placeholders // placeholders
......
...@@ -28,7 +28,7 @@ Furthermore, vLLM decides whether to enable or disable a `CustomOp` based on `co ...@@ -28,7 +28,7 @@ Furthermore, vLLM decides whether to enable or disable a `CustomOp` based on `co
!!! note !!! note
Note that `all` and `none` cannot coexist in `compilation_config.custom_ops`. Note that `all` and `none` cannot coexist in `compilation_config.custom_ops`.
By default, if `compilation_config.backend == "inductor"` and `compilation_config.mode != CompilationMode.NONE`, a `none` will be appended into `compilation_config.custom_ops`, otherwise a `all` will be appended. In other words, this means `CustomOp` will be disabled in some platforms (i.e., those use `inductor` as dafault backend for `torch.compile`) when running with torch compile mode. In this case, Inductor generates (fused) Triton kernels for those disabled custom ops. By default, if `compilation_config.backend == "inductor"` and `compilation_config.mode != CompilationMode.NONE`, a `none` will be appended into `compilation_config.custom_ops`, otherwise a `all` will be appended. In other words, this means `CustomOp` will be disabled in some platforms (i.e., those use `inductor` as default backend for `torch.compile`) when running with torch compile mode. In this case, Inductor generates (fused) Triton kernels for those disabled custom ops.
!!! note !!! note
For multi-modal models, vLLM has enforced the enabling of some custom ops to use device-specific deep-optimized kernels for better performance in ViT part, such as `MMEncoderAttention` and `ApplyRotaryEmb`. We can also pass a `enforce_enable=True` param to the `__init__()` method of the `CustomOp` to enforce enable itself at object-level. For multi-modal models, vLLM has enforced the enabling of some custom ops to use device-specific deep-optimized kernels for better performance in ViT part, such as `MMEncoderAttention` and `ApplyRotaryEmb`. We can also pass a `enforce_enable=True` param to the `__init__()` method of the `CustomOp` to enforce enable itself at object-level.
......
...@@ -211,7 +211,7 @@ LLM(model, compilation_config=CompilationConfig( ...@@ -211,7 +211,7 @@ LLM(model, compilation_config=CompilationConfig(
These modes are stricter and reduce or eliminate the need of dynamic shapes guarding, which can help isolate issues: These modes are stricter and reduce or eliminate the need of dynamic shapes guarding, which can help isolate issues:
- `unbacked`: Uses unbacked symints which don't allow guards, making it easier to identify where guards are being incorrectly added - `unbacked`: Uses unbacked symints which don't allow guards, making it easier to identify where guards are being incorrectly added
- `backed_size_oblivious`: Uses a mode that is more strict about guarding. - `backed_size_oblivious`: Uses a mode that is stricter about guarding.
For more details on dynamic shapes modes, see [Dynamic shapes and vLLM guard dropping](torch_compile.md#dynamic-shapes-and-vllm-guard-dropping). For more details on dynamic shapes modes, see [Dynamic shapes and vLLM guard dropping](torch_compile.md#dynamic-shapes-and-vllm-guard-dropping).
......
...@@ -100,7 +100,7 @@ Every plugin has three parts: ...@@ -100,7 +100,7 @@ Every plugin has three parts:
- `_enum`: This property is the device enumeration from [PlatformEnum][vllm.platforms.interface.PlatformEnum]. Usually, it should be `PlatformEnum.OOT`, which means the platform is out-of-tree. - `_enum`: This property is the device enumeration from [PlatformEnum][vllm.platforms.interface.PlatformEnum]. Usually, it should be `PlatformEnum.OOT`, which means the platform is out-of-tree.
- `device_type`: This property should return the type of the device which pytorch uses. For example, `"cpu"`, `"cuda"`, etc. - `device_type`: This property should return the type of the device which pytorch uses. For example, `"cpu"`, `"cuda"`, etc.
- `device_name`: This property is set the same as `device_type` usually. It's mainly used for logging purposes. - `device_name`: This property is set the same as `device_type` usually. It's mainly used for logging purposes.
- `check_and_update_config`: This function is called very early in the vLLM's initialization process. It's used for plugins to update the vllm configuration. For example, the block size, graph mode config, etc, can be updated in this function. The most important thing is that the **worker_cls** should be set in this function to let vLLM know which worker class to use for the worker process. - `check_and_update_config`: This function is called very early in the vLLM's initialization process. It's used for plugins to update the vllm configuration. For example, the block size, graph mode config, etc., can be updated in this function. The most important thing is that the **worker_cls** should be set in this function to let vLLM know which worker class to use for the worker process.
- `get_attn_backend_cls`: This function should return the attention backend class's fully qualified name. - `get_attn_backend_cls`: This function should return the attention backend class's fully qualified name.
- `get_device_communicator_cls`: This function should return the device communicator class's fully qualified name. - `get_device_communicator_cls`: This function should return the device communicator class's fully qualified name.
...@@ -126,7 +126,7 @@ Every plugin has three parts: ...@@ -126,7 +126,7 @@ Every plugin has three parts:
5. Implement the attention backend class `MyDummyAttention` in `my_dummy_attention.py`. The attention backend class should inherit from [AttentionBackend][vllm.v1.attention.backend.AttentionBackend]. It's used to calculate attentions with your device. Take `vllm.v1.attention.backends` as examples, it contains many attention backend implementations. 5. Implement the attention backend class `MyDummyAttention` in `my_dummy_attention.py`. The attention backend class should inherit from [AttentionBackend][vllm.v1.attention.backend.AttentionBackend]. It's used to calculate attentions with your device. Take `vllm.v1.attention.backends` as examples, it contains many attention backend implementations.
6. Implement custom ops for high performance. Most ops can be ran by pytorch native implementation, while the performance may not be good. In this case, you can implement specific custom ops for your plugins. Currently, there are kinds of custom ops vLLM supports: 6. Implement custom ops for high performance. Most ops can be run by pytorch native implementation, while the performance may not be good. In this case, you can implement specific custom ops for your plugins. Currently, there are kinds of custom ops vLLM supports:
- pytorch ops - pytorch ops
there are 3 kinds of pytorch ops: there are 3 kinds of pytorch ops:
......
...@@ -327,7 +327,7 @@ curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{ ...@@ -327,7 +327,7 @@ curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
}' }'
``` ```
Due to limitations in the the output schema, the output consists of a list of Due to limitations in the output schema, the output consists of a list of
token scores for each token for each input. This means that you'll have to call token scores for each token for each input. This means that you'll have to call
`/tokenize` as well to be able to pair tokens with scores. `/tokenize` as well to be able to pair tokens with scores.
Refer to the tests in `tests/models/language/pooling/test_bge_m3.py` to see how Refer to the tests in `tests/models/language/pooling/test_bge_m3.py` to see how
......
...@@ -9,7 +9,7 @@ Context parallel mainly solves the problem of serving long context requests. As ...@@ -9,7 +9,7 @@ Context parallel mainly solves the problem of serving long context requests. As
During prefill, for a long request with `T` new tokens, we need to compute query/key/value tensors for these new tokens. Say we have `N` GPUs, we can split the request into `N` chunks, and each GPU computes one chunk of the query/key/value tensors. During prefill, for a long request with `T` new tokens, we need to compute query/key/value tensors for these new tokens. Say we have `N` GPUs, we can split the request into `N` chunks, and each GPU computes one chunk of the query/key/value tensors.
Depending on the use case, there're two possible strategies: Depending on the use case, there are two possible strategies:
1. Partial query, full key/value: If the request token length is moderately long (we can afford holding the full key/value tensors), and the goal is to accelerate the prefill (and amortize the computation time of the prefill across query tokens), then we can gather the key/value tensors from all GPUs and let each GPU compute the attention output corresponding to the query tokens of its chunk. 1. Partial query, full key/value: If the request token length is moderately long (we can afford holding the full key/value tensors), and the goal is to accelerate the prefill (and amortize the computation time of the prefill across query tokens), then we can gather the key/value tensors from all GPUs and let each GPU compute the attention output corresponding to the query tokens of its chunk.
2. Partial query, partial key/value: If the request token length is too long, we cannot afford holding the full key/value tensors anymore, then we can only compute one chunk of query/key/value tensors for each GPU, and use techniques like [ring-attention](http://arxiv.org/abs/2310.01889) to send/recv key/value tensors chunk by chunk. 2. Partial query, partial key/value: If the request token length is too long, we cannot afford holding the full key/value tensors anymore, then we can only compute one chunk of query/key/value tensors for each GPU, and use techniques like [ring-attention](http://arxiv.org/abs/2310.01889) to send/recv key/value tensors chunk by chunk.
......
...@@ -693,7 +693,7 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -693,7 +693,7 @@ class OpenAIServingResponses(OpenAIServing):
except ValueError as e: except ValueError as e:
return self.create_error_response(e) return self.create_error_response(e)
# NOTE: Implementation of stauts is still WIP, but for now # NOTE: Implementation of status is still WIP, but for now
# we guarantee that if the status is not "completed", it is accurate. # we guarantee that if the status is not "completed", it is accurate.
# "completed" is implemented as the "catch-all" for now. # "completed" is implemented as the "catch-all" for now.
status: ResponseStatus = "completed" status: ResponseStatus = "completed"
......
...@@ -30,7 +30,7 @@ class SharedFusedMoE(FusedMoE): ...@@ -30,7 +30,7 @@ class SharedFusedMoE(FusedMoE):
# Disable shared expert overlap if: # Disable shared expert overlap if:
# - we are using eplb with non-default backend, because of correctness issues # - we are using eplb with non-default backend, because of correctness issues
# - we are using flashinfer with DP, since there nothint to gain # - we are using flashinfer with DP, since there nothing to gain
# - we are using marlin kernels # - we are using marlin kernels
backend = self.moe_parallel_config.all2all_backend backend = self.moe_parallel_config.all2all_backend
self.use_overlapped = ( self.use_overlapped = (
......
...@@ -22,7 +22,7 @@ class ScaledMMLinearLayerConfig: ...@@ -22,7 +22,7 @@ class ScaledMMLinearLayerConfig:
@dataclass @dataclass
class Int8ScaledMMLinearLayerConfig(ScaledMMLinearLayerConfig): class Int8ScaledMMLinearLayerConfig(ScaledMMLinearLayerConfig):
# TODO: Chnage to QuantKey like FP8ScaledMMLinearLayerConfig # TODO: Change to QuantKey like FP8ScaledMMLinearLayerConfig
is_static_input_scheme: bool is_static_input_scheme: bool
is_channelwise: bool is_channelwise: bool
input_symmetric: bool input_symmetric: bool
......
...@@ -119,7 +119,7 @@ def choose_scaled_mm_linear_kernel( ...@@ -119,7 +119,7 @@ def choose_scaled_mm_linear_kernel(
config (_KernelConfigT): Description of the linear layer config (_KernelConfigT): Description of the linear layer
to be implemented. to be implemented.
possible_kernels (dict[PlatformEnum, list[_KernelT]]): A possible_kernels (dict[PlatformEnum, list[_KernelT]]): A
dictionary of platforms and their list list of possible kernels. dictionary of platforms and their list of possible kernels.
compute_capability (Optional[int], optional): The compute capability of compute_capability (Optional[int], optional): The compute capability of
the target device, if None uses `current_platform` to get the the target device, if None uses `current_platform` to get the
compute capability. Defaults to None. compute capability. Defaults to None.
......
...@@ -278,7 +278,7 @@ def find_loaded_library(lib_name: str) -> str | None: ...@@ -278,7 +278,7 @@ def find_loaded_library(lib_name: str) -> str | None:
According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html, According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
the file `/proc/self/maps` contains the memory maps of the process, which includes the the file `/proc/self/maps` contains the memory maps of the process, which includes the
shared libraries loaded by the process. We can use this file to find the path of the shared libraries loaded by the process. We can use this file to find the path of the
a loaded library. loaded library.
""" # noqa """ # noqa
found_line = None found_line = None
with open("/proc/self/maps") as f: with open("/proc/self/maps") as f:
......
...@@ -1159,7 +1159,7 @@ class Scheduler(SchedulerInterface): ...@@ -1159,7 +1159,7 @@ class Scheduler(SchedulerInterface):
break break
# Calculate the number of embeddings to schedule in the current range # Calculate the number of embeddings to schedule in the current range
# of scheduled encoder placholder tokens. # of scheduled encoder placeholder tokens.
start_idx_rel = max(0, num_computed_tokens - start_pos) start_idx_rel = max(0, num_computed_tokens - start_pos)
end_idx_rel = min( end_idx_rel = min(
num_encoder_tokens, num_computed_tokens + num_new_tokens - start_pos num_encoder_tokens, num_computed_tokens + num_new_tokens - start_pos
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment