Unverified Commit 35bf1938 authored by Didier Durand's avatar Didier Durand Committed by GitHub
Browse files

[Doc]: fix typos in Python comments (#24294)


Signed-off-by: default avatarDidier Durand <durand.didier@gmail.com>
Co-authored-by: default avatarWentao Ye <44945378+yewentao256@users.noreply.github.com>
parent 35efa702
...@@ -417,7 +417,7 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8): ...@@ -417,7 +417,7 @@ def create_sources(impl_configs: list[ImplConfig], num_impl_files=8):
)) ))
def prepacked_type_key(prepack_type: PrepackTypeConfig): def prepacked_type_key(prepack_type: PrepackTypeConfig):
# For now we we can just use the first accumulator type seen since # For now, we can just use the first accumulator type seen since
# the tensor core shapes/layouts don't vary based on accumulator # the tensor core shapes/layouts don't vary based on accumulator
# type so we can generate less code this way # type so we can generate less code this way
return (prepack_type.a, prepack_type.b_num_bits, prepack_type.convert) return (prepack_type.a, prepack_type.b_num_bits, prepack_type.convert)
......
...@@ -180,7 +180,7 @@ Inference batch size is an important parameter for the performance. Larger batch ...@@ -180,7 +180,7 @@ Inference batch size is an important parameter for the performance. Larger batch
- Offline Inference: `256 * world_size` - Offline Inference: `256 * world_size`
- Online Serving: `128 * world_size` - Online Serving: `128 * world_size`
vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use DP, TP and PP together if there are enough CPU sockets and memory nodes. vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommended to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
### Which quantization configs does vLLM CPU support? ### Which quantization configs does vLLM CPU support?
......
...@@ -42,7 +42,7 @@ def run_test( ...@@ -42,7 +42,7 @@ def run_test(
tensor_parallel_size: int = 1, tensor_parallel_size: int = 1,
vllm_embeddings: Optional[torch.Tensor] = None, vllm_embeddings: Optional[torch.Tensor] = None,
): ):
"""Modality agnostic test test executor for comparing HF/vLLM outputs.""" """Modality agnostic test executor for comparing HF/vLLM outputs."""
# In the case of embeddings, vLLM takes separate input tensors # In the case of embeddings, vLLM takes separate input tensors
vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
......
...@@ -60,7 +60,7 @@ class CustomAllreduce: ...@@ -60,7 +60,7 @@ class CustomAllreduce:
group: the process group to work on. If None, it will use the group: the process group to work on. If None, it will use the
default process group. default process group.
device: the device to bind the CustomAllreduce to. If None, device: the device to bind the CustomAllreduce to. If None,
it will be bind to f"cuda:{local_rank}". it will be bound to f"cuda:{local_rank}".
It is the caller's responsibility to make sure each communicator It is the caller's responsibility to make sure each communicator
is bind to a unique device, and all communicators in this group is bind to a unique device, and all communicators in this group
are in the same node. are in the same node.
...@@ -158,7 +158,7 @@ class CustomAllreduce: ...@@ -158,7 +158,7 @@ class CustomAllreduce:
self.disabled = False self.disabled = False
# Buffers memory are owned by this Python class and passed to C++. # Buffers memory are owned by this Python class and passed to C++.
# Meta data composes of two parts: meta data for synchronization and a # Metadata composes of two parts: metadata for synchronization and a
# temporary buffer for storing intermediate allreduce results. # temporary buffer for storing intermediate allreduce results.
self.meta_ptrs = self.create_shared_buffer(ops.meta_size() + max_size, self.meta_ptrs = self.create_shared_buffer(ops.meta_size() + max_size,
group=group, group=group,
......
...@@ -35,7 +35,7 @@ class Internlm2ToolParser(ToolParser): ...@@ -35,7 +35,7 @@ class Internlm2ToolParser(ToolParser):
self, request: ChatCompletionRequest) -> ChatCompletionRequest: self, request: ChatCompletionRequest) -> ChatCompletionRequest:
if request.tools and request.tool_choice != 'none': if request.tools and request.tool_choice != 'none':
# do not skip special tokens because internlm use the special # do not skip special tokens because internlm use the special
# tokens to indicated the start and end of the tool calls # tokens to indicate the start and end of the tool calls
# information. # information.
request.skip_special_tokens = False request.skip_special_tokens = False
return request return request
...@@ -60,8 +60,8 @@ class Internlm2ToolParser(ToolParser): ...@@ -60,8 +60,8 @@ class Internlm2ToolParser(ToolParser):
if '<|action_start|>' not in current_text: if '<|action_start|>' not in current_text:
self.position = len(current_text) self.position = len(current_text)
return DeltaMessage(content=delta_text) return DeltaMessage(content=delta_text)
# if the tool call is sended, return a empty delta message # if the tool call is sended, return an empty delta message
# to make sure the finish_reason will be send correctly. # to make sure the finish_reason will be sent correctly.
if self.current_tool_id > 0: if self.current_tool_id > 0:
return DeltaMessage(content='') return DeltaMessage(content='')
......
...@@ -1064,7 +1064,7 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1064,7 +1064,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# vllm should use flashinfer fused allreduce. The variable should be a # vllm should use flashinfer fused allreduce. The variable should be a
# JSON with the following format: # JSON with the following format:
# { <world size>: <max size in mb> } # { <world size>: <max size in mb> }
# Unspecified world sizes will fallback to # Unspecified world sizes will fall back to
# { 2: 64, 4: 1, <everything else>: 0.5 } # { 2: 64, 4: 1, <everything else>: 0.5 }
"VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB": "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB":
lambda: json.loads(os.getenv( lambda: json.loads(os.getenv(
......
...@@ -534,7 +534,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor, ...@@ -534,7 +534,7 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
EM = sorted_token_ids.size(0) EM = sorted_token_ids.size(0)
if A.size(0) < config["BLOCK_SIZE_M"]: if A.size(0) < config["BLOCK_SIZE_M"]:
# optimize for small batch_size. # optimize for small batch_size.
# We assume that top_ids of each token is unique, so # We assume that top_ids of each token is unique,
# so num_valid_experts <= batch_size <= BLOCK_SIZE_M, # so num_valid_experts <= batch_size <= BLOCK_SIZE_M,
# and we can skip some invalid blocks. # and we can skip some invalid blocks.
EM = min(sorted_token_ids.size(0), EM = min(sorted_token_ids.size(0),
......
...@@ -710,7 +710,7 @@ def determine_expert_map( ...@@ -710,7 +710,7 @@ def determine_expert_map(
# Create a tensor of size num_experts filled with -1 # Create a tensor of size num_experts filled with -1
expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32) expert_map = torch.full((global_num_experts, ), -1, dtype=torch.int32)
# Create a expert map for the local experts # Create an expert map for the local experts
start_idx = ep_rank * base_experts + min(ep_rank, remainder) start_idx = ep_rank * base_experts + min(ep_rank, remainder)
expert_map[start_idx:start_idx + local_num_experts] = torch.arange( expert_map[start_idx:start_idx + local_num_experts] = torch.arange(
0, local_num_experts, dtype=torch.int32) 0, local_num_experts, dtype=torch.int32)
...@@ -806,7 +806,7 @@ class FusedMoE(CustomOp): ...@@ -806,7 +806,7 @@ class FusedMoE(CustomOp):
self.global_num_experts = num_experts + num_redundant_experts self.global_num_experts = num_experts + num_redundant_experts
# we padding globally so EP buffer allocation works # we are padding globally so EP buffer allocation works
if quant_config and quant_config.get_name() == "mxfp4": if quant_config and quant_config.get_name() == "mxfp4":
from vllm.model_executor.layers.quantization.mxfp4 import ( # noqa: E501 from vllm.model_executor.layers.quantization.mxfp4 import ( # noqa: E501
should_use_flashinfer_mxfp4) should_use_flashinfer_mxfp4)
......
...@@ -469,7 +469,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): ...@@ -469,7 +469,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
) )
layer.register_parameter("w2_scales", w2_scales) layer.register_parameter("w2_scales", w2_scales)
set_weight_attrs(w2_scales, extra_weight_attrs) set_weight_attrs(w2_scales, extra_weight_attrs)
# dont shard the w2 scales when running act order # don't shard the w2 scales when running act order
set_weight_attrs(w2_scales, set_weight_attrs(w2_scales,
{"load_full_w2": self.quant_config.desc_act}) {"load_full_w2": self.quant_config.desc_act})
# up_proj scales # up_proj scales
...@@ -493,7 +493,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): ...@@ -493,7 +493,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
) )
layer.register_parameter("w2_qzeros", w2_qzeros) layer.register_parameter("w2_qzeros", w2_qzeros)
set_weight_attrs(w2_qzeros, extra_weight_attrs) set_weight_attrs(w2_qzeros, extra_weight_attrs)
# dont shard the w2 scales when running act order # don't shard the w2 scales when running act order
set_weight_attrs(w2_qzeros, set_weight_attrs(w2_qzeros,
{"load_full_w2": self.quant_config.desc_act}) {"load_full_w2": self.quant_config.desc_act})
w13_g_idx = torch.nn.Parameter( w13_g_idx = torch.nn.Parameter(
......
...@@ -687,7 +687,7 @@ class FlashInferImpl(AttentionImpl): ...@@ -687,7 +687,7 @@ class FlashInferImpl(AttentionImpl):
else: else:
raise ValueError(f"Unsupported output dtype: {output.dtype}") raise ValueError(f"Unsupported output dtype: {output.dtype}")
# TRTLLM attn kernel requires o scale to pass as a host scalar, # TRTLLM attn kernel requires to scale to pass as a host scalar,
# store the o scale as a host scalar in warmup run with cuda graph # store the o scale as a host scalar in warmup run with cuda graph
# not enabled # not enabled
if layer._o_scale_float is None: if layer._o_scale_float is None:
......
...@@ -439,7 +439,7 @@ class EngineCore: ...@@ -439,7 +439,7 @@ class EngineCore:
""" """
# Note on thread safety: no race condition. # Note on thread safety: no race condition.
# `mm_receiver_cache` is reset at the end of LLMEngine init, # `mm_receiver_cache` is reset at the end of LLMEngine init,
# and will only accessed in the input processing thread afterwards. # and will only be accessed in the input processing thread afterwards.
if self.mm_receiver_cache is not None and request.mm_features: if self.mm_receiver_cache is not None and request.mm_features:
request.mm_features = ( request.mm_features = (
self.mm_receiver_cache.get_and_update_features( self.mm_receiver_cache.get_and_update_features(
......
...@@ -2826,7 +2826,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -2826,7 +2826,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# Disable cudagraph capturing globally, so any unexpected cudagraph # Disable cudagraph capturing globally, so any unexpected cudagraph
# capturing will be detected and raise an error after here. # capturing will be detected and raise an error after here.
# Note: We don't put it into graph_capture context manager because # Note: We don't put it into graph_capture context manager because
# we may doing lazy capturing in future that still allows capturing # we may do lazy capturing in future that still allows capturing
# after here. # after here.
set_cudagraph_capturing_enabled(False) set_cudagraph_capturing_enabled(False)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment