Unverified Commit 6a895197 authored by Jiayi Yan's avatar Jiayi Yan Committed by GitHub
Browse files

[Bugfix][CI] fix typos (#34934)


Signed-off-by: default avatar1195343015 <1195343015@qq.com>
Signed-off-by: default avatarJiayi Yan <66017932+1195343015@users.noreply.github.com>
Co-authored-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 8c760b6a
......@@ -3106,7 +3106,7 @@ def cpu_attn_get_scheduler_metadata(
isa: str,
enable_kv_split: bool,
) -> torch.Tensor:
sheduler_metadata = torch.ops._C.get_scheduler_metadata(
scheduler_metadata = torch.ops._C.get_scheduler_metadata(
num_reqs,
num_heads,
num_kv_heads,
......@@ -3119,7 +3119,7 @@ def cpu_attn_get_scheduler_metadata(
isa,
enable_kv_split,
)
return sheduler_metadata
return scheduler_metadata
def cpu_attn_reshape_and_cache(
......
......@@ -872,7 +872,7 @@ class CompilationConfig:
)
# Currently only eager and inductor backend are supported.
# for piecewise compilation. Custom backends are not suppported for
# for piecewise compilation. Custom backends are not supported for
# piecewise compilation. Update when more backends are supported.
if self.mode == CompilationMode.VLLM_COMPILE and self.backend not in [
"",
......
......@@ -59,7 +59,7 @@ class ObservabilityConfig:
enable_layerwise_nvtx_tracing: bool = False
"""Enable layerwise NVTX tracing. This traces the execution of each layer or
module in the model and attach informations such as input/output shapes to
module in the model and attach information such as input/output shapes to
nvtx range markers. Noted that this doesn't work with CUDA graphs enabled."""
enable_mfu_metrics: bool = False
......
......@@ -592,7 +592,7 @@ class VllmConfig:
If the user configuration does not specify a value for a default field
and if the default field is still None after all user selections are
applied, then default values will be applied to the field. User speciied
applied, then default values will be applied to the field. User specified
fields will not be overridden by the default.
Args:
......
......@@ -44,7 +44,7 @@ class DefaultEplbPolicy(AbstractEplbPolicy):
rank_in_pack = np.zeros_like(pack_index, dtype=np.int64)
return pack_index, rank_in_pack
# Sort and get indices in decending order
# Sort and get indices in descending order
indices = np.argsort(-weight, axis=-1)
pack_index = np.full((num_layers, num_groups), -1, dtype=np.int64)
......
......@@ -129,7 +129,7 @@ class KVConnectorRole(enum.Enum):
class KVConnectorHandshakeMetadata(ABC): # noqa: B024
"""
Metadata used for out of band connector handshake between
P/D workers. This needs to serializeable.
P/D workers. This needs to serializable.
"""
pass
......
......@@ -398,7 +398,7 @@ class ReqMeta:
)
def need_gpu_interm_buffer(lmcache_config: LMCacheEngineConfig):
def need_gpu_interim_buffer(lmcache_config: LMCacheEngineConfig):
return not lmcache_config.enable_pd
......@@ -497,7 +497,7 @@ def _init_lmcache_engine(
use_mla,
)
use_gpu = need_gpu_interm_buffer(lmcache_config)
use_gpu = need_gpu_interim_buffer(lmcache_config)
vllm_gpu_connector: (
VLLMBufferLayerwiseGPUConnector
| VLLMPagedMemGPUConnectorV2
......
......@@ -481,7 +481,7 @@ class MooncakeConnectorWorker:
)
self._remote_agents: dict[EngineId, dict[int, dict[int, str]]] = {}
self._pending_bootstrap_querys: dict[str, asyncio.Event] = {}
self._pending_bootstrap_queries: dict[str, asyncio.Event] = {}
self.side_channel_port: int = 0 # we will bind it in register_kv_caches()
self.engine_id: EngineId = engine_id
self.tp_rank = get_tensor_model_parallel_rank()
......@@ -1077,7 +1077,7 @@ class MooncakeConnectorWorker:
response = self._xfer_resp_decoder.decode(ret_msg)
if response.status == MooncakeXferResponseStatus.ERROR:
logger.error(
"Error happens during tranfering kvcache for %s: %s",
"Error happens during transferring kvcache for %s: %s",
req_ids,
response.err_msg,
)
......@@ -1140,8 +1140,8 @@ class MooncakeConnectorWorker:
)
# Always notify others regardless of connection success or failure.
self._pending_bootstrap_querys[remote_bootstrap_addr].set()
del self._pending_bootstrap_querys[remote_bootstrap_addr]
self._pending_bootstrap_queries[remote_bootstrap_addr].set()
del self._pending_bootstrap_queries[remote_bootstrap_addr]
def receive_kv(
self,
......@@ -1171,11 +1171,11 @@ class MooncakeConnectorWorker:
pull_metas: dict[ReqId, PullReqMeta],
):
remote_bootstrap_addr = next(iter(pull_metas.values())).remote_bootstrap_addr
if remote_bootstrap_addr not in self._pending_bootstrap_querys:
self._pending_bootstrap_querys[remote_bootstrap_addr] = asyncio.Event()
if remote_bootstrap_addr not in self._pending_bootstrap_queries:
self._pending_bootstrap_queries[remote_bootstrap_addr] = asyncio.Event()
await self._connect_to_prefiller_bootstrap(remote_bootstrap_addr)
else:
await self._pending_bootstrap_querys[remote_bootstrap_addr].wait()
await self._pending_bootstrap_queries[remote_bootstrap_addr].wait()
if remote_engine_id not in self._remote_agents:
logger.error(
......
......@@ -720,7 +720,7 @@ class OffloadPromMetrics(KVConnectorPromMetrics):
per_engine_labelvalues: dict[int, list[object]],
):
super().__init__(vllm_config, metric_types, labelnames, per_engine_labelvalues)
# (engine_idx, transfer_tupe) -> (metric with bounded labels)
# (engine_idx, transfer_type) -> (metric with bounded labels)
self.histogram_transfer_size: dict[tuple[int, str], PromMetricT] = {}
self.counter_kv_bytes: dict[tuple[int, str], PromMetricT] = {}
self.counter_kv_transfer_time: dict[tuple[int, str], PromMetricT] = {}
......
......@@ -1647,9 +1647,9 @@ class OpenAIServingResponses(OpenAIServing):
# TODO: in streaming, we noticed this bug:
# https://github.com/vllm-project/vllm/issues/25697
await self._initialize_tool_sessions(request, context, exit_stack)
processer = self._process_harmony_streaming_events
processor = self._process_harmony_streaming_events
else:
processer = self._process_simple_streaming_events
processor = self._process_simple_streaming_events
# TODO Hanchen make sampling params to include the structural tag
initial_response = ResponsesResponse.from_request(
......@@ -1677,7 +1677,7 @@ class OpenAIServingResponses(OpenAIServing):
)
try:
async for event_data in processer(
async for event_data in processor(
request,
sampling_params,
result_generator,
......
......@@ -1520,7 +1520,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
os.getenv("VLLM_DEEPEP_BUFFER_SIZE_MB", "1024")
),
# Force DeepEP to use intranode kernel for inter-node communication in
# high throughput mode. This is useful archive higher prefill throuhgput
# high throughput mode. This is useful archive higher prefill throughput
# on system supports multi-node nvlink (e.g GB200).
"VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE": lambda: bool(
int(os.getenv("VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE", "0"))
......
......@@ -175,7 +175,7 @@ class DPMetadata:
# Get the cumulative tokens across sequence parallel ranks.
# In this case the input to the MoEs will be distributed w.r.t both
# DP and TP rank.
# When sp_size==1, this is just the cummulative num tokens across DP.
# When sp_size==1, this is just the cumulative num tokens across DP.
def cu_tokens_across_sp(self, sp_size: int) -> torch.Tensor:
num_tokens_across_sp_cpu = (
self.num_tokens_across_dp_cpu - 1 + sp_size
......
......@@ -57,10 +57,10 @@ class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
input_parallel = input_
else:
# TODO: simplify code below
splitted_input = split_tensor_along_last_dim(
split_input = split_tensor_along_last_dim(
input_, num_partitions=self.tp_size
)
input_parallel = splitted_input[self.tp_rank].contiguous()
input_parallel = split_input[self.tp_rank].contiguous()
# Matrix multiply.
bias_ = (
......
......@@ -11,7 +11,7 @@ from vllm.lora.lora_weights import LoRALayerWeights
from vllm.lora.peft_helper import PEFTHelper
from vllm.lora.utils import (
get_lora_id,
is_base_embeddding_weights,
is_base_embedding_weights,
parse_fine_tuned_lora_name,
)
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
......@@ -86,7 +86,7 @@ class LoRAModel:
pin_memory = str(device) == "cpu" and is_pin_memory_available()
loras: dict[str, LoRALayerWeights] = {}
for tensor_name, tensor in tensors.items():
if is_base_embeddding_weights(tensor_name):
if is_base_embedding_weights(tensor_name):
continue
# Skip modules based on model-defined prefixes (e.g., MTP layers)
if skip_prefixes and cls._should_skip_module(tensor_name, skip_prefixes):
......@@ -162,7 +162,7 @@ class LoRAModel:
def check_unexpected_modules(modules: dict):
for lora_module in modules.keys(): # noqa
if is_base_embeddding_weights(lora_module):
if is_base_embedding_weights(lora_module):
continue
# Handle PEFT file format where experts.base_layer is the
# gate_up_proj and experts is the down_proj
......
......@@ -193,7 +193,7 @@ def parse_fine_tuned_lora_name(
raise ValueError(f"{name} is unsupported LoRA weight")
def is_base_embeddding_weights(name: str) -> bool:
def is_base_embedding_weights(name: str) -> bool:
# hardcoded subfixes for input & output embedding weights
embedding_suffixes = (
".embed_tokens.base_layer.weight",
......
......@@ -82,7 +82,7 @@ class CPUWNA16LinearKernel(MPLinearKernel):
weight = weight.permute(0, 2, 1).reshape(input_size, output_size).contiguous()
weight = pack_quantized_values_into_int32(weight, self.config.weight_type, 1)
# make 16 output channel as a block and transpose to the make
# the block contigous
# the block contiguous
weight = (
weight.view(input_size, -1, 16 // pack_factor)
.permute(1, 0, 2)
......
......@@ -2540,7 +2540,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
)
# workspace
# |------- N tokens --------|--------- N*dcp_size tokens ----------|
# |<- use for loca_gather ->|<--------- use for allgather -------->|
# |<- use for local_gather ->|<--------- use for allgather -------->|
allgather_offset = workspace.shape[0] // (dcp_world_size + 1)
assert allgather_offset * (dcp_world_size + 1) == workspace.shape[0]
assert toks <= allgather_offset
......
......@@ -394,5 +394,5 @@ class FlashInferExperts(mk.FusedMoEExpertsModular):
def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None:
# No support for LoRA in flashinfer_cutlass_fused_moe.
# See TODOs in flashinfer functions runMoe and runMoeMinLantency.
# See TODOs in flashinfer functions runMoe and runMoeMinLatency.
raise NotImplementedError("LoRA is not supported for flashinfer_cutlass_moe")
......@@ -409,7 +409,7 @@ def batched_fused_marlin_moe(
Note that the moe_align_block_size function indicates,
- What rows of the A matrix (hidden_states) to access during the
matmul, via sorted_ids output.
- What expert_id to use for each block matmul, via expert_ids ouptut.
- What expert_id to use for each block matmul, via expert_ids output.
In the batched version, the tokens are already grouped/batched by experts
they subscribe to. Due to this, we can represent the batched hidden_states
......
......@@ -606,7 +606,7 @@ class FusedMoEExperts(ABC):
"""
Whether the kernel supports deployment in particular parallel config.
Can be overriden if a kernel does not support EP, SP or some other
Can be overridden if a kernel does not support EP, SP or some other
configuration.
"""
raise NotImplementedError
......@@ -620,7 +620,7 @@ class FusedMoEExperts(ABC):
"""
Whether the kernel supports a routing method (e.g. GroupedTopK).
Can be overriden by monolithic kernels that execute the router
Can be overridden by monolithic kernels that execute the router
in addition to the experts if certain routers are not supported.
"""
return True
......@@ -633,7 +633,7 @@ class FusedMoEExperts(ABC):
"""
Whether a kernel supports a particular dtype for router logits input.
Can be overriden by monolithic kernels that execute the router
Can be overridden by monolithic kernels that execute the router
in addition to the experts if certain dtypes are not supported.
"""
return True
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment