"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "e76466dde2bc9525d55165ceaa600d298c7bf773"
Unverified Commit 41ae4a1e authored by Didier Durand's avatar Didier Durand Committed by GitHub
Browse files

[Doc]: fix typos in various files (#24798)


Signed-off-by: default avatarDidier Durand <durand.didier@gmail.com>
parent 4dad72f0
...@@ -42,7 +42,7 @@ def main(): ...@@ -42,7 +42,7 @@ def main():
llm_args["model"] = "meta-llama/Llama-3.1-8B-Instruct" llm_args["model"] = "meta-llama/Llama-3.1-8B-Instruct"
# Set `enforce_eager=True` to avoid ahead-of-time compilation. # Set `enforce_eager=True` to avoid ahead-of-time compilation.
# In real workloads, `enforace_eager` should be `False`. # In real workloads, `enforce_eager` should be `False`.
llm = LLM(**llm_args) llm = LLM(**llm_args)
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
print("-" * 50) print("-" * 50)
......
...@@ -182,7 +182,7 @@ class NaiveBlockAllocator(BlockAllocator): ...@@ -182,7 +182,7 @@ class NaiveBlockAllocator(BlockAllocator):
# Increment refcount for each block. # Increment refcount for each block.
assert block.block_id is not None assert block.block_id is not None
refcount = self._refcounter.incr(block.block_id) refcount = self._refcounter.incr(block.block_id)
assert refcount != 1, "can't fork free'd block" assert refcount != 1, "can't fork freed block"
forked_block = self._block_pool.init_block( forked_block = self._block_pool.init_block(
prev_block=prev_block, prev_block=prev_block,
......
...@@ -58,7 +58,7 @@ class Evictor(ABC): ...@@ -58,7 +58,7 @@ class Evictor(ABC):
class BlockMetaData: class BlockMetaData:
"""Data structure for storing key data describe cached block, so that """Data structure for storing key data describe cached block, so that
evitor could use to make its decision which one to choose for eviction evictor could use to make its decision which one to choose for eviction
Here we use physical block id as the dict key, as there maybe several Here we use physical block id as the dict key, as there maybe several
blocks with the same content hash, but their physical id is unique. blocks with the same content hash, but their physical id is unique.
......
...@@ -379,7 +379,7 @@ class LoggingStatLogger(StatLoggerBase): ...@@ -379,7 +379,7 @@ class LoggingStatLogger(StatLoggerBase):
if local_interval_elapsed(stats.now, self.last_local_log, if local_interval_elapsed(stats.now, self.last_local_log,
self.local_interval): self.local_interval):
# Compute summary metrics for tracked stats (and log them # Compute summary metrics for tracked stats (and log them
# to promethus if applicable). # to prometheus if applicable).
prompt_throughput = get_throughput(self.num_prompt_tokens, prompt_throughput = get_throughput(self.num_prompt_tokens,
now=stats.now, now=stats.now,
last_log=self.last_local_log) last_log=self.last_local_log)
...@@ -432,7 +432,7 @@ class LoggingStatLogger(StatLoggerBase): ...@@ -432,7 +432,7 @@ class LoggingStatLogger(StatLoggerBase):
class PrometheusStatLogger(StatLoggerBase): class PrometheusStatLogger(StatLoggerBase):
"""PrometheusStatLogger is used LLMEngine to log to Promethus.""" """PrometheusStatLogger is used LLMEngine to log to Prometheus."""
_metrics_cls = Metrics _metrics_cls = Metrics
_gauge_cls = prometheus_client.Gauge _gauge_cls = prometheus_client.Gauge
......
...@@ -740,7 +740,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): ...@@ -740,7 +740,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
""" """
Handle special case for models where MLP layers are already Handle special case for models where MLP layers are already
fused on disk. In this case, we have no shard id. This function fused on disk. In this case, we have no shard id. This function
determmines the shard id by splitting these layers and then calls determines the shard id by splitting these layers and then calls
the weight loader using the shard id. the weight loader using the shard id.
An example of a model with these fused layers: An example of a model with these fused layers:
...@@ -914,7 +914,7 @@ class QKVParallelLinear(ColumnParallelLinear): ...@@ -914,7 +914,7 @@ class QKVParallelLinear(ColumnParallelLinear):
""" """
Handle special case for models where QKV layers are already Handle special case for models where QKV layers are already
fused on disk. In this case, we have no shard id. This function fused on disk. In this case, we have no shard id. This function
determmines the shard id by splitting these layers and then calls determines the shard id by splitting these layers and then calls
the weight loader using the shard id. the weight loader using the shard id.
An example of a model with these fused layers: An example of a model with these fused layers:
......
...@@ -258,7 +258,7 @@ class VocabParallelEmbedding(CustomOp): ...@@ -258,7 +258,7 @@ class VocabParallelEmbedding(CustomOp):
if params_dtype is None: if params_dtype is None:
params_dtype = torch.get_default_dtype() params_dtype = torch.get_default_dtype()
# Divide the weight matrix along the vocaburaly dimension. # Divide the weight matrix along the vocabulary dimension.
self.num_added_embeddings = self.num_embeddings - self.org_vocab_size self.num_added_embeddings = self.num_embeddings - self.org_vocab_size
self.num_embeddings_per_partition = divide(self.num_embeddings_padded, self.num_embeddings_per_partition = divide(self.num_embeddings_padded,
self.tp_size) self.tp_size)
......
...@@ -1446,7 +1446,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -1446,7 +1446,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal,
return None return None
# The result multimodal_embeddings is tuple of tensors, with each # The result multimodal_embeddings is tuple of tensors, with each
# tensor correspoending to a multimodal data item (image or video). # tensor corresponding to a multimodal data item (image or video).
multimodal_embeddings: tuple[torch.Tensor, ...] = () multimodal_embeddings: tuple[torch.Tensor, ...] = ()
# NOTE: It is important to iterate over the keys in this dictionary # NOTE: It is important to iterate over the keys in this dictionary
......
...@@ -586,10 +586,10 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -586,10 +586,10 @@ class Gemma3nForConditionalGeneration(nn.Module, SupportsMultiModal,
# ruff: noqa # ruff: noqa
# The Gemma3nProcessor expects all audio will be 30s in length and inserts 188 audio soft tokens into the # The Gemma3nProcessor expects all audio will be 30s in length and inserts 188 audio soft tokens into the
# text to account for this. However, the audio preprocessing and encoder do not gurarantee they will # text to account for this. However, the audio preprocessing and encoder do not guarantee they will
# produce 188 soft tokens; they will produce at most that many tokens, but they may produce fewer tokens # produce 188 soft tokens; they will produce at most that many tokens, but they may produce fewer tokens
# depending on the length of the longest audio input in the batch. When we encounter this situation, we pad # depending on the length of the longest audio input in the batch. When we encounter this situation, we pad
# the audio feature out to 188 soft tokens with the emebedding of the last token in the embed_audio vocab. # the audio feature out to 188 soft tokens with the embedding of the last token in the embed_audio vocab.
# TODO precompute and cache padding # TODO precompute and cache padding
audio_padding_toks = torch.tensor([[self.vocab_size - 1]], audio_padding_toks = torch.tensor([[self.vocab_size - 1]],
dtype=torch.long, dtype=torch.long,
......
...@@ -560,7 +560,7 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, ...@@ -560,7 +560,7 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP,
return [] return []
# The result multimodal_embeddings is tuple of tensors, with each # The result multimodal_embeddings is tuple of tensors, with each
# tensor correspoending to a multimodal data item (image). # tensor corresponding to a multimodal data item (image).
multimodal_embeddings: tuple[torch.Tensor, ...] = () multimodal_embeddings: tuple[torch.Tensor, ...] = ()
# NOTE: It is important to iterate over the keys in this dictionary # NOTE: It is important to iterate over the keys in this dictionary
......
...@@ -1154,7 +1154,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): ...@@ -1154,7 +1154,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
return None return None
# The result multimodal_embeddings is tuple of tensors, with each # The result multimodal_embeddings is tuple of tensors, with each
# tensor correspoending to a multimodal data item (image or video). # tensor corresponding to a multimodal data item (image or video).
multimodal_embeddings: tuple[torch.Tensor, ...] = () multimodal_embeddings: tuple[torch.Tensor, ...] = ()
# NOTE: It is important to iterate over the keys in this dictionary # NOTE: It is important to iterate over the keys in this dictionary
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment