Unverified Commit bcb06d7b authored by Didier Durand's avatar Didier Durand Committed by GitHub
Browse files

[Doc]: fix typos in various files (#24726)


Signed-off-by: default avatarDidier Durand <durand.didier@gmail.com>
parent 0377802c
...@@ -56,7 +56,7 @@ def w8a8_block_matmul( ...@@ -56,7 +56,7 @@ def w8a8_block_matmul(
Bs: The per-block quantization scale for `B`. Bs: The per-block quantization scale for `B`.
block_size: The block size for per-block quantization. block_size: The block size for per-block quantization.
It should be 2-dim, e.g., [128, 128]. It should be 2-dim, e.g., [128, 128].
output_dytpe: The dtype of the returned tensor. output_dtype: The dtype of the returned tensor.
Returns: Returns:
torch.Tensor: The result of matmul. torch.Tensor: The result of matmul.
......
...@@ -12,7 +12,7 @@ namespace vec_op { ...@@ -12,7 +12,7 @@ namespace vec_op {
#define vec_sub(a, b) ((a) - (b)) #define vec_sub(a, b) ((a) - (b))
#define vec_mul(a, b) ((a) * (b)) #define vec_mul(a, b) ((a) * (b))
#define vec_div(a, b) ((a) / (b)) #define vec_div(a, b) ((a) / (b))
#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebaic #define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebraic
#define vec_sl(a, b) ((a) << (b)) // Vector Shift Left #define vec_sl(a, b) ((a) << (b)) // Vector Shift Left
// FIXME: FP16 is not fully supported in Torch-CPU // FIXME: FP16 is not fully supported in Torch-CPU
......
...@@ -215,7 +215,7 @@ int moe_align_block_size( ...@@ -215,7 +215,7 @@ int moe_align_block_size(
offsets[mb + 1] = sorted_id_size(sorted_ids + mb * BLOCK_M); offsets[mb + 1] = sorted_id_size(sorted_ids + mb * BLOCK_M);
} }
}); });
// TODO: do we need to vecterize this ? // TODO: do we need to vectorize this ?
for (int mb = 0; mb < num_token_blocks; ++mb) { for (int mb = 0; mb < num_token_blocks; ++mb) {
offsets[mb + 1] += offsets[mb]; offsets[mb + 1] += offsets[mb];
} }
......
...@@ -8,7 +8,7 @@ page for information on known issues and how to solve them. ...@@ -8,7 +8,7 @@ page for information on known issues and how to solve them.
## Introduction ## Introduction
!!! important !!! important
The source code references are to the state of the code at the time of writing in December, 2024. The source code references are to the state of the code at the time of writing in December 2024.
The use of Python multiprocessing in vLLM is complicated by: The use of Python multiprocessing in vLLM is complicated by:
......
...@@ -901,7 +901,7 @@ def _get_query_key_seq_metadata( ...@@ -901,7 +901,7 @@ def _get_query_key_seq_metadata(
attn_metadata.encoder_seq_start_loc, attn_metadata.encoder_seq_start_loc,
attn_metadata.max_encoder_seq_len) attn_metadata.max_encoder_seq_len)
elif attn_type == AttentionType.ENCODER: elif attn_type == AttentionType.ENCODER:
# For encoder attention both the query and the key are same i.e the # For encoder attention both the query and the key are same i.e. the
# encoder sequence. # encoder sequence.
return (attn_metadata.encoder_seq_start_loc, return (attn_metadata.encoder_seq_start_loc,
attn_metadata.max_encoder_seq_len, attn_metadata.max_encoder_seq_len,
......
...@@ -551,7 +551,7 @@ class RandomDataset(BenchmarkDataset): ...@@ -551,7 +551,7 @@ class RandomDataset(BenchmarkDataset):
[6880, 6881] -> ['Ġcalls', 'here'] -> [6880, 6881] -> ['Ġcalls', 'here'] ->
[1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
To avoid uncontrolled change of the prompt length, To avoid uncontrolled change of the prompt length,
the encoded sequence is truncated before being decode again. the encoded sequence is truncated before being decoded again.
""" """
# Build the inner sequence by sampling sequentially from the vocab # Build the inner sequence by sampling sequentially from the vocab
inner_seq = ((offset + index + np.arange(input_len)) inner_seq = ((offset + index + np.arange(input_len))
......
...@@ -242,7 +242,7 @@ def get_logits_processors(processors: Optional[LogitsProcessors], ...@@ -242,7 +242,7 @@ def get_logits_processors(processors: Optional[LogitsProcessors],
elif processors: elif processors:
raise ValueError( raise ValueError(
"The `logits_processors` argument is not supported by this " "The `logits_processors` argument is not supported by this "
"server. See --logits-processor-pattern engine argugment " "server. See --logits-processor-pattern engine argument "
"for more information.") "for more information.")
return None return None
......
...@@ -324,7 +324,7 @@ class MambaMixer2(MambaBase, CustomOp): ...@@ -324,7 +324,7 @@ class MambaMixer2(MambaBase, CustomOp):
# - the weight already has a "weight_loader" attribute # - the weight already has a "weight_loader" attribute
# which set_weight_attrs will raise if we do not # which set_weight_attrs will raise if we do not
# delete before trying to override it # delete before trying to override it
# - ditto for the otther two weights below # - ditto for the other two weights below
delattr(self.conv1d.bias, "weight_loader") delattr(self.conv1d.bias, "weight_loader")
set_weight_attrs( set_weight_attrs(
self.conv1d.bias, self.conv1d.bias,
......
...@@ -1117,7 +1117,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -1117,7 +1117,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
def _process_multimodal_inputs(self, modalities: dict): def _process_multimodal_inputs(self, modalities: dict):
# The result multimodal_embeddings is tuple of tensors, with each # The result multimodal_embeddings is tuple of tensors, with each
# tensor correspoending to a multimodal data item (image or video). # tensor corresponding to a multimodal data item (image or video).
multimodal_embeddings: tuple[torch.Tensor, ...] = () multimodal_embeddings: tuple[torch.Tensor, ...] = ()
# NOTE: It is important to iterate over the keys in this dictionary # NOTE: It is important to iterate over the keys in this dictionary
......
...@@ -2659,7 +2659,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): ...@@ -2659,7 +2659,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
num_tokens += num_pad num_tokens += num_pad
# If cudagraph_mode.decode_mode() == FULL and # If cudagraph_mode.decode_mode() == FULL and
# cudagraph_mode.seperate_routine(). This means that we are using # cudagraph_mode.separate_routine(). This means that we are using
# different graphs and/or modes for mixed prefill-decode batches vs. # different graphs and/or modes for mixed prefill-decode batches vs.
# uniform decode batches. A uniform decode batch means that all # uniform decode batches. A uniform decode batch means that all
# requests have identical query length, except a potential virtual # requests have identical query length, except a potential virtual
......
...@@ -392,7 +392,7 @@ class InputBatch: ...@@ -392,7 +392,7 @@ class InputBatch:
# NOTE: the following is unsafe # NOTE: the following is unsafe
# self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\ # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
# self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...] # self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
# instead, we need to temporiarily copy the data for one of the indices # instead, we need to temporarily copy the data for one of the indices
# TODO(lucas): optimize this by only copying valid indices # TODO(lucas): optimize this by only copying valid indices
tmp = self.token_ids_cpu[i1, ...].copy() tmp = self.token_ids_cpu[i1, ...].copy()
self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...] self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment