Unverified Commit 2ce87935 authored by applesaucethebun's avatar applesaucethebun Committed by GitHub
Browse files

Add typo checker in pre-commit (#6179)


Co-authored-by: default avatarBrayden Zhong <b8zhong@uwaterloo.ca>
parent de167cf5
...@@ -235,7 +235,7 @@ class Fp8LinearMethod(LinearMethodBase): ...@@ -235,7 +235,7 @@ class Fp8LinearMethod(LinearMethodBase):
f"{input_size_per_partition} is not divisible by " f"{input_size_per_partition} is not divisible by "
f"weight quantization block_k = {block_k}." f"weight quantization block_k = {block_k}."
) )
# Required by collum parallel or enabling merged weights # Required by column parallel or enabling merged weights
if ( if (
tp_size > 1 and output_size // output_size_per_partition == tp_size tp_size > 1 and output_size // output_size_per_partition == tp_size
) or len(output_partition_sizes) > 1: ) or len(output_partition_sizes) > 1:
...@@ -491,7 +491,7 @@ class Fp8MoEMethod: ...@@ -491,7 +491,7 @@ class Fp8MoEMethod:
self.quant_config.weight_block_size[1], self.quant_config.weight_block_size[1],
) )
# NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n. # NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
# Required by collum parallel or enabling merged weights # Required by column parallel or enabling merged weights
if intermediate_size % block_n != 0: if intermediate_size % block_n != 0:
raise ValueError( raise ValueError(
f"The output_size of gate's and up's weight = " f"The output_size of gate's and up's weight = "
......
...@@ -104,7 +104,7 @@ def _per_token_group_quant_fp8( ...@@ -104,7 +104,7 @@ def _per_token_group_quant_fp8(
y_s_ptr, y_s_ptr,
# Stride of input # Stride of input
y_stride, y_stride,
# Collums of input # Columns of input
N, N,
# Avoid to divide zero # Avoid to divide zero
eps, eps,
...@@ -342,7 +342,7 @@ def _static_quant_fp8( ...@@ -342,7 +342,7 @@ def _static_quant_fp8(
y_s_repeat_ptr, y_s_repeat_ptr,
# Stride of input # Stride of input
y_stride, y_stride,
# Collums of input # Columns of input
N, N,
# Information for float8 # Information for float8
fp8_min, fp8_min,
...@@ -794,7 +794,7 @@ def w8a8_block_fp8_matmul( ...@@ -794,7 +794,7 @@ def w8a8_block_fp8_matmul(
config = configs[min(configs.keys(), key=lambda x: abs(x - M))] config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
else: else:
# Default config # Default config
# Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1] # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
config = { config = {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": block_size[0], "BLOCK_SIZE_N": block_size[0],
......
...@@ -76,7 +76,7 @@ def _per_token_group_quant_int8( ...@@ -76,7 +76,7 @@ def _per_token_group_quant_int8(
y_s_ptr, y_s_ptr,
# Stride of input # Stride of input
y_stride, y_stride,
# Collums of input # Columns of input
N, N,
# Avoid to divide zero # Avoid to divide zero
eps, eps,
...@@ -370,7 +370,7 @@ def w8a8_block_int8_matmul( ...@@ -370,7 +370,7 @@ def w8a8_block_int8_matmul(
config = configs[min(configs.keys(), key=lambda x: abs(x - M))] config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
else: else:
# Default config # Default config
# Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1] # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
config = { config = {
"BLOCK_SIZE_M": 64, "BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": block_size[0], "BLOCK_SIZE_N": block_size[0],
......
...@@ -100,7 +100,7 @@ class LoRAManager: ...@@ -100,7 +100,7 @@ class LoRAManager:
self.configs[name] = LoRAConfig(path) self.configs[name] = LoRAConfig(path)
self.hf_target_names.update(self.configs[name].target_modules) self.hf_target_names.update(self.configs[name].target_modules)
# Target lora weight names for lora_a and lora_b modules repectively. # Target lora weight names for lora_a and lora_b modules respectively.
# e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj")} # e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj")}
self.lora_weight_names: Set[Tuple[str]] = set( self.lora_weight_names: Set[Tuple[str]] = set(
[get_stacked_name(module) for module in self.hf_target_names] [get_stacked_name(module) for module in self.hf_target_names]
......
...@@ -50,15 +50,15 @@ class LoRAMemoryPool: ...@@ -50,15 +50,15 @@ class LoRAMemoryPool:
self.uid_to_buffer_id: Dict[Optional[str], int] = {} self.uid_to_buffer_id: Dict[Optional[str], int] = {}
# Buffer idx -> lora uid in memory pool # Buffer idx -> lora uid in memory pool
# All uids are initalized as empty strings for empty buffer slots # All uids are initialized as empty strings for empty buffer slots
# Here we don't initalize to None since None is a valid uid # Here we don't initialize to None since None is a valid uid
self.buffer_id_to_uid: List[Optional[str]] = [""] * self.max_loras_per_batch self.buffer_id_to_uid: List[Optional[str]] = [""] * self.max_loras_per_batch
def get_lora_A_shape( def get_lora_A_shape(
self, module_name: str, base_model: torch.nn.Module self, module_name: str, base_model: torch.nn.Module
) -> Tuple[int]: ) -> Tuple[int]:
""" """
Given a module_name (might be a stacked name), return the hidden dims of modules's input and output. Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
""" """
input_dim, _ = get_hidden_dim(module_name, self.base_hf_config, base_model) input_dim, _ = get_hidden_dim(module_name, self.base_hf_config, base_model)
c = get_stacked_multiply(module_name) c = get_stacked_multiply(module_name)
...@@ -75,7 +75,7 @@ class LoRAMemoryPool: ...@@ -75,7 +75,7 @@ class LoRAMemoryPool:
self, module_name: str, base_model: torch.nn.Module self, module_name: str, base_model: torch.nn.Module
) -> Tuple[int]: ) -> Tuple[int]:
""" """
Given a module_name (might be a stacked name), return the hidden dims of modules's input and output. Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
""" """
_, output_dim = get_hidden_dim(module_name, self.base_hf_config, base_model) _, output_dim = get_hidden_dim(module_name, self.base_hf_config, base_model)
c = get_stacked_multiply(module_name) c = get_stacked_multiply(module_name)
......
...@@ -77,7 +77,7 @@ def _gate_up_lora_b_kernel( ...@@ -77,7 +77,7 @@ def _gate_up_lora_b_kernel(
k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1 k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
) )
# Iteate to compute the block in output matrix # Iterate to compute the block in output matrix
partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32) partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
for k in range(0, tl.cdiv(K, BLOCK_K)): for k in range(0, tl.cdiv(K, BLOCK_K)):
x_tile = tl.load( x_tile = tl.load(
......
...@@ -79,7 +79,7 @@ def _qkv_lora_b_kernel( ...@@ -79,7 +79,7 @@ def _qkv_lora_b_kernel(
k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1 k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
) )
# Iteate to compute the block in output matrix # Iterate to compute the block in output matrix
partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32) partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
for k in range(0, tl.cdiv(K, BLOCK_K)): for k in range(0, tl.cdiv(K, BLOCK_K)):
x_tile = tl.load( x_tile = tl.load(
......
...@@ -67,7 +67,7 @@ def _sgemm_lora_a_kernel( ...@@ -67,7 +67,7 @@ def _sgemm_lora_a_kernel(
k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1 k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
) )
# Iteate to compute the block in output matrix # Iterate to compute the block in output matrix
partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32) partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
for k in range(0, tl.cdiv(K, BLOCK_K)): for k in range(0, tl.cdiv(K, BLOCK_K)):
x_tile = tl.load( x_tile = tl.load(
......
...@@ -69,7 +69,7 @@ def _sgemm_lora_b_kernel( ...@@ -69,7 +69,7 @@ def _sgemm_lora_b_kernel(
k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1 k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
) )
# Iteate to compute the block in output matrix # Iterate to compute the block in output matrix
partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32) partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
for k in range(0, tl.cdiv(K, BLOCK_K)): for k in range(0, tl.cdiv(K, BLOCK_K)):
x_tile = tl.load( x_tile = tl.load(
......
...@@ -79,7 +79,7 @@ def get_hidden_dim( ...@@ -79,7 +79,7 @@ def get_hidden_dim(
module_name: str, config: AutoConfig, base_model: torch.nn.Module module_name: str, config: AutoConfig, base_model: torch.nn.Module
) -> Tuple[int]: ) -> Tuple[int]:
""" """
Given a module_name (might be a stacked name), return the hidden dims of modules's input and output. Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
""" """
if hasattr(base_model, "get_hidden_dim"): if hasattr(base_model, "get_hidden_dim"):
......
...@@ -210,7 +210,7 @@ class DataParallelController: ...@@ -210,7 +210,7 @@ class DataParallelController:
) )
# compute zmq ports for this dp rank # compute zmq ports for this dp rank
rank_port_args = PortArgs.init_new(server_args, dp_rank) rank_port_args = PortArgs.init_new(server_args, dp_rank)
# Data parallelism resues the tensor parallelism group, # Data parallelism reuses the tensor parallelism group,
# so all dp ranks should use the same nccl port. # so all dp ranks should use the same nccl port.
rank_port_args.nccl_port = port_args.nccl_port rank_port_args.nccl_port = port_args.nccl_port
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
""" """
The definition of objects transfered between different The definition of objects transferred between different
processes (TokenizerManager, DetokenizerManager, Controller). processes (TokenizerManager, DetokenizerManager, Controller).
""" """
......
...@@ -51,7 +51,7 @@ class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern) ...@@ -51,7 +51,7 @@ class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern)
self, input_ids: List[int], mm_inputs: MultimodalInputs self, input_ids: List[int], mm_inputs: MultimodalInputs
) -> List[int]: ) -> List[int]:
""" """
This function will replace the data-tokens inbetween with pad_values accordingly This function will replace the data-tokens in between with pad_values accordingly
""" """
pad_values = [item.pad_value for item in mm_inputs.mm_items] pad_values = [item.pad_value for item in mm_inputs.mm_items]
data_token_pairs = self.data_token_id_pairs data_token_pairs = self.data_token_id_pairs
......
...@@ -879,7 +879,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin): ...@@ -879,7 +879,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
error_msg = ( error_msg = (
f"{phase_str} out of memory. Try to lower your batch size.\n" f"{phase_str} out of memory. Try to lower your batch size.\n"
f"Try to allocate {num_tokens} tokens.\n" f"Try to allocate {num_tokens} tokens.\n"
f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n" f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
) )
logger.error(error_msg) logger.error(error_msg)
if self.tree_cache is not None: if self.tree_cache is not None:
...@@ -920,7 +920,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin): ...@@ -920,7 +920,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
error_msg = ( error_msg = (
f"Prefill out of memory. Try to lower your batch size.\n" f"Prefill out of memory. Try to lower your batch size.\n"
f"Try to allocate {extend_num_tokens} tokens.\n" f"Try to allocate {extend_num_tokens} tokens.\n"
f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n" f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
f"{self.token_to_kv_pool_allocator.available_size()=}\n" f"{self.token_to_kv_pool_allocator.available_size()=}\n"
f"{self.tree_cache.evictable_size()=}\n" f"{self.tree_cache.evictable_size()=}\n"
) )
...@@ -955,7 +955,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin): ...@@ -955,7 +955,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
error_msg = ( error_msg = (
f"Decode out of memory. Try to lower your batch size.\n" f"Decode out of memory. Try to lower your batch size.\n"
f"Try to allocate {len(seq_lens)} tokens.\n" f"Try to allocate {len(seq_lens)} tokens.\n"
f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n" f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
f"{self.token_to_kv_pool_allocator.available_size()=}\n" f"{self.token_to_kv_pool_allocator.available_size()=}\n"
f"{self.tree_cache.evictable_size()=}\n" f"{self.tree_cache.evictable_size()=}\n"
) )
......
...@@ -1325,7 +1325,7 @@ class Scheduler( ...@@ -1325,7 +1325,7 @@ class Scheduler(
return None return None
running_bs = len(self.running_batch.reqs) running_bs = len(self.running_batch.reqs)
# Igore the check if self.chunked_req is not None. # Ignore the check if self.chunked_req is not None.
# In the non-PP case, when self.chunked_req is not None, num_allocatable_reqs should always be greater than 0, # In the non-PP case, when self.chunked_req is not None, num_allocatable_reqs should always be greater than 0,
# as the space for the chunked request has just been released. # as the space for the chunked request has just been released.
# In PP case, a chunked req can start in one microbatch and end in another microbatch, so the max_running_requests per microbatch should not be strict. # In PP case, a chunked req can start in one microbatch and end in another microbatch, so the max_running_requests per microbatch should not be strict.
......
...@@ -1273,7 +1273,7 @@ class TokenizerManager: ...@@ -1273,7 +1273,7 @@ class TokenizerManager:
self.model_update_result.set_result(recv_obj) self.model_update_result.set_result(recv_obj)
else: # self.server_args.dp_size > 1 else: # self.server_args.dp_size > 1
self.model_update_tmp.append(recv_obj) self.model_update_tmp.append(recv_obj)
# set future if the all results are recevied # set future if the all results are received
if len(self.model_update_tmp) == self.server_args.dp_size: if len(self.model_update_tmp) == self.server_args.dp_size:
self.model_update_result.set_result(self.model_update_tmp) self.model_update_result.set_result(self.model_update_tmp)
......
...@@ -296,12 +296,12 @@ class CudaGraphRunner: ...@@ -296,12 +296,12 @@ class CudaGraphRunner:
self.capture() self.capture()
except RuntimeError as e: except RuntimeError as e:
raise Exception( raise Exception(
f"Capture cuda graph failed: {e}\n" f"Capture CUDA graph failed: {e}\n"
"Possible solutions:\n" "Possible solutions:\n"
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n" "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
"2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n" "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
"3. disable torch compile by not using --enable-torch-compile\n" "3. disable torch compile by not using --enable-torch-compile\n"
"4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n" "4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n" "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
) )
......
...@@ -58,7 +58,7 @@ class ForwardMode(IntEnum): ...@@ -58,7 +58,7 @@ class ForwardMode(IntEnum):
DECODE = auto() DECODE = auto()
# Contains both EXTEND and DECODE when doing chunked prefill. # Contains both EXTEND and DECODE when doing chunked prefill.
MIXED = auto() MIXED = auto()
# No sequence to forward. For data parallel attention, some workers wil be IDLE if no sequence are allocated. # No sequence to forward. For data parallel attention, some workers will be IDLE if no sequence are allocated.
IDLE = auto() IDLE = auto()
# Used in speculative decoding: verify a batch in the target model. # Used in speculative decoding: verify a batch in the target model.
......
...@@ -188,7 +188,7 @@ def trunc_normal_tf_( ...@@ -188,7 +188,7 @@ def trunc_normal_tf_(
best when :math:`a \\leq \text{mean} \\leq b`. best when :math:`a \\leq \text{mean} \\leq b`.
NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0 bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
and the result is subsquently scaled and shifted by the mean and std args. and the result is subsequently scaled and shifted by the mean and std args.
Args: Args:
tensor: an n-dimensional `torch.Tensor` tensor: an n-dimensional `torch.Tensor`
mean: the mean of the normal distribution mean: the mean of the normal distribution
...@@ -735,7 +735,7 @@ class VisionTransformer(nn.Module): ...@@ -735,7 +735,7 @@ class VisionTransformer(nn.Module):
img_size: Input image size. img_size: Input image size.
patch_size: Patch size. patch_size: Patch size.
in_chans: Number of image input channels. in_chans: Number of image input channels.
num_classes: Mumber of classes for classification head. num_classes: Number of classes for classification head.
global_pool: Type of global pooling for final sequence (default: 'token'). global_pool: Type of global pooling for final sequence (default: 'token').
embed_dim: Transformer embedding dimension. embed_dim: Transformer embedding dimension.
depth: Depth of transformer. depth: Depth of transformer.
......
...@@ -1287,7 +1287,7 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -1287,7 +1287,7 @@ class DeepseekV2DecoderLayer(nn.Module):
# Fully Connected # Fully Connected
hidden_states = self.mlp(hidden_states) hidden_states = self.mlp(hidden_states)
# TODO(ch-wan): ues reduce-scatter in MLP to avoid this scatter # TODO(ch-wan): use reduce-scatter in MLP to avoid this scatter
# Scatter # Scatter
if self.dp_size != 1: if self.dp_size != 1:
# important: forward batch.gathered_buffer is used both after scatter and after gather. # important: forward batch.gathered_buffer is used both after scatter and after gather.
...@@ -1499,7 +1499,7 @@ class DeepseekV2ForCausalLM(nn.Module): ...@@ -1499,7 +1499,7 @@ class DeepseekV2ForCausalLM(nn.Module):
else: else:
assert ( assert (
self.n_share_experts_fusion == self.tp_size self.n_share_experts_fusion == self.tp_size
), f"Shared experts fusion optimization is enabled in DeepSeek V3/R1, set it to {self.tp_size} can get best optimized performace." ), f"Shared experts fusion optimization is enabled in DeepSeek V3/R1, set it to {self.tp_size} can get best optimized performance."
elif self.n_share_experts_fusion == 0: elif self.n_share_experts_fusion == 0:
if ( if (
_is_cuda _is_cuda
...@@ -1665,7 +1665,7 @@ class DeepseekV2ForCausalLM(nn.Module): ...@@ -1665,7 +1665,7 @@ class DeepseekV2ForCausalLM(nn.Module):
if is_nextn: if is_nextn:
if hasattr(self.config, "num_nextn_predict_layers"): if hasattr(self.config, "num_nextn_predict_layers"):
num_nextn_layers = self.config.num_nextn_predict_layers num_nextn_layers = self.config.num_nextn_predict_layers
assert num_nextn_layers == 1, "Only 1 nextn layer is supportted" assert num_nextn_layers == 1, "Only 1 nextn layer is supported"
# compatible with old design # compatible with old design
nextn_layer_id = ( nextn_layer_id = (
0 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment