Unverified Commit 2ce87935 authored by applesaucethebun's avatar applesaucethebun Committed by GitHub
Browse files

Add typo checker in pre-commit (#6179)


Co-authored-by: default avatarBrayden Zhong <b8zhong@uwaterloo.ca>
parent de167cf5
......@@ -129,7 +129,7 @@ def launch_server_process_and_send_one_request(
def refine_server_args(server_args: ServerArgs, compile_args: CompileArgs):
# Disbale cuda graph and torch compile to save time
# Disable cuda graph and torch compile to save time
server_args.disable_cuda_graph = True
server_args.enable_torch_compile = False
print(f"Disable CUDA Graph and Torch Compile to save time...")
......
......@@ -38,7 +38,7 @@ def extract_prefix_by_tracing(program, backend):
with TracingScope(tracer):
tracer.ret_value = program.func(tracer, **arguments)
except (StopTracing, TypeError, AttributeError):
# Some exceptions may not be catched
# Some exceptions may not be caught
pass
# Run and cache prefix
......
......@@ -27,7 +27,7 @@ completion_template_name = None
class FimPosition:
"""Postion of fim middle token."""
"""Position of fim middle token."""
MIDDLE = auto()
END = auto()
......
......@@ -416,9 +416,9 @@ class DeepseekVLV2Processor(ProcessorMixin):
h = w = math.ceil(
(self.image_size // self.patch_size) / self.downsample_ratio
)
# global views tokens h * (w + 1), 1 is for line seperator
# global views tokens h * (w + 1), 1 is for line separator
tokenized_image = [self.image_token_id] * h * (w + 1)
# add a seperator between global and local views
# add a separator between global and local views
tokenized_image += [self.image_token_id]
# local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
tokenized_image += (
......
......@@ -509,7 +509,7 @@ class SchedulerDisaggregationDecodeMixin:
def event_loop_overlap_disagg_decode(self: Scheduler):
result_queue = deque()
self.last_batch: Optional[ScheduleBatch] = None
self.last_batch_in_queue = False # last batch is modifed in-place, so we need another variable to track if it's extend
self.last_batch_in_queue = False # last batch is modified in-place, so we need another variable to track if it's extend
while True:
recv_reqs = self.recv_requests()
......
......@@ -54,7 +54,7 @@ class FakeKVSender(BaseKVSender):
logger.info(f"FakeKVSender send success")
else:
self.has_sent = False
logger.info(f"FakeKVSender send fake transfering")
logger.info(f"FakeKVSender send fake transferring")
def failure_exception(self):
raise Exception("Fake KVSender Exception")
......
......@@ -363,7 +363,7 @@ class MooncakeKVManager(BaseKVManager):
self.request_status[bootstrap_room] = KVPoll.WaitingForInput
def check_status(self, bootstrap_room: int):
# TOOD: do we really need the poll()?
# TODO: do we really need the poll()?
return self.request_status[bootstrap_room]
......
......@@ -112,7 +112,7 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
def kv_to_page_indices(kv_indices: np.ndarray, page_size: int):
# 1. The page is guaruanteed to be full except the last page.
# 1. The page is guaranteed to be full except the last page.
# 2. page index = kv_index // page_size
# The return vector is kv_indices[::page_size] // page_size
if page_size == 1: # shortcut
......
......@@ -86,8 +86,8 @@ class StructureInfo:
_GetInfoFunc = Callable[[str], StructureInfo]
"""
helper alias of function
ususally it is a function that takes a name string and returns a StructureInfo object,
Helper alias of function
Usually it is a function that takes a name string and returns a StructureInfo object,
which can be used to construct a structural_tag object
"""
......
......@@ -308,7 +308,7 @@ class FlashAttentionBackend(AttentionBackend):
), "Sliding window and cross attention are not supported together"
self.forward_metadata: FlashAttentionMetadata = None
# extra metdata for handling speculative decoding topk > 1, extended draft decode and verify
# extra metadata for handling speculative decoding topk > 1, extended draft decode and verify
self.forward_metadata_spec_decode_expand: FlashAttentionMetadata = None
self.max_context_len = model_runner.model_config.context_len
self.device = model_runner.device
......
......@@ -919,7 +919,7 @@ def _fwd_kernel(
e_max = n_e_max
# stage 2: compute the trianlge part
# stage 2: compute the triangle part
cur_block_m_end = tl.minimum(cur_seq_len_extend, (cur_block_m + 1) * BLOCK_M)
for start_n in range(0, cur_block_m_end, BLOCK_N):
......
......@@ -201,7 +201,7 @@ def _dp_gather(
global_tokens, local_tokens, 0, local_start_pos, local_num_tokens, False
)
# Input IDs are in int 32. We should use inplace_all_reduce for local case becaues of custom all reduce.
# Input IDs are in int 32. We should use inplace_all_reduce for local case because of custom all reduce.
NUM_GPUS_PER_NODE = 8
if (
not local_tokens.dtype.is_floating_point
......
......@@ -76,7 +76,7 @@ class RMSNorm(CustomOp):
residual: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
if not x.is_contiguous():
# NOTE: Romove this if aiter kernel supports discontinuous input
# NOTE: Remove this if aiter kernel supports discontinuous input
x = x.contiguous()
if residual is not None:
fused_add_rms_norm(x, residual, self.weight.data, self.variance_epsilon)
......
......@@ -116,7 +116,7 @@ def deepep_run_moe_deep_preprocess(topk_ids: torch.Tensor, num_experts: int):
seg_indptr = torch.empty(num_experts + 1, device=topk_ids.device, dtype=torch.int64)
src2dst = torch.empty(topk_ids.numel(), device=topk_ids.device, dtype=torch.int64)
# Find offet
# Find offset
expert_ids = torch.arange(
num_experts + 1, device=topk_ids.device, dtype=reorder_topk_ids.dtype
)
......
......@@ -611,7 +611,7 @@ class Fp8EPMoEMethod(Fp8MoEMethod):
self.quant_config.weight_block_size[1],
)
# NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
# Required by collum parallel or enabling merged weights
# Required by column parallel or enabling merged weights
if intermediate_size % block_n != 0:
raise ValueError(
f"The output_size of gate's and up's weight = "
......
......@@ -994,7 +994,7 @@ def get_default_config(
"num_stages": 2 if _is_hip else 4,
}
else:
# Block-wise quant: BLOCK_SIZE_K must be divisable by block_shape[1]
# Block-wise quant: BLOCK_SIZE_K must be divisible by block_shape[1]
config = {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": block_shape[0],
......
......@@ -270,7 +270,7 @@ def select_experts(
routed_scaling_factor: Optional[float] = None,
):
n_share_experts_fusion = global_server_args_dict["n_share_experts_fusion"]
# DeekSeek V2/V3/R1 serices models uses grouped_top_k
# DeepSeek V2/V3/R1 series models use grouped_top_k
if use_grouped_topk:
assert topk_group is not None
assert num_expert_group is not None
......
......@@ -109,7 +109,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
if quantization in VLLM_QUANTIZATION_METHODS and not VLLM_AVAILABLE:
raise ValueError(
f"{quantization} quantization requires some operators from vllm. "
"Pleaes install vllm by `pip install vllm==0.8.4`"
"Please install vllm by `pip install vllm==0.8.4`"
)
return QUANTIZATION_METHODS[quantization]
......
......@@ -152,7 +152,7 @@ class BlockInt8LinearMethod(LinearMethodBase):
f"{input_size_per_partition} is not divisible by "
f"weight quantization block_k = {block_k}."
)
# Required by collum parallel or enabling merged weights
# Required by column parallel or enabling merged weights
if (tp_size > 1 and output_size // output_size_per_partition == tp_size) or len(
output_partition_sizes
) > 1:
......@@ -285,7 +285,7 @@ class BlockInt8MoEMethod:
self.quant_config.weight_block_size[1],
)
# NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
# Required by collum parallel or enabling merged weights
# Required by column parallel or enabling merged weights
if intermediate_size % block_n != 0:
raise ValueError(
f"The output_size of gate's and up's weight = "
......
......@@ -103,10 +103,10 @@ _INITIALIZATION_DICT: Dict[Tuple[DeepGemmKernelType, int, int, int], bool] = dic
def _compile_warning_1():
if not _IN_PRECOMPILE_STAGE and _IS_FIRST_RANK_ON_NODE:
logger.warning(
"Entering DeepGEMM JIT Pre-Complie session. "
"Entering DeepGEMM JIT Pre-Compile session. "
"And it may takes a long time(Typically 10-20 mins) "
"if you have not run `sglang.compile_deep_gemm`. "
"Recommand to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
"It is recommended to run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
" for pre-compilation to reduce the overhead if you have not run it before. "
"For example: "
"`python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code`"
......@@ -115,7 +115,7 @@ def _compile_warning_1():
def _compile_warning_2():
logger.warning(
"Entering DeepGEMM JIT Single Kernel Complie session. "
"Entering DeepGEMM JIT Single Kernel Compile session. "
"And it will makes inference throughput becomes flaky. "
"Please run `sglang.compile_deep_gemm` with same args as `sglang.launch_server`"
" for pre-compilation to solve this issue. "
......@@ -298,7 +298,7 @@ def _maybe_compile_deep_gemm_one_type_all(
logger.info(
f"Try DeepGEMM JIT Compiling for "
f"<{kernel_helper.name}> N={n}, K={k}, num_groups={num_groups} with all Ms."
f"{' It only takes a litte time (typically 1 sec) if you have run `python3 -m sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}"
f"{' It only takes a little time (typically 1 sec) if you have run `python3 -m sglang.compile_deep_gemm`. ' if not _IN_PRECOMPILE_STAGE else ''}"
)
# NOTE(alcanderian): get_num_sms should be change when 2-batch-overlap is introduced
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment