Unverified Commit 2ce87935 authored by applesaucethebun's avatar applesaucethebun Committed by GitHub
Browse files

Add typo checker in pre-commit (#6179)


Co-authored-by: default avatarBrayden Zhong <b8zhong@uwaterloo.ca>
parent de167cf5
...@@ -428,7 +428,7 @@ class Llama4DecoderLayer(nn.Module): ...@@ -428,7 +428,7 @@ class Llama4DecoderLayer(nn.Module):
# Fully Connected # Fully Connected
hidden_states = self.feed_forward(hidden_states, forward_batch) hidden_states = self.feed_forward(hidden_states, forward_batch)
# TODO(ch-wan): ues reduce-scatter in MLP to avoid this scatter # TODO(ch-wan): use reduce-scatter in MLP to avoid this scatter
# Scatter # Scatter
if self.dp_size != 1: if self.dp_size != 1:
# important: forward batch.gathered_buffer is used both after scatter and after gather. # important: forward batch.gathered_buffer is used both after scatter and after gather.
......
...@@ -57,7 +57,7 @@ class RobertaEmbedding(nn.Module): ...@@ -57,7 +57,7 @@ class RobertaEmbedding(nn.Module):
input_shape = input_ids.size() input_shape = input_ids.size()
inputs_embeds = self.word_embeddings(input_ids) inputs_embeds = self.word_embeddings(input_ids)
# adpated from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py # Adapted from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py
pos_list = [] pos_list = []
token_list = [] token_list = []
......
...@@ -37,7 +37,7 @@ $ python3 -m sglang.bench_one_batch --correct \ ...@@ -37,7 +37,7 @@ $ python3 -m sglang.bench_one_batch --correct \
--tensor-parallel-size 2 \ --tensor-parallel-size 2 \
--disable-cuda-graph --disable-cuda-graph
``` ```
We will eanble CUDA Graph support soon. We will enable CUDA Graph support soon.
""" """
import types import types
......
...@@ -590,7 +590,7 @@ def v1_generate_response( ...@@ -590,7 +590,7 @@ def v1_generate_response(
echo = False echo = False
if (not isinstance(request, list)) and request.echo: if (not isinstance(request, list)) and request.echo:
# TODO: handle the case propmt is token ids # TODO: handle the case prompt is token ids
if isinstance(request.prompt, list) and isinstance(request.prompt[0], str): if isinstance(request.prompt, list) and isinstance(request.prompt[0], str):
# for the case of multiple str prompts # for the case of multiple str prompts
prompts = request.prompt prompts = request.prompt
...@@ -646,7 +646,7 @@ def v1_generate_response( ...@@ -646,7 +646,7 @@ def v1_generate_response(
finish_reason = ret_item["meta_info"]["finish_reason"] finish_reason = ret_item["meta_info"]["finish_reason"]
if to_file: if to_file:
# to make the choise data json serializable # to make the choice data json serializable
choice_data = { choice_data = {
"index": 0, "index": 0,
"text": text, "text": text,
......
...@@ -147,7 +147,7 @@ class ReasoningParser: ...@@ -147,7 +147,7 @@ class ReasoningParser:
Args: Args:
model_type (str): Type of model to parse reasoning from model_type (str): Type of model to parse reasoning from
stream_reasoning (bool): If Flase, accumulates reasoning content until complete. stream_reasoning (bool): If False, accumulates reasoning content until complete.
If True, streams reasoning content as it arrives. If True, streams reasoning content as it arrives.
""" """
......
...@@ -294,7 +294,7 @@ class SamplingBatchInfo: ...@@ -294,7 +294,7 @@ class SamplingBatchInfo:
# Set the flag to True if any of the two has custom logit processor # Set the flag to True if any of the two has custom logit processor
self.has_custom_logit_processor = True self.has_custom_logit_processor = True
# Note: becasue the __len()__ operator is defined on the temperatures tensor, # Note: because the __len()__ operator is defined on the temperatures tensor,
# please make sure any merge operation with len(self) or len(other) is done before # please make sure any merge operation with len(self) or len(other) is done before
# the merge operation of the temperatures tensor below. # the merge operation of the temperatures tensor below.
for item in [ for item in [
......
...@@ -825,7 +825,7 @@ class ServerArgs: ...@@ -825,7 +825,7 @@ class ServerArgs:
# Multi-node distributed serving # Multi-node distributed serving
parser.add_argument( parser.add_argument(
"--dist-init-addr", "--dist-init-addr",
"--nccl-init-addr", # For backward compatbility. This will be removed in the future. "--nccl-init-addr", # For backward compatibility. This will be removed in the future.
type=str, type=str,
help="The host address for initializing distributed backend (e.g., `192.168.0.2:25000`).", help="The host address for initializing distributed backend (e.g., `192.168.0.2:25000`).",
) )
...@@ -1096,7 +1096,7 @@ class ServerArgs: ...@@ -1096,7 +1096,7 @@ class ServerArgs:
parser.add_argument( parser.add_argument(
"--triton-attention-reduce-in-fp32", "--triton-attention-reduce-in-fp32",
action="store_true", action="store_true",
help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16." help="Cast the intermediate attention results to fp32 to avoid possible crashes related to fp16."
"This only affects Triton attention kernels.", "This only affects Triton attention kernels.",
) )
parser.add_argument( parser.add_argument(
...@@ -1188,7 +1188,7 @@ class ServerArgs: ...@@ -1188,7 +1188,7 @@ class ServerArgs:
type=int, type=int,
default=0, default=0,
help="The number of shared_experts need to be replicated to fuse with normal experts in deepseek v3/r1, " help="The number of shared_experts need to be replicated to fuse with normal experts in deepseek v3/r1, "
"set it to tp_size can get best optimized performace.", "set it to tp_size can get best optimized performance.",
) )
parser.add_argument( parser.add_argument(
"--disable-chunked-prefix-cache", "--disable-chunked-prefix-cache",
......
...@@ -82,12 +82,12 @@ class EAGLEDraftCudaGraphRunner: ...@@ -82,12 +82,12 @@ class EAGLEDraftCudaGraphRunner:
self.capture() self.capture()
except RuntimeError as e: except RuntimeError as e:
raise Exception( raise Exception(
f"Capture cuda graph failed: {e}\n" f"Capture CUDA graph failed: {e}\n"
"Possible solutions:\n" "Possible solutions:\n"
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n" "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
"2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n" "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
"3. disable torch compile by not using --enable-torch-compile\n" "3. disable torch compile by not using --enable-torch-compile\n"
"4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n" "4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n" "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
) )
...@@ -149,7 +149,7 @@ class EAGLEDraftCudaGraphRunner: ...@@ -149,7 +149,7 @@ class EAGLEDraftCudaGraphRunner:
# Run and capture # Run and capture
def run_once(): def run_once():
# Backup two fileds, which will be modified in-place in `draft_forward`. # Backup two fields, which will be modified in-place in `draft_forward`.
output_cache_loc_backup = forward_batch.out_cache_loc output_cache_loc_backup = forward_batch.out_cache_loc
hidden_states_backup = forward_batch.spec_info.hidden_states hidden_states_backup = forward_batch.spec_info.hidden_states
......
...@@ -167,12 +167,12 @@ class EagleVerifyOutput: ...@@ -167,12 +167,12 @@ class EagleVerifyOutput:
draft_input: EagleDraftInput draft_input: EagleDraftInput
# Logit outputs from target worker # Logit outputs from target worker
logits_output: LogitsProcessorOutput logits_output: LogitsProcessorOutput
# Accepeted token ids including the bonus token # Accepted token ids including the bonus token
verified_id: torch.Tensor verified_id: torch.Tensor
# Accepeted token length per sequence in a batch in CPU. # Accepted token length per sequence in a batch in CPU.
accept_length_per_req_cpu: List[int] accept_length_per_req_cpu: List[int]
# Accepeted indices from logits_output.next_token_logits # Accepted indices from logits_output.next_token_logits
accepeted_indices: torch.Tensor accepted_indices: torch.Tensor
@dataclass @dataclass
...@@ -316,7 +316,7 @@ class EagleVerifyInput: ...@@ -316,7 +316,7 @@ class EagleVerifyInput:
This API updates values inside logits_output based on the accepted This API updates values inside logits_output based on the accepted
tokens. I.e., logits_output.next_token_logits only contains tokens. I.e., logits_output.next_token_logits only contains
accepeted token logits. accepted token logits.
""" """
bs = self.retrive_index.shape[0] bs = self.retrive_index.shape[0]
candidates = self.draft_token.reshape(bs, self.draft_token_num) candidates = self.draft_token.reshape(bs, self.draft_token_num)
...@@ -493,7 +493,7 @@ class EagleVerifyInput: ...@@ -493,7 +493,7 @@ class EagleVerifyInput:
logits_output=logits_output, logits_output=logits_output,
verified_id=verified_id, verified_id=verified_id,
accept_length_per_req_cpu=accept_length_cpu, accept_length_per_req_cpu=accept_length_cpu,
accepeted_indices=accept_index, accepted_indices=accept_index,
) )
else: else:
assign_req_to_token_pool[(bs,)]( assign_req_to_token_pool[(bs,)](
...@@ -539,7 +539,7 @@ class EagleVerifyInput: ...@@ -539,7 +539,7 @@ class EagleVerifyInput:
logits_output=logits_output, logits_output=logits_output,
verified_id=verified_id, verified_id=verified_id,
accept_length_per_req_cpu=accept_length_cpu, accept_length_per_req_cpu=accept_length_cpu,
accepeted_indices=accept_index, accepted_indices=accept_index,
) )
......
...@@ -201,7 +201,7 @@ class EAGLEWorker(TpModelWorker): ...@@ -201,7 +201,7 @@ class EAGLEWorker(TpModelWorker):
self.has_prefill_wrapper_verify = False self.has_prefill_wrapper_verify = False
else: else:
raise ValueError( raise ValueError(
f"EAGLE is not supportted in attention backend {self.server_args.attention_backend}" f"EAGLE is not supported in attention backend {self.server_args.attention_backend}"
) )
self.draft_model_runner.draft_attn_backend = self.draft_attn_backend self.draft_model_runner.draft_attn_backend = self.draft_attn_backend
...@@ -245,8 +245,8 @@ class EAGLEWorker(TpModelWorker): ...@@ -245,8 +245,8 @@ class EAGLEWorker(TpModelWorker):
Args: Args:
batch: The batch to run forward. The state of the batch is modified as it runs. batch: The batch to run forward. The state of the batch is modified as it runs.
Returns: Returns:
A tuple of the final logit output of the target model, next tokens accepeted, A tuple of the final logit output of the target model, next tokens accepted,
the batch id (used for overlap schedule), and number of accepeted tokens. the batch id (used for overlap schedule), and number of accepted tokens.
""" """
if batch.forward_mode.is_decode(): if batch.forward_mode.is_decode():
with self.draft_tp_context(self.draft_model_runner.tp_group): with self.draft_tp_context(self.draft_model_runner.tp_group):
...@@ -491,11 +491,11 @@ class EAGLEWorker(TpModelWorker): ...@@ -491,11 +491,11 @@ class EAGLEWorker(TpModelWorker):
) )
# Post process based on verified outputs. # Post process based on verified outputs.
# Pick indices that we care (accepeted) # Pick indices that we care (accepted)
logits_output.next_token_logits = logits_output.next_token_logits[ logits_output.next_token_logits = logits_output.next_token_logits[
res.accepeted_indices res.accepted_indices
] ]
logits_output.hidden_states = logits_output.hidden_states[res.accepeted_indices] logits_output.hidden_states = logits_output.hidden_states[res.accepted_indices]
# Prepare the batch for the next draft forwards. # Prepare the batch for the next draft forwards.
batch.forward_mode = ForwardMode.DECODE batch.forward_mode = ForwardMode.DECODE
...@@ -597,7 +597,7 @@ class EAGLEWorker(TpModelWorker): ...@@ -597,7 +597,7 @@ class EAGLEWorker(TpModelWorker):
self.capture_for_decode(logits_output, forward_batch.spec_info) self.capture_for_decode(logits_output, forward_batch.spec_info)
def forward_draft_extend_after_decode(self, batch: ScheduleBatch): def forward_draft_extend_after_decode(self, batch: ScheduleBatch):
# Backup fileds that will be modified in-place # Backup fields that will be modified in-place
seq_lens_backup = batch.seq_lens.clone() seq_lens_backup = batch.seq_lens.clone()
req_pool_indices_backup = batch.req_pool_indices req_pool_indices_backup = batch.req_pool_indices
accept_length_backup = batch.spec_info.accept_length accept_length_backup = batch.spec_info.accept_length
......
...@@ -140,7 +140,7 @@ class ChatCompletionSampler(SamplerBase): ...@@ -140,7 +140,7 @@ class ChatCompletionSampler(SamplerBase):
max_tokens=self.max_tokens, max_tokens=self.max_tokens,
) )
return response.choices[0].message.content return response.choices[0].message.content
# NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are reruning MMMU # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU
except openai.BadRequestError as e: except openai.BadRequestError as e:
print("Bad Request Error", e) print("Bad Request Error", e)
return "" return ""
......
...@@ -121,7 +121,7 @@ class HumanEval(Eval): ...@@ -121,7 +121,7 @@ class HumanEval(Eval):
convo=convo, convo=convo,
metrics={ metrics={
f"pass@{k}": estimate_pass_at_k([total], [correct], k) f"pass@{k}": estimate_pass_at_k([total], [correct], k)
# this will be aggrated so no need of .mean() # this will be aggregated so no need of .mean()
for k in self._ks_passes for k in self._ks_passes
if total >= k if total >= k
}, },
......
...@@ -370,7 +370,7 @@ def test_dtype_gen(): ...@@ -370,7 +370,7 @@ def test_dtype_gen():
@sgl.function @sgl.function
def dtype_gen(s): def dtype_gen(s):
s += "Q: What is the full name of DNS?\n" s += "Q: What is the full name of DNS?\n"
s += "A: The full nams is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n" s += "A: The full names is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
s += "Q: Which year was DNS invented?\n" s += "Q: Which year was DNS invented?\n"
s += "A: " + sgl.gen("int_res", dtype=int) + "\n" s += "A: " + sgl.gen("int_res", dtype=int) + "\n"
s += "Q: What is the value of pi?\n" s += "Q: What is the value of pi?\n"
......
...@@ -278,7 +278,7 @@ def graceful_registry(sub_module_name: str): ...@@ -278,7 +278,7 @@ def graceful_registry(sub_module_name: str):
f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..." f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..."
) )
if signum == signal.SIGTERM: if signum == signal.SIGTERM:
logger.info(f"{sub_module_name} recive sigterm") logger.info(f"{sub_module_name} receive sigterm")
signal.signal(signal.SIGTERM, graceful_shutdown) signal.signal(signal.SIGTERM, graceful_shutdown)
......
...@@ -25,7 +25,7 @@ pip install -e "python[all]" ...@@ -25,7 +25,7 @@ pip install -e "python[all]"
pip install torch_memory_saver pip install torch_memory_saver
pip install transformers==4.51.0 sentence_transformers accelerate peft pandas datasets timm torchaudio==2.6.0 pip install transformers==4.51.0 sentence_transformers accelerate peft pandas datasets timm torchaudio==2.6.0
# For compling xgrammar kernels # For compiling xgrammar kernels
pip install cuda-python nvidia-cuda-nvrtc-cu12 pip install cuda-python nvidia-cuda-nvrtc-cu12
# For lmms_evals evaluating MMMU # For lmms_evals evaluating MMMU
......
...@@ -43,7 +43,7 @@ pip install -e "python[all]" ...@@ -43,7 +43,7 @@ pip install -e "python[all]"
pip install torch_memory_saver pip install torch_memory_saver
pip install transformers==4.51.0 sentence_transformers accelerate peft pandas datasets timm torchaudio==2.6.0 pip install transformers==4.51.0 sentence_transformers accelerate peft pandas datasets timm torchaudio==2.6.0
# For compling xgrammar kernels # For compiling xgrammar kernels
pip install cuda-python nvidia-cuda-nvrtc-cu12 pip install cuda-python nvidia-cuda-nvrtc-cu12
# For lmms_evals evaluating MMMU # For lmms_evals evaluating MMMU
......
""" """
Convert Yi-VL config into a format useable with SGLang Convert Yi-VL config into a format usable with SGLang
Usage: python3 scripts/convert_yi_vl.py --model-path <path-to-model> Usage: python3 scripts/convert_yi_vl.py --model-path <path-to-model>
""" """
......
...@@ -90,7 +90,7 @@ def export_nextn_layer_parameters(input_dir, output_dir, nextn_layer_id): ...@@ -90,7 +90,7 @@ def export_nextn_layer_parameters(input_dir, output_dir, nextn_layer_id):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Export NextN layer paramerters for DeepSeek-V3/R1" description="Export NextN layer parameters for DeepSeek-V3/R1"
) )
parser.add_argument( parser.add_argument(
"--input-dir", "--input-dir",
......
...@@ -114,7 +114,7 @@ set(SGL_KERNEL_CUDA_FLAGS ...@@ -114,7 +114,7 @@ set(SGL_KERNEL_CUDA_FLAGS
"--expt-extended-lambda" "--expt-extended-lambda"
"--threads=32" "--threads=32"
# Supress warnings # Suppress warnings
"-Xcompiler=-Wconversion" "-Xcompiler=-Wconversion"
"-Xcompiler=-fno-strict-aliasing" "-Xcompiler=-fno-strict-aliasing"
......
...@@ -87,7 +87,7 @@ Third-party libraries: ...@@ -87,7 +87,7 @@ Third-party libraries:
The main different Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x. The main different Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x.
And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a. Thats mean if you use **A100(tested)**/A*0/**L20(tested)**/L40/L40s/**3090(tested)** you can use fa3. And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a. That means if you use **A100(tested)**/A*0/**L20(tested)**/L40/L40s/**3090(tested)** you can use fa3.
### Kernel Development ### Kernel Development
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment