Unverified Commit 2ce87935 authored by applesaucethebun's avatar applesaucethebun Committed by GitHub
Browse files

Add typo checker in pre-commit (#6179)


Co-authored-by: default avatarBrayden Zhong <b8zhong@uwaterloo.ca>
parent de167cf5
......@@ -428,7 +428,7 @@ class Llama4DecoderLayer(nn.Module):
# Fully Connected
hidden_states = self.feed_forward(hidden_states, forward_batch)
# TODO(ch-wan): ues reduce-scatter in MLP to avoid this scatter
# TODO(ch-wan): use reduce-scatter in MLP to avoid this scatter
# Scatter
if self.dp_size != 1:
# important: forward batch.gathered_buffer is used both after scatter and after gather.
......
......@@ -57,7 +57,7 @@ class RobertaEmbedding(nn.Module):
input_shape = input_ids.size()
inputs_embeds = self.word_embeddings(input_ids)
# adpated from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py
# Adapted from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py
pos_list = []
token_list = []
......
......@@ -37,7 +37,7 @@ $ python3 -m sglang.bench_one_batch --correct \
--tensor-parallel-size 2 \
--disable-cuda-graph
```
We will eanble CUDA Graph support soon.
We will enable CUDA Graph support soon.
"""
import types
......
......@@ -590,7 +590,7 @@ def v1_generate_response(
echo = False
if (not isinstance(request, list)) and request.echo:
# TODO: handle the case propmt is token ids
# TODO: handle the case prompt is token ids
if isinstance(request.prompt, list) and isinstance(request.prompt[0], str):
# for the case of multiple str prompts
prompts = request.prompt
......@@ -646,7 +646,7 @@ def v1_generate_response(
finish_reason = ret_item["meta_info"]["finish_reason"]
if to_file:
# to make the choise data json serializable
# to make the choice data json serializable
choice_data = {
"index": 0,
"text": text,
......
......@@ -147,7 +147,7 @@ class ReasoningParser:
Args:
model_type (str): Type of model to parse reasoning from
stream_reasoning (bool): If Flase, accumulates reasoning content until complete.
stream_reasoning (bool): If False, accumulates reasoning content until complete.
If True, streams reasoning content as it arrives.
"""
......
......@@ -294,7 +294,7 @@ class SamplingBatchInfo:
# Set the flag to True if any of the two has custom logit processor
self.has_custom_logit_processor = True
# Note: becasue the __len()__ operator is defined on the temperatures tensor,
# Note: because the __len()__ operator is defined on the temperatures tensor,
# please make sure any merge operation with len(self) or len(other) is done before
# the merge operation of the temperatures tensor below.
for item in [
......
......@@ -825,7 +825,7 @@ class ServerArgs:
# Multi-node distributed serving
parser.add_argument(
"--dist-init-addr",
"--nccl-init-addr", # For backward compatbility. This will be removed in the future.
"--nccl-init-addr", # For backward compatibility. This will be removed in the future.
type=str,
help="The host address for initializing distributed backend (e.g., `192.168.0.2:25000`).",
)
......@@ -1096,7 +1096,7 @@ class ServerArgs:
parser.add_argument(
"--triton-attention-reduce-in-fp32",
action="store_true",
help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
help="Cast the intermediate attention results to fp32 to avoid possible crashes related to fp16."
"This only affects Triton attention kernels.",
)
parser.add_argument(
......@@ -1188,7 +1188,7 @@ class ServerArgs:
type=int,
default=0,
help="The number of shared_experts need to be replicated to fuse with normal experts in deepseek v3/r1, "
"set it to tp_size can get best optimized performace.",
"set it to tp_size can get best optimized performance.",
)
parser.add_argument(
"--disable-chunked-prefix-cache",
......
......@@ -82,12 +82,12 @@ class EAGLEDraftCudaGraphRunner:
self.capture()
except RuntimeError as e:
raise Exception(
f"Capture cuda graph failed: {e}\n"
f"Capture CUDA graph failed: {e}\n"
"Possible solutions:\n"
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
"2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
"3. disable torch compile by not using --enable-torch-compile\n"
"4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
"4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
)
......@@ -149,7 +149,7 @@ class EAGLEDraftCudaGraphRunner:
# Run and capture
def run_once():
# Backup two fileds, which will be modified in-place in `draft_forward`.
# Backup two fields, which will be modified in-place in `draft_forward`.
output_cache_loc_backup = forward_batch.out_cache_loc
hidden_states_backup = forward_batch.spec_info.hidden_states
......
......@@ -167,12 +167,12 @@ class EagleVerifyOutput:
draft_input: EagleDraftInput
# Logit outputs from target worker
logits_output: LogitsProcessorOutput
# Accepeted token ids including the bonus token
# Accepted token ids including the bonus token
verified_id: torch.Tensor
# Accepeted token length per sequence in a batch in CPU.
# Accepted token length per sequence in a batch in CPU.
accept_length_per_req_cpu: List[int]
# Accepeted indices from logits_output.next_token_logits
accepeted_indices: torch.Tensor
# Accepted indices from logits_output.next_token_logits
accepted_indices: torch.Tensor
@dataclass
......@@ -316,7 +316,7 @@ class EagleVerifyInput:
This API updates values inside logits_output based on the accepted
tokens. I.e., logits_output.next_token_logits only contains
accepeted token logits.
accepted token logits.
"""
bs = self.retrive_index.shape[0]
candidates = self.draft_token.reshape(bs, self.draft_token_num)
......@@ -493,7 +493,7 @@ class EagleVerifyInput:
logits_output=logits_output,
verified_id=verified_id,
accept_length_per_req_cpu=accept_length_cpu,
accepeted_indices=accept_index,
accepted_indices=accept_index,
)
else:
assign_req_to_token_pool[(bs,)](
......@@ -539,7 +539,7 @@ class EagleVerifyInput:
logits_output=logits_output,
verified_id=verified_id,
accept_length_per_req_cpu=accept_length_cpu,
accepeted_indices=accept_index,
accepted_indices=accept_index,
)
......
......@@ -201,7 +201,7 @@ class EAGLEWorker(TpModelWorker):
self.has_prefill_wrapper_verify = False
else:
raise ValueError(
f"EAGLE is not supportted in attention backend {self.server_args.attention_backend}"
f"EAGLE is not supported in attention backend {self.server_args.attention_backend}"
)
self.draft_model_runner.draft_attn_backend = self.draft_attn_backend
......@@ -245,8 +245,8 @@ class EAGLEWorker(TpModelWorker):
Args:
batch: The batch to run forward. The state of the batch is modified as it runs.
Returns:
A tuple of the final logit output of the target model, next tokens accepeted,
the batch id (used for overlap schedule), and number of accepeted tokens.
A tuple of the final logit output of the target model, next tokens accepted,
the batch id (used for overlap schedule), and number of accepted tokens.
"""
if batch.forward_mode.is_decode():
with self.draft_tp_context(self.draft_model_runner.tp_group):
......@@ -491,11 +491,11 @@ class EAGLEWorker(TpModelWorker):
)
# Post process based on verified outputs.
# Pick indices that we care (accepeted)
# Pick indices that we care (accepted)
logits_output.next_token_logits = logits_output.next_token_logits[
res.accepeted_indices
res.accepted_indices
]
logits_output.hidden_states = logits_output.hidden_states[res.accepeted_indices]
logits_output.hidden_states = logits_output.hidden_states[res.accepted_indices]
# Prepare the batch for the next draft forwards.
batch.forward_mode = ForwardMode.DECODE
......@@ -597,7 +597,7 @@ class EAGLEWorker(TpModelWorker):
self.capture_for_decode(logits_output, forward_batch.spec_info)
def forward_draft_extend_after_decode(self, batch: ScheduleBatch):
# Backup fileds that will be modified in-place
# Backup fields that will be modified in-place
seq_lens_backup = batch.seq_lens.clone()
req_pool_indices_backup = batch.req_pool_indices
accept_length_backup = batch.spec_info.accept_length
......
......@@ -140,7 +140,7 @@ class ChatCompletionSampler(SamplerBase):
max_tokens=self.max_tokens,
)
return response.choices[0].message.content
# NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are reruning MMMU
# NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU
except openai.BadRequestError as e:
print("Bad Request Error", e)
return ""
......
......@@ -121,7 +121,7 @@ class HumanEval(Eval):
convo=convo,
metrics={
f"pass@{k}": estimate_pass_at_k([total], [correct], k)
# this will be aggrated so no need of .mean()
# this will be aggregated so no need of .mean()
for k in self._ks_passes
if total >= k
},
......
......@@ -370,7 +370,7 @@ def test_dtype_gen():
@sgl.function
def dtype_gen(s):
s += "Q: What is the full name of DNS?\n"
s += "A: The full nams is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
s += "A: The full names is " + sgl.gen("str_res", dtype=str, stop="\n") + "\n"
s += "Q: Which year was DNS invented?\n"
s += "A: " + sgl.gen("int_res", dtype=int) + "\n"
s += "Q: What is the value of pi?\n"
......
......@@ -278,7 +278,7 @@ def graceful_registry(sub_module_name: str):
f"{sub_module_name} Received signal to shutdown. Performing graceful shutdown..."
)
if signum == signal.SIGTERM:
logger.info(f"{sub_module_name} recive sigterm")
logger.info(f"{sub_module_name} receive sigterm")
signal.signal(signal.SIGTERM, graceful_shutdown)
......
......@@ -25,7 +25,7 @@ pip install -e "python[all]"
pip install torch_memory_saver
pip install transformers==4.51.0 sentence_transformers accelerate peft pandas datasets timm torchaudio==2.6.0
# For compling xgrammar kernels
# For compiling xgrammar kernels
pip install cuda-python nvidia-cuda-nvrtc-cu12
# For lmms_evals evaluating MMMU
......
......@@ -43,7 +43,7 @@ pip install -e "python[all]"
pip install torch_memory_saver
pip install transformers==4.51.0 sentence_transformers accelerate peft pandas datasets timm torchaudio==2.6.0
# For compling xgrammar kernels
# For compiling xgrammar kernels
pip install cuda-python nvidia-cuda-nvrtc-cu12
# For lmms_evals evaluating MMMU
......
"""
Convert Yi-VL config into a format useable with SGLang
Convert Yi-VL config into a format usable with SGLang
Usage: python3 scripts/convert_yi_vl.py --model-path <path-to-model>
"""
......
......@@ -90,7 +90,7 @@ def export_nextn_layer_parameters(input_dir, output_dir, nextn_layer_id):
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Export NextN layer paramerters for DeepSeek-V3/R1"
description="Export NextN layer parameters for DeepSeek-V3/R1"
)
parser.add_argument(
"--input-dir",
......
......@@ -114,7 +114,7 @@ set(SGL_KERNEL_CUDA_FLAGS
"--expt-extended-lambda"
"--threads=32"
# Supress warnings
# Suppress warnings
"-Xcompiler=-Wconversion"
"-Xcompiler=-fno-strict-aliasing"
......
......@@ -87,7 +87,7 @@ Third-party libraries:
The main different Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x.
And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a. Thats mean if you use **A100(tested)**/A*0/**L20(tested)**/L40/L40s/**3090(tested)** you can use fa3.
And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a. That means if you use **A100(tested)**/A*0/**L20(tested)**/L40/L40s/**3090(tested)** you can use fa3.
### Kernel Development
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment