Unverified Commit e8e18dcd authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Revert "fix some typos" (#6244)

parent bad7c26f
...@@ -78,7 +78,7 @@ class EAGLEWorker(TpModelWorker): ...@@ -78,7 +78,7 @@ class EAGLEWorker(TpModelWorker):
# Override context length with target model's context length # Override context length with target model's context length
server_args.context_length = target_worker.model_runner.model_config.context_len server_args.context_length = target_worker.model_runner.model_config.context_len
# Do not capture CUDA graph in `super().__init__()` # Do not capture cuda graph in `super().__init__()`
# It will be captured later. # It will be captured later.
backup_disable_cuda_graph = server_args.disable_cuda_graph backup_disable_cuda_graph = server_args.disable_cuda_graph
server_args.disable_cuda_graph = True server_args.disable_cuda_graph = True
...@@ -136,7 +136,7 @@ class EAGLEWorker(TpModelWorker): ...@@ -136,7 +136,7 @@ class EAGLEWorker(TpModelWorker):
# Share the embedding and lm_head # Share the embedding and lm_head
self.draft_model_runner.model.set_embed_and_head(embed, head) self.draft_model_runner.model.set_embed_and_head(embed, head)
# Init attention backend and CUDA graphs # Init attention backend and cuda graphs
self.draft_model_runner.server_args.disable_cuda_graph = ( self.draft_model_runner.server_args.disable_cuda_graph = (
backup_disable_cuda_graph backup_disable_cuda_graph
) )
...@@ -148,7 +148,7 @@ class EAGLEWorker(TpModelWorker): ...@@ -148,7 +148,7 @@ class EAGLEWorker(TpModelWorker):
self.init_cuda_graphs() self.init_cuda_graphs()
def init_attention_backend(self): def init_attention_backend(self):
# Create multi-step attn backends and CUDA graph runners # Create multi-step attn backends and cuda graph runners
if self.server_args.attention_backend == "flashinfer": if self.server_args.attention_backend == "flashinfer":
if not global_server_args_dict["use_mla_backend"]: if not global_server_args_dict["use_mla_backend"]:
from sglang.srt.layers.attention.flashinfer_backend import ( from sglang.srt.layers.attention.flashinfer_backend import (
...@@ -207,7 +207,7 @@ class EAGLEWorker(TpModelWorker): ...@@ -207,7 +207,7 @@ class EAGLEWorker(TpModelWorker):
self.draft_model_runner.draft_attn_backend = self.draft_attn_backend self.draft_model_runner.draft_attn_backend = self.draft_attn_backend
def init_cuda_graphs(self): def init_cuda_graphs(self):
"""Capture CUDA graphs.""" """Capture cuda graphs."""
self.cuda_graph_runner = None self.cuda_graph_runner = None
self.cuda_graph_runner_for_draft_extend = None self.cuda_graph_runner_for_draft_extend = None
...@@ -218,12 +218,12 @@ class EAGLEWorker(TpModelWorker): ...@@ -218,12 +218,12 @@ class EAGLEWorker(TpModelWorker):
tic = time.time() tic = time.time()
before_mem = get_available_gpu_memory(self.device, self.gpu_id) before_mem = get_available_gpu_memory(self.device, self.gpu_id)
logger.info( logger.info(
f"Capture draft CUDA graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB" f"Capture draft cuda graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB"
) )
self.cuda_graph_runner = EAGLEDraftCudaGraphRunner(self) self.cuda_graph_runner = EAGLEDraftCudaGraphRunner(self)
after_mem = get_available_gpu_memory(self.device, self.gpu_id) after_mem = get_available_gpu_memory(self.device, self.gpu_id)
logger.info( logger.info(
f"Capture draft CUDA graph end. Time elapsed: {time.time() - tic:.2f} s. avail mem={after_mem:.2f} GB. mem usage={(before_mem - after_mem):.2f} GB." f"Capture draft cuda graph end. Time elapsed: {time.time() - tic:.2f} s. avail mem={after_mem:.2f} GB. mem usage={(before_mem - after_mem):.2f} GB."
) )
# Capture extend # Capture extend
......
...@@ -1117,7 +1117,7 @@ prometheus_multiproc_dir: tempfile.TemporaryDirectory ...@@ -1117,7 +1117,7 @@ prometheus_multiproc_dir: tempfile.TemporaryDirectory
def set_prometheus_multiproc_dir(): def set_prometheus_multiproc_dir():
# Set prometheus multiprocess directory # Set prometheus multiprocess directory
# SGLang uses prometheus multiprocess mode # sglang uses prometheus multiprocess mode
# we need to set this before importing prometheus_client # we need to set this before importing prometheus_client
# https://prometheus.github.io/client_python/multiprocess/ # https://prometheus.github.io/client_python/multiprocess/
global prometheus_multiproc_dir global prometheus_multiproc_dir
......
...@@ -42,7 +42,7 @@ class MockModelRunner: ...@@ -42,7 +42,7 @@ class MockModelRunner:
"TokenPool", "TokenPool",
(), (),
{ {
# A typical max_bs * max_context_len for CUDA graph decode # A typical max_bs * max_context_len for cuda graph decode
"size": max_batch_size, "size": max_batch_size,
# Add req_to_token attribute # Add req_to_token attribute
"req_to_token": torch.zeros( "req_to_token": torch.zeros(
......
...@@ -37,7 +37,7 @@ class MockModelRunner: ...@@ -37,7 +37,7 @@ class MockModelRunner:
"TokenPool", "TokenPool",
(), (),
{ {
# A typical max_bs * max_context_len for CUDA graph decode # A typical max_bs * max_context_len for cuda graph decode
"size": batch_size, "size": batch_size,
# Add req_to_token attribute # Add req_to_token attribute
"req_to_token": torch.zeros( "req_to_token": torch.zeros(
......
...@@ -83,11 +83,11 @@ Third-party libraries: ...@@ -83,11 +83,11 @@ Third-party libraries:
### FlashAttention FYI ### FlashAttention FYI
FA3 can fail without a enough shared memory for some shapes, such as higher hidden_dim or some special cases. Right now, FA3 is supported for sm80/sm87 and sm86/sm89. FA3 can fail without a enough shared memory for a some shapes, such as higher hidden_dim or some special cases. Right now, fa3 is supported for sm80/sm87 and sm86/sm89.
The main different Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x. The main different Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x.
And for sgl-kernel right now, we can build FA3 on sm80/sm86/sm89/sm90a. That means if you use **A100(tested)**/A*0/**L20(tested)**/L40/L40s/**3090(tested)** you can use FA3. And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a. That means if you use **A100(tested)**/A*0/**L20(tested)**/L40/L40s/**3090(tested)** you can use fa3.
### Kernel Development ### Kernel Development
...@@ -164,7 +164,7 @@ template <> ...@@ -164,7 +164,7 @@ template <>
struct pytorch_library_compatible_type<int> { struct pytorch_library_compatible_type<int> {
using type = int64_t; using type = int64_t;
static int convert_from_type(int64_t arg) { static int convert_from_type(int64_t arg) {
TORCH_CHECK(arg <= std::numeric_limits<int>::max(), "int64_t value is too large to be converted to int"); TORCH_CHECK(arg <= std::numeric_limits<int>::max(), "int64_t value is too large to be converted to int");
TORCH_CHECK(arg >= std::numeric_limits<int>::min(), "int64_t value is too small to be converted to int"); TORCH_CHECK(arg >= std::numeric_limits<int>::min(), "int64_t value is too small to be converted to int");
return arg; return arg;
} }
......
...@@ -177,7 +177,7 @@ def calculate_diff(num_tokens, num_experts=256, block_size=128, topk=8): ...@@ -177,7 +177,7 @@ def calculate_diff(num_tokens, num_experts=256, block_size=128, topk=8):
expert_ids_vllm = torch.zeros_like(expert_ids_cuda) expert_ids_vllm = torch.zeros_like(expert_ids_cuda)
num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_cuda) num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_cuda)
# compare the performance of CUDA, triton and vllm implementation # compare the performance of cuda, triton and vllm implementation
sgl_moe_align_block_size( sgl_moe_align_block_size(
topk_ids, topk_ids,
num_experts, num_experts,
...@@ -349,7 +349,7 @@ def benchmark(num_tokens, num_experts, topk, provider): ...@@ -349,7 +349,7 @@ def benchmark(num_tokens, num_experts, topk, provider):
), ),
quantiles=quantiles, quantiles=quantiles,
) )
else: # vLLM else: # vllm
try: try:
ms, min_ms, max_ms = triton.testing.do_bench( ms, min_ms, max_ms = triton.testing.do_bench(
lambda: ops.moe_align_block_size( lambda: ops.moe_align_block_size(
......
...@@ -280,8 +280,8 @@ class CustomAllreduce { ...@@ -280,8 +280,8 @@ class CustomAllreduce {
std::unordered_map<void*, RankData*> buffers_; std::unordered_map<void*, RankData*> buffers_;
Signal* self_sg_; Signal* self_sg_;
// Stores rank data from all ranks. This is mainly for CUDA graph purposes. // Stores rank data from all ranks. This is mainly for cuda graph purposes.
// For CUDA graph to work, all kernel arguments must be fixed during graph // For cuda graph to work, all kernel arguments must be fixed during graph
// capture time. However, the peer pointers are not known during graph capture // capture time. However, the peer pointers are not known during graph capture
// time. Therefore, during capture, we increment the rank data pointer and use // time. Therefore, during capture, we increment the rank data pointer and use
// that as the argument to the kernel. The kernel arguments are stored in // that as the argument to the kernel. The kernel arguments are stored in
...@@ -291,7 +291,7 @@ class CustomAllreduce { ...@@ -291,7 +291,7 @@ class CustomAllreduce {
// //
// The overall process looks like this: // The overall process looks like this:
// 1. Graph capture. // 1. Graph capture.
// 2. Each rank obtains the IPC handles for each addresses used during CUDA // 2. Each rank obtains the IPC handles for each addresses used during cuda
// graph capture using get_graph_buffer_ipc_meta. // graph capture using get_graph_buffer_ipc_meta.
// 3. (In Python) all gather the IPC handles. // 3. (In Python) all gather the IPC handles.
// 4. Obtain the peer pointers by opening the IPC handles, and store them in // 4. Obtain the peer pointers by opening the IPC handles, and store them in
......
...@@ -65,5 +65,5 @@ from sgl_kernel.speculative import ( ...@@ -65,5 +65,5 @@ from sgl_kernel.speculative import (
from sgl_kernel.version import __version__ from sgl_kernel.version import __version__
build_tree_kernel = ( build_tree_kernel = (
None # TODO(ying): remove this after updating the SGLang python code. None # TODO(ying): remove this after updating the sglang python code.
) )
...@@ -10,14 +10,14 @@ except: ...@@ -10,14 +10,14 @@ except:
def is_fa3_supported(device=None) -> bool: def is_fa3_supported(device=None) -> bool:
# There some FA3 FYI # There some fa3 FYI
# FA3 can fail without a enough shared memory for a some shapes, such as higher # FA3 can fail without a enough shared memory for a some shapes, such as higher
# hidden_dim or some special cases. # hidden_dim or some special cases.
# Right now, FA3 is supported for sm80/sm87 and sm86/sm89. The main different # Right now, fa3 is supported for sm80/sm87 and sm86/sm89. The main different
# Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information # Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x
# And for sgl-kernel right now, we can build FA3 on sm80/sm86/sm89/sm90a. # And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a.
# That means if you use A100/A*0/L20/L40/L40s/4090 you can use FA3. # That means if you use A100/A*0/L20/L40/L40s/4090 you can use fa3.
return ( return (
torch.cuda.get_device_capability(device)[0] == 9 torch.cuda.get_device_capability(device)[0] == 9
or torch.cuda.get_device_capability(device)[0] == 8 or torch.cuda.get_device_capability(device)[0] == 8
......
...@@ -197,7 +197,7 @@ def test_merge_attn_states( ...@@ -197,7 +197,7 @@ def test_merge_attn_states(
if not torch.cuda.is_available(): if not torch.cuda.is_available():
pytest.skip( pytest.skip(
"Currently only support compare triton merge_attn_states " "Currently only support compare triton merge_attn_states "
"with custom CUDA merge_attn_states kernel" "with custom cuda merge_attn_states kernel"
) )
NUM_TOKENS = num_tokens NUM_TOKENS = num_tokens
......
...@@ -47,8 +47,8 @@ TEST_CUDA_GRAPH_PADDING_PROMPTS = [ ...@@ -47,8 +47,8 @@ TEST_CUDA_GRAPH_PADDING_PROMPTS = [
class TestLoRACudaGraph(CustomTestCase): class TestLoRACudaGraph(CustomTestCase):
def _run_without_cuda_graph_on_model_cases(self, model_cases: List[LoRAModelCase]): def _run_without_cuda_graph_on_model_cases(self, model_cases: List[LoRAModelCase]):
# Since we have already enabled CUDA graph by default in other LoRA tests, # Since we have already enabled CUDA graph by default in other lora tests,
# we only need to run LoRA tests without CUDA graph here. # we only need to run lora tests without CUDA graph here.
for model_case in model_cases: for model_case in model_cases:
# If skip_long_prompt is True, filter out prompts longer than 1000 characters # If skip_long_prompt is True, filter out prompts longer than 1000 characters
prompts = ( prompts = (
......
...@@ -154,7 +154,7 @@ def run_lora_test_one_by_one( ...@@ -154,7 +154,7 @@ def run_lora_test_one_by_one(
model_case (LoRAModelCase): The model case to test. model_case (LoRAModelCase): The model case to test.
torch_dtype (torch.dtype): The torch dtype to use. torch_dtype (torch.dtype): The torch dtype to use.
max_new_tokens (int): The maximum number of new tokens to generate. max_new_tokens (int): The maximum number of new tokens to generate.
backend (str): The LoRA backend to use. backend (str): The lora backend to use.
disable_cuda_graph (bool, optional): Whether to disable CUDA graph. Defaults to False. disable_cuda_graph (bool, optional): Whether to disable CUDA graph. Defaults to False.
disable_radix_cache (bool, optional): Whether to disable radix cache. Defaults to True. disable_radix_cache (bool, optional): Whether to disable radix cache. Defaults to True.
mem_fraction_static (float, optional): The fraction of memory to use. Defaults to 0.88. mem_fraction_static (float, optional): The fraction of memory to use. Defaults to 0.88.
...@@ -289,7 +289,7 @@ def run_lora_test_by_batch( ...@@ -289,7 +289,7 @@ def run_lora_test_by_batch(
test_tag: str = "", test_tag: str = "",
): ):
""" """
Run LoRA tests as a batch. Run lora tests as a batch.
For prompt0, prompt1, ..., promptN, For prompt0, prompt1, ..., promptN,
we will use adaptor0, adaptor1, ..., adaptorN included in model case, we will use adaptor0, adaptor1, ..., adaptorN included in model case,
We will then compare the outputs of HF and SRT with LoRA. We will then compare the outputs of HF and SRT with LoRA.
...@@ -301,7 +301,7 @@ def run_lora_test_by_batch( ...@@ -301,7 +301,7 @@ def run_lora_test_by_batch(
model_case (LoRAModelCase): The model case to test. model_case (LoRAModelCase): The model case to test.
torch_dtype (torch.dtype): The torch dtype to use. torch_dtype (torch.dtype): The torch dtype to use.
max_new_tokens (int): The maximum number of new tokens to generate. max_new_tokens (int): The maximum number of new tokens to generate.
backend (str): The LoRA backend to use. backend (str): The lora backend to use.
disable_cuda_graph (bool, optional): Whether to disable CUDA graph. Defaults to False. disable_cuda_graph (bool, optional): Whether to disable CUDA graph. Defaults to False.
disable_radix_cache (bool, optional): Whether to disable radix cache. Defaults to True. disable_radix_cache (bool, optional): Whether to disable radix cache. Defaults to True.
mem_fraction_static (float, optional): The fraction of memory to use. Defaults to 0.88. mem_fraction_static (float, optional): The fraction of memory to use. Defaults to 0.88.
...@@ -372,8 +372,8 @@ def run_lora_test_by_batch( ...@@ -372,8 +372,8 @@ def run_lora_test_by_batch(
print("ROUGE-L score:", rouge_score) print("ROUGE-L score:", rouge_score)
print("SRT output:", srt_output_str) print("SRT output:", srt_output_str)
print("HF output:", hf_output_str) print("HF output:", hf_output_str)
print("SRT no LoRA output:", srt_no_lora_outputs.output_strs[i].strip()) print("SRT no lora output:", srt_no_lora_outputs.output_strs[i].strip())
print("HF no LoRA output:", hf_no_lora_outputs.output_strs[i].strip()) print("HF no lora output:", hf_no_lora_outputs.output_strs[i].strip())
assert srt_outputs.output_strs[i].strip(" ") == hf_outputs.output_strs[i].strip( assert srt_outputs.output_strs[i].strip(" ") == hf_outputs.output_strs[i].strip(
" " " "
), ( ), (
......
...@@ -8,7 +8,7 @@ class TestSRTEngineWithQuantArgs(CustomTestCase): ...@@ -8,7 +8,7 @@ class TestSRTEngineWithQuantArgs(CustomTestCase):
def test_1_quantization_args(self): def test_1_quantization_args(self):
# we only test fp8 because other methods are currently dependent on vLLM. We can add other methods back to test after vLLM dependency is resolved. # we only test fp8 because other methods are currently dependent on vllm. We can add other methods back to test after vllm dependency is resolved.
quantization_args_list = [ quantization_args_list = [
# "awq", # "awq",
"fp8", "fp8",
...@@ -34,7 +34,7 @@ class TestSRTEngineWithQuantArgs(CustomTestCase): ...@@ -34,7 +34,7 @@ class TestSRTEngineWithQuantArgs(CustomTestCase):
def test_2_torchao_args(self): def test_2_torchao_args(self):
# we don't test int8dq because currently there is conflict between int8dq and capture CUDA graph # we don't test int8dq because currently there is conflict between int8dq and capture cuda graph
torchao_args_list = [ torchao_args_list = [
# "int8dq", # "int8dq",
"int8wo", "int8wo",
......
...@@ -277,7 +277,7 @@ class TestTritonAttention(CustomTestCase): ...@@ -277,7 +277,7 @@ class TestTritonAttention(CustomTestCase):
def test_decode_attention(self): def test_decode_attention(self):
# Here we just to ensure there is no error # Here we just to ensure there is no error
# TODO: correctness test # TODO: correctnesss test
# Test configurations # Test configurations
configs = [ configs = [
......
...@@ -189,7 +189,7 @@ def init_process_hf( ...@@ -189,7 +189,7 @@ def init_process_hf(
print(f"[hf] {rank=} {broadcast_time=:.3f}s") print(f"[hf] {rank=} {broadcast_time=:.3f}s")
param_queue.put(("broadcast_time", broadcast_time)) param_queue.put(("broadcast_time", broadcast_time))
# Delete the HuggingFace models to free up memory. # Delete the huggingface models to free up memory.
del hf_instruct_model del hf_instruct_model
del hf_base_model del hf_base_model
gc.collect() gc.collect()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment