Unverified Commit e8e18dcd authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Revert "fix some typos" (#6244)

parent bad7c26f
...@@ -16,7 +16,7 @@ from sglang.srt.lora.utils import ( ...@@ -16,7 +16,7 @@ from sglang.srt.lora.utils import (
class LoRAMemoryPool: class LoRAMemoryPool:
"""Class for memory pool management of LoRA modules""" """Class for memory pool management of lora modules"""
def __init__( def __init__(
self, self,
...@@ -38,7 +38,7 @@ class LoRAMemoryPool: ...@@ -38,7 +38,7 @@ class LoRAMemoryPool:
self.tp_rank: int = tp_rank self.tp_rank: int = tp_rank
self.lora_modules: Dict[int, List[Tuple[str, BaseLayerWithLoRA]]] = lora_modules self.lora_modules: Dict[int, List[Tuple[str, BaseLayerWithLoRA]]] = lora_modules
# Both A_buffer and B_buffer maps LoRA weight names to its buffer space. # Both A_buffer and B_buffer maps lora weight names to its buffer space.
# A_buffer contains num_layer number of row-major tensors with shape # A_buffer contains num_layer number of row-major tensors with shape
# (max_loras_per_batch, stacked_num * max_lora_dim, input_dim) # (max_loras_per_batch, stacked_num * max_lora_dim, input_dim)
# B_buffer contains num_layer number of column-major tensors with shape # B_buffer contains num_layer number of column-major tensors with shape
...@@ -46,10 +46,10 @@ class LoRAMemoryPool: ...@@ -46,10 +46,10 @@ class LoRAMemoryPool:
self.A_buffer: Dict[str, List[torch.Tensor]] = {} self.A_buffer: Dict[str, List[torch.Tensor]] = {}
self.B_buffer: Dict[str, List[torch.Tensor]] = {} self.B_buffer: Dict[str, List[torch.Tensor]] = {}
# LoRA uid -> buffer idx in memory pool # Lora uid -> buffer idx in memory pool
self.uid_to_buffer_id: Dict[Optional[str], int] = {} self.uid_to_buffer_id: Dict[Optional[str], int] = {}
# Buffer idx -> LoRA uid in memory pool # Buffer idx -> lora uid in memory pool
# All uids are initialized as empty strings for empty buffer slots # All uids are initialized as empty strings for empty buffer slots
# Here we don't initialize to None since None is a valid uid # Here we don't initialize to None since None is a valid uid
self.buffer_id_to_uid: List[Optional[str]] = [""] * self.max_loras_per_batch self.buffer_id_to_uid: List[Optional[str]] = [""] * self.max_loras_per_batch
...@@ -95,7 +95,7 @@ class LoRAMemoryPool: ...@@ -95,7 +95,7 @@ class LoRAMemoryPool:
base_model: torch.nn.Module, base_model: torch.nn.Module,
): ):
# lora_weight_names is a set of name pairs indicating each pair of LoRA modules to load # lora_weight_names is a set of name pairs indicating each pair of lora modules to load
# e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj"), ("o_proj", "o_proj")} # e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj"), ("o_proj", "o_proj")}
self.lora_weight_names: Set[Tuple[str]] = lora_weight_names self.lora_weight_names: Set[Tuple[str]] = lora_weight_names
device = next(base_model.parameters()).device device = next(base_model.parameters()).device
...@@ -137,7 +137,7 @@ class LoRAMemoryPool: ...@@ -137,7 +137,7 @@ class LoRAMemoryPool:
return buffer_id, "" return buffer_id, ""
for buffer_id in range(self.max_loras_per_batch): for buffer_id in range(self.max_loras_per_batch):
# Evict unneeded LoRA # Evict unneeded lora
if self.buffer_id_to_uid[buffer_id] not in cur_uids: if self.buffer_id_to_uid[buffer_id] not in cur_uids:
return buffer_id, self.buffer_id_to_uid[buffer_id] return buffer_id, self.buffer_id_to_uid[buffer_id]
......
...@@ -37,7 +37,7 @@ def _gate_up_lora_b_kernel( ...@@ -37,7 +37,7 @@ def _gate_up_lora_b_kernel(
): ):
# This kernel packs 2 sgemms (gate/up) into a single kernel. # This kernel packs 2 sgemms (gate/up) into a single kernel.
# x: (s, 2 * K), s is the sum of sequence lengths, K equals to LoRA rank # x: (s, 2 * K), s is the sum of sequence lengths, K equals to lora rank
# weights: (num_lora, 2 * output_dim, K) # weights: (num_lora, 2 * output_dim, K)
# output: (s, 2 * output_dim) # output: (s, 2 * output_dim)
# output_dim >> K # output_dim >> K
......
...@@ -39,7 +39,7 @@ def _qkv_lora_b_kernel( ...@@ -39,7 +39,7 @@ def _qkv_lora_b_kernel(
): ):
# This kernel packs 3 sgemms (q/k/v) into a single kernel. # This kernel packs 3 sgemms (q/k/v) into a single kernel.
# x: (s, 3 * K), s is the sum of sequence lengths, K equals to LoRA rank # x: (s, 3 * K), s is the sum of sequence lengths, K equals to lora rank
# weights: (num_lora, N_Q + 2 * N_KV, K) # weights: (num_lora, N_Q + 2 * N_KV, K)
# output: (s, N_Q + 2 * N_KV) # output: (s, N_Q + 2 * N_KV)
# N_Q >> K, N_KV >> K # N_Q >> K, N_KV >> K
......
...@@ -22,13 +22,13 @@ class LoRABatchInfo: ...@@ -22,13 +22,13 @@ class LoRABatchInfo:
# Maximum sequence length of current batch # Maximum sequence length of current batch
max_len: int max_len: int
# The index of LoRA adapter used by each sequence, in shape (bs,) # The index of lora adapter used by each sequence, in shape (bs,)
weight_indices: torch.Tensor weight_indices: torch.Tensor
# ranks of each LoRA adapter, in shape (lora_num,) # ranks of each lora adapter, in shape (lora_num,)
lora_ranks: torch.Tensor lora_ranks: torch.Tensor
# scaling of each LoRA adapter, in shape (lora_num,) # scaling of each lora adapter, in shape (lora_num,)
scalings: torch.Tensor scalings: torch.Tensor
...@@ -51,9 +51,9 @@ def get_customized_names_from_hf_names( ...@@ -51,9 +51,9 @@ def get_customized_names_from_hf_names(
hf_module_names: Set[str], base_model: torch.nn.Module hf_module_names: Set[str], base_model: torch.nn.Module
) -> Set[str]: ) -> Set[str]:
""" """
This function takes in a set of HuggingFace style module names: This function takes in a set of huggingface style module names:
e.g., {"k_proj", "q_proj", "v_proj", "o_proj"} e.g., {"k_proj", "q_proj", "v_proj", "o_proj"}
and outputs a set of module names of customized SGLang layers: and outputs a set of module names of customized sglang layers:
e.g., {"qkv_proj", "o_proj"} e.g., {"qkv_proj", "o_proj"}
""" """
if hasattr(base_model, "get_module_name"): if hasattr(base_model, "get_module_name"):
...@@ -87,7 +87,7 @@ def get_hidden_dim( ...@@ -87,7 +87,7 @@ def get_hidden_dim(
else: else:
""" """
WARNING: get_hidden_dim() is not defined, WARNING: get_hidden_dim() is not defined,
which is used to get the hidden dim for different LoRA modules which is used to get the hidden dim for different lora modules
Use the default one, but please check if it is correct for your model. Use the default one, but please check if it is correct for your model.
Please implement the function in the model class if it is not. Please implement the function in the model class if it is not.
You can reference this function in llama.py. You can reference this function in llama.py.
...@@ -108,7 +108,7 @@ def get_hidden_dim( ...@@ -108,7 +108,7 @@ def get_hidden_dim(
def get_stacked_name(name: str) -> Tuple[str]: def get_stacked_name(name: str) -> Tuple[str]:
""" """
Mapping a target LoRA module name to (stacked name for LoRA A, stacked name for LoRA B) Mapping a target module name to (stacked name for Lora A, stacked name for Lora B)
""" """
params_mapping = { params_mapping = {
"q_proj": ("qkv_proj", "q_proj"), "q_proj": ("qkv_proj", "q_proj"),
...@@ -122,7 +122,7 @@ def get_stacked_name(name: str) -> Tuple[str]: ...@@ -122,7 +122,7 @@ def get_stacked_name(name: str) -> Tuple[str]:
def get_stacked_multiply(module_name: str) -> int: def get_stacked_multiply(module_name: str) -> int:
""" """
Mapping a module name to its magnification at output dimension Mapping a lora module name to its magnification at output dimension
""" """
stacked_rank = { stacked_rank = {
"qkv_proj": 3, "qkv_proj": 3,
...@@ -137,7 +137,7 @@ def get_weight_name( ...@@ -137,7 +137,7 @@ def get_weight_name(
) -> Optional[str]: ) -> Optional[str]:
""" """
target_name is name of a given module, target_name is name of a given module,
lora_weight_names is a set of LoRA stacked name pairs (see get_stacked_name method above) lora_weight_names is a set of lora stacked name pairs (see get_stacked_name method above)
If there is a weight name in lora_weight_names that can match target_name, return this name If there is a weight name in lora_weight_names that can match target_name, return this name
Else raise ValueError. Else raise ValueError.
""" """
......
...@@ -1667,7 +1667,7 @@ class Scheduler( ...@@ -1667,7 +1667,7 @@ class Scheduler(
can_cuda_graph = 0 can_cuda_graph = 0
if not spec_algorithm.is_none(): if not spec_algorithm.is_none():
# TODO(sang): Support CUDA graph when idle batch is there. # TODO(sang): Support cuda graph when idle batch is there.
if local_batch is None or local_batch.forward_mode.is_idle(): if local_batch is None or local_batch.forward_mode.is_idle():
can_cuda_graph = 0 can_cuda_graph = 0
...@@ -1704,7 +1704,7 @@ class Scheduler( ...@@ -1704,7 +1704,7 @@ class Scheduler(
local_batch.global_num_tokens = global_num_tokens local_batch.global_num_tokens = global_num_tokens
local_batch.global_num_tokens_for_logprob = global_num_tokens_for_logprob local_batch.global_num_tokens_for_logprob = global_num_tokens_for_logprob
# Check forward mode for CUDA graph # Check forward mode for cuda graph
if not disable_cuda_graph: if not disable_cuda_graph:
local_batch.can_run_dp_cuda_graph = can_cuda_graph local_batch.can_run_dp_cuda_graph = can_cuda_graph
......
...@@ -238,7 +238,7 @@ class TokenizerManager: ...@@ -238,7 +238,7 @@ class TokenizerManager:
self.metrics_collector = TokenizerMetricsCollector( self.metrics_collector = TokenizerMetricsCollector(
labels={ labels={
"model_name": self.server_args.served_model_name, "model_name": self.server_args.served_model_name,
# TODO: Add LoRA name/path in the future, # TODO: Add lora name/path in the future,
}, },
) )
......
...@@ -213,7 +213,7 @@ class TpModelWorkerClient: ...@@ -213,7 +213,7 @@ class TpModelWorkerClient:
penalizer_orchestrator=None, penalizer_orchestrator=None,
) )
# A CUDA stream sync here to avoid the CUDA illegal memory access error. # A cuda stream sync here to avoid the cuda illegal memory access error.
self.scheduler_stream.synchronize() self.scheduler_stream.synchronize()
# Push a new batch to the queue # Push a new batch to the queue
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
"""Run the model with CUDA graph and torch.compile.""" """Run the model with cuda graph and torch.compile."""
from __future__ import annotations from __future__ import annotations
...@@ -127,7 +127,7 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner): ...@@ -127,7 +127,7 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
else: else:
capture_bs = [1, 2, 4, 8] + list(range(16, 161, 8)) capture_bs = [1, 2, 4, 8] + list(range(16, 161, 8))
else: else:
# Since speculative decoding requires more CUDA graph memory, we # Since speculative decoding requires more cuda graph memory, we
# capture less. # capture less.
capture_bs = ( capture_bs = (
list(range(1, 9)) + list(range(10, 33, 2)) + list(range(40, 161, 16)) list(range(1, 9)) + list(range(10, 33, 2)) + list(range(40, 161, 16))
...@@ -161,7 +161,7 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner): ...@@ -161,7 +161,7 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
return capture_bs, compile_bs return capture_bs, compile_bs
# Reuse this memory pool across all CUDA graph runners. # Reuse this memory pool across all cuda graph runners.
global_graph_memory_pool = None global_graph_memory_pool = None
...@@ -175,7 +175,7 @@ def set_global_graph_memory_pool(val): ...@@ -175,7 +175,7 @@ def set_global_graph_memory_pool(val):
class CudaGraphRunner: class CudaGraphRunner:
"""A CudaGraphRunner runs the forward pass of a model with CUDA graph and torch.compile.""" """A CudaGraphRunner runs the forward pass of a model with cuda graph and torch.compile."""
def __init__(self, model_runner: ModelRunner): def __init__(self, model_runner: ModelRunner):
# Parse args # Parse args
...@@ -194,7 +194,7 @@ class CudaGraphRunner: ...@@ -194,7 +194,7 @@ class CudaGraphRunner:
# Batch sizes to capture # Batch sizes to capture
self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner) self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner)
rank0_log(f"Capture CUDA graph bs {self.capture_bs}") rank0_log(f"Capture cuda graph bs {self.capture_bs}")
self.capture_forward_mode = ForwardMode.DECODE self.capture_forward_mode = ForwardMode.DECODE
self.capture_hidden_mode = CaptureHiddenMode.NULL self.capture_hidden_mode = CaptureHiddenMode.NULL
self.num_tokens_per_bs = 1 self.num_tokens_per_bs = 1
...@@ -334,8 +334,8 @@ class CudaGraphRunner: ...@@ -334,8 +334,8 @@ class CudaGraphRunner:
else forward_batch.batch_size <= self.max_bs else forward_batch.batch_size <= self.max_bs
) )
# NOTE: CUDA graph cannot handle mixed batch (encoder_len = 0) # NOTE: cuda graph cannot handle mixed batch (encoder_len = 0)
# If mixed batch cannot be supported, then encoder_lens can be removed in CUDA graph # If mixed batch cannot be supported, then encoder_lens can be removed in cuda graph
# because the full_text_row_masked_out_mask tensor will always be ones # because the full_text_row_masked_out_mask tensor will always be ones
is_encoder_lens_supported = ( is_encoder_lens_supported = (
torch.all(forward_batch.encoder_lens > 0) torch.all(forward_batch.encoder_lens > 0)
...@@ -350,7 +350,7 @@ class CudaGraphRunner: ...@@ -350,7 +350,7 @@ class CudaGraphRunner:
avail_mem = get_available_gpu_memory( avail_mem = get_available_gpu_memory(
self.model_runner.device, self.model_runner.gpu_id, empty_cache=False self.model_runner.device, self.model_runner.gpu_id, empty_cache=False
) )
# Reverse the order to enable better memory sharing across CUDA graphs. # Reverse the order to enable better memory sharing across cuda graphs.
capture_range = ( capture_range = (
tqdm.tqdm(list(reversed(self.capture_bs))) tqdm.tqdm(list(reversed(self.capture_bs)))
if get_tensor_model_parallel_rank() == 0 if get_tensor_model_parallel_rank() == 0
...@@ -429,9 +429,9 @@ class CudaGraphRunner: ...@@ -429,9 +429,9 @@ class CudaGraphRunner:
spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL
) )
if self.model_runner.server_args.lora_paths is not None: if self.model_runner.server_args.lora_paths is not None:
# Currently, if the lora_path in `lora_paths` is None, the LoRA backend will use a # Currently, if the lora_path in `lora_paths` is None, the lora backend will use a
# different logic to handle LoRA, so we need to set `lora_paths` to a list of non-None # different logic to handle lora, so we need to set `lora_paths` to a list of non-None
# values if LoRA is enabled. # values if lora is enabled.
lora_paths = [next(iter(self.model_runner.server_args.lora_paths))] * bs lora_paths = [next(iter(self.model_runner.server_args.lora_paths))] * bs
else: else:
lora_paths = None lora_paths = None
......
...@@ -229,7 +229,7 @@ class ForwardBatch: ...@@ -229,7 +229,7 @@ class ForwardBatch:
# For DP attention # For DP attention
global_num_tokens_cpu: Optional[List[int]] = None global_num_tokens_cpu: Optional[List[int]] = None
global_num_tokens_gpu: Optional[torch.Tensor] = None global_num_tokens_gpu: Optional[torch.Tensor] = None
# Has to be None when CUDA graph is captured. # Has to be None when cuda graph is captured.
global_num_tokens_for_logprob_cpu: Optional[List[int]] = None global_num_tokens_for_logprob_cpu: Optional[List[int]] = None
global_num_tokens_for_logprob_gpu: Optional[torch.Tensor] = None global_num_tokens_for_logprob_gpu: Optional[torch.Tensor] = None
# for extend, local start pos and num tokens is different in logits processor # for extend, local start pos and num tokens is different in logits processor
...@@ -356,7 +356,7 @@ class ForwardBatch: ...@@ -356,7 +356,7 @@ class ForwardBatch:
if model_runner.model_is_mrope: if model_runner.model_is_mrope:
ret._compute_mrope_positions(model_runner, batch) ret._compute_mrope_positions(model_runner, batch)
# Init LoRA information # Init lora information
if model_runner.server_args.lora_paths is not None: if model_runner.server_args.lora_paths is not None:
model_runner.lora_manager.prepare_lora_batch(ret) model_runner.lora_manager.prepare_lora_batch(ret)
......
...@@ -225,7 +225,7 @@ class ModelRunner: ...@@ -225,7 +225,7 @@ class ModelRunner:
if self.tp_size > 1 and supports_torch_tp: if self.tp_size > 1 and supports_torch_tp:
self.apply_torch_tp() self.apply_torch_tp()
# Init LoRA # Init lora
if server_args.lora_paths is not None: if server_args.lora_paths is not None:
self.init_lora_manager() self.init_lora_manager()
...@@ -1009,11 +1009,11 @@ class ModelRunner: ...@@ -1009,11 +1009,11 @@ class ModelRunner:
) )
def init_cuda_graphs(self): def init_cuda_graphs(self):
"""Capture CUDA graphs.""" """Capture cuda graphs."""
self.cuda_graph_runner = None self.cuda_graph_runner = None
if not self.is_generation: if not self.is_generation:
# TODO: Currently, CUDA graph only captures decode steps, which only exists for generation models # TODO: Currently, cuda graph only captures decode steps, which only exists for generation models
return return
if self.server_args.disable_cuda_graph: if self.server_args.disable_cuda_graph:
...@@ -1022,12 +1022,12 @@ class ModelRunner: ...@@ -1022,12 +1022,12 @@ class ModelRunner:
tic = time.time() tic = time.time()
before_mem = get_available_gpu_memory(self.device, self.gpu_id) before_mem = get_available_gpu_memory(self.device, self.gpu_id)
logger.info( logger.info(
f"Capture CUDA graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB" f"Capture cuda graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB"
) )
self.cuda_graph_runner = CudaGraphRunner(self) self.cuda_graph_runner = CudaGraphRunner(self)
after_mem = get_available_gpu_memory(self.device, self.gpu_id) after_mem = get_available_gpu_memory(self.device, self.gpu_id)
logger.info( logger.info(
f"Capture CUDA graph end. Time elapsed: {time.time() - tic:.2f} s. " f"Capture cuda graph end. Time elapsed: {time.time() - tic:.2f} s. "
f"mem usage={(before_mem - after_mem):.2f} GB. avail mem={after_mem:.2f} GB." f"mem usage={(before_mem - after_mem):.2f} GB. avail mem={after_mem:.2f} GB."
) )
......
...@@ -393,7 +393,7 @@ class CohereForCausalLM(nn.Module): ...@@ -393,7 +393,7 @@ class CohereForCausalLM(nn.Module):
weight_loader(param, loaded_weight, shard_id) weight_loader(param, loaded_weight, shard_id)
break break
else: else:
# lm_head is not used in vLLM as it is tied with embed_token. # lm_head is not used in vllm as it is tied with embed_token.
# To prevent errors, skip loading lm_head.weight. # To prevent errors, skip loading lm_head.weight.
if "lm_head.weight" in name: if "lm_head.weight" in name:
continue continue
......
...@@ -1190,7 +1190,7 @@ class CLIPVisionTower(nn.Module): ...@@ -1190,7 +1190,7 @@ class CLIPVisionTower(nn.Module):
# vision_tower = create_sam_vit(**vision_tower_params) # vision_tower = create_sam_vit(**vision_tower_params)
forward_kwargs = dict() forward_kwargs = dict()
else: # HuggingFace else: # huggingface
from transformers import CLIPVisionModel from transformers import CLIPVisionModel
vision_tower = CLIPVisionModel.from_pretrained(**vision_tower_params) vision_tower = CLIPVisionModel.from_pretrained(**vision_tower_params)
......
...@@ -342,7 +342,7 @@ class GemmaForCausalLM(nn.Module): ...@@ -342,7 +342,7 @@ class GemmaForCausalLM(nn.Module):
weight_loader(param, loaded_weight, shard_id) weight_loader(param, loaded_weight, shard_id)
break break
else: else:
# lm_head is not used in vLLM as it is tied with embed_token. # lm_head is not used in vllm as it is tied with embed_token.
# To prevent errors, skip loading lm_head.weight. # To prevent errors, skip loading lm_head.weight.
if "lm_head.weight" in name: if "lm_head.weight" in name:
continue continue
......
...@@ -441,7 +441,7 @@ class Gemma2ForCausalLM(nn.Module): ...@@ -441,7 +441,7 @@ class Gemma2ForCausalLM(nn.Module):
weight_loader(param, loaded_weight, shard_id) weight_loader(param, loaded_weight, shard_id)
break break
else: else:
# lm_head is not used in vLLM as it is tied with embed_token. # lm_head is not used in vllm as it is tied with embed_token.
# To prevent errors, skip loading lm_head.weight. # To prevent errors, skip loading lm_head.weight.
if "lm_head.weight" in name: if "lm_head.weight" in name:
continue continue
......
...@@ -174,7 +174,7 @@ class Gemma3Attention(nn.Module): ...@@ -174,7 +174,7 @@ class Gemma3Attention(nn.Module):
# Local attention. Override the values in config.json. # Local attention. Override the values in config.json.
self.rope_theta = config.rope_local_base_freq self.rope_theta = config.rope_local_base_freq
self.rope_scaling = {"rope_type": "default"} self.rope_scaling = {"rope_type": "default"}
# FIXME(mick): idk why vLLM does this # FIXME(mick): idk why vllm does this
# self.sliding_window = config.interleaved_sliding_window # self.sliding_window = config.interleaved_sliding_window
self.sliding_window = get_attention_sliding_window_size(config) self.sliding_window = get_attention_sliding_window_size(config)
else: else:
...@@ -667,7 +667,7 @@ class Gemma3ForCausalLM(PreTrainedModel): ...@@ -667,7 +667,7 @@ class Gemma3ForCausalLM(PreTrainedModel):
weight_loader(param, loaded_weight, shard_id) weight_loader(param, loaded_weight, shard_id)
break break
else: else:
# lm_head is not used in vLLM as it is tied with embed_token. # lm_head is not used in vllm as it is tied with embed_token.
# To prevent errors, skip loading lm_head.weight. # To prevent errors, skip loading lm_head.weight.
if "lm_head.weight" in name: if "lm_head.weight" in name:
continue continue
......
...@@ -418,7 +418,7 @@ class MoonVitEncoderLayer(nn.Module): ...@@ -418,7 +418,7 @@ class MoonVitEncoderLayer(nn.Module):
hidden_dim: int, hidden_dim: int,
mlp_dim: int, mlp_dim: int,
*, *,
attn_implementation: str = "flash_attention_2", # use fa2 in SGLang by default attn_implementation: str = "flash_attention_2", # use fa2 in sglang by default
activation=F.gelu, activation=F.gelu,
attn_bias: bool = False, attn_bias: bool = False,
): ):
......
...@@ -537,8 +537,8 @@ class MllamaTextCrossAttention(nn.Module): ...@@ -537,8 +537,8 @@ class MllamaTextCrossAttention(nn.Module):
quant_config=quant_config, quant_config=quant_config,
prefix=add_prefix("o_proj", prefix), prefix=add_prefix("o_proj", prefix),
) )
# vllm.model_executor.layers.layernorm.RMSNorm has a precision issue, # vllm.model_executor.layers.layernorm.RMSNorm has precision issue,
# use HuggingFace's instead # use huggingface's instead
self.q_norm = MllamaTextRMSNorm(self.head_dim, eps=config.rms_norm_eps) self.q_norm = MllamaTextRMSNorm(self.head_dim, eps=config.rms_norm_eps)
self.k_norm = MllamaTextRMSNorm(self.head_dim, eps=config.rms_norm_eps) self.k_norm = MllamaTextRMSNorm(self.head_dim, eps=config.rms_norm_eps)
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
...@@ -979,8 +979,8 @@ class MllamaForConditionalGeneration(nn.Module): ...@@ -979,8 +979,8 @@ class MllamaForConditionalGeneration(nn.Module):
cross_attention_states = None cross_attention_states = None
if self.capture_mode: if self.capture_mode:
# NOTE: when doing CUDA graph capture, we do not want to skip cross attention # NOTE: when doing cuda graph capture, we do not want to skip cross attention
# Make is a constant value to avoid CUDA graph capture issue # Make is a constant value to avoid cuda graph capture issue
skip_cross_attention = False skip_cross_attention = False
else: else:
# NOTE: we do not need image_inputs when prefill # NOTE: we do not need image_inputs when prefill
......
...@@ -57,7 +57,7 @@ class RobertaEmbedding(nn.Module): ...@@ -57,7 +57,7 @@ class RobertaEmbedding(nn.Module):
input_shape = input_ids.size() input_shape = input_ids.size()
inputs_embeds = self.word_embeddings(input_ids) inputs_embeds = self.word_embeddings(input_ids)
# Adapted from vLLM: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py # Adapted from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py
pos_list = [] pos_list = []
token_list = [] token_list = []
......
...@@ -67,7 +67,7 @@ class Platform: ...@@ -67,7 +67,7 @@ class Platform:
# Real device name of current platform. # Real device name of current platform.
device_name: str device_name: str
# For specifying torch device for CUDA alike platform's capability. # For specifying torch device for cuda alike platform's capability.
device_type: str device_type: str
# The torch.distributed backend on current platform # The torch.distributed backend on current platform
...@@ -254,7 +254,7 @@ class Platform: ...@@ -254,7 +254,7 @@ class Platform:
@classmethod @classmethod
def check_and_update_lora_backend(cls, backend: str) -> str: def check_and_update_lora_backend(cls, backend: str) -> str:
""" """
Check and update the LoRA backend for the current platform. Check and update the lora backend for the current platform.
""" """
raise NotImplementedError raise NotImplementedError
......
...@@ -246,7 +246,7 @@ class ServerArgs: ...@@ -246,7 +246,7 @@ class ServerArgs:
self.mem_fraction_static = min( self.mem_fraction_static = min(
mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem, mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
(gpu_mem - 1024 * 18) (gpu_mem - 1024 * 18)
/ gpu_mem, # 15 GB + additional 3GB for CUDA graph / gpu_mem, # 15 GB + additional 3GB for cuda graph
) )
# Set chunked prefill size, which depends on the gpu memory capacity # Set chunked prefill size, which depends on the gpu memory capacity
...@@ -276,9 +276,9 @@ class ServerArgs: ...@@ -276,9 +276,9 @@ class ServerArgs:
) )
self.page_size = 128 self.page_size = 128
# Set CUDA graph max batch size # Set cuda graph max batch size
if self.cuda_graph_max_bs is None: if self.cuda_graph_max_bs is None:
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable CUDA graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating CUDA graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable CUDA graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating CUDA graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating CUDA graphs on lower-end GPUs compared to the original 160, avoiding OOM issues. # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
if gpu_mem is not None and gpu_mem < 25_000: if gpu_mem is not None and gpu_mem < 25_000:
if self.tp_size < 4: if self.tp_size < 4:
self.cuda_graph_max_bs = 8 self.cuda_graph_max_bs = 8
...@@ -729,7 +729,7 @@ class ServerArgs: ...@@ -729,7 +729,7 @@ class ServerArgs:
"--download-dir", "--download-dir",
type=str, type=str,
default=ServerArgs.download_dir, default=ServerArgs.download_dir,
help="Model download directory for HuggingFace.", help="Model download directory for huggingface.",
) )
parser.add_argument( parser.add_argument(
"--base-gpu-id", "--base-gpu-id",
...@@ -1024,12 +1024,12 @@ class ServerArgs: ...@@ -1024,12 +1024,12 @@ class ServerArgs:
parser.add_argument( parser.add_argument(
"--disable-cuda-graph", "--disable-cuda-graph",
action="store_true", action="store_true",
help="Disable CUDA graph.", help="Disable cuda graph.",
) )
parser.add_argument( parser.add_argument(
"--disable-cuda-graph-padding", "--disable-cuda-graph-padding",
action="store_true", action="store_true",
help="Disable CUDA graph when padding is needed. Still uses CUDA graph when padding is not needed.", help="Disable cuda graph when padding is needed. Still uses cuda graph when padding is not needed.",
) )
parser.add_argument( parser.add_argument(
"--enable-nccl-nvls", "--enable-nccl-nvls",
...@@ -1075,7 +1075,7 @@ class ServerArgs: ...@@ -1075,7 +1075,7 @@ class ServerArgs:
parser.add_argument( parser.add_argument(
"--enable-ep-moe", "--enable-ep-moe",
action="store_true", action="store_true",
help="Enabling expert parallelism for MoE. The ep size is equal to the tp size.", help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
) )
parser.add_argument( parser.add_argument(
"--enable-torch-compile", "--enable-torch-compile",
...@@ -1092,13 +1092,13 @@ class ServerArgs: ...@@ -1092,13 +1092,13 @@ class ServerArgs:
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
type=int, type=int,
default=ServerArgs.cuda_graph_max_bs, default=ServerArgs.cuda_graph_max_bs,
help="Set the maximum batch size for CUDA graph. It will extend the CUDA graph capture batch size to this value.", help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
) )
parser.add_argument( parser.add_argument(
"--cuda-graph-bs", "--cuda-graph-bs",
type=int, type=int,
nargs="+", nargs="+",
help="Set the list of batch sizes for CUDA graph.", help="Set the list of batch sizes for cuda graph.",
) )
parser.add_argument( parser.add_argument(
"--torchao-config", "--torchao-config",
...@@ -1334,7 +1334,7 @@ class ServerArgs: ...@@ -1334,7 +1334,7 @@ class ServerArgs:
self.max_loras_per_batch > 0 self.max_loras_per_batch > 0
# FIXME # FIXME
and (self.lora_paths is None or self.disable_radix_cache) and (self.lora_paths is None or self.disable_radix_cache)
), "compatibility of LoRA and CUDA graph and RadixAttention is in progress" ), "compatibility of lora and cuda graph and radix attention is in progress"
assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative" assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
assert self.gpu_id_step >= 1, "gpu_id_step must be positive" assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment