Unverified Commit a17e70f5 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Use more general heuristics to set the default value of --mem-fraction-static (#10975)


Co-authored-by: default avatarsglang-bot <sglangbot@gmail.com>
parent 816b3a43
...@@ -99,8 +99,6 @@ jobs: ...@@ -99,8 +99,6 @@ jobs:
needs: [check-changes, sgl-kernel-build-wheels] needs: [check-changes, sgl-kernel-build-wheels]
if: needs.check-changes.outputs.sgl_kernel == 'true' if: needs.check-changes.outputs.sgl_kernel == 'true'
runs-on: 1-gpu-runner runs-on: 1-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
...@@ -233,8 +231,6 @@ jobs: ...@@ -233,8 +231,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 1-gpu-runner runs-on: 1-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
...@@ -266,8 +262,6 @@ jobs: ...@@ -266,8 +262,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 2-gpu-runner runs-on: 2-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
...@@ -299,8 +293,6 @@ jobs: ...@@ -299,8 +293,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 4-gpu-runner runs-on: 4-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
...@@ -332,8 +324,6 @@ jobs: ...@@ -332,8 +324,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 8-gpu-runner runs-on: 8-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
...@@ -365,8 +355,6 @@ jobs: ...@@ -365,8 +355,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 1-gpu-runner runs-on: 1-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
...@@ -426,8 +414,6 @@ jobs: ...@@ -426,8 +414,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 1-gpu-runner runs-on: 1-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
...@@ -479,8 +465,6 @@ jobs: ...@@ -479,8 +465,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 2-gpu-runner runs-on: 2-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
...@@ -538,8 +522,6 @@ jobs: ...@@ -538,8 +522,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 1-gpu-runner runs-on: 1-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
...@@ -570,8 +552,6 @@ jobs: ...@@ -570,8 +552,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 2-gpu-runner runs-on: 2-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
...@@ -602,8 +582,6 @@ jobs: ...@@ -602,8 +582,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 4-gpu-runner runs-on: 4-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
...@@ -631,8 +609,6 @@ jobs: ...@@ -631,8 +609,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 8-gpu-runner runs-on: 8-gpu-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
...@@ -660,8 +636,6 @@ jobs: ...@@ -660,8 +636,6 @@ jobs:
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 4-b200-runner runs-on: 4-b200-runner
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
strategy: strategy:
fail-fast: false fail-fast: false
steps: steps:
......
...@@ -35,6 +35,7 @@ else: ...@@ -35,6 +35,7 @@ else:
Image = Any Image = Any
# Parameters for a session
@dataclass @dataclass
class SessionParams: class SessionParams:
id: Optional[str] = None id: Optional[str] = None
...@@ -84,8 +85,6 @@ class GenerateReqInput: ...@@ -84,8 +85,6 @@ class GenerateReqInput:
sampling_params: Optional[Union[List[Dict], Dict]] = None sampling_params: Optional[Union[List[Dict], Dict]] = None
# The request id. # The request id.
rid: Optional[Union[List[str], str]] = None rid: Optional[Union[List[str], str]] = None
# Extra key for classifying the request (e.g. cache_salt)
extra_key: Optional[Union[List[str], str]] = None
# Whether to return logprobs. # Whether to return logprobs.
return_logprob: Optional[Union[List[bool], bool]] = None return_logprob: Optional[Union[List[bool], bool]] = None
# If return logprobs, the start location in the prompt for returning logprobs. # If return logprobs, the start location in the prompt for returning logprobs.
...@@ -134,18 +133,23 @@ class GenerateReqInput: ...@@ -134,18 +133,23 @@ class GenerateReqInput:
# Conversation id used for tracking requests # Conversation id used for tracking requests
conversation_id: Optional[str] = None conversation_id: Optional[str] = None
# (Deprecated, please use custom_labels) Label for the request
label: Optional[str] = None
# Priority for the request # Priority for the request
priority: Optional[int] = None priority: Optional[int] = None
# Image gen grpc migration # Extra key for classifying the request (e.g. cache_salt)
return_bytes: bool = False extra_key: Optional[Union[List[str], str]] = None
# Whether to disallow logging for this request (e.g. due to ZDR)
no_logs: bool = False
# For custom metric labels # For custom metric labels
custom_labels: Optional[Dict[str, str]] = None custom_labels: Optional[Dict[str, str]] = None
# (Deprecated, please use custom_labels) Label for the request
label: Optional[str] = None
# (Internal) Whether to return bytes for image generation
return_bytes: bool = False
def contains_mm_input(self) -> bool: def contains_mm_input(self) -> bool:
return ( return (
has_valid_data(self.image_data) has_valid_data(self.image_data)
...@@ -544,8 +548,11 @@ class GenerateReqInput: ...@@ -544,8 +548,11 @@ class GenerateReqInput:
self.data_parallel_rank if self.data_parallel_rank is not None else None self.data_parallel_rank if self.data_parallel_rank is not None else None
), ),
conversation_id=self.conversation_id, conversation_id=self.conversation_id,
label=self.label,
priority=self.priority, priority=self.priority,
extra_key=self.extra_key,
no_logs=self.no_logs,
custom_labels=self.custom_labels,
label=self.label,
return_bytes=self.return_bytes, return_bytes=self.return_bytes,
) )
...@@ -602,21 +609,23 @@ class TokenizedGenerateReqInput: ...@@ -602,21 +609,23 @@ class TokenizedGenerateReqInput:
# For dp balance # For dp balance
dp_balance_id: int = -1 dp_balance_id: int = -1
# Label for the request
label: Optional[str] = None
# Priority for the request # Priority for the request
priority: Optional[int] = None priority: Optional[int] = None
# Extra key for classifying the request (e.g. cache_salt) # Extra key for classifying the request (e.g. cache_salt)
extra_key: Optional[str] = None extra_key: Optional[str] = None
# Image gen grpc migration # Whether to disallow logging for this request (e.g. due to ZDR)
return_bytes: bool = False no_logs: bool = False
# tracing context # tracing context
trace_context: Optional[Dict] = None trace_context: Optional[Dict] = None
# (Deprecated, please use custom_labels) Label for the request
label: Optional[str] = None
# (Internal) Whether to return bytes for image generation
return_bytes: bool = False
@dataclass @dataclass
class BatchTokenizedGenerateReqInput: class BatchTokenizedGenerateReqInput:
......
...@@ -242,11 +242,8 @@ def find_local_hf_snapshot_dir( ...@@ -242,11 +242,8 @@ def find_local_hf_snapshot_dir(
allow_patterns: List[str], allow_patterns: List[str],
revision: Optional[str] = None, revision: Optional[str] = None,
) -> Optional[str]: ) -> Optional[str]:
"""If the weights are already local, skip downloading and returns the path """If the weights are already local, skip downloading and returns the path."""
if os.path.isdir(model_name_or_path):
Only applied in ci
"""
if not is_in_ci() or os.path.isdir(model_name_or_path):
return None return None
found_local_snapshot_dir = None found_local_snapshot_dir = None
...@@ -347,11 +344,14 @@ def download_weights_from_hf( ...@@ -347,11 +344,14 @@ def download_weights_from_hf(
str: The path to the downloaded model weights. str: The path to the downloaded model weights.
""" """
path = find_local_hf_snapshot_dir( if is_in_ci():
model_name_or_path, cache_dir, allow_patterns, revision # If the weights are already local, skip downloading and returns the path.
) # This is used to skip too-many Huggingface API calls in CI.
if path is not None: path = find_local_hf_snapshot_dir(
return path model_name_or_path, cache_dir, allow_patterns, revision
)
if path is not None:
return path
if not huggingface_hub.constants.HF_HUB_OFFLINE: if not huggingface_hub.constants.HF_HUB_OFFLINE:
# Before we download we look at that is available: # Before we download we look at that is available:
......
# Adapted from qwen2.py # Adapted from qwen2.py
import logging import logging
from functools import partial
from typing import Any, Dict, Iterable, List, Optional, Tuple from typing import Any, Dict, Iterable, List, Optional, Tuple
import torch import torch
......
...@@ -523,108 +523,142 @@ class ServerArgs: ...@@ -523,108 +523,142 @@ class ServerArgs:
def _handle_gpu_memory_settings(self, gpu_mem): def _handle_gpu_memory_settings(self, gpu_mem):
""" """
Configure GPU memory-dependent settings including mem_fraction_static, Configure GPU memory-dependent settings including
chunked_prefill_size, cuda_graph_max_bs, and cuda_graph_bs. chunked_prefill_size, cuda_graph_max_bs, and mem_fraction_static.
"""
# Set mem fraction static
if self.mem_fraction_static is None:
if gpu_mem is not None:
# GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
# mem_fraction_static = (model weights + KV cache pool) / GPU memory capacity.
# We want mem_fraction_static to be as large as possible but still has enough room
# for activations and cuda graph buffers. We use the following heuristic to
# compute the needed size for activations and cuda graph buffers:
# - The size of the activation depends on the chunked_prefill_size and model size.
# - The size of cuda graph buffers depends on the cuda graph capture range and model size.
# For GPUs with more memory, we use a larger chunked_prefill_size and
# capture more cuda graphs, so they need to reserve more memory.
parallel_size = self.tp_size * self.pp_size
if gpu_mem < 20 * 1024:
# T4, 4080. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
reserved_mem = (2.8 + parallel_size / 10) * 1024
elif gpu_mem < 50 * 1024:
# A10, L40, 4090, 5090. (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
reserved_mem = (2.8 + parallel_size / 10) * 1024
elif gpu_mem < 90 * 1024:
# H100, A100. (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
reserved_mem = (12 + parallel_size / 2) * 1024
elif gpu_mem < 100 * 1024:
# H20. (chunked_prefill_size 8k, cuda_graph_max_bs 512)
reserved_mem = (15 + parallel_size / 2) * 1024
elif gpu_mem < 160 * 1024:
# H200. (chunked_prefill_size 8k, cuda_graph_max_bs 512)
reserved_mem = (15 + parallel_size / 2) * 1024
else:
# B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
reserved_mem = 32 * 1024
# draft model and larger cuda graph buffers
if self.speculative_algorithm is not None:
if self.speculative_algorithm == "STANDALONE":
# Standalone speculative decoding needs more memory than other speculative
# decoding algorithms since the draft model is typically larger.
reserved_mem += 6 * 1024
elif self.speculative_algorithm != "NGRAM":
reserved_mem += 2 * 1024
if self.enable_dp_attention:
reserved_mem += 4 * 1024
self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3)
else:
self.mem_fraction_static = 0.88
# Lazy init to avoid circular import Here are our heuristics:
# Multimodal models need more memory for the image processor - Set chunked_prefill_size and cuda_graph_max_bs based on the GPU memory capacity.
from sglang.srt.configs.model_config import ModelConfig This is because GPUs with more memory are generally more powerful, we need to use a larger
chunked_prefill_size and a larger cuda_graph_max_bs to fully utilize the GPU.
- Then set mem_fraction_static based on chunked_prefill_size and cuda_graph_max_bs.
model_config = ModelConfig.from_server_args(self) GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
if model_config.is_multimodal:
self.adjust_mem_fraction_for_vlm(model_config)
# Set chunked prefill size, which depends on the gpu memory capacity The argument mem_fraction_static is defined as (model weights + KV cache pool) / GPU memory capacity,
if self.chunked_prefill_size is None: or equivalently, mem_fraction_static = (GPU memory capacity - activations - cuda graph buffers) / GPU memory capacity.
if gpu_mem is not None:
if gpu_mem < 50 * 1024: # T4, 4080, A10, L40, 4090, 5090
self.chunked_prefill_size = 2048
elif gpu_mem < 160 * 1024: # H100, H200, A100, H20
self.chunked_prefill_size = 8192
else: # B200, MI300
self.chunked_prefill_size = 16384
else:
self.chunked_prefill_size = 4096
# Set cuda graph max batch size and cuda graph batch sizes In order to compute mem_fraction_static, we need to estimate the size of activations and cuda graph buffers.
if self.cuda_graph_max_bs is None: The activation memory is proportional to the chunked_prefill_size.
if gpu_mem is not None: The cuda graph memory is proportional to the cuda_graph_max_bs.
if gpu_mem < 20 * 1024: We use reserved_mem = chunked_prefill_size * 1.5 + cuda_graph_max_bs * 2 to estimate the size of activations and cuda graph buffers in GB.
# T4, 4080 and set mem_fraction_static = (GPU memory capacity - reserved_mem) / GPU memory capacity.
The coefficient 1.5 is a heuristic value, in the future, we can do better estimation by looking at the model types, hidden sizes or even do a dummy run.
"""
if gpu_mem is not None:
if gpu_mem < 20 * 1024:
# T4, 4080
# (chunked_prefill_size 2k, cuda_graph_max_bs 8)
if self.chunked_prefill_size is None:
self.chunked_prefill_size = 2048
if self.cuda_graph_max_bs is None:
self.cuda_graph_max_bs = 8 self.cuda_graph_max_bs = 8
elif gpu_mem < 50 * 1024: elif gpu_mem < 35 * 1024:
# A10, L40, 4090, 5090 # A10, 4090, 5090
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. # (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
if self.chunked_prefill_size is None:
self.chunked_prefill_size = 2048
if self.cuda_graph_max_bs is None:
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM < 35GB, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance.
# However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs # However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
# from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues. # from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
if self.tp_size < 4: if self.tp_size < 4:
self.cuda_graph_max_bs = 16 self.cuda_graph_max_bs = 16
else: else:
self.cuda_graph_max_bs = 80 self.cuda_graph_max_bs = 80
elif gpu_mem < 90 * 1024: elif gpu_mem < 60 * 1024:
# H100, A100 # A100 (40GB), L40,
# (chunked_prefill_size 4k, cuda_graph_max_bs 32 if tp < 4 else 160)
if self.chunked_prefill_size is None:
self.chunked_prefill_size = 4096
if self.cuda_graph_max_bs is None:
if self.tp_size < 4:
self.cuda_graph_max_bs = 32
else:
self.cuda_graph_max_bs = 160
elif gpu_mem < 90 * 1024:
# H100, A100
# (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
if self.chunked_prefill_size is None:
self.chunked_prefill_size = 8192
if self.cuda_graph_max_bs is None:
if self.tp_size < 4:
self.cuda_graph_max_bs = 256
else:
self.cuda_graph_max_bs = 512
elif gpu_mem < 160 * 1024:
# H20, H200
# (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
if self.chunked_prefill_size is None:
self.chunked_prefill_size = 8192
if self.cuda_graph_max_bs is None:
if self.tp_size < 4: if self.tp_size < 4:
self.cuda_graph_max_bs = 256 self.cuda_graph_max_bs = 256
else: else:
self.cuda_graph_max_bs = 512 self.cuda_graph_max_bs = 512
else:
# H20, H200, B200, MI300
self.cuda_graph_max_bs = 512
else: else:
# Default fallback # B200, MI300
# (chunked_prefill_size 16k, cuda_graph_max_bs 512)
if self.chunked_prefill_size is None:
self.chunked_prefill_size = 16384
if self.cuda_graph_max_bs is None:
self.cuda_graph_max_bs = 512
else:
# Fallback defaults when gpu_mem is None
if self.chunked_prefill_size is None:
self.chunked_prefill_size = 4096
if self.cuda_graph_max_bs is None:
self.cuda_graph_max_bs = 160 self.cuda_graph_max_bs = 160
# Set cuda graph batch sizes
if self.cuda_graph_bs is None: if self.cuda_graph_bs is None:
self.cuda_graph_bs = self._generate_cuda_graph_batch_sizes() self.cuda_graph_bs = self._generate_cuda_graph_batch_sizes()
else:
self.cuda_graph_max_bs = max(self.cuda_graph_bs)
if self.mem_fraction_static is None:
# Constant meta data (e.g., from attention backend)
reserved_mem = 1024
# For activation during large prefill
if self.chunked_prefill_size > 0:
reserved_mem += max(self.chunked_prefill_size, 2048) * 1.5
else:
reserved_mem += max(self.max_prefill_tokens, 2048) * 1.5
# For cuda graphs
reserved_mem += self.cuda_graph_max_bs * 2
# Some adjustments for large parallel size
reserved_mem += self.tp_size * self.pp_size / 4 * 1024
if self.enable_dp_attention:
# DP attention needs more padding for some operations
reserved_mem += self.cuda_graph_max_bs * self.dp_size * 3
# DP attention uses much more memory for large cuda graph max bs,
# likely due to some inefficiencies in torch allocator or our implementation.
# So we need to reserve more memory.
if self.cuda_graph_max_bs > 300:
reserved_mem += self.cuda_graph_max_bs * self.dp_size * 1.5
if gpu_mem > 60 * 1024:
reserved_mem = max(reserved_mem, 10 * 1024)
if self.speculative_algorithm is not None:
if self.speculative_algorithm == "STANDALONE":
# standalonedraft model and cuda graphs
reserved_mem += 6 * 1024
elif self.speculative_algorithm != "NGRAM":
# eagle draft models and cuda graphs
reserved_mem += 2 * 1024
self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3)
# Lazy init to avoid circular import
# Multimodal models need more memory for the image processor
from sglang.srt.configs.model_config import ModelConfig
model_config = ModelConfig.from_server_args(self)
if model_config.is_multimodal:
self.adjust_mem_fraction_for_vlm(model_config)
def _generate_cuda_graph_batch_sizes(self): def _generate_cuda_graph_batch_sizes(self):
""" """
......
...@@ -38,7 +38,7 @@ class TestLlama4LoRA(CustomTestCase): ...@@ -38,7 +38,7 @@ class TestLlama4LoRA(CustomTestCase):
"--tp-size", "--tp-size",
str(model.tp_size), str(model.tp_size),
"--context-length", "--context-length",
"1048576", "262144",
"--attention-backend", "--attention-backend",
"fa3", "fa3",
], ],
......
...@@ -13,6 +13,7 @@ class TestFile: ...@@ -13,6 +13,7 @@ class TestFile:
suites = { suites = {
"per-commit": [ "per-commit": [
TestFile("function_call/test_json_schema_constraint.py", 30),
TestFile("hicache/test_hicache.py", 116), TestFile("hicache/test_hicache.py", 116),
TestFile("hicache/test_hicache_mla.py", 127), TestFile("hicache/test_hicache_mla.py", 127),
TestFile("hicache/test_hicache_storage.py", 127), TestFile("hicache/test_hicache_storage.py", 127),
...@@ -20,11 +21,9 @@ suites = { ...@@ -20,11 +21,9 @@ suites = {
TestFile("lora/test_lora_eviction.py", 200), TestFile("lora/test_lora_eviction.py", 200),
TestFile("lora/test_lora_backend.py", 99), TestFile("lora/test_lora_backend.py", 99),
TestFile("lora/test_multi_lora_backend.py", 60), TestFile("lora/test_multi_lora_backend.py", 60),
TestFile("lora/test_lora_cuda_graph.py", 250),
TestFile("lora/test_lora_update.py", 400), TestFile("lora/test_lora_update.py", 400),
TestFile("lora/test_lora_qwen3.py", 97), TestFile("lora/test_lora_qwen3.py", 97),
TestFile("lora/test_lora_radix_cache.py", 100), TestFile("lora/test_lora_radix_cache.py", 100),
TestFile("lora/test_chunked_sgmv_backend.py", 30),
TestFile("models/test_embedding_models.py", 73), TestFile("models/test_embedding_models.py", 73),
# TestFile("models/test_clip_models.py", 52), # TestFile("models/test_clip_models.py", 52),
TestFile("models/test_encoder_embedding_models.py", 100), TestFile("models/test_encoder_embedding_models.py", 100),
...@@ -51,7 +50,6 @@ suites = { ...@@ -51,7 +50,6 @@ suites = {
TestFile("openai_server/features/test_reasoning_content.py", 89), TestFile("openai_server/features/test_reasoning_content.py", 89),
TestFile("openai_server/function_call/test_openai_function_calling.py", 60), TestFile("openai_server/function_call/test_openai_function_calling.py", 60),
TestFile("openai_server/function_call/test_tool_choice.py", 226), TestFile("openai_server/function_call/test_tool_choice.py", 226),
TestFile("function_call/test_json_schema_constraint.py", 30),
TestFile("openai_server/validation/test_large_max_new_tokens.py", 41), TestFile("openai_server/validation/test_large_max_new_tokens.py", 41),
TestFile("openai_server/validation/test_matched_stop.py", 60), TestFile("openai_server/validation/test_matched_stop.py", 60),
TestFile("openai_server/validation/test_openai_server_ignore_eos.py", 85), TestFile("openai_server/validation/test_openai_server_ignore_eos.py", 85),
...@@ -144,8 +142,6 @@ suites = { ...@@ -144,8 +142,6 @@ suites = {
TestFile("test_multi_instance_release_memory_occupation.py", 64), TestFile("test_multi_instance_release_memory_occupation.py", 64),
], ],
"per-commit-8-gpu": [ "per-commit-8-gpu": [
# Disabled because it hangs on the CI.
# TestFile("ep/test_moe_ep.py", 181),
TestFile("hicache/test_hicache_storage_mooncake_backend.py", 800), TestFile("hicache/test_hicache_storage_mooncake_backend.py", 800),
TestFile("lora/test_lora_llama4.py", 600), TestFile("lora/test_lora_llama4.py", 600),
TestFile("test_disaggregation.py", 499), TestFile("test_disaggregation.py", 499),
......
...@@ -3,7 +3,6 @@ import unittest ...@@ -3,7 +3,6 @@ import unittest
from types import SimpleNamespace from types import SimpleNamespace
import requests import requests
import torch
from sglang.srt.utils import is_cuda, is_hip, kill_process_tree from sglang.srt.utils import is_cuda, is_hip, kill_process_tree
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
...@@ -11,6 +10,7 @@ from sglang.test.test_utils import ( ...@@ -11,6 +10,7 @@ from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST, DEFAULT_URL_FOR_TEST,
CustomTestCase, CustomTestCase,
is_in_ci,
popen_launch_server, popen_launch_server,
) )
...@@ -50,6 +50,7 @@ class TestMLADeepseekV3(CustomTestCase): ...@@ -50,6 +50,7 @@ class TestMLADeepseekV3(CustomTestCase):
self.assertGreater(metrics["accuracy"], 0.62) self.assertGreater(metrics["accuracy"], 0.62)
@unittest.skipIf(is_in_ci(), "To reduce the CI execution time.")
class TestMLADeepseekV3DisableFusedFunc(CustomTestCase): class TestMLADeepseekV3DisableFusedFunc(CustomTestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
......
import multiprocessing import multiprocessing
import os import os
import subprocess import time
import traceback import traceback
import unittest import unittest
from multiprocessing import Process from multiprocessing import Process
...@@ -21,7 +21,7 @@ from sglang.test.test_utils import ( ...@@ -21,7 +21,7 @@ from sglang.test.test_utils import (
TEST_SUITE = dict( TEST_SUITE = dict(
model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST, model_path=DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
mem_fraction_static=0.85, mem_fraction_static=0.83,
dp_size=2, dp_size=2,
tp_size=2, tp_size=2,
) )
...@@ -214,6 +214,9 @@ def _run_sglang_subprocess( ...@@ -214,6 +214,9 @@ def _run_sglang_subprocess(
_mem_usage = get_gpu_memory_gb(rank) _mem_usage = get_gpu_memory_gb(rank)
print(f"GPU{rank} Memory usage after resuming Sgl weights: {_mem_usage}") print(f"GPU{rank} Memory usage after resuming Sgl weights: {_mem_usage}")
del hf_model del hf_model
hf_model = None
torch.cuda.empty_cache()
time.sleep(5)
torch.cuda.empty_cache() torch.cuda.empty_cache()
_curr_usage = get_gpu_memory_gb(rank) _curr_usage = get_gpu_memory_gb(rank)
assert ( assert (
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment