Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
ed2e313e
"docs/vscode:/vscode.git/clone" did not exist on "be0bfcec4dacf5f98de9b12967472a3a5b5fcde2"
Unverified
Commit
ed2e313e
authored
Jul 25, 2025
by
Lianmin Zheng
Committed by
GitHub
Jul 25, 2025
Browse files
Clean up server_args, triton cache manager (#8332)
parent
f8260f25
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
128 additions
and
204 deletions
+128
-204
python/sglang/srt/entrypoints/engine.py
python/sglang/srt/entrypoints/engine.py
+0
-6
python/sglang/srt/entrypoints/http_server.py
python/sglang/srt/entrypoints/http_server.py
+20
-32
python/sglang/srt/layers/moe/topk.py
python/sglang/srt/layers/moe/topk.py
+3
-4
python/sglang/srt/managers/scheduler.py
python/sglang/srt/managers/scheduler.py
+5
-6
python/sglang/srt/model_executor/forward_batch_info.py
python/sglang/srt/model_executor/forward_batch_info.py
+8
-7
python/sglang/srt/model_executor/model_runner.py
python/sglang/srt/model_executor/model_runner.py
+0
-1
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+59
-49
python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
...n/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
+0
-1
python/sglang/srt/utils.py
python/sglang/srt/utils.py
+0
-65
test/srt/test_deepep_large.py
test/srt/test_deepep_large.py
+1
-1
test/srt/test_deepep_small.py
test/srt/test_deepep_small.py
+2
-2
test/srt/test_hybrid_dp_ep_tp_mtp.py
test/srt/test_hybrid_dp_ep_tp_mtp.py
+30
-30
No files found.
python/sglang/srt/entrypoints/engine.py
View file @
ed2e313e
...
...
@@ -71,7 +71,6 @@ from sglang.srt.utils import (
is_cuda
,
kill_process_tree
,
launch_dummy_health_check_server
,
maybe_set_triton_cache_manager
,
prepare_model_and_tokenizer
,
set_prometheus_multiproc_dir
,
set_ulimit
,
...
...
@@ -637,11 +636,6 @@ def _set_envs_and_config(server_args: ServerArgs):
# Set ulimit
set_ulimit
()
# Fix triton bugs
if
server_args
.
tp_size
*
server_args
.
dp_size
>
1
:
# FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
maybe_set_triton_cache_manager
()
# Check flashinfer version
if
server_args
.
attention_backend
==
"flashinfer"
:
assert_pkg_version
(
...
...
python/sglang/srt/entrypoints/http_server.py
View file @
ed2e313e
...
...
@@ -107,6 +107,8 @@ from sglang.version import __version__
logger
=
logging
.
getLogger
(
__name__
)
asyncio
.
set_event_loop_policy
(
uvloop
.
EventLoopPolicy
())
HEALTH_CHECK_TIMEOUT
=
int
(
os
.
getenv
(
"SGLANG_HEALTH_CHECK_TIMEOUT"
,
20
))
# Store global states
@
dataclasses
.
dataclass
...
...
@@ -212,9 +214,6 @@ async def validate_json_request(raw_request: Request):
)
HEALTH_CHECK_TIMEOUT
=
int
(
os
.
getenv
(
"SGLANG_HEALTH_CHECK_TIMEOUT"
,
20
))
##### Native API endpoints #####
...
...
@@ -807,6 +806,24 @@ async def retrieve_model(model: str):
)
@
app
.
post
(
"/v1/score"
,
dependencies
=
[
Depends
(
validate_json_request
)])
async
def
v1_score_request
(
request
:
ScoringRequest
,
raw_request
:
Request
):
"""Endpoint for the decoder-only scoring API. See Engine.score() for detailed documentation."""
return
await
raw_request
.
app
.
state
.
openai_serving_score
.
handle_request
(
request
,
raw_request
)
@
app
.
api_route
(
"/v1/rerank"
,
methods
=
[
"POST"
,
"PUT"
],
dependencies
=
[
Depends
(
validate_json_request
)]
)
async
def
v1_rerank_request
(
request
:
V1RerankReqInput
,
raw_request
:
Request
):
"""Endpoint for reranking documents based on query relevance."""
return
await
raw_request
.
app
.
state
.
openai_serving_rerank
.
handle_request
(
request
,
raw_request
)
## SageMaker API
@
app
.
get
(
"/ping"
)
async
def
sagemaker_health
()
->
Response
:
...
...
@@ -852,24 +869,6 @@ async def vertex_generate(vertex_req: VertexGenerateReqInput, raw_request: Reque
return
ORJSONResponse
({
"predictions"
:
ret
})
@
app
.
post
(
"/v1/score"
,
dependencies
=
[
Depends
(
validate_json_request
)])
async
def
v1_score_request
(
request
:
ScoringRequest
,
raw_request
:
Request
):
"""Endpoint for the decoder-only scoring API. See Engine.score() for detailed documentation."""
return
await
raw_request
.
app
.
state
.
openai_serving_score
.
handle_request
(
request
,
raw_request
)
@
app
.
api_route
(
"/v1/rerank"
,
methods
=
[
"POST"
,
"PUT"
],
dependencies
=
[
Depends
(
validate_json_request
)]
)
async
def
v1_rerank_request
(
request
:
V1RerankReqInput
,
raw_request
:
Request
):
"""Endpoint for reranking documents based on query relevance."""
return
await
raw_request
.
app
.
state
.
openai_serving_rerank
.
handle_request
(
request
,
raw_request
)
def
_create_error_response
(
e
):
return
ORJSONResponse
(
{
"error"
:
{
"message"
:
str
(
e
)}},
status_code
=
HTTPStatus
.
BAD_REQUEST
...
...
@@ -916,15 +915,6 @@ def launch_server(
add_prometheus_middleware
(
app
)
enable_func_timer
()
image_token_text
=
None
if
(
tokenizer_manager
.
image_token_id
is
not
None
and
not
server_args
.
skip_tokenizer_init
):
image_token_text
=
tokenizer_manager
.
tokenizer
.
decode
(
[
tokenizer_manager
.
image_token_id
]
)
# Send a warmup request - we will create the thread launch it
# in the lifespan after all other warmups have fired.
warmup_thread
=
threading
.
Thread
(
...
...
@@ -932,7 +922,6 @@ def launch_server(
args
=
(
server_args
,
pipe_finish_writer
,
image_token_text
,
launch_callback
,
),
)
...
...
@@ -1066,7 +1055,6 @@ def _execute_server_warmup(
def
_wait_and_warmup
(
server_args
:
ServerArgs
,
pipe_finish_writer
:
Optional
[
multiprocessing
.
connection
.
Connection
],
image_token_text
:
str
,
launch_callback
:
Optional
[
Callable
[[],
None
]]
=
None
,
):
if
not
server_args
.
skip_server_warmup
:
...
...
python/sglang/srt/layers/moe/topk.py
View file @
ed2e313e
...
...
@@ -15,7 +15,7 @@
from
__future__
import
annotations
import
math
from
typing
import
TYPE_CHECKING
,
Callable
,
NamedTuple
,
Optional
from
typing
import
Callable
,
NamedTuple
,
Optional
import
torch
import
torch.nn.functional
as
F
...
...
@@ -39,10 +39,10 @@ from sglang.srt.utils import (
_is_cuda
=
is_cuda
()
_is_hip
=
is_hip
()
_use_aiter
=
get_bool_env_var
(
"SGLANG_USE_AITER"
)
and
_is_hip
_is_cpu_amx_available
=
cpu_has_amx_support
()
_is_cpu
=
is_cpu
()
_is_cpu_amx_available
=
cpu_has_amx_support
()
_is_npu
=
is_npu
()
_use_aiter
=
get_bool_env_var
(
"SGLANG_USE_AITER"
)
and
_is_hip
if
_is_cuda
:
from
sgl_kernel
import
moe_fused_gate
...
...
@@ -54,7 +54,6 @@ if _use_aiter:
from
aiter
import
biased_grouped_topk
as
aiter_biased_grouped_topk
except
ImportError
:
raise
ImportError
(
"aiter is required when SGLANG_USE_AITER is set to True"
)
if
_is_npu
:
import
torch_npu
...
...
python/sglang/srt/managers/scheduler.py
View file @
ed2e313e
...
...
@@ -653,6 +653,9 @@ class Scheduler(
)
)
embedding_cache_size
=
int
(
os
.
environ
.
get
(
"SGLANG_VLM_CACHE_SIZE_MB"
,
"100"
))
init_embedding_cache
(
embedding_cache_size
*
1024
*
1024
)
def
init_profier
(
self
):
self
.
torch_profiler
=
None
self
.
torch_profiler_output_dir
:
Optional
[
str
]
=
None
...
...
@@ -2895,9 +2898,9 @@ def run_scheduler_process(
prefix
+=
f
" PP
{
pp_rank
}
"
# Config the process
kill_itself_when_parent_died
()
setproctitle
.
setproctitle
(
f
"sglang::scheduler
{
prefix
.
replace
(
' '
,
'_'
)
}
"
)
faulthandler
.
enable
()
kill_itself_when_parent_died
()
parent_process
=
psutil
.
Process
().
parent
()
# [For Router] if env var "SGLANG_DP_RANK" exist, set dp_rank to the value of the env var
...
...
@@ -2912,10 +2915,6 @@ def run_scheduler_process(
if
get_bool_env_var
(
"SGLANG_SET_CPU_AFFINITY"
):
set_gpu_proc_affinity
(
server_args
.
tp_size
,
server_args
.
nnodes
,
gpu_id
)
embedding_cache_size
=
100
if
"SGLANG_VLM_CACHE_SIZE_MB"
in
os
.
environ
:
embedding_cache_size
=
int
(
os
.
environ
[
"SGLANG_VLM_CACHE_SIZE_MB"
])
init_embedding_cache
(
embedding_cache_size
*
1024
*
1024
)
# Create a scheduler and run the event loop
try
:
scheduler
=
Scheduler
(
server_args
,
port_args
,
gpu_id
,
tp_rank
,
pp_rank
,
dp_rank
)
...
...
@@ -2926,8 +2925,8 @@ def run_scheduler_process(
"max_req_input_len"
:
scheduler
.
max_req_input_len
,
}
)
disaggregation_mode
:
DisaggregationMode
=
scheduler
.
disaggregation_mode
disaggregation_mode
:
DisaggregationMode
=
scheduler
.
disaggregation_mode
if
disaggregation_mode
==
DisaggregationMode
.
NULL
:
if
server_args
.
pp_size
>
1
:
scheduler
.
event_loop_pp
()
...
...
python/sglang/srt/model_executor/forward_batch_info.py
View file @
ed2e313e
...
...
@@ -74,8 +74,6 @@ class ForwardMode(IntEnum):
MIXED
=
auto
()
# No sequence to forward. For data parallel attention, some workers will be IDLE if no sequence are allocated.
IDLE
=
auto
()
# Split Prefill for PD multiplexing
SPLIT_PREFILL
=
auto
()
# Used in speculative decoding: verify a batch in the target model.
TARGET_VERIFY
=
auto
()
...
...
@@ -86,6 +84,9 @@ class ForwardMode(IntEnum):
# It is now used for triggering the sampling_info_done event for the first prefill batch.
DUMMY_FIRST
=
auto
()
# Split Prefill for PD multiplexing
SPLIT_PREFILL
=
auto
()
def
is_prefill
(
self
):
return
self
.
is_extend
()
...
...
@@ -103,12 +104,12 @@ class ForwardMode(IntEnum):
def
is_mixed
(
self
):
return
self
==
ForwardMode
.
MIXED
def
is_split_prefill
(
self
):
return
self
==
ForwardMode
.
SPLIT_PREFILL
def
is_idle
(
self
):
return
self
==
ForwardMode
.
IDLE
def
is_decode_or_idle
(
self
):
return
self
==
ForwardMode
.
DECODE
or
self
==
ForwardMode
.
IDLE
def
is_target_verify
(
self
):
return
self
==
ForwardMode
.
TARGET_VERIFY
...
...
@@ -132,8 +133,8 @@ class ForwardMode(IntEnum):
def
is_dummy_first
(
self
):
return
self
==
ForwardMode
.
DUMMY_FIRST
def
is_
decode_or_idle
(
self
):
return
self
==
ForwardMode
.
DECODE
or
self
==
ForwardMode
.
IDLE
def
is_
split_prefill
(
self
):
return
self
==
ForwardMode
.
SPLIT_PREFILL
@
total_ordering
...
...
python/sglang/srt/model_executor/model_runner.py
View file @
ed2e313e
...
...
@@ -109,7 +109,6 @@ from sglang.srt.utils import (
get_bool_env_var
,
get_cpu_ids_by_node
,
init_custom_process_group
,
is_cuda
,
is_fa3_default_architecture
,
is_flashinfer_available
,
is_hip
,
...
...
python/sglang/srt/server_args.py
View file @
ed2e313e
...
...
@@ -80,7 +80,7 @@ class ServerArgs:
schedule_policy
:
str
=
"fcfs"
schedule_conservativeness
:
float
=
1.0
cpu_offload_gb
:
int
=
0
page_size
:
int
=
1
page_size
:
Optional
[
int
]
=
None
hybrid_kvcache_ratio
:
Optional
[
float
]
=
None
swa_full_tokens_ratio
:
float
=
0.8
disable_hybrid_swa_memory
:
bool
=
False
...
...
@@ -266,31 +266,20 @@ class ServerArgs:
def
__post_init__
(
self
):
# Expert parallelism
# We put it here first due to some internal ckpt conversation issues.
if
self
.
enable_ep_moe
:
self
.
ep_size
=
self
.
tp_size
logger
.
warning
(
f
"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[
{
self
.
tp_size
}
]."
)
if
self
.
enable_flashinfer_moe
:
assert
(
self
.
quantization
==
"modelopt_fp4"
),
"modelopt_fp4 quantization is required for Flashinfer MOE"
os
.
environ
[
"TRTLLM_ENABLE_PDL"
]
=
"1"
self
.
disable_shared_experts_fusion
=
True
logger
.
warning
(
f
"Flashinfer MoE is enabled. Shared expert fusion is disabled."
)
# Set missing default values
if
self
.
tokenizer_path
is
None
:
self
.
tokenizer_path
=
self
.
model_path
if
self
.
device
is
None
:
self
.
device
=
get_device
()
if
self
.
served_model_name
is
None
:
self
.
served_model_name
=
self
.
model_path
if
self
.
device
is
None
:
self
.
device
=
get_device
()
if
self
.
random_seed
is
None
:
self
.
random_seed
=
random
.
randint
(
0
,
1
<<
30
)
...
...
@@ -359,7 +348,6 @@ class ServerArgs:
self
.
chunked_prefill_size
=
16384
else
:
self
.
chunked_prefill_size
=
4096
assert
self
.
chunked_prefill_size
%
self
.
page_size
==
0
# Set cuda graph max batch size
if
self
.
cuda_graph_max_bs
is
None
:
...
...
@@ -410,6 +398,14 @@ class ServerArgs:
)
self
.
page_size
=
128
# Set page size
if
self
.
page_size
is
None
:
self
.
page_size
=
1
# AMD-specific Triton attention KV splits default number
if
is_hip
():
self
.
triton_attention_num_kv_splits
=
16
# Choose grammar backend
if
self
.
grammar_backend
is
None
:
self
.
grammar_backend
=
"xgrammar"
...
...
@@ -431,6 +427,17 @@ class ServerArgs:
self
.
enable_dp_attention
),
"Please enable dp attention when setting enable_dp_lm_head. "
# MoE kernel
if
self
.
enable_flashinfer_moe
:
assert
(
self
.
quantization
==
"modelopt_fp4"
),
"modelopt_fp4 quantization is required for Flashinfer MOE"
os
.
environ
[
"TRTLLM_ENABLE_PDL"
]
=
"1"
self
.
disable_shared_experts_fusion
=
True
logger
.
warning
(
f
"Flashinfer MoE is enabled. Shared expert fusion is disabled."
)
# DeepEP MoE
if
self
.
enable_deepep_moe
:
if
self
.
deepep_mode
==
"normal"
:
...
...
@@ -502,14 +509,6 @@ class ServerArgs:
logger
.
warning
(
"DeepSeek MTP does not require setting speculative_draft_model_path."
)
elif
"Llama4"
in
model_arch
:
# TODO: remove this after Llama4 supports in other backends
if
self
.
attention_backend
!=
"fa3"
:
self
.
attention_backend
=
"fa3"
logger
.
warning
(
"Llama4 requires using fa3 attention backend. "
"Attention backend is automatically set to fa3."
)
# Auto choose parameters
if
self
.
speculative_num_steps
is
None
:
...
...
@@ -542,12 +541,11 @@ class ServerArgs:
)
and
check_gguf_file
(
self
.
model_path
):
self
.
quantization
=
self
.
load_format
=
"gguf"
# Model loading
if
is_remote_url
(
self
.
model_path
):
self
.
load_format
=
"remote"
# AMD-specific Triton attention KV splits default number
if
is_hip
():
self
.
triton_attention_num_kv_splits
=
16
if
self
.
custom_weight_loader
is
None
:
self
.
custom_weight_loader
=
[]
# PD disaggregation
if
self
.
disaggregation_mode
==
"decode"
:
...
...
@@ -572,6 +570,7 @@ class ServerArgs:
self
.
disable_cuda_graph
=
True
logger
.
warning
(
"Cuda graph is disabled for prefill server"
)
# Propagate env vars
os
.
environ
[
"SGLANG_ENABLE_TORCH_COMPILE"
]
=
(
"1"
if
self
.
enable_torch_compile
else
"0"
)
...
...
@@ -580,9 +579,6 @@ class ServerArgs:
"1"
if
self
.
disable_outlines_disk_cache
else
"0"
)
if
self
.
custom_weight_loader
is
None
:
self
.
custom_weight_loader
=
[]
@
staticmethod
def
add_cli_args
(
parser
:
argparse
.
ArgumentParser
):
# Model and tokenizer
...
...
@@ -1227,6 +1223,13 @@ class ServerArgs:
default
=
ServerArgs
.
grammar_backend
,
help
=
"Choose the backend for grammar-guided decoding."
,
)
parser
.
add_argument
(
"--mm-attention-backend"
,
type
=
str
,
choices
=
[
"sdpa"
,
"fa3"
,
"triton_attn"
],
default
=
ServerArgs
.
mm_attention_backend
,
help
=
"Set multimodal attention backend."
,
)
# Speculative decoding
parser
.
add_argument
(
...
...
@@ -1276,13 +1279,6 @@ class ServerArgs:
help
=
"The path of the draft model's small vocab table."
,
default
=
ServerArgs
.
speculative_token_map
,
)
parser
.
add_argument
(
"--mm-attention-backend"
,
type
=
str
,
choices
=
[
"sdpa"
,
"fa3"
,
"triton_attn"
],
default
=
ServerArgs
.
mm_attention_backend
,
help
=
"Set multimodal attention backend."
,
)
# Expert parallelism
parser
.
add_argument
(
...
...
@@ -1530,11 +1526,6 @@ class ServerArgs:
action
=
"store_true"
,
help
=
"Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker."
,
)
parser
.
add_argument
(
"--disable-overlap-cg-plan"
,
action
=
"store_true"
,
help
=
"Disable the overlap optimization for cudagraph preparation in eagle verify."
,
)
parser
.
add_argument
(
"--enable-mixed-chunk"
,
action
=
"store_true"
,
...
...
@@ -1792,11 +1783,11 @@ class ServerArgs:
return
hf_config
def
check_server_args
(
self
):
# Check parallel size constraints
assert
(
self
.
tp_size
*
self
.
pp_size
)
%
self
.
nnodes
==
0
,
"tp_size must be divisible by number of nodes"
# FIXME pp constraints
if
self
.
pp_size
>
1
:
assert
(
self
.
disable_overlap_schedule
...
...
@@ -1807,11 +1798,7 @@ class ServerArgs:
assert
not
(
self
.
dp_size
>
1
and
self
.
nnodes
!=
1
and
not
self
.
enable_dp_attention
),
"multi-node data parallel is not supported unless dp attention!"
assert
(
self
.
max_loras_per_batch
>
0
# FIXME
and
(
self
.
lora_paths
is
None
or
self
.
disable_radix_cache
)
),
"compatibility of lora and radix attention is in progress"
assert
self
.
base_gpu_id
>=
0
,
"base_gpu_id must be non-negative"
assert
self
.
gpu_id_step
>=
1
,
"gpu_id_step must be positive"
...
...
@@ -1820,9 +1807,32 @@ class ServerArgs:
None
,
},
"moe_dense_tp_size only support 1 and None currently"
# Check model architecture
model_arch
=
self
.
get_hf_config
().
architectures
[
0
]
if
"Llama4"
in
model_arch
:
assert
self
.
attention_backend
==
"fa3"
,
"fa3 is required for Llama4 model"
# Check LoRA
self
.
check_lora_server_args
()
# Check speculative decoding
if
self
.
speculative_algorithm
is
not
None
:
assert
(
not
self
.
enable_mixed_chunk
),
"enable_mixed_chunk is required for speculative decoding"
# Check chunked prefill
assert
(
self
.
chunked_prefill_size
%
self
.
page_size
==
0
),
"chunked_prefill_size must be divisible by page_size"
def
check_lora_server_args
(
self
):
assert
(
self
.
max_loras_per_batch
>
0
# FIXME
and
(
self
.
lora_paths
is
None
or
self
.
disable_radix_cache
)
),
"compatibility of lora and radix attention is in progress"
# Enable LoRA if any LoRA paths are provided for backward compatibility.
if
self
.
lora_paths
:
if
self
.
enable_lora
is
None
:
...
...
python/sglang/srt/speculative/eagle_draft_cuda_graph_runner.py
View file @
ed2e313e
...
...
@@ -336,7 +336,6 @@ class EAGLEDraftCudaGraphRunner:
forward_batch
.
req_pool_indices
=
self
.
req_pool_indices
[:
bs
]
forward_batch
.
positions
=
self
.
positions
[:
num_tokens
]
# Special handle for seq_len_cpu used when flashinfer mla is used
if
forward_batch
.
seq_lens_cpu
is
not
None
:
if
bs
!=
raw_bs
:
self
.
seq_lens_cpu
.
fill_
(
self
.
seq_len_fill_value
)
...
...
python/sglang/srt/utils.py
View file @
ed2e313e
...
...
@@ -937,71 +937,6 @@ def monkey_patch_vllm_gguf_config():
setattr
(
GGUFConfig
,
"get_quant_method"
,
get_quant_method_with_embedding_replaced
)
def
maybe_set_triton_cache_manager
()
->
None
:
"""Set environment variable to tell Triton to use a
custom cache manager"""
cache_manger
=
os
.
environ
.
get
(
"TRITON_CACHE_MANAGER"
,
None
)
if
cache_manger
is
None
:
manager
=
"sglang.srt.utils:CustomCacheManager"
logger
.
debug
(
"Setting Triton cache manager to: %s"
,
manager
)
os
.
environ
[
"TRITON_CACHE_MANAGER"
]
=
manager
class
CustomCacheManager
(
FileCacheManager
):
# Adapted from: https://github.com/tdoublep/vllm/blob/3307522289fdfefe323b6c00d0db696651989a2f/vllm/triton_utils/custom_cache_manager.py
def
__init__
(
self
,
key
,
override
=
False
,
dump
=
False
):
from
sglang.srt.distributed.parallel_state
import
get_tp_group
self
.
key
=
key
self
.
lock_path
=
None
try
:
module_path
=
"triton.runtime.cache"
cache_module
=
importlib
.
import_module
(
module_path
)
default_cache_dir
=
getattr
(
cache_module
,
"default_cache_dir"
,
None
)
default_dump_dir
=
getattr
(
cache_module
,
"default_dump_dir"
,
None
)
default_override_dir
=
getattr
(
cache_module
,
"default_override_dir"
,
None
)
except
(
ModuleNotFoundError
,
AttributeError
)
as
e
:
default_cache_dir
=
None
default_dump_dir
=
None
default_override_dir
=
None
if
dump
:
self
.
cache_dir
=
(
default_dump_dir
()
if
default_dump_dir
is
not
None
else
os
.
path
.
join
(
Path
.
home
(),
".triton"
,
"dump"
)
)
self
.
cache_dir
=
os
.
path
.
join
(
self
.
cache_dir
,
self
.
key
)
self
.
lock_path
=
os
.
path
.
join
(
self
.
cache_dir
,
"lock"
)
os
.
makedirs
(
self
.
cache_dir
,
exist_ok
=
True
)
elif
override
:
self
.
cache_dir
=
(
default_override_dir
()
if
default_override_dir
is
not
None
else
os
.
path
.
join
(
Path
.
home
(),
".triton"
,
"override"
)
)
self
.
cache_dir
=
os
.
path
.
join
(
self
.
cache_dir
,
self
.
key
)
else
:
# create cache directory if it doesn't exist
self
.
cache_dir
=
os
.
getenv
(
"TRITON_CACHE_DIR"
,
""
).
strip
()
or
(
default_cache_dir
()
if
default_cache_dir
is
not
None
else
os
.
path
.
join
(
Path
.
home
(),
".triton"
,
"cache"
)
)
if
self
.
cache_dir
:
try
:
self
.
cache_dir
=
f
"
{
self
.
cache_dir
}
_
{
get_tp_group
().
local_rank
}
"
except
:
self
.
cache_dir
=
f
"
{
self
.
cache_dir
}
_
{
os
.
getpid
()
}
"
self
.
cache_dir
=
os
.
path
.
join
(
self
.
cache_dir
,
self
.
key
)
self
.
lock_path
=
os
.
path
.
join
(
self
.
cache_dir
,
"lock"
)
os
.
makedirs
(
self
.
cache_dir
,
exist_ok
=
True
)
else
:
raise
RuntimeError
(
"Could not create or locate cache dir"
)
def
set_ulimit
(
target_soft_limit
=
65535
):
# number of open files
resource_type
=
resource
.
RLIMIT_NOFILE
...
...
test/srt/test_deepep_large.py
View file @
ed2e313e
...
...
@@ -101,7 +101,7 @@ class TestDeepseekMTP(CustomTestCase):
"--max-running-requests"
,
"512"
,
"--speculative-algorithm"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-num-steps"
,
"1"
,
"--speculative-eagle-topk"
,
...
...
test/srt/test_deepep_small.py
View file @
ed2e313e
...
...
@@ -261,7 +261,7 @@ class TestMTP(CustomTestCase):
"--enable-dp-lm-head"
,
"--enable-deepep-moe"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN
,
"--speculative-num-steps"
,
...
...
@@ -329,7 +329,7 @@ class TestMTPWithTBO(CustomTestCase):
"--enable-deepep-moe"
,
"--trust-remote-code"
,
"--speculative-algorithm"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-num-steps"
,
"2"
,
"--speculative-eagle-topk"
,
...
...
test/srt/test_hybrid_dp_ep_tp_mtp.py
View file @
ed2e313e
...
...
@@ -1224,7 +1224,7 @@ class Test30(CustomTestCase):
"--tp"
,
"8"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -1271,7 +1271,7 @@ class Test31(CustomTestCase):
"--dp"
,
"4"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -1318,7 +1318,7 @@ class Test32(CustomTestCase):
"--dp"
,
"8"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -1364,7 +1364,7 @@ class Test33(CustomTestCase):
"--moe-dense-tp-size"
,
"1"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -1413,7 +1413,7 @@ class Test34(CustomTestCase):
"--moe-dense-tp-size"
,
"1"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -1462,7 +1462,7 @@ class Test35(CustomTestCase):
"--moe-dense-tp-size"
,
"1"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -1510,7 +1510,7 @@ class Test36(CustomTestCase):
"4"
,
"--enable-dp-lm-head"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -1558,7 +1558,7 @@ class Test37(CustomTestCase):
"8"
,
"--enable-dp-lm-head"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -1608,7 +1608,7 @@ class Test38(CustomTestCase):
"1"
,
"--enable-dp-lm-head"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -1658,7 +1658,7 @@ class Test39(CustomTestCase):
"1"
,
"--enable-dp-lm-head"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -1709,7 +1709,7 @@ class Test40(CustomTestCase):
"--max-running-requests"
,
"32"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -1763,7 +1763,7 @@ class Test41(CustomTestCase):
"--max-running-requests"
,
"32"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -1817,7 +1817,7 @@ class Test42(CustomTestCase):
"--max-running-requests"
,
"32"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -1870,7 +1870,7 @@ class Test43(CustomTestCase):
"--max-running-requests"
,
"32"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -1926,7 +1926,7 @@ class Test44(CustomTestCase):
"--max-running-requests"
,
"32"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -1982,7 +1982,7 @@ class Test45(CustomTestCase):
"--max-running-requests"
,
"32"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -2037,7 +2037,7 @@ class Test46(CustomTestCase):
"--max-running-requests"
,
"32"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -2092,7 +2092,7 @@ class Test47(CustomTestCase):
"--max-running-requests"
,
"32"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -2149,7 +2149,7 @@ class Test48(CustomTestCase):
"--max-running-requests"
,
"32"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -2206,7 +2206,7 @@ class Test49(CustomTestCase):
"--max-running-requests"
,
"32"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -2251,7 +2251,7 @@ class Test50(CustomTestCase):
"8"
,
"--enable-ep-moe"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -2299,7 +2299,7 @@ class Test51(CustomTestCase):
"4"
,
"--enable-ep-moe"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -2347,7 +2347,7 @@ class Test52(CustomTestCase):
"8"
,
"--enable-ep-moe"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -2394,7 +2394,7 @@ class Test53(CustomTestCase):
"1"
,
"--enable-ep-moe"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -2444,7 +2444,7 @@ class Test54(CustomTestCase):
"1"
,
"--enable-ep-moe"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -2494,7 +2494,7 @@ class Test55(CustomTestCase):
"1"
,
"--enable-ep-moe"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -2543,7 +2543,7 @@ class Test56(CustomTestCase):
"--enable-dp-lm-head"
,
"--enable-ep-moe"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -2592,7 +2592,7 @@ class Test57(CustomTestCase):
"--enable-dp-lm-head"
,
"--enable-ep-moe"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -2643,7 +2643,7 @@ class Test58(CustomTestCase):
"--enable-dp-lm-head"
,
"--enable-ep-moe"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
@@ -2694,7 +2694,7 @@ class Test59(CustomTestCase):
"--enable-dp-lm-head"
,
"--enable-ep-moe"
,
"--speculative-algo"
,
"
NEXTN
"
,
"
EAGLE
"
,
"--speculative-draft"
,
"lmsys/DeepSeek-V3-0324-NextN"
,
"--speculative-num-steps"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment