Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
7349717e
"docs/source/vscode:/vscode.git/clone" did not exist on "4ca706e1b4e15d6cb8b0df574e44b9d2dd92996e"
Unverified
Commit
7349717e
authored
Jul 01, 2025
by
ybyang
Committed by
GitHub
Jul 01, 2025
Browse files
[doc] update lws doc for pd (#7318)
parent
392e441a
Changes
7
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
1464 additions
and
0 deletions
+1464
-0
docs/references/advanced_deploy.rst
docs/references/advanced_deploy.rst
+1
-0
docs/references/disaggregation/lws-examples/d-svc.yaml
docs/references/disaggregation/lws-examples/d-svc.yaml
+12
-0
docs/references/disaggregation/lws-examples/d.yaml
docs/references/disaggregation/lws-examples/d.yaml
+292
-0
docs/references/disaggregation/lws-examples/lb.yaml
docs/references/disaggregation/lws-examples/lb.yaml
+55
-0
docs/references/disaggregation/lws-examples/p-svc.yaml
docs/references/disaggregation/lws-examples/p-svc.yaml
+12
-0
docs/references/disaggregation/lws-examples/p.yaml
docs/references/disaggregation/lws-examples/p.yaml
+306
-0
docs/references/disaggregation/lws_pd_deploy.md
docs/references/disaggregation/lws_pd_deploy.md
+786
-0
No files found.
docs/references/advanced_deploy.rst
View file @
7349717e
...
...
@@ -5,3 +5,4 @@ Multi-Node Deployment
multi_node.md
deploy_on_k8s.md
disaggregation/lws_pd_deploy.md
docs/references/disaggregation/lws-examples/d-svc.yaml
0 → 100644
View file @
7349717e
apiVersion
:
v1
kind
:
Service
metadata
:
name
:
deepseekr10528-decode-main
spec
:
selector
:
leaderworkerset.sigs.k8s.io/name
:
deepseekr10528-decode-main
role
:
leader
ports
:
-
protocol
:
TCP
port
:
30000
targetPort
:
30000
docs/references/disaggregation/lws-examples/d.yaml
0 → 100644
View file @
7349717e
apiVersion
:
leaderworkerset.x-k8s.io/v1
kind
:
LeaderWorkerSet
metadata
:
name
:
deepseekr10528-decode-main
spec
:
leaderWorkerTemplate
:
leaderTemplate
:
metadata
:
labels
:
role
:
leader
spec
:
containers
:
-
command
:
-
python3
-
-m
-
sglang.launch_server
-
--port
-
"
30000"
-
--host
-
"
0.0.0.0"
-
--model-path
-
/work/models
-
--chunked-prefill-size
-
"
262144"
-
--page-size
-
"
64"
-
--enable-dp-attention
-
--enable-dp-lm-head
-
--dp-size
-
"
16"
-
--enable-deepep-moe
-
--deepep-mode
-
low_latency
-
--disaggregation-mode
-
decode
-
--mem-fraction-static
-
"
0.849"
-
--context-length
-
"
32768"
-
--disaggregation-ib-device
-
"
mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
-
--cuda-graph-max-bs
-
"
64"
-
--max-running-requests
-
"
2048"
-
--tp-size
-
"
16"
# Size of Tensor Parallelism
-
--dist-init-addr
-
$(LWS_LEADER_ADDRESS):20102
-
--nnodes
-
$(LWS_GROUP_SIZE)
-
--node-rank
-
$(LWS_WORKER_INDEX)
-
--trust-remote-code
-
--ep-num-redundant-experts
-
"
32"
-
--moe-dense-tp-size
-
"
1"
env
:
-
name
:
CUDA_LAUNCH_BLOCKING
value
:
"
0"
-
name
:
NVSHMEM_IB_GID_INDEX
value
:
"
3"
-
name
:
NVSHMEM_ENABLE_NIC_PE_MAPPING
value
:
"
1"
-
name
:
NVSHMEM_HCA_PE_MAPPING
value
:
"
mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
-
name
:
NCCL_IB_QPS_PER_CONNECTION
value
:
"
8"
-
name
:
NCCL_IB_SPLIT_DATA_ON_QPS
value
:
"
1"
-
name
:
NCCL_NET_PLUGIN
value
:
"
none"
-
name
:
NCCL_IB_TC
value
:
"
136"
-
name
:
NCCL_MIN_NCHANNELS
value
:
"
4"
-
name
:
NCCL_IB_SL
value
:
"
5"
-
name
:
MC_TE_METRIC
value
:
"
true"
-
name
:
SGLANG_MOONCAKE_TRANS_THREAD
value
:
"
16"
-
name
:
SGL_ENABLE_JIT_DEEPGEMM
value
:
"
1"
-
name
:
NCCL_IB_HCA
value
:
^=mlx5_0,mlx5_5,mlx5_6
-
name
:
LWS_WORKER_INDEX
valueFrom
:
fieldRef
:
fieldPath
:
metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
image
:
lmsysorg/sglang:latest
name
:
sglang-leader
ports
:
-
containerPort
:
30000
protocol
:
TCP
readinessProbe
:
periodSeconds
:
30
tcpSocket
:
port
:
30000
resources
:
limits
:
nvidia.com/gpu
:
"
8"
securityContext
:
capabilities
:
add
:
-
IPC_LOCK
privileged
:
true
volumeMounts
:
-
mountPath
:
/root/.cache
name
:
sgl-cache
-
mountPath
:
/dev/shm
name
:
dshm
-
mountPath
:
/work/models
name
:
model
-
mountPath
:
/dev/infiniband
name
:
ib
-
mountPath
:
/sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
name
:
cf
dnsPolicy
:
ClusterFirstWithHostNet
hostIPC
:
true
hostNetwork
:
true
nodeSelector
:
# should modify according your deployment env
pd
:
"
yes"
tolerations
:
# should modify according your deployment env
-
key
:
bopd
operator
:
Exists
-
key
:
node-role
operator
:
Exists
volumes
:
-
hostPath
:
path
:
/data1/sgl_cache1
type
:
DirectoryOrCreate
name
:
sgl-cache
-
emptyDir
:
medium
:
Memory
name
:
dshm
-
hostPath
:
path
:
/data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
name
:
model
-
hostPath
:
path
:
/dev/infiniband
name
:
ib
-
hostPath
:
path
:
/data1/maas_hosted_models/models/fused_moe_triton/configs
name
:
cf
restartPolicy
:
RecreateGroupOnPodRestart
size
:
2
workerTemplate
:
metadata
:
{}
spec
:
containers
:
-
command
:
-
python3
-
-m
-
sglang.launch_server
-
--model-path
-
/work/models
-
--chunked-prefill-size
-
"
262144"
-
--page-size
-
"
64"
-
--enable-dp-attention
-
--enable-dp-lm-head
-
--dp-size
-
"
16"
-
--enable-deepep-moe
-
--deepep-mode
-
low_latency
-
--disaggregation-mode
-
decode
-
--mem-fraction-static
-
"
0.849"
-
--context-length
-
"
32768"
-
--disaggregation-ib-device
-
"
mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
-
--cuda-graph-max-bs
-
"
64"
-
--max-running-requests
-
"
2048"
-
--tp-size
-
"
16"
# Size of Tensor Parallelism
-
--dist-init-addr
-
$(LWS_LEADER_ADDRESS):20102
-
--nnodes
-
$(LWS_GROUP_SIZE)
-
--node-rank
-
$(LWS_WORKER_INDEX)
-
--trust-remote-code
-
--ep-num-redundant-experts
-
"
32"
-
--moe-dense-tp-size
-
"
1"
env
:
-
name
:
NVSHMEM_IB_TRAFFIC_CLASS
value
:
"
16"
-
name
:
NVSHMEM_IB_GID_INDEX
value
:
"
3"
-
name
:
NVSHMEM_ENABLE_NIC_PE_MAPPING
value
:
"
1"
-
name
:
NVSHMEM_HCA_PE_MAPPING
value
:
"
mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
-
name
:
NCCL_IB_QPS_PER_CONNECTION
value
:
"
8"
-
name
:
NCCL_IB_SPLIT_DATA_ON_QPS
value
:
"
1"
-
name
:
NCCL_NET_PLUGIN
value
:
"
none"
-
name
:
NCCL_IB_TC
value
:
"
136"
-
name
:
NCCL_MIN_NCHANNELS
value
:
"
4"
-
name
:
MC_TE_METRIC
value
:
"
true"
-
name
:
NCCL_IB_SL
value
:
"
5"
-
name
:
SGLANG_MOONCAKE_TRANS_THREAD
value
:
"
16"
-
name
:
SGL_ENABLE_JIT_DEEPGEMM
value
:
"
1"
-
name
:
NCCL_IB_HCA
value
:
^=mlx5_0,mlx5_5,mlx5_6
-
name
:
LWS_WORKER_INDEX
valueFrom
:
fieldRef
:
fieldPath
:
metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
image
:
lmsysorg/sglang:latest
name
:
sglang-worker
ports
:
-
containerPort
:
30001
resources
:
limits
:
nvidia.com/gpu
:
"
8"
securityContext
:
capabilities
:
add
:
-
IPC_LOCK
privileged
:
true
volumeMounts
:
-
mountPath
:
/root/.cache
name
:
sgl-cache
-
mountPath
:
/dev/shm
name
:
dshm
-
mountPath
:
/work/models
name
:
model
-
mountPath
:
/dev/infiniband
name
:
ib
-
mountPath
:
/sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
name
:
cf
dnsPolicy
:
ClusterFirstWithHostNet
hostIPC
:
true
hostNetwork
:
true
nodeSelector
:
# should modify according your deployment env
pd
:
"
yes"
tolerations
:
# should modify according your deployment env
-
key
:
bopd
operator
:
Exists
-
key
:
node-role
operator
:
Exists
volumes
:
-
hostPath
:
path
:
/data1/sgl_cache1
type
:
DirectoryOrCreate
name
:
sgl-cache
-
emptyDir
:
medium
:
Memory
name
:
dshm
-
hostPath
:
path
:
/dev/infiniband
name
:
ib
-
hostPath
:
# modify according to you deployment env
path
:
/data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
name
:
model
-
hostPath
:
# modify according to you deployment env
path
:
/data1/maas_hosted_models/models/fused_moe_triton/configs
name
:
cf
networkConfig
:
subdomainPolicy
:
Shared
replicas
:
1
rolloutStrategy
:
rollingUpdateConfiguration
:
maxSurge
:
0
maxUnavailable
:
1
type
:
RollingUpdate
startupPolicy
:
LeaderCreated
docs/references/disaggregation/lws-examples/lb.yaml
0 → 100644
View file @
7349717e
apiVersion
:
apps/v1
kind
:
Deployment
metadata
:
name
:
deepseekr10528-lb-main
labels
:
app
:
deepseekr10528-lb
spec
:
replicas
:
1
selector
:
matchLabels
:
app
:
deepseekr10528-lb
template
:
metadata
:
labels
:
app
:
deepseekr10528-lb
spec
:
nodeSelector
:
bo
:
"
yes"
tolerations
:
-
key
:
bopd
operator
:
Exists
-
key
:
node-role
operator
:
Exists
containers
:
-
name
:
sgl-minilb
image
:
lmsysorg/sglang:latest
command
:
-
python
-
-m
-
sglang.srt.disaggregation.mini_lb
-
--prefill
-
http://deepseekr10528-prefill-main:30000
-
--decode
-
http://deepseekr10528-decode-main:30000
-
--host
-
0.0.0.0
-
--port
-
"
8000"
ports
:
-
containerPort
:
8000
---
apiVersion
:
v1
kind
:
Service
metadata
:
name
:
deepseekr10528-lb-service
spec
:
type
:
NodePort
# NodePort is easy to test, you can also specify `ClusterIP`
selector
:
app
:
deepseekr10528-lb
ports
:
-
protocol
:
TCP
port
:
8000
# Service Port(In-Cluster)
targetPort
:
8000
# Exposed Container
nodePort
:
30800
docs/references/disaggregation/lws-examples/p-svc.yaml
0 → 100644
View file @
7349717e
apiVersion
:
v1
kind
:
Service
metadata
:
name
:
deepseekr10528-prefill-main
spec
:
selector
:
leaderworkerset.sigs.k8s.io/name
:
deepseekr10528-prefill-main
role
:
leader
ports
:
-
protocol
:
TCP
port
:
30000
targetPort
:
30000
docs/references/disaggregation/lws-examples/p.yaml
0 → 100644
View file @
7349717e
apiVersion
:
leaderworkerset.x-k8s.io/v1
kind
:
LeaderWorkerSet
metadata
:
name
:
deepseekr10528-prefill-main
spec
:
leaderWorkerTemplate
:
leaderTemplate
:
metadata
:
labels
:
role
:
leader
spec
:
containers
:
-
command
:
-
python3
-
-m
-
sglang.launch_server
-
--port
-
"
30000"
-
--host
-
"
0.0.0.0"
-
--model-path
-
/work/models
-
--disaggregation-ib-device
# should modify according your rdma env
-
mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
-
--chunked-prefill-size
-
"
524288"
-
--max-prefill-tokens
-
"
32768"
-
--page-size
-
"
64"
-
--ep-dispatch-algorithm
-
dynamic
-
--eplb-algorithm
-
deepseek
-
--enable-dp-lm-head
-
--enable-dp-attention
-
--dp-size
-
"
16"
-
--disable-radix-cache
-
--enable-deepep-moe
-
--deepep-mode
-
normal
-
--disaggregation-mode
-
prefill
-
--mem-fraction-static
-
"
0.7"
-
--context-length
-
"
32768"
-
--tp
-
"
16"
-
--dist-init-addr
-
$(LWS_LEADER_ADDRESS):20102
-
--nnodes
-
$(LWS_GROUP_SIZE)
-
--node-rank
-
$(LWS_WORKER_INDEX)
-
--trust-remote-code
-
--ep-num-redundant-experts
-
"
32"
-
--moe-dense-tp-size
-
"
1"
-
--max-running-requests
-
"
1024"
env
:
-
name
:
NVSHMEM_HCA_PE_MAPPING
# should modify according your rdma env
value
:
"
mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
-
name
:
NVSHMEM_IB_GID_INDEX
value
:
"
3"
-
name
:
NVSHMEM_ENABLE_NIC_PE_MAPPING
value
:
"
1"
-
name
:
SGLANG_SET_CPU_AFFINITY
value
:
"
true"
-
name
:
SGL_ENABLE_JIT_DEEPGEMM
value
:
"
1"
-
name
:
NCCL_IB_QPS_PER_CONNECTION
value
:
"
8"
-
name
:
NCCL_IB_SPLIT_DATA_ON_QPS
value
:
"
1"
-
name
:
NCCL_NET_PLUGIN
value
:
none
-
name
:
NCCL_IB_TC
value
:
"
136"
-
name
:
NCCL_MIN_NCHANNELS
value
:
"
4"
-
name
:
MC_TE_METRIC
value
:
"
false"
-
name
:
NCCL_IB_SL
value
:
"
5"
-
name
:
NCCL_IB_HCA
value
:
^=mlx5_0,mlx5_5,mlx5_6
-
name
:
LWS_WORKER_INDEX
valueFrom
:
fieldRef
:
fieldPath
:
metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
image
:
lmsysorg/sglang:latest
name
:
sglang-leader
ports
:
-
containerPort
:
30000
protocol
:
TCP
readinessProbe
:
periodSeconds
:
30
tcpSocket
:
port
:
30000
resources
:
limits
:
nvidia.com/gpu
:
"
8"
securityContext
:
capabilities
:
add
:
-
IPC_LOCK
privileged
:
true
volumeMounts
:
-
mountPath
:
/dev/shm
name
:
dshm
-
mountPath
:
/work/models
name
:
model
-
mountPath
:
/dev/infiniband
name
:
ib
-
mountPath
:
/sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
name
:
cf
-
mountPath
:
/root/.cache
name
:
sgl-cache
dnsPolicy
:
ClusterFirstWithHostNet
hostIPC
:
true
hostNetwork
:
true
nodeSelector
:
# should modify according your deployment env
pd
:
"
yes"
tolerations
:
# should modify according your deployment env
-
key
:
bopd
operator
:
Exists
-
key
:
node-role
operator
:
Exists
volumes
:
-
emptyDir
:
medium
:
Memory
name
:
dshm
-
hostPath
:
path
:
/data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
name
:
model
-
hostPath
:
path
:
/dev/infiniband
name
:
ib
-
hostPath
:
path
:
/data1/maas_hosted_models/models/fused_moe_triton/configs
name
:
cf
-
hostPath
:
path
:
/data1/sgl_cache
type
:
DirectoryOrCreate
name
:
sgl-cache
restartPolicy
:
RecreateGroupOnPodRestart
size
:
2
workerTemplate
:
metadata
:
{}
spec
:
containers
:
-
command
:
-
python3
-
-m
-
sglang.launch_server
-
--model-path
-
/work/models
-
--disaggregation-ib-device
# should modify according your rdma env
-
mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
-
--chunked-prefill-size
-
"
524288"
-
--max-prefill-tokens
-
"
32768"
-
--page-size
-
"
64"
-
--ep-dispatch-algorithm
-
dynamic
-
--eplb-algorithm
-
deepseek
# - --deepep-config
# - /home/aiges/tuned/tuned_8sms.json
# can be tuned using deepep test scripts
-
--enable-dp-lm-head
-
--enable-dp-attention
-
--dp-size
-
"
16"
-
--disable-radix-cache
-
--enable-deepep-moe
-
--deepep-mode
-
normal
-
--disaggregation-mode
-
prefill
-
--mem-fraction-static
-
"
0.7"
-
--context-length
-
"
32768"
-
--tp
-
"
16"
-
--dist-init-addr
-
$(LWS_LEADER_ADDRESS):20102
-
--nnodes
-
$(LWS_GROUP_SIZE)
-
--node-rank
-
$(LWS_WORKER_INDEX)
-
--trust-remote-code
-
--ep-num-redundant-experts
-
"
32"
-
--moe-dense-tp-size
-
"
1"
-
--max-running-requests
-
"
1024"
env
:
-
name
:
SGLANG_SET_CPU_AFFINITY
value
:
"
true"
-
name
:
NVSHMEM_HCA_PE_MAPPING
# should modify according your rdma env
value
:
"
mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
-
name
:
NCCL_IB_HCA
value
:
^=mlx5_0,mlx5_5,mlx5_6
-
name
:
NVSHMEM_IB_TRAFFIC_CLASS
value
:
"
16"
-
name
:
NVSHMEM_IB_GID_INDEX
value
:
"
3"
-
name
:
NVSHMEM_ENABLE_NIC_PE_MAPPING
value
:
"
1"
-
name
:
CUDA_LAUNCH_BLOCKING
value
:
"
0"
-
name
:
SGLANG_MOONCAKE_TRANS_THREAD
value
:
"
8"
-
name
:
SGL_ENABLE_JIT_DEEPGEMM
value
:
"
1"
-
name
:
SGL_CHUNKED_PREFIX_CACHE_THRESHOLD
value
:
"
0"
-
name
:
NCCL_IB_QPS_PER_CONNECTION
value
:
"
8"
-
name
:
NCCL_IB_SPLIT_DATA_ON_QPS
value
:
"
1"
-
name
:
NCCL_NET_PLUGIN
value
:
none
-
name
:
NCCL_IB_TC
value
:
"
136"
-
name
:
NCCL_MIN_NCHANNELS
value
:
"
4"
-
name
:
MC_TE_METRIC
value
:
"
true"
-
name
:
NCCL_IB_SL
value
:
"
5"
-
name
:
LWS_WORKER_INDEX
valueFrom
:
fieldRef
:
fieldPath
:
metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
image
:
lmsysorg/sglang:latest
name
:
sglang-worker
ports
:
-
containerPort
:
30001
protocol
:
TCP
resources
:
limits
:
nvidia.com/gpu
:
"
8"
securityContext
:
capabilities
:
add
:
-
IPC_LOCK
privileged
:
true
volumeMounts
:
-
mountPath
:
/root/.cache
name
:
sgl-cache
-
mountPath
:
/dev/shm
name
:
dshm
-
mountPath
:
/work/models
name
:
model
-
mountPath
:
/dev/infiniband
name
:
ib
-
mountPath
:
/sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
name
:
cf
dnsPolicy
:
ClusterFirstWithHostNet
hostIPC
:
true
hostNetwork
:
true
nodeSelector
:
# should modify according your deployment env
pd
:
"
yes"
tolerations
:
# should modify according your deployment env
-
key
:
bopd
operator
:
Exists
-
key
:
node-role
operator
:
Exists
volumes
:
-
emptyDir
:
medium
:
Memory
name
:
dshm
-
hostPath
:
path
:
/dev/infiniband
name
:
ib
-
hostPath
:
# modify according to you deployment env
path
:
/data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
name
:
model
-
hostPath
:
# modify according to you deployment env
path
:
/data1/maas_hosted_models/models/fused_moe_triton/configs
name
:
cf
-
hostPath
:
# modify according to you deployment env
path
:
/data1/sgl_cache
type
:
DirectoryOrCreate
name
:
sgl-cache
docs/references/disaggregation/lws_pd_deploy.md
0 → 100644
View file @
7349717e
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment