apiVersion: leaderworkerset.x-k8s.io/v1 kind: LeaderWorkerSet metadata: name: deepseekr10528-decode-main spec: leaderWorkerTemplate: leaderTemplate: metadata: labels: role: leader spec: containers: - command: - python3 - -m - sglang.launch_server - --port - "30000" - --host - "0.0.0.0" - --model-path - /work/models - --chunked-prefill-size - "262144" - --page-size - "64" - --enable-dp-attention - --enable-dp-lm-head - --dp-size - "16" - --enable-deepep-moe - --deepep-mode - low_latency - --disaggregation-mode - decode - --mem-fraction-static - "0.849" - --context-length - "32768" - --disaggregation-ib-device - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3" - --cuda-graph-max-bs - "64" - --max-running-requests - "2048" - --tp-size - "16" # Size of Tensor Parallelism - --dist-init-addr - $(LWS_LEADER_ADDRESS):20102 - --nnodes - $(LWS_GROUP_SIZE) - --node-rank - $(LWS_WORKER_INDEX) - --trust-remote-code - --ep-num-redundant-experts - "32" - --moe-dense-tp-size - "1" env: - name: CUDA_LAUNCH_BLOCKING value: "0" - name: NVSHMEM_IB_GID_INDEX value: "3" - name: NVSHMEM_ENABLE_NIC_PE_MAPPING value: "1" - name: NVSHMEM_HCA_PE_MAPPING value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2" - name: NCCL_IB_QPS_PER_CONNECTION value: "8" - name: NCCL_IB_SPLIT_DATA_ON_QPS value: "1" - name: NCCL_NET_PLUGIN value: "none" - name: NCCL_IB_TC value: "136" - name: NCCL_MIN_NCHANNELS value: "4" - name: NCCL_IB_SL value: "5" - name: MC_TE_METRIC value: "true" - name: SGLANG_MOONCAKE_TRANS_THREAD value: "16" - name: SGL_ENABLE_JIT_DEEPGEMM value: "1" - name: NCCL_IB_HCA value: ^=mlx5_0,mlx5_5,mlx5_6 - name: LWS_WORKER_INDEX valueFrom: fieldRef: fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] image: lmsysorg/sglang:latest name: sglang-leader ports: - containerPort: 30000 protocol: TCP readinessProbe: periodSeconds: 30 tcpSocket: port: 30000 resources: limits: nvidia.com/gpu: "8" securityContext: capabilities: add: - IPC_LOCK privileged: true volumeMounts: - mountPath: /root/.cache name: sgl-cache - mountPath: /dev/shm name: dshm - mountPath: /work/models name: model - mountPath: /dev/infiniband name: ib - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs name: cf dnsPolicy: ClusterFirstWithHostNet hostIPC: true hostNetwork: true nodeSelector: # should modify according your deployment env pd: "yes" tolerations: # should modify according your deployment env - key: bopd operator: Exists - key: node-role operator: Exists volumes: - hostPath: path: /data1/sgl_cache1 type: DirectoryOrCreate name: sgl-cache - emptyDir: medium: Memory name: dshm - hostPath: path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528 name: model - hostPath: path: /dev/infiniband name: ib - hostPath: path: /data1/maas_hosted_models/models/fused_moe_triton/configs name: cf restartPolicy: RecreateGroupOnPodRestart size: 2 workerTemplate: metadata: {} spec: containers: - command: - python3 - -m - sglang.launch_server - --model-path - /work/models - --chunked-prefill-size - "262144" - --page-size - "64" - --enable-dp-attention - --enable-dp-lm-head - --dp-size - "16" - --enable-deepep-moe - --deepep-mode - low_latency - --disaggregation-mode - decode - --mem-fraction-static - "0.849" - --context-length - "32768" - --disaggregation-ib-device - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3" - --cuda-graph-max-bs - "64" - --max-running-requests - "2048" - --tp-size - "16" # Size of Tensor Parallelism - --dist-init-addr - $(LWS_LEADER_ADDRESS):20102 - --nnodes - $(LWS_GROUP_SIZE) - --node-rank - $(LWS_WORKER_INDEX) - --trust-remote-code - --ep-num-redundant-experts - "32" - --moe-dense-tp-size - "1" env: - name: NVSHMEM_IB_TRAFFIC_CLASS value: "16" - name: NVSHMEM_IB_GID_INDEX value: "3" - name: NVSHMEM_ENABLE_NIC_PE_MAPPING value: "1" - name: NVSHMEM_HCA_PE_MAPPING value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2" - name: NCCL_IB_QPS_PER_CONNECTION value: "8" - name: NCCL_IB_SPLIT_DATA_ON_QPS value: "1" - name: NCCL_NET_PLUGIN value: "none" - name: NCCL_IB_TC value: "136" - name: NCCL_MIN_NCHANNELS value: "4" - name: MC_TE_METRIC value: "true" - name: NCCL_IB_SL value: "5" - name: SGLANG_MOONCAKE_TRANS_THREAD value: "16" - name: SGL_ENABLE_JIT_DEEPGEMM value: "1" - name: NCCL_IB_HCA value: ^=mlx5_0,mlx5_5,mlx5_6 - name: LWS_WORKER_INDEX valueFrom: fieldRef: fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] image: lmsysorg/sglang:latest name: sglang-worker ports: - containerPort: 30001 resources: limits: nvidia.com/gpu: "8" securityContext: capabilities: add: - IPC_LOCK privileged: true volumeMounts: - mountPath: /root/.cache name: sgl-cache - mountPath: /dev/shm name: dshm - mountPath: /work/models name: model - mountPath: /dev/infiniband name: ib - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs name: cf dnsPolicy: ClusterFirstWithHostNet hostIPC: true hostNetwork: true nodeSelector: # should modify according your deployment env pd: "yes" tolerations: # should modify according your deployment env - key: bopd operator: Exists - key: node-role operator: Exists volumes: - hostPath: path: /data1/sgl_cache1 type: DirectoryOrCreate name: sgl-cache - emptyDir: medium: Memory name: dshm - hostPath: path: /dev/infiniband name: ib - hostPath: # modify according to you deployment env path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528 name: model - hostPath: # modify according to you deployment env path: /data1/maas_hosted_models/models/fused_moe_triton/configs name: cf networkConfig: subdomainPolicy: Shared replicas: 1 rolloutStrategy: rollingUpdateConfiguration: maxSurge: 0 maxUnavailable: 1 type: RollingUpdate startupPolicy: LeaderCreated