d.yaml

apiVersion: leaderworkerset.x-k8s.io/v1
kind: LeaderWorkerSet
metadata:
  name: deepseekr10528-decode-main
spec:
  leaderWorkerTemplate:
    leaderTemplate:
      metadata:
        labels:
          role: leader
      spec:
        containers:
        - command:
          - python3
          - -m
          - sglang.launch_server
          - --port
          - "30000"
          - --host
          - "0.0.0.0"
          - --model-path
          - /work/models
          - --chunked-prefill-size
          - "262144"
          - --page-size
          - "64"
          - --enable-dp-attention
          - --enable-dp-lm-head
          - --dp-size
          - "16"
          - --enable-deepep-moe
          - --deepep-mode
          - low_latency
          - --disaggregation-mode
          - decode
          - --mem-fraction-static
          - "0.849"
          - --context-length
          - "32768"
          - --disaggregation-ib-device
          - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
          - --cuda-graph-max-bs
          - "64"
          - --max-running-requests
          - "2048"
          - --tp-size
          - "16" # Size of Tensor Parallelism
          - --dist-init-addr
          - $(LWS_LEADER_ADDRESS):20102
          - --nnodes
          - $(LWS_GROUP_SIZE)
          - --node-rank
          - $(LWS_WORKER_INDEX)
          - --trust-remote-code
          - --ep-num-redundant-experts
          - "32"
          - --moe-dense-tp-size
          - "1"
          env:
          - name: CUDA_LAUNCH_BLOCKING
            value: "0"
          - name: NVSHMEM_IB_GID_INDEX
            value: "3"
          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
            value: "1"
          - name: NVSHMEM_HCA_PE_MAPPING
            value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
          - name:  NCCL_IB_QPS_PER_CONNECTION
            value: "8"
          - name: NCCL_IB_SPLIT_DATA_ON_QPS
            value: "1"
          - name: NCCL_NET_PLUGIN
            value: "none"
          - name: NCCL_IB_TC
            value: "136"
          - name: NCCL_MIN_NCHANNELS
            value: "4"
          - name: NCCL_IB_SL
            value: "5"
          - name: MC_TE_METRIC
            value: "true"
          - name: SGLANG_MOONCAKE_TRANS_THREAD
            value: "16"
          - name: SGL_ENABLE_JIT_DEEPGEMM
            value: "1"
          - name: NCCL_IB_HCA
            value: ^=mlx5_0,mlx5_5,mlx5_6
          - name: LWS_WORKER_INDEX
            valueFrom:
              fieldRef:
                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
          image: lmsysorg/sglang:latest
          name: sglang-leader
          ports:
          - containerPort: 30000
            protocol: TCP
          readinessProbe:
            periodSeconds: 30
            tcpSocket:
              port: 30000
          resources:
            limits:
              nvidia.com/gpu: "8"
          securityContext:
            capabilities:
              add:
              - IPC_LOCK
            privileged: true
          volumeMounts:
          - mountPath: /root/.cache
            name: sgl-cache
          - mountPath: /dev/shm
            name: dshm
          - mountPath: /work/models
            name: model
          - mountPath: /dev/infiniband
            name: ib
          - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
            name: cf
        dnsPolicy: ClusterFirstWithHostNet
        hostIPC: true
        hostNetwork: true
        nodeSelector:
        # should modify according your deployment env
          pd: "yes"
        tolerations:
        # should modify according your deployment env
        - key: bopd
          operator: Exists
        - key: node-role
          operator: Exists
        volumes:
        - hostPath:
            path: /data1/sgl_cache1
            type: DirectoryOrCreate
          name: sgl-cache
        - emptyDir:
            medium: Memory
          name: dshm
        - hostPath:
            path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
          name: model
        - hostPath:
            path: /dev/infiniband
          name: ib
        - hostPath:
            path: /data1/maas_hosted_models/models/fused_moe_triton/configs
          name: cf
    restartPolicy: RecreateGroupOnPodRestart
    size:  2
    workerTemplate:
      metadata: {}
      spec:
        containers:
        - command:
          - python3
          - -m
          - sglang.launch_server
          - --model-path
          - /work/models
          - --chunked-prefill-size
          - "262144"
          - --page-size
          - "64"
          - --enable-dp-attention
          - --enable-dp-lm-head
          - --dp-size
          - "16"
          - --enable-deepep-moe
          - --deepep-mode
          - low_latency
          - --disaggregation-mode
          - decode
          - --mem-fraction-static
          - "0.849"
          - --context-length
          - "32768"
          - --disaggregation-ib-device
          - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
          - --cuda-graph-max-bs
          - "64"
          - --max-running-requests
          - "2048"
          - --tp-size
          - "16" # Size of Tensor Parallelism
          - --dist-init-addr
          - $(LWS_LEADER_ADDRESS):20102
          - --nnodes
          - $(LWS_GROUP_SIZE)
          - --node-rank
          - $(LWS_WORKER_INDEX)
          - --trust-remote-code
          - --ep-num-redundant-experts
          - "32"
          - --moe-dense-tp-size
          - "1"
          env:
          - name: NVSHMEM_IB_TRAFFIC_CLASS
            value: "16"
          - name: NVSHMEM_IB_GID_INDEX
            value: "3"
          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
            value: "1"
          - name: NVSHMEM_HCA_PE_MAPPING
            value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
          - name:  NCCL_IB_QPS_PER_CONNECTION
            value: "8"
          - name: NCCL_IB_SPLIT_DATA_ON_QPS
            value: "1"
          - name: NCCL_NET_PLUGIN
            value: "none"
          - name: NCCL_IB_TC
            value: "136"
          - name: NCCL_MIN_NCHANNELS
            value: "4"
          - name: MC_TE_METRIC
            value: "true"
          - name: NCCL_IB_SL
            value: "5"
          - name: SGLANG_MOONCAKE_TRANS_THREAD
            value: "16"
          - name: SGL_ENABLE_JIT_DEEPGEMM
            value: "1"
          - name: NCCL_IB_HCA
            value: ^=mlx5_0,mlx5_5,mlx5_6
          - name: LWS_WORKER_INDEX
            valueFrom:
              fieldRef:
                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
          image: lmsysorg/sglang:latest
          name: sglang-worker
          ports:
          - containerPort: 30001
          resources:
            limits:
              nvidia.com/gpu: "8"
          securityContext:
            capabilities:
              add:
              - IPC_LOCK
            privileged: true
          volumeMounts:
          - mountPath: /root/.cache
            name: sgl-cache
          - mountPath: /dev/shm
            name: dshm
          - mountPath: /work/models
            name: model
          - mountPath: /dev/infiniband
            name: ib
          - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
            name: cf
        dnsPolicy: ClusterFirstWithHostNet
        hostIPC: true
        hostNetwork: true
        nodeSelector:
        # should modify according your deployment env
          pd: "yes"
        tolerations:
        # should modify according your deployment env
        - key: bopd
          operator: Exists
        - key: node-role
          operator: Exists
        volumes:
        - hostPath:
            path: /data1/sgl_cache1
            type: DirectoryOrCreate
          name: sgl-cache
        - emptyDir:
            medium: Memory
          name: dshm
        - hostPath:
            path: /dev/infiniband
          name: ib
        - hostPath:
            # modify according to you deployment env
            path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
          name: model
        - hostPath:
            # modify according to you deployment env
            path: /data1/maas_hosted_models/models/fused_moe_triton/configs
          name: cf
  networkConfig:
    subdomainPolicy: Shared
  replicas: 1
  rolloutStrategy:
    rollingUpdateConfiguration:
      maxSurge: 0
      maxUnavailable: 1
    type: RollingUpdate
  startupPolicy: LeaderCreated