k8s-sglang-distributed-sts.yaml 2.61 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# Two Nodes Sglang example

apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: distributed-sglang
spec:
  replicas: 2   # number of nodes/pods to run distributed sglang
  selector:
    matchLabels:
      app: distributed-sglang
  serviceName: ""
  template:
    metadata:
      labels:
        app: distributed-sglang
    spec:
      containers:
      - name: sglang-container
        image: docker.io/lmsysorg/sglang:latest
        imagePullPolicy: Always # image may be replaced by official CI versioned image
        command:
        - /bin/bash
        - -c
25
        # please modify the sglang serving arguments below, as necessary.
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
        # NOTE: the --expert-parallel-size and --enable-ep-moe are for MoE model like DeepSeek-R1
        args:
        - |
          python3 -m sglang.launch_server \
          --model /llm-folder \
          --dist-init-addr sglang-master-pod:5000 \
          --tensor-parallel-size 16 \
          --nnodes 2 \
          --node-rank $POD_INDEX \
          --trust-remote-code \
          --host 0.0.0.0 \
          --port 8000 \
          --enable-metrics \
          --enable-ep-moe \
          --expert-parallel-size 16
        env:
        - name: POD_INDEX     # reflects the node-rank
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: metadata.labels['apps.kubernetes.io/pod-index']
        - name: NCCL_DEBUG
          value: INFO
        resources:
          limits:
            nvidia.com/gpu: "8"
          requests:
        volumeMounts:
        - mountPath: /dev/shm
          name: dshm
        - mountPath: /llm-folder
          name: llm
        securityContext:
          privileged: true   # to leverage RDMA/InfiniBand device, co-work with HostNetwork=true
      hostNetwork: true
      volumes:
      - emptyDir:
          medium: Memory
          sizeLimit: 10Gi
        name: dshm
      - hostPath:
          path: /llm-folder # replace with PVC or hostPath with your model weights
          type: DirectoryOrCreate
        name: llm
      #- persistentVolumeClaim:
      #  claimName: llm-pvc
      #  name: llm
---
apiVersion: v1
kind: Service
metadata:
  name: sglang-master-pod
spec:
  type: ClusterIP
  selector:
    app: distributed-sglang
    apps.kubernetes.io/pod-index: "0"
  ports:
  - name: dist-port
    port: 5000
    targetPort: 5000
---
# the serving service
apiVersion: v1
kind: Service
metadata:
  name: sglang-serving-on-master
spec:
  type: NodePort
  selector:
    app: distributed-sglang
    apps.kubernetes.io/pod-index: "0"
  ports:
  - name: serving
    port: 8000
    targetPort: 8000
  - name: metrics
    port: 8080
    targetPort: 8080