k8s-sglang-service.yaml 2.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: llama-31-8b-sglang
spec:
  accessModes:
    - ReadWriteMany
  resources:
    requests:
      storage: 30Gi
  storageClassName: default # change this to your preferred storage class
  volumeMode: Filesystem
---
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
  name: nvidia
handler: nvidia
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: meta-llama-31-8b-instruct-sglang
spec:
  replicas: 1
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app: meta-llama-31-8b-instruct-sglang
  template:
    metadata:
      labels:
        app: meta-llama-31-8b-instruct-sglang
        model: meta-llama-31-8b-instruct
        engine: sglang
    spec:
      restartPolicy: Always
      runtimeClassName: nvidia
      containers:
        - name: meta-llama-31-8b-instruct-sglang
          image: docker.io/lmsysorg/sglang:latest
43
          imagePullPolicy: Always # IfNotPresent or Never
44
45
46
          ports:
            - containerPort: 30000
          command: ["python3", "-m", "sglang.launch_server"]
47
48
49
50
51
52
53
54
55
          args:
            [
              "--model-path",
              "meta-llama/Llama-3.1-8B-Instruct",
              "--host",
              "0.0.0.0",
              "--port",
              "30000",
            ]
56
57
58
59
60
61
          env:
            - name: HF_TOKEN
              value: <secret>
          resources:
            limits:
              nvidia.com/gpu: 1
62
63
64
65
66
67
              cpu: 8
              memory: 40Gi
            requests:
              cpu: 2
              memory: 16Gi
              nvidia.com/gpu: 1
68
          volumeMounts:
69
70
            - name: shm
              mountPath: /dev/shm
71
72
73
74
75
76
77
78
79
            - name: hf-cache
              mountPath: /root/.cache/huggingface
            - name: localtime
              mountPath: /etc/localtime
              readOnly: true
          livenessProbe:
            httpGet:
              path: /health
              port: 30000
80
81
82
83
84
85
86
87
88
89
90
91
92
            initialDelaySeconds: 120
            periodSeconds: 15
            timeoutSeconds: 10
            failureThreshold: 3
          readinessProbe:
            httpGet:
              path: /health_generate
              port: 30000
            initialDelaySeconds: 120
            periodSeconds: 15
            timeoutSeconds: 10
            failureThreshold: 3
            successThreshold: 1
93
      volumes:
94
95
96
97
        - name: shm
          emptyDir:
            medium: Memory
            sizeLimit: 10Gi
98
        - name: hf-cache
99
100
          persistentVolumeClaim:
            claimName: llama-31-8b-sglang
101
102
103
104
105
106
107
108
109
110
111
112
113
114
        - name: localtime
          hostPath:
            path: /etc/localtime
            type: File
---
apiVersion: v1
kind: Service
metadata:
  name: meta-llama-31-8b-instruct-sglang
spec:
  selector:
    app: meta-llama-31-8b-instruct-sglang
  ports:
    - protocol: TCP
115
116
117
      port: 80 # port on host
      targetPort: 30000 # port in container
  type: LoadBalancer # change to ClusterIP if needed