Unverified Commit 2e1d2d7e authored by TimWang's avatar TimWang Committed by GitHub
Browse files

Add PVC and update resource limits in k8s config (#8489)

parent fb16fbaf
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: llama-31-8b-sglang
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 30Gi
storageClassName: default # change this to your preferred storage class
volumeMode: Filesystem
---
apiVersion: node.k8s.io/v1 apiVersion: node.k8s.io/v1
kind: RuntimeClass kind: RuntimeClass
metadata: metadata:
...@@ -27,23 +40,36 @@ spec: ...@@ -27,23 +40,36 @@ spec:
containers: containers:
- name: meta-llama-31-8b-instruct-sglang - name: meta-llama-31-8b-instruct-sglang
image: docker.io/lmsysorg/sglang:latest image: docker.io/lmsysorg/sglang:latest
imagePullPolicy: Always # IfNotPresent or Never imagePullPolicy: Always # IfNotPresent or Never
ports: ports:
- containerPort: 30000 - containerPort: 30000
command: ["python3", "-m", "sglang.launch_server"] command: ["python3", "-m", "sglang.launch_server"]
args: ["--model-path", "meta-llama/Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"] args:
[
"--model-path",
"meta-llama/Llama-3.1-8B-Instruct",
"--host",
"0.0.0.0",
"--port",
"30000",
]
env: env:
- name: HF_TOKEN - name: HF_TOKEN
value: <secret> value: <secret>
resources: resources:
limits: limits:
nvidia.com/gpu: 1 nvidia.com/gpu: 1
cpu: 8
memory: 40Gi
requests:
cpu: 2
memory: 16Gi
nvidia.com/gpu: 1
volumeMounts: volumeMounts:
- name: shm - name: shm
mountPath: /dev/shm mountPath: /dev/shm
- name: hf-cache - name: hf-cache
mountPath: /root/.cache/huggingface mountPath: /root/.cache/huggingface
readOnly: true
- name: localtime - name: localtime
mountPath: /etc/localtime mountPath: /etc/localtime
readOnly: true readOnly: true
...@@ -51,17 +77,27 @@ spec: ...@@ -51,17 +77,27 @@ spec:
httpGet: httpGet:
path: /health path: /health
port: 30000 port: 30000
initialDelaySeconds: 30 initialDelaySeconds: 120
periodSeconds: 10 periodSeconds: 15
timeoutSeconds: 10
failureThreshold: 3
readinessProbe:
httpGet:
path: /health_generate
port: 30000
initialDelaySeconds: 120
periodSeconds: 15
timeoutSeconds: 10
failureThreshold: 3
successThreshold: 1
volumes: volumes:
- name: shm - name: shm
emptyDir: emptyDir:
medium: Memory medium: Memory
sizeLimit: 10Gi sizeLimit: 10Gi
- name: hf-cache - name: hf-cache
hostPath: persistentVolumeClaim:
path: /root/.cache/huggingface claimName: llama-31-8b-sglang
type: Directory
- name: localtime - name: localtime
hostPath: hostPath:
path: /etc/localtime path: /etc/localtime
...@@ -76,6 +112,6 @@ spec: ...@@ -76,6 +112,6 @@ spec:
app: meta-llama-31-8b-instruct-sglang app: meta-llama-31-8b-instruct-sglang
ports: ports:
- protocol: TCP - protocol: TCP
port: 30000 # port on host port: 80 # port on host
targetPort: 30000 # port in container targetPort: 30000 # port in container
type: LoadBalancer type: LoadBalancer # change to ClusterIP if needed
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment