sglangv0.5.2 & support Qwen3-Next-80B-A3B-Instruct

118f1fc7 · maxiao1 · 118f1fc7 · 118f1fc7 · 118f1fc7 · 118f1fc7
Commit 118f1fc7 authored Sep 13, 2025 by maxiao1
20 changed files
--- a/docs/references/multi_node_deployment/lws_pd/lws-examples/d.yaml
+++ b/docs/references/multi_node_deployment/lws_pd/lws-examples/d.yaml
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: deepseekr10528-decode-main
+spec:
+  leaderWorkerTemplate:
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+      spec:
+        containers:
+        - command:
+          - python3
+          - -m
+          - sglang.launch_server
+          - --port
+          - "30000"
+          - --host
+          - "0.0.0.0"
+          - --model-path
+          - /work/models
+          - --chunked-prefill-size
+          - "262144"
+          - --page-size
+          - "64"
+          - --enable-dp-attention
+          - --enable-dp-lm-head
+          - --dp-size
+          - "16"
+          - --moe-a2a-backend
+          - deepep
+          - --disaggregation-mode
+          - decode
+          - --mem-fraction-static
+          - "0.849"
+          - --context-length
+          - "32768"
+          - --disaggregation-ib-device
+          - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
+          - --cuda-graph-max-bs
+          - "64"
+          - --max-running-requests
+          - "2048"
+          - --tp-size
+          - "16" # Size of Tensor Parallelism
+          - --dist-init-addr
+          - $(LWS_LEADER_ADDRESS):20102
+          - --nnodes
+          - $(LWS_GROUP_SIZE)
+          - --node-rank
+          - $(LWS_WORKER_INDEX)
+          - --trust-remote-code
+          - --ep-num-redundant-experts
+          - "32"
+          - --moe-dense-tp-size
+          - "1"
+          env:
+          - name: CUDA_LAUNCH_BLOCKING
+            value: "0"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: NVSHMEM_HCA_PE_MAPPING
+            value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
+          - name:  NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: "none"
+          - name: NCCL_IB_TC
+            value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: NCCL_IB_SL
+            value: "5"
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "16"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: lmsysorg/sglang:latest
+          name: sglang-leader
+          ports:
+          - containerPort: 30000
+            protocol: TCP
+          readinessProbe:
+            periodSeconds: 30
+            tcpSocket:
+              port: 30000
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+            privileged: true
+          volumeMounts:
+          - mountPath: /root/.cache
+            name: sgl-cache
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /work/models
+            name: model
+          - mountPath: /dev/infiniband
+            name: ib
+          - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+            name: cf
+        dnsPolicy: ClusterFirstWithHostNet
+        hostIPC: true
+        hostNetwork: true
+        nodeSelector:
+        # should modify according your deployment env
+          pd: "yes"
+        tolerations:
+        # should modify according your deployment env
+        - key: bopd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+        volumes:
+        - hostPath:
+            path: /data1/sgl_cache1
+            type: DirectoryOrCreate
+          name: sgl-cache
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+          name: model
+        - hostPath:
+            path: /dev/infiniband
+          name: ib
+        - hostPath:
+            path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+          name: cf
+    restartPolicy: RecreateGroupOnPodRestart
+    size:  2
+    workerTemplate:
+      metadata: {}
+      spec:
+        containers:
+        - command:
+          - python3
+          - -m
+          - sglang.launch_server
+          - --model-path
+          - /work/models
+          - --chunked-prefill-size
+          - "262144"
+          - --page-size
+          - "64"
+          - --enable-dp-attention
+          - --enable-dp-lm-head
+          - --dp-size
+          - "16"
+          - --moe-a2a-backend
+          - deepep
+          - --disaggregation-mode
+          - decode
+          - --mem-fraction-static
+          - "0.849"
+          - --context-length
+          - "32768"
+          - --disaggregation-ib-device
+          - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
+          - --cuda-graph-max-bs
+          - "64"
+          - --max-running-requests
+          - "2048"
+          - --tp-size
+          - "16" # Size of Tensor Parallelism
+          - --dist-init-addr
+          - $(LWS_LEADER_ADDRESS):20102
+          - --nnodes
+          - $(LWS_GROUP_SIZE)
+          - --node-rank
+          - $(LWS_WORKER_INDEX)
+          - --trust-remote-code
+          - --ep-num-redundant-experts
+          - "32"
+          - --moe-dense-tp-size
+          - "1"
+          env:
+          - name: NVSHMEM_IB_TRAFFIC_CLASS
+            value: "16"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: NVSHMEM_HCA_PE_MAPPING
+            value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
+          - name:  NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: "none"
+          - name: NCCL_IB_TC
+            value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: NCCL_IB_SL
+            value: "5"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "16"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: lmsysorg/sglang:latest
+          name: sglang-worker
+          ports:
+          - containerPort: 30001
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+            privileged: true
+          volumeMounts:
+          - mountPath: /root/.cache
+            name: sgl-cache
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /work/models
+            name: model
+          - mountPath: /dev/infiniband
+            name: ib
+          - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+            name: cf
+        dnsPolicy: ClusterFirstWithHostNet
+        hostIPC: true
+        hostNetwork: true
+        nodeSelector:
+        # should modify according your deployment env
+          pd: "yes"
+        tolerations:
+        # should modify according your deployment env
+        - key: bopd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+        volumes:
+        - hostPath:
+            path: /data1/sgl_cache1
+            type: DirectoryOrCreate
+          name: sgl-cache
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /dev/infiniband
+          name: ib
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+          name: model
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+          name: cf
+  networkConfig:
+    subdomainPolicy: Shared
+  replicas: 1
+  rolloutStrategy:
+    rollingUpdateConfiguration:
+      maxSurge: 0
+      maxUnavailable: 1
+    type: RollingUpdate
+  startupPolicy: LeaderCreated
--- a/docs/references/multi_node_deployment/lws_pd/lws-examples/lb.yaml
+++ b/docs/references/multi_node_deployment/lws_pd/lws-examples/lb.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: deepseekr10528-lb-main
+  labels:
+    app: deepseekr10528-lb
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: deepseekr10528-lb
+  template:
+    metadata:
+      labels:
+        app: deepseekr10528-lb
+    spec:
+      nodeSelector:
+          bo: "yes"
+      tolerations:
+        - key: bopd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+      containers:
+        - name: sgl-minilb
+          image: lmsysorg/sglang:latest
+          command:
+          - python
+          - -m
+          - sglang_router.launch_router
+          - --pd-disaggregation
+          - --prefill
+          - http://deepseekr10528-prefill-main:30000
+          - --decode
+          - http://deepseekr10528-decode-main:30000
+          - --host
+          - 0.0.0.0
+          - --port
+          -  "8000"
+          ports:
+            - containerPort: 8000
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseekr10528-lb-service
+spec:
+  type: NodePort # NodePort is easy to test, you can also specify `ClusterIP`
+  selector:
+    app: deepseekr10528-lb
+  ports:
+    - protocol: TCP
+      port: 8000         # Service Port（In-Cluster）
+      targetPort: 8000   # Exposed Container
+      nodePort: 30800
--- a/docs/references/multi_node_deployment/lws_pd/lws-examples/p-svc.yaml
+++ b/docs/references/multi_node_deployment/lws_pd/lws-examples/p-svc.yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseekr10528-prefill-main
+spec:
+  selector:
+    leaderworkerset.sigs.k8s.io/name: deepseekr10528-prefill-main
+    role: leader
+  ports:
+    - protocol: TCP
+      port: 30000
+      targetPort: 30000
--- a/docs/references/multi_node_deployment/lws_pd/lws-examples/p.yaml
+++ b/docs/references/multi_node_deployment/lws_pd/lws-examples/p.yaml
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: deepseekr10528-prefill-main
+spec:
+  leaderWorkerTemplate:
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+      spec:
+        containers:
+        - command:
+          - python3
+          - -m
+          - sglang.launch_server
+          - --port
+          - "30000"
+          - --host
+          - "0.0.0.0"
+          - --model-path
+          - /work/models
+          - --disaggregation-ib-device
+          # should modify according your rdma env
+          - mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
+          - --chunked-prefill-size
+          - "524288"
+          - --max-prefill-tokens
+          - "32768"
+          - --page-size
+          - "64"
+          - --ep-dispatch-algorithm
+          - dynamic
+          - --eplb-algorithm
+          - deepseek
+          - --enable-dp-lm-head
+          - --enable-dp-attention
+          - --dp-size
+          - "16"
+          - --disable-radix-cache
+          - --moe-a2a-backend
+          - deepep
+          - --disaggregation-mode
+          - prefill
+          - --mem-fraction-static
+          - "0.7"
+          - --context-length
+          - "32768"
+          - --tp
+          - "16"
+          - --dist-init-addr
+          - $(LWS_LEADER_ADDRESS):20102
+          - --nnodes
+          - $(LWS_GROUP_SIZE)
+          - --node-rank
+          - $(LWS_WORKER_INDEX)
+          - --trust-remote-code
+          - --ep-num-redundant-experts
+          - "32"
+          - --moe-dense-tp-size
+          - "1"
+          - --max-running-requests
+          - "1024"
+          env:
+          - name: NVSHMEM_HCA_PE_MAPPING
+            # should modify according your rdma env
+            value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: SGLANG_SET_CPU_AFFINITY
+            value: "true"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: none
+          - name: NCCL_IB_TC
+            value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "false"
+          - name: NCCL_IB_SL
+            value: "5"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: lmsysorg/sglang:latest
+          name: sglang-leader
+          ports:
+          - containerPort: 30000
+            protocol: TCP
+          readinessProbe:
+            periodSeconds: 30
+            tcpSocket:
+              port: 30000
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+            privileged: true
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /work/models
+            name: model
+          - mountPath: /dev/infiniband
+            name: ib
+          - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+            name: cf
+          - mountPath: /root/.cache
+            name: sgl-cache
+        dnsPolicy: ClusterFirstWithHostNet
+        hostIPC: true
+        hostNetwork: true
+        nodeSelector:
+        # should modify according your deployment env
+          pd: "yes"
+        tolerations:
+        # should modify according your deployment env
+        - key: bopd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+        volumes:
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+          name: model
+        - hostPath:
+            path: /dev/infiniband
+          name: ib
+        - hostPath:
+            path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+          name: cf
+        - hostPath:
+            path: /data1/sgl_cache
+            type: DirectoryOrCreate
+          name: sgl-cache
+    restartPolicy: RecreateGroupOnPodRestart
+    size: 2
+    workerTemplate:
+      metadata: {}
+      spec:
+        containers:
+        - command:
+          - python3
+          - -m
+          - sglang.launch_server
+          - --model-path
+          - /work/models
+          - --disaggregation-ib-device
+          # should modify according your rdma env
+          - mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
+          - --chunked-prefill-size
+          - "524288"
+          - --max-prefill-tokens
+          - "32768"
+          - --page-size
+          - "64"
+          - --ep-dispatch-algorithm
+          - dynamic
+          - --eplb-algorithm
+          - deepseek
+          #          - --deepep-config
+          #          -  /home/aiges/tuned/tuned_8sms.json
+          # can be tuned using deepep test scripts
+          - --enable-dp-lm-head
+          - --enable-dp-attention
+          - --dp-size
+          - "16"
+          - --disable-radix-cache
+          - --moe-a2a-backend
+          - deepep
+          - --disaggregation-mode
+          - prefill
+          - --mem-fraction-static
+          - "0.7"
+          - --context-length
+          - "32768"
+          - --tp
+          - "16"
+          - --dist-init-addr
+          - $(LWS_LEADER_ADDRESS):20102
+          - --nnodes
+          - $(LWS_GROUP_SIZE)
+          - --node-rank
+          - $(LWS_WORKER_INDEX)
+          - --trust-remote-code
+          - --ep-num-redundant-experts
+          - "32"
+          - --moe-dense-tp-size
+          - "1"
+          - --max-running-requests
+          - "1024"
+          env:
+          - name: SGLANG_SET_CPU_AFFINITY
+            value: "true"
+          - name: NVSHMEM_HCA_PE_MAPPING
+            # should modify according your rdma env
+            value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: NVSHMEM_IB_TRAFFIC_CLASS
+            value: "16"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: CUDA_LAUNCH_BLOCKING
+            value: "0"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "8"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: SGL_CHUNKED_PREFIX_CACHE_THRESHOLD
+            value: "0"
+          - name: NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: none
+          - name: NCCL_IB_TC
+            value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: NCCL_IB_SL
+            value: "5"
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: lmsysorg/sglang:latest
+          name: sglang-worker
+          ports:
+          - containerPort: 30001
+            protocol: TCP
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+            privileged: true
+          volumeMounts:
+          - mountPath: /root/.cache
+            name: sgl-cache
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /work/models
+            name: model
+          - mountPath: /dev/infiniband
+            name: ib
+          - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+            name: cf
+        dnsPolicy: ClusterFirstWithHostNet
+        hostIPC: true
+        hostNetwork: true
+        nodeSelector:
+        # should modify according your deployment env
+          pd: "yes"
+        tolerations:
+        # should modify according your deployment env
+        - key: bopd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+        volumes:
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /dev/infiniband
+          name: ib
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+          name: model
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+          name: cf
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/sgl_cache
+            type: DirectoryOrCreate
+          name: sgl-cache
--- a/docs/references/multi_node_deployment/lws_pd/lws_pd_deploy.md
+++ b/docs/references/multi_node_deployment/lws_pd/lws_pd_deploy.md
+# LWS Based PD Deploy
+## 0. Prerequisites
+1. k8s >=1.26
+2. lws installed on k8s.
+## 1. Image Preparation
+`lmsysorg/sglang:deepep`
+## 2. Deployment Manifest Files
+***Notice: We will package all deployment files into Helm Chart format in the near future. Interested community members can contact us to contribute***
+### Prefill
+Prefill manifest file [prefill.yaml](lws-examples/p.yaml)
+*Note: The NodeSelector section, model location section, and taint toleration section can be adjusted according to your actual deployment environment*
+```yaml
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: deepseekr10528-prefill-main
+spec:
+  leaderWorkerTemplate:
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+      spec:
+        containers:
+        - command:
+          - python3
+          - -m
+          - sglang.launch_server
+          - --port
+          - "30000"
+          - --host
+          - "0.0.0.0"
+          - --model-path
+          - /work/models
+          - --disaggregation-ib-device
+          # should modify according your rdma env
+          - mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
+          - --chunked-prefill-size
+          - "524288"
+          - --max-prefill-tokens
+          - "32768"
+          - --page-size
+          - "64"
+          #          - --init-expert-location
+          #          - /home/aiges/tuned/attachment_ep_statistics/prefill_in1024.json
+          - --ep-dispatch-algorithm
+          - dynamic
+          - --eplb-algorithm
+          - deepseek
+          #          - --deepep-config
+          #          -  /home/aiges/tuned/tuned_8sms.json
+          - --enable-dp-lm-head
+          - --enable-dp-attention
+          - --dp-size
+          - "16"
+          - --disable-radix-cache
+          - --moe-a2a-backend
+          - deepep
+          - --disaggregation-mode
+          - prefill
+          - --mem-fraction-static
+          - "0.7"
+          - --context-length
+          - "32768"
+          - --tp
+          - "16"
+          - --dist-init-addr
+          - $(LWS_LEADER_ADDRESS):20102
+          - --nnodes
+          - $(LWS_GROUP_SIZE)
+          - --node-rank
+          - $(LWS_WORKER_INDEX)
+          - --trust-remote-code
+          - --ep-num-redundant-experts
+          - "32"
+          - --moe-dense-tp-size
+          - "1"
+          - --max-running-requests
+          - "1024"
+          env:
+#          - name: NVSHMEM_HCA_PE_MAPPING
+#            value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
+#          - name: NVSHMEM_HCA_LIST
+#            value: "mlx5_bond_0:1,mlx5_bond_1:1,mlx5_bond_2:1,mlx5_bond_3:1"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: SGLANG_SET_CPU_AFFINITY
+            value: "true"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: none
+          - name: NCCL_IB_TC
+            value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "false"
+          - name: NCCL_IB_SL
+            value: "5"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: lmsysorg/sglang:deepep
+          name: sglang-leader
+          ports:
+          - containerPort: 30000
+            protocol: TCP
+          readinessProbe:
+            periodSeconds: 30
+            tcpSocket:
+              port: 30000
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+            privileged: true
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /work/models
+            name: model
+          - mountPath: /dev/infiniband
+            name: ib
+          - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+            name: cf
+          - mountPath: /root/.cache
+            name: sgl-cache
+        dnsPolicy: ClusterFirstWithHostNet
+        hostIPC: true
+        hostNetwork: true
+        nodeSelector:
+          pd: "yes"
+        tolerations:
+        - key: pd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+        volumes:
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+          name: model
+        - hostPath:
+            path: /dev/infiniband
+          name: ib
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+          name: cf
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/sgl_cache
+            type: DirectoryOrCreate
+          name: sgl-cache
+    restartPolicy: RecreateGroupOnPodRestart
+    size: 2
+    workerTemplate:
+      metadata: {}
+      spec:
+        containers:
+        - command:
+          - python3
+          - -m
+          - sglang.launch_server
+          - --model-path
+          - /work/models
+          - --disaggregation-ib-device
+          - mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3
+          - --chunked-prefill-size
+          - "524288"
+          - --max-prefill-tokens
+          - "32768"
+          - --page-size
+          - "64"
+          #- --init-expert-location
+          #- /home/aiges/tuned/attachment_ep_statistics/prefill_in1024.json
+          - --ep-dispatch-algorithm
+          - dynamic
+          - --eplb-algorithm
+          - deepseek
+#          - --deepep-config
+#          -  /home/aiges/tuned/tuned_8sms.json
+          - --enable-dp-lm-head
+          - --enable-dp-attention
+          - --dp-size
+          - "16"
+          - --disable-radix-cache
+          - --moe-a2a-backend
+          - deepep
+          - --disaggregation-mode
+          - prefill
+          - --mem-fraction-static
+          - "0.7"
+          - --context-length
+          - "32768"
+          - --tp
+          - "16"
+          - --dist-init-addr
+          - $(LWS_LEADER_ADDRESS):20102
+          - --nnodes
+          - $(LWS_GROUP_SIZE)
+          - --node-rank
+          - $(LWS_WORKER_INDEX)
+          - --trust-remote-code
+          - --ep-num-redundant-experts
+          - "32"
+          - --moe-dense-tp-size
+          - "1"
+          - --max-running-requests
+          - "1024"
+          env:
+          - name: SGLANG_SET_CPU_AFFINITY
+            value: "true"
+          - name: SGLANG_HACK_DEEPEP_NUM_SMS
+            value: "8"
+          - name: SGLANG_HACK_DEEPEP_NEW_MODE
+            value: "0"
+#          - name: NVSHMEM_HCA_PE_MAPPING
+#            value: "mlx5_bond_0:1:2,mlx5_bond_1:1:2,mlx5_bond_2:1:2,mlx5_bond_3:1:2"
+#          - name: NVSHMEM_HCA_LIST
+#            value: "mlx5_bond_0:1,mlx5_bond_1:1,mlx5_bond_2:1,mlx5_bond_3:1"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: NVSHMEM_IB_TRAFFIC_CLASS
+            value: "16"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: CUDA_LAUNCH_BLOCKING
+            value: "0"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "8"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: SGL_CHUNKED_PREFIX_CACHE_THRESHOLD
+            value: "0"
+          - name: NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: none
+          - name: NCCL_IB_TC
+            value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: NCCL_IB_SL
+            value: "5"
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: lmsysorg/sglang:deepep
+          name: sglang-worker
+          ports:
+          - containerPort: 30001
+            protocol: TCP
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+            privileged: true
+          volumeMounts:
+          - mountPath: /root/.cache
+            name: sgl-cache
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /work/models
+            name: model
+          - mountPath: /dev/infiniband
+            name: ib
+          - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+            name: cf
+        dnsPolicy: ClusterFirstWithHostNet
+        hostIPC: true
+        hostNetwork: true
+        nodeSelector:
+          pd: "yes"
+        tolerations:
+        - key: pd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+        volumes:
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /dev/infiniband
+          name: ib
+        - hostPath:
+            path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+          name: model
+        - hostPath:
+            path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+          name: cf
+        - hostPath:
+            path: /data1/sgl_cache
+            type: DirectoryOrCreate
+          name: sgl-cache
+```
+### Decode
+Decode node deployment manifest file [decode.yaml](lws-examples/d.yaml)
+*Note: The NodeSelector section, model location section, and taint toleration section can be adjusted according to your actual deployment environment*
+```yaml
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: deepseekr10528-decode-main
+spec:
+  leaderWorkerTemplate:
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+      spec:
+        containers:
+        - command:
+          - python3
+          - -m
+          - sglang.launch_server
+          - --port
+          - "30000"
+          - --host
+          - "0.0.0.0"
+          - --model-path
+          - /work/models
+          - --chunked-prefill-size
+          - "262144"
+          - --page-size
+          - "64"
+          - --enable-dp-attention
+          - --enable-dp-lm-head
+          - --dp-size
+          - "16"
+          - --moe-a2a-backend
+          - deepep
+          - --disaggregation-mode
+          - decode
+          - --mem-fraction-static
+          -  "0.849"
+          - --context-length
+          - "32768"
+          - --disaggregation-ib-device
+          - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
+          - --cuda-graph-max-bs
+          - "64"
+          - --max-running-requests
+          - "2048"
+          - --tp-size
+          - "16" # Size of Tensor Parallelism
+          - --dist-init-addr
+          - $(LWS_LEADER_ADDRESS):20102
+          - --nnodes
+          - $(LWS_GROUP_SIZE)
+          - --node-rank
+          - $(LWS_WORKER_INDEX)
+          - --trust-remote-code
+          - --ep-num-redundant-experts
+          - "32"
+          - --moe-dense-tp-size
+          - "1"
+          env:
+          - name: CUDA_LAUNCH_BLOCKING
+            value: "0"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name:  NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: "none"
+          - name: NCCL_IB_TC
+            value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: NCCL_IB_SL
+            value: "5"
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "16"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: lmsysorg/sglang:deepep
+          name: sglang-leader
+          ports:
+          - containerPort: 30000
+            protocol: TCP
+          readinessProbe:
+            periodSeconds: 30
+            tcpSocket:
+              port: 30000
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+            privileged: true
+          volumeMounts:
+          - mountPath: /root/.cache
+            name: sgl-cache
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /work/models
+            name: model
+          - mountPath: /dev/infiniband
+            name: ib
+          - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+            name: cf
+        dnsPolicy: ClusterFirstWithHostNet
+        hostIPC: true
+        hostNetwork: true
+        nodeSelector:
+          pd: "yes"
+        tolerations:
+        - key: pd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+        volumes:
+        - hostPath:
+            path: /data1/sgl_cache1
+            type: DirectoryOrCreate
+          name: sgl-cache
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+          name: model
+        - hostPath:
+            path: /dev/infiniband
+          name: ib
+        - hostPath:
+            path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+          name: cf
+    restartPolicy: RecreateGroupOnPodRestart
+    size:  2
+    workerTemplate:
+      metadata: {}
+      spec:
+        containers:
+        - command:
+          - python3
+          - -m
+          - sglang.launch_server
+          - --model-path
+          - /work/models
+          - --chunked-prefill-size
+          - "262144"
+          - --page-size
+          - "64"
+          - --enable-dp-attention
+          - --enable-dp-lm-head
+            #- --enable-two-batch-overlap
+          - --dp-size
+          - "16"
+          - --moe-a2a-backend
+          - deepep
+          - --disaggregation-mode
+          - decode
+          - --mem-fraction-static
+          -  "0.849"
+          - --context-length
+          - "32768"
+          - --disaggregation-ib-device
+          # should modify according your rdma env
+          - "mlx5_bond_0,mlx5_bond_1,mlx5_bond_2,mlx5_bond_3"
+          - --cuda-graph-max-bs
+          - "64"
+          - --max-running-requests
+          - "2048"
+          - --tp-size
+          - "16" # Size of Tensor Parallelism
+          - --dist-init-addr
+          - $(LWS_LEADER_ADDRESS):20102
+          - --nnodes
+          - $(LWS_GROUP_SIZE)
+          - --node-rank
+          - $(LWS_WORKER_INDEX)
+          - --trust-remote-code
+          - --ep-num-redundant-experts
+          - "32"
+          - --moe-dense-tp-size
+          - "1"
+          env:
+          - name: SGLANG_HACK_DEEPEP_NUM_SMS
+            value: "24"
+          - name: SGLANG_HACK_DEEPEP_NEW_MODE
+            value: "0"
+          - name: NVSHMEM_IB_TRAFFIC_CLASS
+            value: "16"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name:  NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: "none"
+          - name: NCCL_IB_TC
+            value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: NCCL_IB_SL
+            value: "5"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "16"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: lmsysorg/sglang:deepep
+          name: sglang-worker
+          ports:
+          - containerPort: 30001
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          securityContext:
+            capabilities:
+              add:
+              - IPC_LOCK
+            privileged: true
+          volumeMounts:
+          - mountPath: /root/.cache
+            name: sgl-cache
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /work/models
+            name: model
+          - mountPath: /dev/infiniband
+            name: ib
+          - mountPath: /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs
+            name: cf
+        dnsPolicy: ClusterFirstWithHostNet
+        hostIPC: true
+        hostNetwork: true
+        nodeSelector:
+          pd: "yes"
+        tolerations:
+        - key: pd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+        volumes:
+        - hostPath:
+            path: /data1/sgl_cache1
+            type: DirectoryOrCreate
+          name: sgl-cache
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /dev/infiniband
+          name: ib
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/maas_hosted_models/models/DeepSeek-R1-0528/deepseek_r1_0528
+          name: model
+        - hostPath:
+            # modify according to you deployment env
+            path: /data1/maas_hosted_models/models/fused_moe_triton/configs
+          name: cf
+  networkConfig:
+    subdomainPolicy: Shared
+  replicas: 1
+  rolloutStrategy:
+    rollingUpdateConfiguration:
+      maxSurge: 0
+      maxUnavailable: 1
+    type: RollingUpdate
+  startupPolicy: LeaderCreated
+```
+Execute separately:
+```bash
+kubectl apply -f p.yaml
+kubectl apply -f d.yaml
+```
+At this point, we have completed the deployment of the 1P1D SGlang engine part.
+To allow our users to directly experience the model API, we still need a load balancer to handle sequential calls between prefill and decode. Different companies implement LBs differently, and the community will also officially release a new LB component written in Rust in the near future.
+Currently, we use a static K8S service + minilb approach to implement model API calls.
+### Creating Service for Prefill and Decode
+#### Create prefill k8s service
+[p-svc.yaml](lws-examples/p-svc.yaml)
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseekr10528-prefill-main
+spec:
+  selector:
+    leaderworkerset.sigs.k8s.io/name: deepseekr10528-prefill-main
+    role: leader
+  ports:
+    - protocol: TCP
+      port: 30000
+      targetPort: 30000
+```
+Execute `kubectl apply -f p-svc.yaml`
+#### Create decode k8s service
+[d-svc.yaml](lws-examples/d-svc.yaml)
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseekr10528-decode-main
+spec:
+  selector:
+    leaderworkerset.sigs.k8s.io/name: deepseekr10528-decode-main
+    role: leader
+  ports:
+    - protocol: TCP
+      port: 30000
+      targetPort: 30000
+```
+Execute `kubectl apply -f d-svc.yaml`
+#### Deploy minilb and lb service
+[lb.yaml](lws-examples/lb.yaml)
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: deepseekr10528-lb-main
+  labels:
+    app: deepseekr10528-lb
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: deepseekr10528-lb
+  template:
+    metadata:
+      labels:
+        app: deepseekr10528-lb
+    spec:
+      nodeSelector:
+          pd: "yes"
+      tolerations:
+        - key: pd
+          operator: Exists
+        - key: node-role
+          operator: Exists
+      containers:
+        - name: sgl-minilb
+          image: lmsysorg/sglang:deepep
+          command:
+          - python
+          - -m
+          - sglang_router.launch_router
+          - --pd-disaggregation
+          - --prefill
+          - http://deepseekr10528-prefill-main:30000
+          - --decode
+          - http://deepseekr10528-decode-main:30000
+          - --host
+          - 0.0.0.0
+          - --port
+          -  "8000"
+          ports:
+            - containerPort: 8000
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseekr10528-lb-service
+spec:
+  type: NodePort
+  selector:
+    app: deepseekr10528-lb
+  ports:
+    - protocol: TCP
+      port: 8000         # Service Port（In-Cluster）
+      targetPort: 8000   # Exposed Container
+      nodePort: 30800
+```
+Execute `kubectl apply -f lb.yaml`
+After waiting for all model deployments to succeed, you will get the following output:
+```bash
+[root@ecs-001]# kubectl get po
+deepseekr10528-decode-main-0             1/1     Running   0          74m
+deepseekr10528-decode-main-0-1           1/1     Running   0          74m
+deepseekr10528-lb-main-9c5dbfc57-6lcbd   1/1     Running   0          22m
+deepseekr10528-prefill-main-0            1/1     Running   0          74m
+deepseekr10528-prefill-main-0-1          1/1     Running   0          74m
+[root@ecs-cbm-x1-pd-cpu-001 main_doc]# kubectl  get svc |grep dee
+deepseekr10528-decode-main    ClusterIP   None             <none>        <none>           97m
+deepseekr10528-lb-service     NodePort    172.16.242.169   <none>        8000:30800/TCP   22m
+deepseekr10528-prefill-main   ClusterIP   None             <none>        <none>           97m
+```
+At this point, select a nodePort:30800 to access:
+```bash
+[root@ecs-001]# curl -X POST "http://{nodePort}:30800/v1/chat/completions" \
+>     -H "Content-Type: application/json" \
+>     -H "Authorization: Bearer None" \
+>     -d '{
+>        "rid":"ccccdd",
+>         "model": "r1",
+>         "messages": [
+>             {"role": "system", "content": "0: You are a helpful AI assistant"},
+>             {"role": "user", "content": "你是谁？."}
+>         ],
+>         "max_tokens":221
+>     }'
+{"id":"ccccdd","object":"chat.completion","created":1750252498,"model":"qwen2","choices":[{"index":0,"message":{"role":"assistant","content":"<think>\n嗯，用户问了一个很基础的自我介绍问题"你是谁？"。这可能是第一次互动时的常规开场白，也可能是想确认我的身份和功能范围。\n\n用户没有提供任何背景信息，语气简洁中性。这种场景下新用户的可能性较高，需要给出清晰友好的自我介绍，同时突出实用价值来降低陌生感。\n\n考虑到中文用户，应该用简体中文回复。重点要说明三点：身份归属（深度求索）、功能定位（AI助手）、服务范围（学习/工作/生活）。结尾用开放性问题引导对话很关键——既能了解需求，又能避免让用户面对空白输入框时不知所措。\n\n用波浪线结尾可以软化语气，那个笑脸表情😊刚好能中和AI的机械感。不过要控制表情符号数量，避免显得轻浮。\n</think>\n你好呀！我是你的AI助手，由深度求索公司（DeepSeek）开发的语言模型，名字叫 **DeepSeek-R1**。你可以把我当成一个知识丰富、随叫随到的小帮手～😊\n\n我的任务就是陪你聊天、解答问题、","reasoning_content":null,"tool_calls":null},"logprobs":null,"finish_reason":"length","matched_stop":null}],"usage":{"prompt_tokens":14,"total_tokens":235,"completion_tokens":221,"prompt_tokens_details":null}}
+```
+## FAQ
+1. The current deployment startup parameters may not be fully compatible with all RDMA scenarios. Different RDMA NCCL-related environment configurations may be needed in different network environments.
+2. Some preset, optimized configurations for EPLB are not used here. You can adjust them according to [6017](https://github.com/sgl-project/sglang/issues/6017) as needed.
--- a/docs/references/multi_node_deployment/multi_node.md
+++ b/docs/references/multi_node_deployment/multi_node.md
+# Multi-Node Deployment
+## Llama 3.1 405B
+**Run 405B (fp16) on Two Nodes**
+```bash
+# replace 172.16.4.52:20000 with your own node ip address and port of the first node
+python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --dist-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0
+python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --dist-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1
+```
+Note that LLama 405B (fp8) can also be launched on a single node.
+```bash
+python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
+```
+## DeepSeek V3/R1
+Please refer to [DeepSeek documents for reference](https://docs.sglang.ai/basic_usage/deepseek.html#running-examples-on-multi-node).
+## Multi-Node Inference on SLURM
+This example showcases how to serve SGLang server across multiple nodes by SLURM. Submit the following job to the SLURM cluster.
+```
+#!/bin/bash -l
+#SBATCH -o SLURM_Logs/%x_%j_master.out
+#SBATCH -e SLURM_Logs/%x_%j_master.err
+#SBATCH -D ./
+#SBATCH -J Llama-405B-Online-Inference-TP16-SGL
+#SBATCH --nodes=2
+#SBATCH --ntasks=2
+#SBATCH --ntasks-per-node=1  # Ensure 1 task per node
+#SBATCH --cpus-per-task=18
+#SBATCH --mem=224GB
+#SBATCH --partition="lmsys.org"
+#SBATCH --gres=gpu:8
+#SBATCH --time=12:00:00
+echo "[INFO] Activating environment on node $SLURM_PROCID"
+if ! source ENV_FOLDER/bin/activate; then
+    echo "[ERROR] Failed to activate environment" >&2
+    exit 1
+fi
+# Define parameters
+model=MODEL_PATH
+tp_size=16
+echo "[INFO] Running inference"
+echo "[INFO] Model: $model"
+echo "[INFO] TP Size: $tp_size"
+# Set NCCL initialization address using the hostname of the head node
+HEAD_NODE=$(scontrol show hostname "$SLURM_NODELIST" | head -n 1)
+NCCL_INIT_ADDR="${HEAD_NODE}:8000"
+echo "[INFO] NCCL_INIT_ADDR: $NCCL_INIT_ADDR"
+# Launch the model server on each node using SLURM
+srun --ntasks=2 --nodes=2 --output="SLURM_Logs/%x_%j_node$SLURM_NODEID.out" \
+    --error="SLURM_Logs/%x_%j_node$SLURM_NODEID.err" \
+    python3 -m sglang.launch_server \
+    --model-path "$model" \
+    --grammar-backend "xgrammar" \
+    --tp "$tp_size" \
+    --dist-init-addr "$NCCL_INIT_ADDR" \
+    --nnodes 2 \
+    --node-rank "$SLURM_NODEID" &
+# Wait for the NCCL server to be ready on port 30000
+while ! nc -z "$HEAD_NODE" 30000; do
+    sleep 1
+    echo "[INFO] Waiting for $HEAD_NODE:30000 to accept connections"
+done
+echo "[INFO] $HEAD_NODE:30000 is ready to accept connections"
+# Keep the script running until the SLURM job times out
+wait
+```
+Then, you can test the server by sending requests following other [documents](https://docs.sglang.ai/backend/openai_api_completions.html).
+Thanks for [aflah02](https://github.com/aflah02) for providing the example, based on his [blog post](https://aflah02.substack.com/p/multi-node-llm-inference-with-sglang).
--- a/docs/references/multi_node_deployment/multi_node_index.rst
+++ b/docs/references/multi_node_deployment/multi_node_index.rst
+Multi-Node Deployment
+=====================
+.. toctree::
+   :maxdepth: 1
+   :caption: Multi-Node Deployment
+   multi_node.md
+   deploy_on_k8s.md
+   lws_pd/lws_pd_deploy.md
+- `Deploying DeepSeek with PD Disaggregation and Large-Scale Expert Parallelism on 96 H100 GPUs <https://lmsys.org/blog/2025-05-05-large-scale-ep/>`_
+- `Deploying Kimi K2 with PD Disaggregation and Large-Scale Expert Parallelism on 128 H200 GPUs <https://lmsys.org/blog/2025-07-20-k2-large-scale-ep/>`_
--- a/docs/references/production_metrics.md
+++ b/docs/references/production_metrics.md
+# Production Metrics
+SGLang exposes the following metrics via Prometheus. You can enable it by adding `--enable-metrics` when you launch the server.
+An example of the monitoring dashboard is available in [examples/monitoring/grafana.json](https://github.com/sgl-project/sglang/blob/main/examples/monitoring/grafana/dashboards/json/sglang-dashboard.json).
+Here is an example of the metrics:
+```
+$ curl http://localhost:30000/metrics
+# HELP sglang:prompt_tokens_total Number of prefill tokens processed.
+# TYPE sglang:prompt_tokens_total counter
+sglang:prompt_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 8.128902e+06
+# HELP sglang:generation_tokens_total Number of generation tokens processed.
+# TYPE sglang:generation_tokens_total counter
+sglang:generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.557572e+06
+# HELP sglang:token_usage The token usage
+# TYPE sglang:token_usage gauge
+sglang:token_usage{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.28
+# HELP sglang:cache_hit_rate The cache hit rate
+# TYPE sglang:cache_hit_rate gauge
+sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.007507552643049313
+# HELP sglang:time_to_first_token_seconds Histogram of time to first token in seconds.
+# TYPE sglang:time_to_first_token_seconds histogram
+sglang:time_to_first_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 2.3518979474117756e+06
+sglang:time_to_first_token_seconds_bucket{le="0.001",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+sglang:time_to_first_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+sglang:time_to_first_token_seconds_bucket{le="0.01",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+sglang:time_to_first_token_seconds_bucket{le="0.02",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+sglang:time_to_first_token_seconds_bucket{le="0.04",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
+sglang:time_to_first_token_seconds_bucket{le="0.06",model_name="meta-llama/Llama-3.1-8B-Instruct"} 3.0
+sglang:time_to_first_token_seconds_bucket{le="0.08",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:time_to_first_token_seconds_bucket{le="0.1",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:time_to_first_token_seconds_bucket{le="0.25",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:time_to_first_token_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:time_to_first_token_seconds_bucket{le="0.75",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:time_to_first_token_seconds_bucket{le="1.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 27.0
+sglang:time_to_first_token_seconds_bucket{le="2.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 140.0
+sglang:time_to_first_token_seconds_bucket{le="5.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 314.0
+sglang:time_to_first_token_seconds_bucket{le="7.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 941.0
+sglang:time_to_first_token_seconds_bucket{le="10.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1330.0
+sglang:time_to_first_token_seconds_bucket{le="15.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1970.0
+sglang:time_to_first_token_seconds_bucket{le="20.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 2326.0
+sglang:time_to_first_token_seconds_bucket{le="25.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 2417.0
+sglang:time_to_first_token_seconds_bucket{le="30.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 2513.0
+sglang:time_to_first_token_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11008.0
+sglang:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 11008.0
+# HELP sglang:e2e_request_latency_seconds Histogram of End-to-end request latency in seconds
+# TYPE sglang:e2e_request_latency_seconds histogram
+sglang:e2e_request_latency_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 3.116093850019932e+06
+sglang:e2e_request_latency_seconds_bucket{le="0.3",model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0
+sglang:e2e_request_latency_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:e2e_request_latency_seconds_bucket{le="0.8",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:e2e_request_latency_seconds_bucket{le="1.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:e2e_request_latency_seconds_bucket{le="1.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:e2e_request_latency_seconds_bucket{le="2.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:e2e_request_latency_seconds_bucket{le="2.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 6.0
+sglang:e2e_request_latency_seconds_bucket{le="5.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.0
+sglang:e2e_request_latency_seconds_bucket{le="10.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 10.0
+sglang:e2e_request_latency_seconds_bucket{le="15.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11.0
+sglang:e2e_request_latency_seconds_bucket{le="20.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 14.0
+sglang:e2e_request_latency_seconds_bucket{le="30.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 247.0
+sglang:e2e_request_latency_seconds_bucket{le="40.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 486.0
+sglang:e2e_request_latency_seconds_bucket{le="50.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 845.0
+sglang:e2e_request_latency_seconds_bucket{le="60.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1513.0
+sglang:e2e_request_latency_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11228.0
+sglang:e2e_request_latency_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 11228.0
+# HELP sglang:time_per_output_token_seconds Histogram of time per output token in seconds.
+# TYPE sglang:time_per_output_token_seconds histogram
+sglang:time_per_output_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B-Instruct"} 866964.5791549598
+sglang:time_per_output_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1.0
+sglang:time_per_output_token_seconds_bucket{le="0.01",model_name="meta-llama/Llama-3.1-8B-Instruct"} 73.0
+sglang:time_per_output_token_seconds_bucket{le="0.015",model_name="meta-llama/Llama-3.1-8B-Instruct"} 382.0
+sglang:time_per_output_token_seconds_bucket{le="0.02",model_name="meta-llama/Llama-3.1-8B-Instruct"} 593.0
+sglang:time_per_output_token_seconds_bucket{le="0.025",model_name="meta-llama/Llama-3.1-8B-Instruct"} 855.0
+sglang:time_per_output_token_seconds_bucket{le="0.03",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1035.0
+sglang:time_per_output_token_seconds_bucket{le="0.04",model_name="meta-llama/Llama-3.1-8B-Instruct"} 1815.0
+sglang:time_per_output_token_seconds_bucket{le="0.05",model_name="meta-llama/Llama-3.1-8B-Instruct"} 11685.0
+sglang:time_per_output_token_seconds_bucket{le="0.075",model_name="meta-llama/Llama-3.1-8B-Instruct"} 433413.0
+sglang:time_per_output_token_seconds_bucket{le="0.1",model_name="meta-llama/Llama-3.1-8B-Instruct"} 4.950195e+06
+sglang:time_per_output_token_seconds_bucket{le="0.15",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.039435e+06
+sglang:time_per_output_token_seconds_bucket{le="0.2",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.171662e+06
+sglang:time_per_output_token_seconds_bucket{le="0.3",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.266055e+06
+sglang:time_per_output_token_seconds_bucket{le="0.4",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.296752e+06
+sglang:time_per_output_token_seconds_bucket{le="0.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.312226e+06
+sglang:time_per_output_token_seconds_bucket{le="0.75",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.339675e+06
+sglang:time_per_output_token_seconds_bucket{le="1.0",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.357747e+06
+sglang:time_per_output_token_seconds_bucket{le="2.5",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.389414e+06
+sglang:time_per_output_token_seconds_bucket{le="+Inf",model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.400757e+06
+sglang:time_per_output_token_seconds_count{model_name="meta-llama/Llama-3.1-8B-Instruct"} 7.400757e+06
+# HELP sglang:func_latency_seconds Function latency in seconds
+# TYPE sglang:func_latency_seconds histogram
+sglang:func_latency_seconds_sum{name="generate_request"} 4.514771912145079
+sglang:func_latency_seconds_bucket{le="0.05",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="0.07500000000000001",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="0.1125",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="0.16875",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="0.253125",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="0.3796875",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="0.56953125",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="0.8542968750000001",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="1.2814453125",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="1.9221679687500002",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="2.8832519531250003",name="generate_request"} 14006.0
+sglang:func_latency_seconds_bucket{le="4.3248779296875",name="generate_request"} 14007.0
+sglang:func_latency_seconds_bucket{le="6.487316894531251",name="generate_request"} 14007.0
+sglang:func_latency_seconds_bucket{le="9.730975341796876",name="generate_request"} 14007.0
+sglang:func_latency_seconds_bucket{le="14.596463012695313",name="generate_request"} 14007.0
+sglang:func_latency_seconds_bucket{le="21.89469451904297",name="generate_request"} 14007.0
+sglang:func_latency_seconds_bucket{le="32.84204177856446",name="generate_request"} 14007.0
+sglang:func_latency_seconds_bucket{le="49.26306266784668",name="generate_request"} 14007.0
+sglang:func_latency_seconds_bucket{le="+Inf",name="generate_request"} 14007.0
+sglang:func_latency_seconds_count{name="generate_request"} 14007.0
+# HELP sglang:num_running_reqs The number of running requests
+# TYPE sglang:num_running_reqs gauge
+sglang:num_running_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct"} 162.0
+# HELP sglang:num_used_tokens The number of used tokens
+# TYPE sglang:num_used_tokens gauge
+sglang:num_used_tokens{model_name="meta-llama/Llama-3.1-8B-Instruct"} 123859.0
+# HELP sglang:gen_throughput The generate throughput (token/s)
+# TYPE sglang:gen_throughput gauge
+sglang:gen_throughput{model_name="meta-llama/Llama-3.1-8B-Instruct"} 86.50814177726902
+# HELP sglang:num_queue_reqs The number of requests in the waiting queue
+# TYPE sglang:num_queue_reqs gauge
+sglang:num_queue_reqs{model_name="meta-llama/Llama-3.1-8B-Instruct"} 2826.0
+```
+## Setup Guide
+This section describes how to set up the monitoring stack (Prometheus + Grafana) provided in the `examples/monitoring` directory.
+### Prerequisites
+- Docker and Docker Compose installed
+- SGLang server running with metrics enabled
+### Usage
+1.  **Start your SGLang server with metrics enabled:**
+    ```bash
+    python -m sglang.launch_server --model-path <your_model_path> --port 30000 --enable-metrics
+    ```
+    Replace `<your_model_path>` with the actual path to your model (e.g., `meta-llama/Meta-Llama-3.1-8B-Instruct`). Ensure the server is accessible from the monitoring stack (you might need `--host 0.0.0.0` if running in Docker). By default, the metrics endpoint will be available at `http://<sglang_server_host>:30000/metrics`.
+2.  **Navigate to the monitoring example directory:**
+    ```bash
+    cd examples/monitoring
+    ```
+3.  **Start the monitoring stack:**
+    ```bash
+    docker compose up -d
+    ```
+    This command will start Prometheus and Grafana in the background.
+4.  **Access the monitoring interfaces:**
+    *   **Grafana:** Open your web browser and go to [http://localhost:3000](http://localhost:3000).
+    *   **Prometheus:** Open your web browser and go to [http://localhost:9090](http://localhost:9090).
+5.  **Log in to Grafana:**
+    *   Default Username: `admin`
+    *   Default Password: `admin`
+    You will be prompted to change the password upon your first login.
+6.  **View the Dashboard:**
+    The SGLang dashboard is pre-configured and should be available automatically. Navigate to `Dashboards` -> `Browse` -> `SGLang Monitoring` folder -> `SGLang Dashboard`.
+### Troubleshooting
+*   **Port Conflicts:** If you encounter errors like "port is already allocated," check if other services (including previous instances of Prometheus/Grafana) are using ports `9090` or `3000`. Use `docker ps` to find running containers and `docker stop <container_id>` to stop them, or use `lsof -i :<port>` to find other processes using the ports. You might need to adjust the ports in the `docker-compose.yaml` file if they permanently conflict with other essential services on your system.
+To modify Grafana's port to the other one(like 3090) in your Docker Compose file, you need to explicitly specify the port mapping under the grafana service.
+    Option 1: Add GF_SERVER_HTTP_PORT to the environment section:
+    ```
+      environment:
+    - GF_AUTH_ANONYMOUS_ENABLED=true
+    - GF_SERVER_HTTP_PORT=3090  # <-- Add this line
+    ```
+    Option 2: Use port mapping:
+    ```
+    grafana:
+      image: grafana/grafana:latest
+      container_name: grafana
+      ports:
+      - "3090:3000"  # <-- Host:Container port mapping
+    ```
+*   **Connection Issues:**
+    *   Ensure both Prometheus and Grafana containers are running (`docker ps`).
+    *   Verify the Prometheus data source configuration in Grafana (usually auto-configured via `grafana/datasources/datasource.yaml`). Go to `Connections` -> `Data sources` -> `Prometheus`. The URL should point to the Prometheus service (e.g., `http://prometheus:9090`).
+    *   Confirm that your SGLang server is running and the metrics endpoint (`http://<sglang_server_host>:30000/metrics`) is accessible *from the Prometheus container*. If SGLang is running on your host machine and Prometheus is in Docker, use `host.docker.internal` (on Docker Desktop) or your machine's network IP instead of `localhost` in the `prometheus.yaml` scrape configuration.
+*   **No Data on Dashboard:**
+    *   Generate some traffic to your SGLang server to produce metrics. For example, run a benchmark:
+        ```bash
+        python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 100 --random-input 128 --random-output 128
+        ```
+    *   Check the Prometheus UI (`http://localhost:9090`) under `Status` -> `Targets` to see if the SGLang endpoint is being scraped successfully.
+    *   Verify the `model_name` and `instance` labels in your Prometheus metrics match the variables used in the Grafana dashboard. You might need to adjust the Grafana dashboard variables or the labels in your Prometheus configuration.
+### Configuration Files
+The monitoring setup is defined by the following files within the `examples/monitoring` directory:
+*   `docker-compose.yaml`: Defines the Prometheus and Grafana services.
+*   `prometheus.yaml`: Prometheus configuration, including scrape targets.
+*   `grafana/datasources/datasource.yaml`: Configures the Prometheus data source for Grafana.
+*   `grafana/dashboards/config/dashboard.yaml`: Tells Grafana to load dashboards from the specified path.
+*   `grafana/dashboards/json/sglang-dashboard.json`: The actual Grafana dashboard definition in JSON format.
+You can customize the setup by modifying these files. For instance, you might need to update the `static_configs` target in `prometheus.yaml` if your SGLang server runs on a different host or port.
+#### Check if the metrics are being collected
+Run `python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5` to generate some requests.
+Then you should be able to see the metrics in the Grafana dashboard.
--- a/docs/references/torch_compile_cache.md
+++ b/docs/references/torch_compile_cache.md
+# Enabling cache for torch.compile
+SGLang uses `max-autotune-no-cudagraphs` mode of torch.compile. The auto-tuning can be slow.
+If you want to deploy a model on many different machines, you can ship the torch.compile cache to these machines and skip the compilation steps.
+This is based on https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html
+1. Generate the cache by setting TORCHINDUCTOR_CACHE_DIR and running the model once.
+```
+TORCHINDUCTOR_CACHE_DIR=/root/inductor_root_cache python3 -m sglang.launch_server --model meta-llama/Llama-3.1-8B-Instruct --enable-torch-compile
+```
+2. Copy the cache folder to other machines and launch the server with `TORCHINDUCTOR_CACHE_DIR`.
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
+ipykernel
+ipywidgets
+jupyter_client
+markdown>=3.4.0
+matplotlib
+myst-parser
+nbconvert
+nbsphinx
+pandoc
+pillow
+pydantic
+sphinx
+sphinx-book-theme
+sphinx-copybutton
+sphinx-tabs
+nbstripout
+sphinxcontrib-mermaid
+urllib3<2.0.0
+gguf>=0.10.0
+sphinx-autobuild
--- a/docs/serve.sh
+++ b/docs/serve.sh
+# Clean and serve documentation with auto-build
+make clean
+make serve
--- a/docs/supported_models/embedding_models.md
+++ b/docs/supported_models/embedding_models.md
+# Embedding Models
+SGLang provides robust support for embedding models by integrating efficient serving mechanisms with its flexible programming interface. This integration allows for streamlined handling of embedding tasks, facilitating faster and more accurate retrieval and semantic search operations. SGLang's architecture enables better resource utilization and reduced latency in embedding model deployment.
+```{important}
+Embedding models are executed with `--is-embedding` flag and some may require `--trust-remote-code`
+```
+## Quick Start
+### Launch Server
+```shell
+python3 -m sglang.launch_server \
+  --model-path Qwen/Qwen3-Embedding-4B \
+  --is-embedding \
+  --host 0.0.0.0 \
+  --port 30000
+```
+### Client Request
+```python
+import requests
+url = "http://127.0.0.1:30000"
+payload = {
+    "model": "Qwen/Qwen3-Embedding-4B",
+    "input": "What is the capital of France?",
+    "encoding_format": "float"
+}
+response = requests.post(url + "/v1/embeddings", json=payload).json()
+print("Embedding:", response["data"][0]["embedding"])
+```
+## Multimodal Embedding Example
+For multimodal models like GME that support both text and images:
+```shell
+python3 -m sglang.launch_server \
+  --model-path Alibaba-NLP/gme-Qwen2-VL-2B-Instruct \
+  --is-embedding \
+  --chat-template gme-qwen2-vl \
+  --host 0.0.0.0 \
+  --port 30000
+```
+```python
+import requests
+url = "http://127.0.0.1:30000"
+text_input = "Represent this image in embedding space."
+image_path = "https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild/resolve/main/images/023.jpg"
+payload = {
+    "model": "gme-qwen2-vl",
+    "input": [
+        {
+            "text": text_input
+        },
+        {
+            "image": image_path
+        }
+    ],
+}
+response = requests.post(url + "/v1/embeddings", json=payload).json()
+print("Embeddings:", [x.get("embedding") for x in response.get("data", [])])
+```
+## Supported Models
+| Model Family                               | Example Model                          | Chat Template | Description                                                                 |
+| ------------------------------------------ | -------------------------------------- | ------------- | --------------------------------------------------------------------------- |
+| **E5 (Llama/Mistral based)**              | `intfloat/e5-mistral-7b-instruct`     | N/A           | High-quality text embeddings based on Mistral/Llama architectures          |
+| **GTE-Qwen2**                             | `Alibaba-NLP/gte-Qwen2-7B-instruct`   | N/A           | Alibaba's text embedding model with multilingual support                   |
+| **Qwen3-Embedding**                       | `Qwen/Qwen3-Embedding-4B`             | N/A           | Latest Qwen3-based text embedding model for semantic representation        |
+| **BGE**                                    | `BAAI/bge-large-en-v1.5`              | N/A           | BAAI's text embeddings (requires `attention-backend` triton/torch_native)  |
+| **GME (Multimodal)**                      | `Alibaba-NLP/gme-Qwen2-VL-2B-Instruct`| `gme-qwen2-vl`| Multimodal embedding for text and image cross-modal tasks                  |
+| **CLIP**                                   | `openai/clip-vit-large-patch14-336`   | N/A           | OpenAI's CLIP for image and text embeddings                                |
--- a/docs/supported_models/generative_models.md
+++ b/docs/supported_models/generative_models.md
+# Large Language Models
+These models accept text input and produce text output (e.g., chat completions). They are primarily large language models (LLMs), some with mixture-of-experts (MoE) architectures for scaling.
+## Example launch Command
+```shell
+python3 -m sglang.launch_server \
+  --model-path meta-llama/Llama-3.2-1B-Instruct \  # example HF/local path
+  --host 0.0.0.0 \
+  --port 30000 \
+```
+## Supported models
+Below the supported models are summarized in a table.
+If you are unsure if a specific architecture is implemented, you can search for it via GitHub. For example, to search for `Qwen3ForCausalLM`, use the expression:
+```
+repo:sgl-project/sglang path:/^python\/sglang\/srt\/models\// Qwen3ForCausalLM
+```
+in the GitHub search bar.
+| Model Family (Variants)             | Example HuggingFace Identifier                     | Description                                                                            |
+|-------------------------------------|--------------------------------------------------|----------------------------------------------------------------------------------------|
+| **DeepSeek** (v1, v2, v3/R1)        | `deepseek-ai/DeepSeek-R1`                        | Series of advanced reasoning-optimized models (including a 671B MoE) trained with reinforcement learning; top performance on complex reasoning, math, and code tasks. [SGLang provides Deepseek v3/R1 model-specific optimizations](../basic_usage/deepseek.md) and [Reasoning Parser](../advanced_features/separate_reasoning.ipynb)|
+| **GPT-OSS**       | `openai/gpt-oss-20b`, `openai/gpt-oss-120b`       | OpenAI’s latest GPT-OSS series for complex reasoning, agentic tasks, and versatile developer use cases.|
+| **Qwen** (3, 3MoE, 3Next, 2.5, 2 series)       | `Qwen/Qwen3-0.6B`, `Qwen/Qwen3-30B-A3B` `Qwen/Qwen3-Next-80B-A3B-Instruct `      | Alibaba’s latest Qwen3 series for complex reasoning, language understanding, and generation tasks; Support for MoE variants along with previous generation 2.5, 2, etc. [SGLang provides Qwen3 specific reasoning parser](../advanced_features/separate_reasoning.ipynb)|
+| **Llama** (2, 3.x, 4 series)        | `meta-llama/Llama-4-Scout-17B-16E-Instruct`       | Meta's open LLM series, spanning 7B to 400B parameters (Llama 2, 3, and new Llama 4) with well-recognized performance. [SGLang provides Llama-4 model-specific optimizations](../basic_usage/llama4.md)  |
+| **Mistral** (Mixtral, NeMo, Small3) | `mistralai/Mistral-7B-Instruct-v0.2`             | Open 7B LLM by Mistral AI with strong performance; extended into MoE (“Mixtral”) and NeMo Megatron variants for larger scale. |
+| **Gemma** (v1, v2, v3)              | `google/gemma-3-1b-it`                            | Google’s family of efficient multilingual models (1B–27B); Gemma 3 offers a 128K context window, and its larger (4B+) variants support vision input. |
+| **Phi** (Phi-1.5, Phi-2, Phi-3, Phi-4, Phi-MoE series) | `microsoft/Phi-4-multimodal-instruct`, `microsoft/Phi-3.5-MoE-instruct` | Microsoft’s Phi family of small models (1.3B–5.6B); Phi-4-multimodal (5.6B) processes text, images, and speech, Phi-4-mini is a high-accuracy text model and Phi-3.5-MoE is a mixture-of-experts model. |
+| **MiniCPM** (v3, 4B)               | `openbmb/MiniCPM3-4B`                            | OpenBMB’s series of compact LLMs for edge devices; MiniCPM 3 (4B) achieves GPT-3.5-level results in text tasks. |
+| **OLMoE** (Open MoE)               | `allenai/OLMoE-1B-7B-0924`                       | Allen AI’s open Mixture-of-Experts model (7B total, 1B active parameters) delivering state-of-the-art results with sparse expert activation. |
+| **StableLM** (3B, 7B)               | `stabilityai/stablelm-tuned-alpha-7b`            | StabilityAI’s early open-source LLM (3B & 7B) for general text generation; a demonstration model with basic instruction-following ability. |
+| **Command-R** (Cohere)              | `CohereForAI/c4ai-command-r-v01`                 | Cohere’s open conversational LLM (Command series) optimized for long context, retrieval-augmented generation, and tool use. |
+| **DBRX** (Databricks)              | `databricks/dbrx-instruct`                       | Databricks’ 132B-parameter MoE model (36B active) trained on 12T tokens; competes with GPT-3.5 quality as a fully open foundation model. |
+| **Grok** (xAI)                     | `xai-org/grok-1`                                | xAI’s grok-1 model known for vast size(314B parameters) and high quality; integrated in SGLang for high-performance inference. |
+| **ChatGLM** (GLM-130B family)       | `THUDM/chatglm2-6b`                              | Zhipu AI’s bilingual chat model (6B) excelling at Chinese-English dialogue; fine-tuned for conversational quality and alignment. |
+| **InternLM 2** (7B, 20B)           | `internlm/internlm2-7b`                          | Next-gen InternLM (7B and 20B) from SenseTime, offering strong reasoning and ultra-long context support (up to 200K tokens). |
+| **ExaONE 3** (Korean-English)      | `LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct`           | LG AI Research’s Korean-English model (7.8B) trained on 8T tokens; provides high-quality bilingual understanding and generation. |
+| **Baichuan 2** (7B, 13B)           | `baichuan-inc/Baichuan2-13B-Chat`                | BaichuanAI’s second-generation Chinese-English LLM (7B/13B) with improved performance and an open commercial license. |
+| **XVERSE** (MoE)                   | `xverse/XVERSE-MoE-A36B`                         | Yuanxiang’s open MoE LLM (XVERSE-MoE-A36B: 255B total, 36B active) supporting ~40 languages; delivers 100B+ dense-level performance via expert routing. |
+| **SmolLM** (135M–1.7B)            | `HuggingFaceTB/SmolLM-1.7B`                      | Hugging Face’s ultra-small LLM series (135M–1.7B params) offering surprisingly strong results, enabling advanced AI on mobile/edge devices. |
+| **GLM-4** (Multilingual 9B)        | `ZhipuAI/glm-4-9b-chat`                          | Zhipu’s GLM-4 series (up to 9B parameters) – open multilingual models with support for 1M-token context and even a 5.6B multimodal variant (Phi-4V). |
+| **MiMo** (7B series)               | `XiaomiMiMo/MiMo-7B-RL`                         | Xiaomi's reasoning-optimized model series, leverages Multiple-Token Prediction for faster inference. |
+| **ERNIE-4.5** (4.5, 4.5MoE series) | `baidu/ERNIE-4.5-21B-A3B-PT`                    | Baidu's ERNIE-4.5 series which consists of MoE with 47B and 3B active parameters, with the largest model having 424B total parameters, as well as a 0.3B dense model. |
+| **Arcee AFM-4.5B**               | `arcee-ai/AFM-4.5B-Base`                         | Arcee's foundational model series for real world reliability and edge deployments. |
+| **Persimmon** (8B)               | `adept/persimmon-8b-chat`                         | Adept’s open 8B model with a 16K context window and fast inference; trained for broad usability and licensed under Apache 2.0. |
+| **Ling** (16.8B–290B) | `inclusionAI/Ling-lite`, `inclusionAI/Ling-plus` | InclusionAI’s open MoE models. Ling-Lite has 16.8B total / 2.75B active parameters, and Ling-Plus has 290B total / 28.8B active parameters. They are designed for high performance on NLP and complex reasoning tasks. |
+| **Granite 3.0, 3.1** (IBM)               | `ibm-granite/granite-3.1-8b-instruct`                          | IBM's open dense foundation models optimized for reasoning, code, and business AI use cases. Integrated with Red Hat and watsonx systems. |
+| **Granite 3.0 MoE** (IBM)               | `ibm-granite/granite-3.0-3b-a800m-instruct`                          | IBM’s Mixture-of-Experts models offering strong performance with cost-efficiency. MoE expert routing designed for enterprise deployment at scale. |
+| **Llama Nemotron Super** (v1, v1.5, NVIDIA) | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, `nvidia/Llama-3_3-Nemotron-Super-49B-v1_5` | The [NVIDIA Nemotron](https://www.nvidia.com/en-us/ai-data-science/foundation-models/nemotron/) family builds on the strongest open models in the ecosystem by enhancing them with greater accuracy, efficiency, and transparency using NVIDIA open synthetic datasets, advanced techniques, and tools. This enables the creation of practical, right-sized, and high-performing AI agents. |
+| **Llama Nemotron Ultra** (v1, NVIDIA) | `nvidia/Llama-3_1-Nemotron-Ultra-253B-v1` | The [NVIDIA Nemotron](https://www.nvidia.com/en-us/ai-data-science/foundation-models/nemotron/) family builds on the strongest open models in the ecosystem by enhancing them with greater accuracy, efficiency, and transparency using NVIDIA open synthetic datasets, advanced techniques, and tools. This enables the creation of practical, right-sized, and high-performing AI agents. |
--- a/docs/supported_models/modelscope.md
+++ b/docs/supported_models/modelscope.md
+# Use Models From ModelScope
+To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable `SGLANG_USE_MODELSCOPE`.
+```bash
+export SGLANG_USE_MODELSCOPE=true
+```
+We take [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) as an example.
+Launch the Server:
+```bash
+python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
+```
+Or start it by docker:
+```bash
+docker run --gpus all \
+    -p 30000:30000 \
+    -v ~/.cache/modelscope:/root/.cache/modelscope \
+    --env "SGLANG_USE_MODELSCOPE=true" \
+    --ipc=host \
+    lmsysorg/sglang:latest \
+    python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000
+```
+Note that modelscope uses a different cache directory than huggingface. You may need to set it manually to avoid running out of disk space.
--- a/docs/supported_models/multimodal_language_models.md
+++ b/docs/supported_models/multimodal_language_models.md
+# Multimodal Language Models
+These models accept multi-modal inputs (e.g., images and text) and generate text output. They augment language models with multimodal encoders.
+## Example launch Command
+```shell
+python3 -m sglang.launch_server \
+  --model-path meta-llama/Llama-3.2-11B-Vision-Instruct \  # example HF/local path
+  --host 0.0.0.0 \
+  --port 30000 \
+```
+## Supported models
+Below the supported models are summarized in a table.
+If you are unsure if a specific architecture is implemented, you can search for it via GitHub. For example, to search for `Qwen2_5_VLForConditionalGeneration`, use the expression:
+```
+repo:sgl-project/sglang path:/^python\/sglang\/srt\/models\// Qwen2_5_VLForConditionalGeneration
+```
+in the GitHub search bar.
+| Model Family (Variants)    | Example HuggingFace Identifier             | Chat Template    | Description                                                                                                                                                                                                     |
+|----------------------------|--------------------------------------------|------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| **Qwen-VL** (Qwen2 series) | `Qwen/Qwen2.5-VL-7B-Instruct`              | `qwen2-vl`       | Alibaba’s vision-language extension of Qwen; for example, Qwen2.5-VL (7B and larger variants) can analyze and converse about image content.                                                                     |
+| **DeepSeek-VL2**           | `deepseek-ai/deepseek-vl2`                 | `deepseek-vl2`   | Vision-language variant of DeepSeek (with a dedicated image processor), enabling advanced multimodal reasoning on image and text inputs.                                                                        |
+| **Janus-Pro** (1B, 7B)     | `deepseek-ai/Janus-Pro-7B`                 | `janus-pro`      | DeepSeek’s open-source multimodal model capable of both image understanding and generation. Janus-Pro employs a decoupled architecture for separate visual encoding paths, enhancing performance in both tasks. |
+| **MiniCPM-V / MiniCPM-o**  | `openbmb/MiniCPM-V-2_6`                    | `minicpmv`       | MiniCPM-V (2.6, ~8B) supports image inputs, and MiniCPM-o adds audio/video; these multimodal LLMs are optimized for end-side deployment on mobile/edge devices.                                                 |
+| **Llama 3.2 Vision** (11B) | `meta-llama/Llama-3.2-11B-Vision-Instruct` | `llama_3_vision` | Vision-enabled variant of Llama 3 (11B) that accepts image inputs for visual question answering and other multimodal tasks.                                                                                     |
+| **LLaVA** (v1.5 & v1.6)    | *e.g.* `liuhaotian/llava-v1.5-13b`         | `vicuna_v1.1`    | Open vision-chat models that add an image encoder to LLaMA/Vicuna (e.g. LLaMA2 13B) for following multimodal instruction prompts.                                                                               |
+| **LLaVA-NeXT** (8B, 72B)   | `lmms-lab/llava-next-72b`                  | `chatml-llava`   | Improved LLaVA models (with an 8B Llama3 version and a 72B version) offering enhanced visual instruction-following and accuracy on multimodal benchmarks.                                                       |
+| **LLaVA-OneVision**        | `lmms-lab/llava-onevision-qwen2-7b-ov`     | `chatml-llava`   | Enhanced LLaVA variant integrating Qwen as the backbone; supports multiple images (and even video frames) as inputs via an OpenAI Vision API-compatible format.                                                 |
+| **Gemma 3 (Multimodal)**   | `google/gemma-3-4b-it`                     | `gemma-it`       | Gemma 3's larger models (4B, 12B, 27B) accept images (each image encoded as 256 tokens) alongside text in a combined 128K-token context.                                                                        |
+| **Kimi-VL** (A3B)          | `moonshotai/Kimi-VL-A3B-Instruct`          | `kimi-vl`        | Kimi-VL is a multimodal model that can understand and generate text from images.                                                                                                                                |
+| **Mistral-Small-3.1-24B**  | `mistralai/Mistral-Small-3.1-24B-Instruct-2503` | `mistral`   | Mistral 3.1 is a multimodal model that can generate text from text or images input. It also supports tool calling and structured output. |
+| **Phi-4-multimodal-instruct**  | `microsoft/Phi-4-multimodal-instruct` | `phi-4-mm`   | Phi-4-multimodal-instruct is the multimodal variant of the Phi-4-mini model, enhanced with LoRA for improved multimodal capabilities. It supports text, vision and audio modalities in SGLang. |
+| **MiMo-VL** (7B)           | `XiaomiMiMo/MiMo-VL-7B-RL`                 | `mimo-vl`        | Xiaomi's compact yet powerful vision-language model featuring a native resolution ViT encoder for fine-grained visual details, an MLP projector for cross-modal alignment, and the MiMo-7B language model optimized for complex reasoning tasks. |
+| **GLM-4.5V** (106B) /  **GLM-4.1V**(9B)           | `zai-org/GLM-4.5V`                   | `glm-4v`         | GLM-4.5V and GLM-4.1V-Thinking: Towards Versatile Multimodal Reasoning with Scalable Reinforcement Learning                                                                                                                                                                                                      |
--- a/docs/supported_models/rerank_models.md
+++ b/docs/supported_models/rerank_models.md
+# Rerank Models
+SGLang offers comprehensive support for rerank models by incorporating optimized serving frameworks with a flexible programming interface. This setup enables efficient processing of cross-encoder reranking tasks, improving the accuracy and relevance of search result ordering. SGLang’s design ensures high throughput and low latency during reranker model deployment, making it ideal for semantic-based result refinement in large-scale retrieval systems.
+```{important}
+They are executed with `--is-embedding` and some may require `--trust-remote-code`
+```
+## Example Launch Command
+```shell
+python3 -m sglang.launch_server \
+  --model-path BAAI/bge-reranker-v2-m3 \
+  --host 0.0.0.0 \
+  --disable-radix-cache \
+  --chunked-prefill-size -1 \
+  --attention-backend triton \
+  --is-embedding \
+  --port 30000
+```
+## Example Client Request
+```python
+import requests
+url = "http://127.0.0.1:30000/v1/rerank"
+payload = {
+    "model": "BAAI/bge-reranker-v2-m3",
+    "query": "what is panda?",
+    "documents": [
+        "hi",
+        "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China."
+    ]
+}
+response = requests.post(url, json=payload)
+response_json = response.json()
+for item in response_json:
+    print(f"Score: {item['score']:.2f} - Document: '{item['document']}'")
+```
+## Supported rerank models
+| Model Family (Rerank)                          | Example HuggingFace Identifier       | Chat Template | Description                                                                                                                      |
+|------------------------------------------------|--------------------------------------|---------------|----------------------------------------------------------------------------------------------------------------------------------|
+| **BGE-Reranker (BgeRerankModel)**              | `BAAI/bge-reranker-v2-m3`            | N/A           | Currently only support `attention-backend`   `triton` and `torch_native`.  high-performance cross-encoder reranker model from BAAI. Suitable for reranking search results based on semantic relevance.   |
--- a/docs/supported_models/reward_models.md
+++ b/docs/supported_models/reward_models.md
+# Reward Models
+These models output a scalar reward score or classification result, often used in reinforcement learning or content moderation tasks.
+```{important}
+They are executed with `--is-embedding` and some may require `--trust-remote-code`.
+```
+## Example launch Command
+```shell
+python3 -m sglang.launch_server \
+  --model-path Qwen/Qwen2.5-Math-RM-72B \  # example HF/local path
+  --is-embedding \
+  --host 0.0.0.0 \
+  --tp-size=4 \                          # set for tensor parallelism
+  --port 30000 \
+```
+## Supported models
+| Model Family (Reward)                                                     | Example HuggingFace Identifier                              | Description                                                                     |
+|---------------------------------------------------------------------------|-----------------------------------------------------|---------------------------------------------------------------------------------|
+| **Llama (3.1 Reward / `LlamaForSequenceClassification`)**                   | `Skywork/Skywork-Reward-Llama-3.1-8B-v0.2`            | Reward model (preference classifier) based on Llama 3.1 (8B) for scoring and ranking responses for RLHF.  |
+| **Gemma 2 (27B Reward / `Gemma2ForSequenceClassification`)**                | `Skywork/Skywork-Reward-Gemma-2-27B-v0.2`             | Derived from Gemma‑2 (27B), this model provides human preference scoring for RLHF and multilingual tasks.  |
+| **InternLM 2 (Reward / `InternLM2ForRewardMode`)**                         | `internlm/internlm2-7b-reward`                       | InternLM 2 (7B)–based reward model used in alignment pipelines to guide outputs toward preferred behavior.  |
+| **Qwen2.5 (Reward - Math / `Qwen2ForRewardModel`)**                         | `Qwen/Qwen2.5-Math-RM-72B`                           | A 72B math-specialized RLHF reward model from the Qwen2.5 series, tuned for evaluating and refining responses.  |
+| **Qwen2.5 (Reward - Sequence / `Qwen2ForSequenceClassification`)**          | `jason9693/Qwen2.5-1.5B-apeach`                      | A smaller Qwen2.5 variant used for sequence classification, offering an alternative RLHF scoring mechanism.  |
--- a/docs/supported_models/support_new_models.md
+++ b/docs/supported_models/support_new_models.md
+# How to Support New Models
+This document explains how to add support for new language models and multimodal large language models (MLLMs) in
+SGLang. It also covers how to test new models and register external implementations.
+## How to Support a New Language Model
+To support a new model in SGLang, you only need to add a single file under
+the [SGLang Models Directory](https://github.com/sgl-project/sglang/tree/main/python/sglang/srt/models). You can learn
+from existing model implementations and create a new file for your model. For most models, you should be able to find a
+similar model to start with (e.g., starting from Llama). Also refer how
+to [port a Model from vLLM to SGLang](#port-a-model-from-vllm-to-sglang)
+## How to Support a New Multimodal Large Language Model
+To support a new multimodal large language model (MLLM) in SGLang, there are several key components in addition to the
+standard LLM support:
+1. **Register your new model as multimodal**:
+   Extend `is_multimodal_model`
+   in [model_config.py](https://github.com/sgl-project/sglang/blob/0ab3f437aba729b348a683ab32b35b214456efc7/python/sglang/srt/configs/model_config.py#L561)
+   to return `True` for your model.
+2. **Register a new chat-template**:
+   Only when your default chat-template is unable to accept images as input: Register a new chat template in [conversation.py](https://github.com/sgl-project/sglang/tree/main/python/sglang/srt/conversation.py) and the corresponding matching function.
+3. **Multimodal Data Processor**:
+   Define a new `Processor` class that inherits from `BaseMultimodalProcessor` and register this processor as your
+   model’s dedicated processor.
+   See [multimodal_processor.py](https://github.com/sgl-project/sglang/tree/main/python/sglang/srt/multimodal/processors)
+   for more details.
+4. **Handle Multimodal Tokens**:
+   Implement a `pad_input_ids` function for your new model. In this function, multimodal tokens in the prompt should be
+   expanded (if necessary) and padded with multimodal-data-hashes so that SGLang can recognize different multimodal data
+   with `RadixAttention`.
+5. **Handle Image Feature Extraction**:
+   Implement a `get_image_feature` function for your new model, which extracts image features from raw image data and converts them into the embeddings used by the language model.
+6. **Adapt to Vision Attention**:
+   Adapt the multi-headed `Attention` of ViT with SGLang’s `VisionAttention`.
+You can refer to [Qwen2VL](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/qwen2_vl.py) or
+other mllm implementations. These models demonstrate how to correctly handle both multimodal and textual inputs.
+## Testing and Debugging
+Please note all your testing and benchmarking results in PR description.
+### Interactive Debugging
+For interactive debugging, compare the outputs of Hugging Face/Transformers and SGLang. The following two commands
+should give the same text output and very similar prefill logits:
+- Get the reference output:
+  ```bash
+  python3 scripts/playground/reference_hf.py --model-path [new model] --model-type {text,mllm}
+  ```
+- Get the SGLang output:
+  ```bash
+  python3 -m sglang.bench_one_batch --correct --model [new model]
+  ```
+### Add the Model to the Test Suite
+To ensure the new model is well maintained, add it to the test suite by including it in the `ALL_OTHER_MODELS` list in
+the [test_generation_models.py](https://github.com/sgl-project/sglang/blob/main/test/srt/models/test_generation_models.py)
+file, test the new model on your local machine and report the results on demonstrative benchmarks (GSM8K, MMLU, MMMU,
+MMMU-Pro, etc.) in your PR. \\
+For VLMs, also include a test in `test_vision_openai_server_{x}.py` (e.g. [test_vision_openai_server_a.py](https://github.com/sgl-project/sglang/blob/main/test/srt/test_vision_openai_server_a.py), [test_vision_openai_server_b.py](https://github.com/sgl-project/sglang/blob/main/test/srt/test_vision_openai_server_b.py)).
+This is an example command to run to test a new model on your local machine:
+```bash
+ONLY_RUN=Qwen/Qwen2-1.5B python3 -m unittest test_generation_models.TestGenerationModels.test_others
+```
+### Benchmark
+- **(Required) MMMU**: follow MMMU benchmark [README.md](https://github.com/sgl-project/sglang/blob/main/benchmark/mmmu/README.md) to get SGLang vs. HF Transformer accuracy comparison. The accuracy score from SGLang run should not be much lower than that from HF Transformer run. Similarly, follow https://docs.sglang.ai/developer_guide/benchmark_and_profiling.html to get performance comparison: TTFT and throughput must meet or exceed baselines (e.g., HF Transformer).
+- **(Optional) Other evals**: If you ran other evals, please note the results in PR description.
+## Port a Model from vLLM to SGLang
+The [vLLM Models Directory](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models) is a valuable
+resource, as vLLM covers many models. SGLang reuses vLLM’s interface and some layers, making it easier to port models
+from vLLM to SGLang.
+To port a model from vLLM to SGLang:
+- Compare these two files for guidance:
+    - [SGLang Llama Implementation](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/llama.py)
+    - [vLLM Llama Implementation](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llama.py)
+- The major differences include:
+    - **Replace vLLM’s `Attention` with `RadixAttention`** (ensure you pass `layer_id` to `RadixAttention`).
+    - **Replace vLLM’s `LogitsProcessor` with SGLang’s `LogitsProcessor`.**
+    - **Replace the multi-headed `Attention` of ViT with SGLang’s `VisionAttention`.**
+    - **Replace other vLLM layers** (such as `RMSNorm`, `SiluAndMul`) with SGLang layers.
+    - **Remove `Sample`.**
+    - **Change the `forward()` functions** and add a `forward_batch()` method.
+    - **Add `EntryClass`** at the end.
+    - **Ensure that the new implementation uses only SGLang components** and does not rely on any vLLM components.
+Note: make sure you add your new model to the supported models list in the supported models documentation.
+## Registering an External Model Implementation
+In addition to the methods above, you can register your new model with the `ModelRegistry` before launching the server.
+This allows you to integrate your model without modifying the source code.
+For example:
+```python
+from sglang.srt.models.registry import ModelRegistry
+from sglang.srt.entrypoints.http_server import launch_server
+# For a single model, add it to the registry:
+ModelRegistry.models[model_name] = model_class
+# For multiple models, you can imitate the import_model_classes() function:
+from functools import lru_cache
+@lru_cache()
+def import_new_model_classes():
+    model_arch_name_to_cls = {}
+    # Populate model_arch_name_to_cls with your new model classes.
+    ...
+    return model_arch_name_to_cls
+ModelRegistry.models.update(import_new_model_classes())
+# Launch the server with your server arguments:
+launch_server(server_args)
+```
+## Example: Implementing and Serving a Llama Wrapper Model
+Below is an introductory, step-by-step walkthrough on how to implement a new model end-to-end in SGLang and then run it via the [Offline Engine](https://github.com/sgl-project/sglang/blob/main/docs/basic_usage/offline_engine_api.ipynb).
+### Implementing Our Model
+To keep things simple, this new model will be a simple wrapper around [Llama 3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct), and our goal will be just to bias the output logits for each `forward` call by taking the square root of each individual logit.
+Let's start by defining our model in a file called `llama_wrapper.py`.
+The first step is to import the necessary libraries from SRT, which is SGLang's internal backend.
+```python
+# In the file `llama_wrapper.py`
+import torch
+from transformers import LlamaConfig
+from typing import Optional
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, PPProxyTensors
+from sglang.srt.models.llama import LlamaForCausalLM
+```
+Next, we declare a new `class` for our model and have it inherit from `LlamaForCausalLM`, which allows our model to access `LlamaForCausalLM`'s predefined modules and layers, such as `LlamaAttention` and `LlamaMLP`.
+Note that almost all model implementations take in `config` and `quant_config` as arguments for their `__init__` method; `config` and `quant_config` are passed in via [`model_loader/loader.py`](https://github.com/sgl-project/sglang/blob/bf72b80122fd888bf619d17b96fa3e323ab809fc/python/sglang/srt/model_loader/loader.py#L219).
+Because we have inherited from `LlamaForCausalLM`, we can pass our parameters directly to its constructor, which will set the member variables for us.
+```python
+class LlamaWrapper(LlamaForCausalLM):
+    def __init__(
+        self,
+        config: LlamaConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config=config, quant_config=quant_config, prefix=prefix)
+```
+Now, we want to define the `forward` method, which is what will be called at inference time.
+Note that the signature for `forward` is essentially the same for any model; you can take a look at the other models defined in the [`models` directory](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/) for references.
+To see where exactly `forward` is called in the SGLang runtime's internals, take a look at [`forward_decode`](https://github.com/sgl-project/sglang/blob/bf72b80122fd888bf619d17b96fa3e323ab809fc/python/sglang/srt/model_executor/model_runner.py#L1705) and [`forward_extend`](https://github.com/sgl-project/sglang/blob/bf72b80122fd888bf619d17b96fa3e323ab809fc/python/sglang/srt/model_executor/model_runner.py#L1724) in the [`ModelRunner` class](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/model_executor/model_runner.py).
+```python
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        pp_proxy_tensors: Optional[PPProxyTensors] = None,
+        input_embeds: Optional[torch.Tensor] = None,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+```
+We now call the `__call__` method for `self.model` (which is a member variable that `LlamaForCausalLM` defines in its `__init__` method), which eventually calls `LlamaForCausalLM`'s `forward` method.
+After that, we feed the `hidden_states` into our model's `LogitsProcessor` (again defined in `LlamaForCausalLM`).
+```python
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            forward_batch,
+            input_embeds,
+            pp_proxy_tensors=pp_proxy_tensors,
+        )
+        res: LogitsProcessorOutput = self.logits_processor(
+            input_ids,
+            hidden_states,
+            self.lm_head,
+            forward_batch,
+        )
+```
+After receiving the logits for the next token, we can finally perform our biasing step.
+```python
+        orig_logits = res.next_token_logits
+        res.next_token_logits = torch.where(
+            orig_logits > 0,
+            orig_logits.sqrt(),
+            orig_logits
+        )
+        return res
+```
+Now, our `LlamaWrapper` model is created and ready to be served!
+### Serving Our Model Via SGLang's Offline Engine
+The next step of this walkthrough involves hosting our new model offline, so that it can be served locally and without an HTTP server.
+First, create a new file called `run.py`.
+Now, we must ensure that SGLang's `ModelRegistry` can find our model.
+To do this, we first download the model's configuration and weights from Huggingface.
+```python
+# In the file `run.py`
+import asyncio
+from functools import lru_cache
+from huggingface_hub import snapshot_download
+from llama_wrapper import LlamaWrapper # Make sure to import our new model!
+import sglang as sgl
+from sglang.srt.models.registry import ModelRegistry
+# Make sure to request access to this model on Huggingface, then export your
+# `HF_TOKEN` to download the model snapshot
+llama_dir = snapshot_download(
+    repo_id="meta-llama/Llama-3.1-8B-Instruct",
+    local_dir="./llama_ckpt",
+)
+```
+Now that we have our model on disk, we want to point it to `LlamaWrapper` by changing the `architectures` field in `./llama_ckpt/config.json` to be `LlamaWrapper`.
+That way, when we pass in the path of our model checkpoint to SGLang, it will know that we want to use "LlamaWrapper" instead of "LlamaForCausalLM" as our model.
+```python
+{
+  "architectures": [
+   #  "LlamaForCausalLM"
+    "LlamaWrapper"
+  ],
+  ...
+}
+```
+However, if we don't link our `LlamaWrapper` class to the "LlamaWrapper" registry keyword, then SGLang won't be able to find our model.
+Thus, to register our `LlamaWrapper`, we want to follow the steps in the above section titled "Registering an External Model Implementation".
+```python
+@lru_cache()
+def import_new_model_classes():
+    model_arch_name_to_cls = {"LlamaWrapper": LlamaWrapper}
+    return model_arch_name_to_cls
+ModelRegistry.models.update(import_new_model_classes())
+```
+Lastly, when we create our `Engine`, we just pass in the path to the local model directory.
+Then, our `LlamaWrapper` is ready to be served; for this walkthrough, we will use SGLang `Engine`'s non-streaming asynchronous generation endpoint.
+```python
+def main():
+    llm = sgl.Engine(model_path="./llama_ckpt")
+    sampling_params = {"temperature": 0.2, "top_k": 5}
+    prompts = [
+        "Write a short, neutral self-introduction for a fictional character. Hello, my name is",
+        "Provide a concise factual statement about France’s capital city. The capital of France is",
+        "Explain possible future trends in artificial intelligence. The future of AI is",
+    ]
+    asyncio.run(run_llm(llm, sampling_params, prompts))
+    llm.shutdown()
+async def run_llm(
+    llm,
+    sampling_params,
+    prompts,
+) -> None:
+    outputs = await llm.async_generate(prompts, sampling_params)
+    for prompt, output in zip(prompts, outputs):
+        print(f"\nPrompt: {prompt}")
+        print(f"Generated text: {output['text']}")
+if __name__ == "__main__":
+    main()
+```
+Now, when we call `python run.py`, we will get the outputs of our newly created model!
+## Documentation
+Add to table of supported models in [generative_models.md](https://github.com/sgl-project/sglang/blob/main/docs/supported_models/generative_models.md) or [multimodal_language_models.md](https://github.com/sgl-project/sglang/blob/main/docs/supported_models/multimodal_language_models.md)
+---
+By following these guidelines, you can add support for new language models and multimodal large language models in
+SGLang and ensure they are thoroughly tested and easily integrated into the system.
--- a/docs/supported_models/transformers_fallback.md
+++ b/docs/supported_models/transformers_fallback.md
+# Transformers fallback in SGLang
+`sglang` can fall back to using models that are available in `transformers`. This works for most decoder-style language models and support for vision-language models is coming soon!
+## Example launch Command
+By default, we will use sglang implementation if it is available. Otherwise, we will fall back to transformers one. However, you can switch the implementation by setting `--model-impl` to `transformers`.
+```shell
+python3 -m sglang.launch_server \
+  --model-path meta-llama/Llama-3.2-1B-Instruct \
+  --host 0.0.0.0 \
+  --port 30000 \
+  --model-impl transformers
+```
+## Supported features
+### Quantization
+Transformers fall back has supported most of available quantization in SGLang (except GGUF). See [Quantization page](../advanced_features/quantization.md) for more information about supported quantization in SGLang.
+### Remote code
+This fallback also means that any model on the hub that can be used in `transformers` with `trust_remote_code=True` that correctly implements attention can be used in production!
+A model just needs the following two things:
+```python
+from transformers import PreTrainedModel
+from torch import nn
+class MyAttention(nn.Module):
+  def forward(self, hidden_states, **kwargs): # <- kwargs are required
+    ...
+    attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+    attn_output, attn_weights = attention_interface(
+      self,
+      query_states,
+      key_states,
+      value_states,
+      **kwargs,
+    )
+    ...
+class MyModel(PreTrainedModel):
+  _supports_attention_backend = True
+```
+Here is what happens in the background:
+1. The config is loaded
+2. `MyModel` python class is loaded from the `auto_map`, and we check that the model `_supports_attention_backend`.
+3. The `TransformersModel` backend is used. See `/srt/models/transformers`, which leverages `self.config._attn_implementation = "sglang"`, thus the need to use `ALL_ATTENTION_FUNCTIONS`.
+That's it!
--- a/docs/wrap_run_llm.py
+++ b/docs/wrap_run_llm.py
+import os
+import re
+def insert_runllm_widget(html_content):
+    # RunLLM Widget script to be inserted
+    widget_script = """
+    <!-- RunLLM Widget Script -->
+    <script type="module" id="runllm-widget-script" src="https://widget.runllm.com" crossorigin="true" version="stable" runllm-keyboard-shortcut="Mod+j" runllm-name="SGLang Chatbot" runllm-position="BOTTOM_RIGHT" runllm-assistant-id="629" async></script>
+    """
+    # Find the closing body tag and insert the widget script before it
+    return re.sub(r"</body>", f"{widget_script}\n</body>", html_content)
+def process_html_files(build_dir):
+    for root, dirs, files in os.walk(build_dir):
+        for file in files:
+            if file.endswith(".html"):
+                file_path = os.path.join(root, file)
+                # Read the HTML file
+                with open(file_path, "r", encoding="utf-8") as f:
+                    content = f.read()
+                # Insert the RunLLM widget
+                modified_content = insert_runllm_widget(content)
+                # Write back the modified content
+                with open(file_path, "w", encoding="utf-8") as f:
+                    f.write(modified_content)
+def main():
+    # Get the build directory path
+    build_dir = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "_build", "html"
+    )
+    # Process all HTML files
+    if os.path.exists(build_dir):
+        process_html_files(build_dir)
+    else:
+        print(f"Build directory not found: {build_dir}")
+if __name__ == "__main__":
+    main()