Unverified Commit 157714aa authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

chore: add instructions to modify SLA to profile_sla doc; update component name (#2167)

parent 8248a116
...@@ -14,11 +14,8 @@ spec: ...@@ -14,11 +14,8 @@ spec:
image: ${DOCKER_IMAGE} image: ${DOCKER_IMAGE}
resources: resources:
requests: requests:
cpu: "1" cpu: "16"
memory: "2Gi" memory: "10Gi"
limits:
cpu: "2"
memory: "4Gi"
env: env:
- name: HUGGING_FACE_HUB_TOKEN - name: HUGGING_FACE_HUB_TOKEN
valueFrom: valueFrom:
...@@ -37,6 +34,18 @@ spec: ...@@ -37,6 +34,18 @@ spec:
- /workspace/profiling_results - /workspace/profiling_results
- --namespace - --namespace
- ${NAMESPACE} - ${NAMESPACE}
- --min-num-gpus-per-engine
- "1"
- --max-num-gpus-per-engine
- "8"
- --isl
- "3000"
- --osl
- "150"
- --ttft
- "200"
- --itl
- "20"
volumeMounts: volumeMounts:
- name: output-volume - name: output-volume
mountPath: /workspace/profiling_results mountPath: /workspace/profiling_results
......
...@@ -89,16 +89,16 @@ class VllmV1ConfigModifier: ...@@ -89,16 +89,16 @@ class VllmV1ConfigModifier:
if target == "prefill": if target == "prefill":
# convert prefill worker into decode worker # convert prefill worker into decode worker
config["spec"]["services"][ config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
] = config["spec"]["services"][ ] = config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].prefill_worker WORKER_COMPONENT_NAMES["vllm"].prefill_worker_k8s_name
] ]
del config["spec"]["services"][ del config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].prefill_worker WORKER_COMPONENT_NAMES["vllm"].prefill_worker_k8s_name
] ]
args = config["spec"]["services"][ args = config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["extraPodSpec"]["mainContainer"]["args"] ]["extraPodSpec"]["mainContainer"]["args"]
args = break_arguments(args) args = break_arguments(args)
...@@ -112,18 +112,18 @@ class VllmV1ConfigModifier: ...@@ -112,18 +112,18 @@ class VllmV1ConfigModifier:
if "--no-enable-prefix-caching" not in args: if "--no-enable-prefix-caching" not in args:
args = append_argument(args, "--no-enable-prefix-caching") args = append_argument(args, "--no-enable-prefix-caching")
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][ config["spec"]["services"][
"extraPodSpec" WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["mainContainer"]["args"] = join_arguments(args) ]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)
elif target == "decode": elif target == "decode":
# delete prefill worker # delete prefill worker
del config["spec"]["services"][ del config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].prefill_worker WORKER_COMPONENT_NAMES["vllm"].prefill_worker_k8s_name
] ]
args = config["spec"]["services"][ args = config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["extraPodSpec"]["mainContainer"]["args"] ]["extraPodSpec"]["mainContainer"]["args"]
args = break_arguments(args) args = break_arguments(args)
...@@ -134,13 +134,13 @@ class VllmV1ConfigModifier: ...@@ -134,13 +134,13 @@ class VllmV1ConfigModifier:
if "--no-enable-prefix-caching" in args: if "--no-enable-prefix-caching" in args:
args.remove("--no-enable-prefix-caching") args.remove("--no-enable-prefix-caching")
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][ config["spec"]["services"][
"extraPodSpec" WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["mainContainer"]["args"] = join_arguments(args) ]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)
# set num workers to 1 # set num workers to 1
decode_worker_config = config["spec"]["services"][ decode_worker_config = config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
] ]
decode_worker_config["replicas"] = 1 decode_worker_config["replicas"] = 1
...@@ -150,16 +150,16 @@ class VllmV1ConfigModifier: ...@@ -150,16 +150,16 @@ class VllmV1ConfigModifier:
def set_config_tp_size(cls, config: dict, tp_size: int): def set_config_tp_size(cls, config: dict, tp_size: int):
config = deepcopy(config) config = deepcopy(config)
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][ config["spec"]["services"][
"resources" WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["requests"]["gpu"] = str(tp_size) ]["resources"]["requests"]["gpu"] = str(tp_size)
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][ config["spec"]["services"][
"resources" WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["limits"]["gpu"] = str(tp_size) ]["resources"]["limits"]["gpu"] = str(tp_size)
args = config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][ args = config["spec"]["services"][
"extraPodSpec" WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["mainContainer"]["args"] ]["extraPodSpec"]["mainContainer"]["args"]
args = break_arguments(args) args = break_arguments(args)
...@@ -169,15 +169,15 @@ class VllmV1ConfigModifier: ...@@ -169,15 +169,15 @@ class VllmV1ConfigModifier:
except ValueError: except ValueError:
args = append_argument(args, ["--tensor-parallel-size", str(tp_size)]) args = append_argument(args, ["--tensor-parallel-size", str(tp_size)])
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][ config["spec"]["services"][
"extraPodSpec" WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["mainContainer"]["args"] = join_arguments(args) ]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)
return config return config
@classmethod @classmethod
def get_model_name(cls, config: dict) -> str: def get_model_name(cls, config: dict) -> str:
worker_name = WORKER_COMPONENT_NAMES["vllm"].decode_worker worker_name = WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
args = config["spec"]["services"][worker_name]["extraPodSpec"]["mainContainer"][ args = config["spec"]["services"][worker_name]["extraPodSpec"]["mainContainer"][
"args" "args"
] ]
......
...@@ -141,7 +141,7 @@ spec: ...@@ -141,7 +141,7 @@ spec:
- -c - -c
args: args:
- "python3 -m dynamo.planner.prometheus" - "python3 -m dynamo.planner.prometheus"
backend: VllmDecodeWorker:
dynamoNamespace: vllm-disagg-planner dynamoNamespace: vllm-disagg-planner
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
...@@ -191,7 +191,7 @@ spec: ...@@ -191,7 +191,7 @@ spec:
- -c - -c
args: args:
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log" - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log"
prefill: VllmPrefillWorker:
dynamoNamespace: vllm-disagg-planner dynamoNamespace: vllm-disagg-planner
envFromSecret: hf-token-secret envFromSecret: hf-token-secret
componentType: worker componentType: worker
......
...@@ -74,9 +74,11 @@ class SLAPlannerDefaults(BasePlannerDefaults): ...@@ -74,9 +74,11 @@ class SLAPlannerDefaults(BasePlannerDefaults):
class VllmComponentName: class VllmComponentName:
prefill_worker = "prefill" prefill_worker_k8s_name = "VllmPrefillWorker"
prefill_worker_component_name = "prefill"
prefill_worker_endpoint = "generate" prefill_worker_endpoint = "generate"
decode_worker = "backend" decode_worker_k8s_name = "VllmDecodeWorker"
decode_worker_component_name = "backend"
decode_worker_endpoint = "generate" decode_worker_endpoint = "generate"
......
...@@ -106,7 +106,11 @@ class Planner: ...@@ -106,7 +106,11 @@ class Planner:
if self.prefill_client is None: if self.prefill_client is None:
self.prefill_client = ( self.prefill_client = (
await self.runtime.namespace(self.namespace) await self.runtime.namespace(self.namespace)
.component(WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker) .component(
WORKER_COMPONENT_NAMES[
self.args.backend
].prefill_worker_component_name
)
.endpoint( .endpoint(
WORKER_COMPONENT_NAMES[ WORKER_COMPONENT_NAMES[
self.args.backend self.args.backend
...@@ -127,7 +131,11 @@ class Planner: ...@@ -127,7 +131,11 @@ class Planner:
if self.workers_client is None: if self.workers_client is None:
self.workers_client = ( self.workers_client = (
await self.runtime.namespace(self.namespace) await self.runtime.namespace(self.namespace)
.component(WORKER_COMPONENT_NAMES[self.args.backend].decode_worker) .component(
WORKER_COMPONENT_NAMES[
self.args.backend
].decode_worker_component_name
)
.endpoint( .endpoint(
WORKER_COMPONENT_NAMES[self.args.backend].decode_worker_endpoint WORKER_COMPONENT_NAMES[self.args.backend].decode_worker_endpoint
) )
...@@ -300,8 +308,12 @@ class Planner: ...@@ -300,8 +308,12 @@ class Planner:
if not self.args.no_operation: if not self.args.no_operation:
target_replicas = { target_replicas = {
WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker: next_num_p, WORKER_COMPONENT_NAMES[
WORKER_COMPONENT_NAMES[self.args.backend].decode_worker: next_num_d, self.args.backend
].prefill_worker_k8s_name: next_num_p,
WORKER_COMPONENT_NAMES[
self.args.backend
].decode_worker_k8s_name: next_num_d,
} }
await self.connector.set_component_replicas(target_replicas, blocking=False) await self.connector.set_component_replicas(target_replicas, blocking=False)
......
...@@ -82,25 +82,47 @@ kubectl create secret docker-registry nvcr-imagepullsecret \ ...@@ -82,25 +82,47 @@ kubectl create secret docker-registry nvcr-imagepullsecret \
# in the project's root folder # in the project's root folder
./container/build.sh --framework VLLM ./container/build.sh --framework VLLM
# Tag and push to your container registry # Tag and push to your container registry
export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2 # or your own dynamoimage
# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE
# Modify this yaml to profile different models
export DGD_CONFIG_FILE=/workspace/components/backends/vllm/deploy/disagg.yaml # or your own disagg config file
``` ```
Replace the `image` within `profile_sla_job.yaml` with the tag of the image you pushed. Replace the `image` within `profile_sla_job.yaml` with the tag of the image you pushed.
**Step 2: Run profiling (required)** **Step 2: Set SLA target**
Edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml` to set the target ISL, OSL, TTFT, and ITL.
```yaml
spec:
template:
spec:
containers:
- name: profile-sla
args:
- --isl
- "3000" # average ISL is 3000 tokens
- --osl
- "150" # average OSL is 150 tokens
- --ttft
- "200" # target TTFT is 200ms
- --itl
- "20" # target ITL is 20ms
```
**Step 3: Run profiling (required)**
```bash ```bash
cd $DYNAMO_HOME/benchmarks/profiler/deploy cd $DYNAMO_HOME/benchmarks/profiler/deploy
envsubst < profiling_pvc.yaml | kubectl apply -f - envsubst < profiling_pvc.yaml | kubectl apply -f -
envsubst < profile_sla_sa.yaml | kubectl apply -f - envsubst < profile_sla_sa.yaml | kubectl apply -f -
envsubst < profile_sla_rbac.yaml | kubectl apply -f - envsubst < profile_sla_rbac.yaml | kubectl apply -f -
envsubst < profile_sla_binding.yaml | kubectl apply -f - envsubst < profile_sla_binding.yaml | kubectl apply -f -
export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2 # or your own image
# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE
export DGD_CONFIG_FILE=/workspace/components/backends/vllm/deploy/disagg.yaml # or your own disagg config file
envsubst < profile_sla_job.yaml | kubectl apply -f - envsubst < profile_sla_job.yaml | kubectl apply -f -
``` ```
**Step 3: Wait for profiling to complete** **Step 4: Wait for profiling to complete**
```bash ```bash
kubectl get jobs -n $NAMESPACE kubectl get jobs -n $NAMESPACE
kubectl logs job/profile-sla -n $NAMESPACE kubectl logs job/profile-sla -n $NAMESPACE
...@@ -129,13 +151,14 @@ The profiling results are stored in a PVC named `profiling-pvc`. To access the r ...@@ -129,13 +151,14 @@ The profiling results are stored in a PVC named `profiling-pvc`. To access the r
1. **Create a temporary pod to access the PVC:** 1. **Create a temporary pod to access the PVC:**
```bash ```bash
kubectl run temp-access --image=alpine:latest --rm -it --restart=Never \ kubectl run temp-access --image=alpine:latest --restart=Never \
--overrides='{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["sh"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}' \ --overrides='{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["tail","-f","/dev/null"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}' \
-n $NAMESPACE -n $NAMESPACE
``` ```
2. **Inside the temporary pod, navigate to the results directory:** 2. **Inside the temporary pod, navigate to the results directory:**
```bash ```bash
kubectl exec -it temp-access -n $NAMESPACE -- sh
cd /workspace/profiling_results cd /workspace/profiling_results
ls -la ls -la
``` ```
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment