Unverified Commit 157714aa authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

chore: add instructions to modify SLA to profile_sla doc; update component name (#2167)

parent 8248a116
......@@ -14,11 +14,8 @@ spec:
image: ${DOCKER_IMAGE}
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "2"
memory: "4Gi"
cpu: "16"
memory: "10Gi"
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
......@@ -37,6 +34,18 @@ spec:
- /workspace/profiling_results
- --namespace
- ${NAMESPACE}
- --min-num-gpus-per-engine
- "1"
- --max-num-gpus-per-engine
- "8"
- --isl
- "3000"
- --osl
- "150"
- --ttft
- "200"
- --itl
- "20"
volumeMounts:
- name: output-volume
mountPath: /workspace/profiling_results
......
......@@ -89,16 +89,16 @@ class VllmV1ConfigModifier:
if target == "prefill":
# convert prefill worker into decode worker
config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
] = config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].prefill_worker
WORKER_COMPONENT_NAMES["vllm"].prefill_worker_k8s_name
]
del config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].prefill_worker
WORKER_COMPONENT_NAMES["vllm"].prefill_worker_k8s_name
]
args = config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["extraPodSpec"]["mainContainer"]["args"]
args = break_arguments(args)
......@@ -112,18 +112,18 @@ class VllmV1ConfigModifier:
if "--no-enable-prefix-caching" not in args:
args = append_argument(args, "--no-enable-prefix-caching")
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
"extraPodSpec"
]["mainContainer"]["args"] = join_arguments(args)
config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)
elif target == "decode":
# delete prefill worker
del config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].prefill_worker
WORKER_COMPONENT_NAMES["vllm"].prefill_worker_k8s_name
]
args = config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["extraPodSpec"]["mainContainer"]["args"]
args = break_arguments(args)
......@@ -134,13 +134,13 @@ class VllmV1ConfigModifier:
if "--no-enable-prefix-caching" in args:
args.remove("--no-enable-prefix-caching")
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
"extraPodSpec"
]["mainContainer"]["args"] = join_arguments(args)
config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)
# set num workers to 1
decode_worker_config = config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]
decode_worker_config["replicas"] = 1
......@@ -150,16 +150,16 @@ class VllmV1ConfigModifier:
def set_config_tp_size(cls, config: dict, tp_size: int):
config = deepcopy(config)
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
"resources"
]["requests"]["gpu"] = str(tp_size)
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
"resources"
]["limits"]["gpu"] = str(tp_size)
config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["resources"]["requests"]["gpu"] = str(tp_size)
config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["resources"]["limits"]["gpu"] = str(tp_size)
args = config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
"extraPodSpec"
]["mainContainer"]["args"]
args = config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["extraPodSpec"]["mainContainer"]["args"]
args = break_arguments(args)
......@@ -169,15 +169,15 @@ class VllmV1ConfigModifier:
except ValueError:
args = append_argument(args, ["--tensor-parallel-size", str(tp_size)])
config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
"extraPodSpec"
]["mainContainer"]["args"] = join_arguments(args)
config["spec"]["services"][
WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)
return config
@classmethod
def get_model_name(cls, config: dict) -> str:
worker_name = WORKER_COMPONENT_NAMES["vllm"].decode_worker
worker_name = WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name
args = config["spec"]["services"][worker_name]["extraPodSpec"]["mainContainer"][
"args"
]
......
......@@ -141,7 +141,7 @@ spec:
- -c
args:
- "python3 -m dynamo.planner.prometheus"
backend:
VllmDecodeWorker:
dynamoNamespace: vllm-disagg-planner
envFromSecret: hf-token-secret
componentType: worker
......@@ -191,7 +191,7 @@ spec:
- -c
args:
- "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log"
prefill:
VllmPrefillWorker:
dynamoNamespace: vllm-disagg-planner
envFromSecret: hf-token-secret
componentType: worker
......
......@@ -74,9 +74,11 @@ class SLAPlannerDefaults(BasePlannerDefaults):
class VllmComponentName:
prefill_worker = "prefill"
prefill_worker_k8s_name = "VllmPrefillWorker"
prefill_worker_component_name = "prefill"
prefill_worker_endpoint = "generate"
decode_worker = "backend"
decode_worker_k8s_name = "VllmDecodeWorker"
decode_worker_component_name = "backend"
decode_worker_endpoint = "generate"
......
......@@ -106,7 +106,11 @@ class Planner:
if self.prefill_client is None:
self.prefill_client = (
await self.runtime.namespace(self.namespace)
.component(WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker)
.component(
WORKER_COMPONENT_NAMES[
self.args.backend
].prefill_worker_component_name
)
.endpoint(
WORKER_COMPONENT_NAMES[
self.args.backend
......@@ -127,7 +131,11 @@ class Planner:
if self.workers_client is None:
self.workers_client = (
await self.runtime.namespace(self.namespace)
.component(WORKER_COMPONENT_NAMES[self.args.backend].decode_worker)
.component(
WORKER_COMPONENT_NAMES[
self.args.backend
].decode_worker_component_name
)
.endpoint(
WORKER_COMPONENT_NAMES[self.args.backend].decode_worker_endpoint
)
......@@ -300,8 +308,12 @@ class Planner:
if not self.args.no_operation:
target_replicas = {
WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker: next_num_p,
WORKER_COMPONENT_NAMES[self.args.backend].decode_worker: next_num_d,
WORKER_COMPONENT_NAMES[
self.args.backend
].prefill_worker_k8s_name: next_num_p,
WORKER_COMPONENT_NAMES[
self.args.backend
].decode_worker_k8s_name: next_num_d,
}
await self.connector.set_component_replicas(target_replicas, blocking=False)
......
......@@ -82,25 +82,47 @@ kubectl create secret docker-registry nvcr-imagepullsecret \
# in the project's root folder
./container/build.sh --framework VLLM
# Tag and push to your container registry
export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2 # or your own dynamoimage
# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE
# Modify this yaml to profile different models
export DGD_CONFIG_FILE=/workspace/components/backends/vllm/deploy/disagg.yaml # or your own disagg config file
```
Replace the `image` within `profile_sla_job.yaml` with the tag of the image you pushed.
**Step 2: Run profiling (required)**
**Step 2: Set SLA target**
Edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml` to set the target ISL, OSL, TTFT, and ITL.
```yaml
spec:
template:
spec:
containers:
- name: profile-sla
args:
- --isl
- "3000" # average ISL is 3000 tokens
- --osl
- "150" # average OSL is 150 tokens
- --ttft
- "200" # target TTFT is 200ms
- --itl
- "20" # target ITL is 20ms
```
**Step 3: Run profiling (required)**
```bash
cd $DYNAMO_HOME/benchmarks/profiler/deploy
envsubst < profiling_pvc.yaml | kubectl apply -f -
envsubst < profile_sla_sa.yaml | kubectl apply -f -
envsubst < profile_sla_rbac.yaml | kubectl apply -f -
envsubst < profile_sla_binding.yaml | kubectl apply -f -
export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2 # or your own image
# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE
export DGD_CONFIG_FILE=/workspace/components/backends/vllm/deploy/disagg.yaml # or your own disagg config file
envsubst < profile_sla_job.yaml | kubectl apply -f -
```
**Step 3: Wait for profiling to complete**
**Step 4: Wait for profiling to complete**
```bash
kubectl get jobs -n $NAMESPACE
kubectl logs job/profile-sla -n $NAMESPACE
......@@ -129,13 +151,14 @@ The profiling results are stored in a PVC named `profiling-pvc`. To access the r
1. **Create a temporary pod to access the PVC:**
```bash
kubectl run temp-access --image=alpine:latest --rm -it --restart=Never \
--overrides='{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["sh"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}' \
kubectl run temp-access --image=alpine:latest --restart=Never \
--overrides='{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["tail","-f","/dev/null"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}' \
-n $NAMESPACE
```
2. **Inside the temporary pod, navigate to the results directory:**
```bash
kubectl exec -it temp-access -n $NAMESPACE -- sh
cd /workspace/profiling_results
ls -la
```
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment