fix: update inference gateway deployment instructions (#1940)

1c03404f · Biswa Panda · GitHub · 5ca570f9 · 1c03404f · 1c03404f
Unverified Commit 1c03404f authored Jul 15, 2025 by Biswa Panda Committed by GitHub Jul 15, 2025
3 changed files
--- a/deploy/inference-gateway/example/README.md
+++ b/deploy/inference-gateway/example/README.md
@@ -19,24 +19,35 @@ Follow the instructions in [deploy/cloud/README.md](../../deploy/cloud/README.md

 Deploy 2 Dynamo aggregated graphs following the instructions in [examples/llm/README.md](../../examples/llm/README.md):

-### Build Dynamo Graph
-```bash
-export DYNAMO_IMAGE=<your-registry>/<your-image-name>:<your-tag>
+### Deploy Dynamo Graphs

-# Build the service
-cd $PROJECT_ROOT/examples/llm
-export DYNAMO_TAG=$(dynamo build graphs.agg:Frontend | grep "Successfully built" |  awk '{ print $NF }' | sed 's/\.$//')
-```
+Follow the commands to deploy 2 dynamo graphs -

-### Deploy Dynamo Graphs
 ```bash
+# Set pre-built vLLM dynamo base container image
+export VLLM_RUNTIME_IMAGE=<dynamo-vllm-base-image>
+# for example:
+# export VLLM_RUNTIME_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+
+# run the following commands from dynamo repo's root folder
+
 # Deploy first graph
 export DEPLOYMENT_NAME=llm-agg1
-# TODO: Deploy your service using a DynamoGraphDeployment CR.
+yq eval '
+  .metadata.name = env(DEPLOYMENT_NAME) |
+  .spec.services[].extraPodSpec.mainContainer.image = env(VLLM_RUNTIME_IMAGE)
+' examples/vllm_v0/deploy/agg.yaml > examples/vllm_v0/deploy/agg1.yaml
+
+kubectl apply -f examples/vllm_v0/deploy/agg1.yaml

 # Deploy second graph
 export DEPLOYMENT_NAME=llm-agg2
-# TODO: Deploy your service using a DynamoGraphDeployment CR.
+yq eval '
+  .metadata.name = env(DEPLOYMENT_NAME) |
+  .spec.services[].extraPodSpec.mainContainer.image = env(VLLM_RUNTIME_IMAGE)
+' examples/vllm_v0/deploy/agg.yaml > examples/vllm_v0/deploy/agg2.yaml
+
+kubectl apply -f examples/vllm_v0/deploy/agg2.yaml
 ```

 3. **Deploy Inference Gateway**

--- a/deploy/inference-gateway/example/resources/dynamo-epp.yaml
+++ b/deploy/inference-gateway/example/resources/dynamo-epp.yaml
@@ -33,8 +33,8 @@ spec:
      terminationGracePeriodSeconds: 130
      containers:
      - name: epp
-        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main
-        imagePullPolicy: Always
+        image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:v0.4.0
+        imagePullPolicy: IfNotPresent
        args:
        - -poolName
        - "dynamo-deepseek"

--- a/deploy/inference-gateway/example/resources/inference-pool.yaml
+++ b/deploy/inference-gateway/example/resources/inference-pool.yaml
@@ -18,11 +18,11 @@ metadata:
  name: dynamo-deepseek
  namespace: default
 spec:
-  targetPortNumber: 3000
+  targetPortNumber: 8000
  selector:
-    nvidia.com/dynamo-component-type: Frontend
+    nvidia.com/dynamo-component: Frontend
  extensionRef:
-    failureMode: FailClose
+    failureMode: FailOpen
    group: ""
    kind: Service
    name: dynamo-deepseek-epp
\ No newline at end of file