adaption DYNAMO for DCU

32cd0048 · xuxz · b950ec54 · 32cd0048 · 32cd0048 · 32cd0048
Commit 32cd0048 authored Jun 09, 2025 by xuxz
7 changed files
--- a/components/planner/src/dynamo/planner/local_connector.py
+++ b/components/planner/src/dynamo/planner/local_connector.py
@@ -148,7 +148,7 @@ class LocalConnector(PlannerConnector):
            if not available_gpus:
                raise ValueError("No GPUs available for allocation")
            gpu_id = available_gpus[0]
-            watcher_env["CUDA_VISIBLE_DEVICES"] = gpu_id
+            watcher_env["HIP_VISIBLE_DEVICES"] = gpu_id

        watcher_env["DYNAMO_SERVICE_CONFIG"] = service_config


--- a/deploy/sdk/src/dynamo/sdk/cli/allocator.py
+++ b/deploy/sdk/src/dynamo/sdk/cli/allocator.py
@@ -207,7 +207,7 @@ class ResourceAllocator:

                # Generate environment variables for each worker
                for _ in range(num_workers):
-                    env_vars = {"CUDA_VISIBLE_DEVICES": ",".join(map(str, assigned))}
+                    env_vars = {"HIP_VISIBLE_DEVICES": ",".join(map(str, assigned))}
                    resource_envs.append(env_vars)
            else:
                logger.info(
@@ -221,7 +221,7 @@ class ResourceAllocator:
                    )

                    # Generate environment variables for this worker
-                    env_vars = {"CUDA_VISIBLE_DEVICES": ",".join(map(str, assigned))}
+                    env_vars = {"HIP_VISIBLE_DEVICES": ",".join(map(str, assigned))}

                    # If we have comprehensive GPU stats, log them
                    try:
@@ -242,7 +242,8 @@ class ResourceAllocator:
                        logger.debug(f"Failed to get GPU stats: {e}")

                    resource_envs.append(env_vars)
-
+        # else:
+        #     resource_envs = config["envs"]
        logger.info(
            f"Final resource allocation - workers: {num_workers}, envs: {resource_envs}"
        )

--- a/examples/llm/configs/agg.yaml
+++ b/examples/llm/configs/agg.yaml
@@ -13,12 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 Common:
-  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-  block-size: 64
+  model: /models/deepseek-r1/DeepSeek-R1-Distill-Llama-8B
+  block-size: 16
  max-model-len: 16384

 Frontend:
-  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  served_model_name: /models/deepseek-r1/DeepSeek-R1-Distill-Llama-8B
  endpoint: dynamo.Processor.chat/completions
  port: 8000


--- a/examples/llm/configs/disagg.yaml
+++ b/examples/llm/configs/disagg.yaml
@@ -13,13 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 Common:
-  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-  block-size: 64
+  model: /models/deepseek-r1/DeepSeek-R1-Distill-Llama-8B
+  block-size: 16
  max-model-len: 16384
  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'

 Frontend:
-  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  served_model_name: /models/deepseek-r1/DeepSeek-R1-Distill-Llama-8B
  endpoint: dynamo.Processor.chat/completions
  port: 8000

@@ -29,21 +29,26 @@ Processor:

 VllmWorker:
  remote-prefill: true
-  conditional-disagg: true
+  conditional-disagg: false
  max-local-prefill-length: 10
-  max-prefill-queue-size: 2
+  max-prefill-queue-size: 64
+  tensor-parallel-size: 1
+  enable-prefix-caching: false
  ServiceArgs:
    workers: 1
    resources:
      gpu: 1
+
  common-configs: [model, block-size, max-model-len, kv-transfer-config]

 PrefillWorker:
  max-num-batched-tokens: 16384
+  tensor-parallel-size: 1
  ServiceArgs:
    workers: 1
    resources:
      gpu: 1
+
  common-configs: [model, block-size, max-model-len, kv-transfer-config]

 Planner:

--- a/examples/llm/configs/disagg_router.yaml
+++ b/examples/llm/configs/disagg_router.yaml
@@ -13,14 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 Common:
-  model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
-  block-size: 64
+  model: /models/deepseek-r1/DeepSeek-R1-Distill-Llama-8B
+  block-size: 16
  max-model-len: 16384
  router: kv
  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'

 Frontend:
-  served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  served_model_name: /models/deepseek-r1/DeepSeek-R1-Distill-Llama-8B
  endpoint: dynamo.Processor.chat/completions
  port: 8000

@@ -34,9 +34,9 @@ Router:
 VllmWorker:
  max-num-batched-tokens: 16384
  remote-prefill: true
-  conditional-disagg: true
+  conditional-disagg: false
  max-local-prefill-length: 10
-  max-prefill-queue-size: 2
+  max-prefill-queue-size: 64
  tensor-parallel-size: 1
  enable-prefix-caching: true
  ServiceArgs:

--- a/examples/llm/configs/mutinode_disagg_r1.yaml
+++ b/examples/llm/configs/mutinode_disagg_r1.yaml
@@ -14,7 +14,7 @@
 # limitations under the License.
 Common:
  model: deepseek-ai/DeepSeek-R1
-  block-size: 64
+  block-size: 16
  max-model-len: 16384
  kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
  tensor-parallel-size: 16

--- a/launch/llmctl/src/main.rs
+++ b/launch/llmctl/src/main.rs
@@ -228,9 +228,9 @@ async fn add_model(
        endpoint_name
    );

-    if model_name.starts_with('/') {
-        raise!("Model name '{}' cannot start with a slash", model_name);
-    }
+    // if model_name.starts_with('/') {
+    //     raise!("Model name '{}' cannot start with a slash", model_name);
+    // }

    let parts: Vec<&str> = endpoint_name.split('.').collect();