[Bugfix][CI/Build][Hardware][AMD] Shard ID parameters in AMD tests running parallel jobs (#9279)

Signed-off-by: Hissu Hyvarinen <hissu.hyvarinen@amd.com>

[Bugfix][CI/Build][Hardware][AMD] Shard ID parameters in AMD tests running parallel jobs (#9279)
Signed-off-by: Hissu Hyvarinen <hissu.hyvarinen@amd.com>
5208dc7a · hissu-hyvarinen · GitHub · 1c45f4c3 · 5208dc7a · 5208dc7a
Unverified Commit 5208dc7a authored Nov 04, 2024 by hissu-hyvarinen Committed by GitHub Nov 04, 2024
Show whitespace changes
Inline Side-by-side

Showing with 12 additions and 6 deletions

.buildkite/run-amd-test.sh .buildkite/run-amd-test.sh +6 -5

tests/lora/test_minicpmv.py tests/lora/test_minicpmv.py +6 -1

No files found.
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -107,11 +107,12 @@ fi
 PARALLEL_JOB_COUNT=8
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 if [[ $commands == *"--shard-id="* ]]; then
-  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
+  # assign job count as the number of shards used   
-    #replace shard arguments
-    commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
-    echo "Shard ${GPU} commands:$commands"
+  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
+    # assign shard-id for each shard
+    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
+    echo "Shard ${GPU} commands:$commands_gpu"
    docker run \
        --device /dev/kfd --device /dev/dri \
        --network host \
@@ -123,7 +124,7 @@ if [[ $commands == *"--shard-id="* ]]; then
        -e HF_HOME=${HF_MOUNT} \
        --name ${container_name}_${GPU}  \
        ${image_name} \
-        /bin/bash -c "${commands}" \
+        /bin/bash -c "${commands_gpu}" \
        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
    PIDS+=($!)
  done

--- a/tests/lora/test_minicpmv.py
+++ b/tests/lora/test_minicpmv.py
 from typing import List
+import pytest
 import vllm
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
 MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
@@ -53,6 +56,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
    return generated_texts
+@pytest.mark.xfail(
+    current_platform.is_rocm(),
+    reason="MiniCPM-V dependency xformers incompatible with ROCm")
 def test_minicpmv_lora(minicpmv_lora_files):
    llm = vllm.LLM(
        MODEL_PATH,
@@ -63,7 +69,6 @@ def test_minicpmv_lora(minicpmv_lora_files):
        trust_remote_code=True,
        gpu_memory_utilization=0.97  # This model is pretty big for CI gpus
    )
    output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
    for i in range(len(EXPECTED_OUTPUT)):
        assert EXPECTED_OUTPUT[i].startswith(output1[i])