"vscode:/vscode.git/clone" did not exist on "4137c5dfa7c0de6c0ff74ad3774224b6b3280349"
Unverified Commit 5208dc7a authored by hissu-hyvarinen's avatar hissu-hyvarinen Committed by GitHub
Browse files

[Bugfix][CI/Build][Hardware][AMD] Shard ID parameters in AMD tests running parallel jobs (#9279)


Signed-off-by: default avatarHissu Hyvarinen <hissu.hyvarinen@amd.com>
parent 1c45f4c3
...@@ -107,11 +107,12 @@ fi ...@@ -107,11 +107,12 @@ fi
PARALLEL_JOB_COUNT=8 PARALLEL_JOB_COUNT=8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then if [[ $commands == *"--shard-id="* ]]; then
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do # assign job count as the number of shards used
#replace shard arguments
commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "} commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
echo "Shard ${GPU} commands:$commands" for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
# assign shard-id for each shard
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
echo "Shard ${GPU} commands:$commands_gpu"
docker run \ docker run \
--device /dev/kfd --device /dev/dri \ --device /dev/kfd --device /dev/dri \
--network host \ --network host \
...@@ -123,7 +124,7 @@ if [[ $commands == *"--shard-id="* ]]; then ...@@ -123,7 +124,7 @@ if [[ $commands == *"--shard-id="* ]]; then
-e HF_HOME=${HF_MOUNT} \ -e HF_HOME=${HF_MOUNT} \
--name ${container_name}_${GPU} \ --name ${container_name}_${GPU} \
${image_name} \ ${image_name} \
/bin/bash -c "${commands}" \ /bin/bash -c "${commands_gpu}" \
|& while read -r line; do echo ">>Shard $GPU: $line"; done & |& while read -r line; do echo ">>Shard $GPU: $line"; done &
PIDS+=($!) PIDS+=($!)
done done
......
from typing import List from typing import List
import pytest
import vllm import vllm
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform
MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5" MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
...@@ -53,6 +56,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: ...@@ -53,6 +56,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
return generated_texts return generated_texts
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="MiniCPM-V dependency xformers incompatible with ROCm")
def test_minicpmv_lora(minicpmv_lora_files): def test_minicpmv_lora(minicpmv_lora_files):
llm = vllm.LLM( llm = vllm.LLM(
MODEL_PATH, MODEL_PATH,
...@@ -63,7 +69,6 @@ def test_minicpmv_lora(minicpmv_lora_files): ...@@ -63,7 +69,6 @@ def test_minicpmv_lora(minicpmv_lora_files):
trust_remote_code=True, trust_remote_code=True,
gpu_memory_utilization=0.97 # This model is pretty big for CI gpus gpu_memory_utilization=0.97 # This model is pretty big for CI gpus
) )
output1 = do_sample(llm, minicpmv_lora_files, lora_id=1) output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
for i in range(len(EXPECTED_OUTPUT)): for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output1[i]) assert EXPECTED_OUTPUT[i].startswith(output1[i])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment