[CI/Build][ROCm] Enabling LoRA tests on ROCm (#7369)

Co-authored-by: Simon Mo <simon.mo@hey.com>

[CI/Build][ROCm] Enabling LoRA tests on ROCm (#7369)
Co-authored-by: Simon Mo <simon.mo@hey.com>
d1dec642 · alexeykondrat · GitHub · 2ad2e560 · d1dec642 · d1dec642
Unverified Commit d1dec642 authored Sep 04, 2024 by alexeykondrat Committed by GitHub Sep 04, 2024
4 changed files
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
 # This script runs test inside the corresponding ROCm docker container.
-set -ex
+set -o pipefail
 # Print ROCm version
 echo "--- Confirming Clean Initial State"
@@ -70,16 +70,51 @@ HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p ${HF_CACHE}
 HF_MOUNT="/root/.cache/huggingface"
-docker run \
+commands=$@
+PARALLEL_JOB_COUNT=8
+# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
+if [[ $commands == *"--shard-id="* ]]; then
+  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
+    #replace shard arguments
+    commands=${@//"--shard-id= "/"--shard-id=${GPU} "}
+    commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
+    docker run \
        --device /dev/kfd --device /dev/dri \
        --network host \
        --shm-size=16gb \
        --rm \
-        -e HIP_VISIBLE_DEVICES=0 \
+        -e HIP_VISIBLE_DEVICES=${GPU} \
        -e HF_TOKEN \
        -v ${HF_CACHE}:${HF_MOUNT} \
        -e HF_HOME=${HF_MOUNT} \
-        --name ${container_name} \
+        --name ${container_name}_${GPU}  \
        ${image_name} \
-        /bin/bash -c "${@}"
+        /bin/bash -c "${commands}" \
+        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
+    PIDS+=($!)
+  done
+  #wait for all processes to finish and collect exit codes
+  for pid in ${PIDS[@]}; do
+    wait ${pid}
+    STATUS+=($?)
+  done
+  for st in ${STATUS[@]}; do
+    if [[ ${st} -ne 0 ]]; then
+      echo "One of the processes failed with $st"
+      exit ${st}
+    fi
+  done
+else
+  docker run \
+          --device /dev/kfd --device /dev/dri \
+          --network host \
+          --shm-size=16gb \
+          --rm \
+          -e HIP_VISIBLE_DEVICES=0 \
+          -e HF_TOKEN \
+          -v ${HF_CACHE}:${HF_MOUNT} \
+          -e HF_HOME=${HF_MOUNT} \
+          --name ${container_name} \
+          ${image_name} \
+          /bin/bash -c "${commands}"
+fi
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -218,9 +218,9 @@ steps:
    - pytest -v -s spec_decode
 - label: LoRA Test %N # 30min each
+  mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/lora
-  - csrc/punica
  - tests/lora
  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
  parallelism: 4
@@ -360,7 +360,6 @@ steps:
  num_gpus: 4
  source_file_dependencies:
  - vllm/lora
-  - csrc/punica
  - tests/lora/test_long_context
  commands:
    # FIXIT: find out which code initialize cuda before running the test

--- a/tests/lora/test_gemma.py
+++ b/tests/lora/test_gemma.py
 from typing import List
+import pytest
 import vllm
 from vllm.lora.request import LoRARequest
+from vllm.utils import is_hip
 MODEL_PATH = "google/gemma-7b"
@@ -28,6 +31,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
    return generated_texts
+@pytest.mark.xfail(is_hip(), reason="There can be output mismatch on ROCm")
 def test_gemma_lora(gemma_lora_files):
    llm = vllm.LLM(MODEL_PATH,
                   max_model_len=1024,

--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -7,6 +7,7 @@ import pytest
 import vllm
 from vllm.lora.request import LoRARequest
+from vllm.utils import is_hip
 from .conftest import cleanup
@@ -17,12 +18,23 @@ class ModelWithQuantization:
    quantization: str
-MODELS: List[ModelWithQuantization] = [
+MODELS: List[ModelWithQuantization]
-    ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
+#AWQ quantization is currently not supported in ROCm.
-                          quantization="AWQ"),
+if is_hip():
-    ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+    MODELS = [
-                          quantization="GPTQ"),
+        ModelWithQuantization(
-]
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+            quantization="GPTQ"),
+    ]
+else:
+    MODELS = [
+        ModelWithQuantization(
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
+            quantization="AWQ"),
+        ModelWithQuantization(
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+            quantization="GPTQ"),
+    ]
 def do_sample(llm: vllm.LLM,