Unverified Commit d1dec642 authored by alexeykondrat's avatar alexeykondrat Committed by GitHub
Browse files

[CI/Build][ROCm] Enabling LoRA tests on ROCm (#7369)


Co-authored-by: default avatarSimon Mo <simon.mo@hey.com>
parent 2ad2e560
# This script runs test inside the corresponding ROCm docker container. # This script runs test inside the corresponding ROCm docker container.
set -ex set -o pipefail
# Print ROCm version # Print ROCm version
echo "--- Confirming Clean Initial State" echo "--- Confirming Clean Initial State"
...@@ -70,16 +70,51 @@ HF_CACHE="$(realpath ~)/huggingface" ...@@ -70,16 +70,51 @@ HF_CACHE="$(realpath ~)/huggingface"
mkdir -p ${HF_CACHE} mkdir -p ${HF_CACHE}
HF_MOUNT="/root/.cache/huggingface" HF_MOUNT="/root/.cache/huggingface"
docker run \ commands=$@
PARALLEL_JOB_COUNT=8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
#replace shard arguments
commands=${@//"--shard-id= "/"--shard-id=${GPU} "}
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
docker run \
--device /dev/kfd --device /dev/dri \ --device /dev/kfd --device /dev/dri \
--network host \ --network host \
--shm-size=16gb \ --shm-size=16gb \
--rm \ --rm \
-e HIP_VISIBLE_DEVICES=0 \ -e HIP_VISIBLE_DEVICES=${GPU} \
-e HF_TOKEN \ -e HF_TOKEN \
-v ${HF_CACHE}:${HF_MOUNT} \ -v ${HF_CACHE}:${HF_MOUNT} \
-e HF_HOME=${HF_MOUNT} \ -e HF_HOME=${HF_MOUNT} \
--name ${container_name} \ --name ${container_name}_${GPU} \
${image_name} \ ${image_name} \
/bin/bash -c "${@}" /bin/bash -c "${commands}" \
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
PIDS+=($!)
done
#wait for all processes to finish and collect exit codes
for pid in ${PIDS[@]}; do
wait ${pid}
STATUS+=($?)
done
for st in ${STATUS[@]}; do
if [[ ${st} -ne 0 ]]; then
echo "One of the processes failed with $st"
exit ${st}
fi
done
else
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
--shm-size=16gb \
--rm \
-e HIP_VISIBLE_DEVICES=0 \
-e HF_TOKEN \
-v ${HF_CACHE}:${HF_MOUNT} \
-e HF_HOME=${HF_MOUNT} \
--name ${container_name} \
${image_name} \
/bin/bash -c "${commands}"
fi
...@@ -218,9 +218,9 @@ steps: ...@@ -218,9 +218,9 @@ steps:
- pytest -v -s spec_decode - pytest -v -s spec_decode
- label: LoRA Test %N # 30min each - label: LoRA Test %N # 30min each
mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/lora - vllm/lora
- csrc/punica
- tests/lora - tests/lora
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
parallelism: 4 parallelism: 4
...@@ -360,7 +360,6 @@ steps: ...@@ -360,7 +360,6 @@ steps:
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
- vllm/lora - vllm/lora
- csrc/punica
- tests/lora/test_long_context - tests/lora/test_long_context
commands: commands:
# FIXIT: find out which code initialize cuda before running the test # FIXIT: find out which code initialize cuda before running the test
......
from typing import List from typing import List
import pytest
import vllm import vllm
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.utils import is_hip
MODEL_PATH = "google/gemma-7b" MODEL_PATH = "google/gemma-7b"
...@@ -28,6 +31,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: ...@@ -28,6 +31,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
return generated_texts return generated_texts
@pytest.mark.xfail(is_hip(), reason="There can be output mismatch on ROCm")
def test_gemma_lora(gemma_lora_files): def test_gemma_lora(gemma_lora_files):
llm = vllm.LLM(MODEL_PATH, llm = vllm.LLM(MODEL_PATH,
max_model_len=1024, max_model_len=1024,
......
...@@ -7,6 +7,7 @@ import pytest ...@@ -7,6 +7,7 @@ import pytest
import vllm import vllm
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.utils import is_hip
from .conftest import cleanup from .conftest import cleanup
...@@ -17,12 +18,23 @@ class ModelWithQuantization: ...@@ -17,12 +18,23 @@ class ModelWithQuantization:
quantization: str quantization: str
MODELS: List[ModelWithQuantization] = [ MODELS: List[ModelWithQuantization]
ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", #AWQ quantization is currently not supported in ROCm.
quantization="AWQ"), if is_hip():
ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", MODELS = [
quantization="GPTQ"), ModelWithQuantization(
] model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
quantization="GPTQ"),
]
else:
MODELS = [
ModelWithQuantization(
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
quantization="AWQ"),
ModelWithQuantization(
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
quantization="GPTQ"),
]
def do_sample(llm: vllm.LLM, def do_sample(llm: vllm.LLM,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment