Unverified Commit 9be3df8f authored by Pavithra Vijayakrishnan's avatar Pavithra Vijayakrishnan Committed by GitHub
Browse files

test: update lora tests (#4996)


Signed-off-by: default avatarpvijayakrish <pvijayakrish@nvidia.com>
parent 00b38ded
...@@ -28,6 +28,10 @@ inputs: ...@@ -28,6 +28,10 @@ inputs:
description: 'Run pytest in dry-run mode (collect tests only, do not execute)' description: 'Run pytest in dry-run mode (collect tests only, do not execute)'
required: false required: false
default: 'false' default: 'false'
start_minio:
description: 'Start MinIO service for LoRA tests (true/false)'
required: false
default: 'true'
runs: runs:
...@@ -47,6 +51,47 @@ runs: ...@@ -47,6 +51,47 @@ runs:
echo "PLATFORM_ARCH=${PLATFORM_ARCH}" >> $GITHUB_ENV echo "PLATFORM_ARCH=${PLATFORM_ARCH}" >> $GITHUB_ENV
echo "🏗️ Platform architecture: ${PLATFORM_ARCH}" echo "🏗️ Platform architecture: ${PLATFORM_ARCH}"
- name: Start MinIO Service
if: inputs.start_minio == 'true'
shell: bash
env:
MINIO_CONTAINER_NAME: dynamo-minio-test
MINIO_ACCESS_KEY: minioadmin
MINIO_SECRET_KEY: minioadmin
run: |
# Start MinIO for S3-compatible object storage (used by LoRA tests)
echo "🗄️ Starting MinIO service..."
# Remove any existing container
docker rm -f "${MINIO_CONTAINER_NAME}" 2>/dev/null || true
docker run -d \
--name "${MINIO_CONTAINER_NAME}" \
-p 9000:9000 \
-p 9001:9001 \
-e "MINIO_ROOT_USER=${MINIO_ACCESS_KEY}" \
-e "MINIO_ROOT_PASSWORD=${MINIO_SECRET_KEY}" \
quay.io/minio/minio server /data --console-address ':9001'
# Wait for MinIO to be ready
echo "⏳ Waiting for MinIO to be ready..."
MAX_ATTEMPTS=30
ATTEMPT=0
while [ $ATTEMPT -lt $MAX_ATTEMPTS ]; do
if curl -sf http://localhost:9000/minio/health/live > /dev/null 2>&1; then
echo "✅ MinIO is ready (attempt $((ATTEMPT + 1)))"
break
fi
ATTEMPT=$((ATTEMPT + 1))
if [ $ATTEMPT -eq $MAX_ATTEMPTS ]; then
echo "❌ MinIO failed to start within ${MAX_ATTEMPTS}s"
echo "📋 Container logs:"
docker logs "${MINIO_CONTAINER_NAME}" 2>&1 || true
exit 1
fi
sleep 1
done
- name: Run tests - name: Run tests
shell: bash shell: bash
env: env:
...@@ -140,6 +185,13 @@ runs: ...@@ -140,6 +185,13 @@ runs:
# Exit with original test result to maintain workflow behavior # Exit with original test result to maintain workflow behavior
exit ${TEST_EXIT_CODE} exit ${TEST_EXIT_CODE}
- name: Cleanup MinIO Service
if: always() && inputs.start_minio == 'true'
shell: bash
run: |
echo "🧹 Cleaning up MinIO container..."
docker rm -f dynamo-minio-test 2>/dev/null || true
- name: Upload Test Results - name: Upload Test Results
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
if: always() # Always upload test results, even if tests failed if: always() # Always upload test results, even if tests failed
......
...@@ -8,6 +8,9 @@ ...@@ -8,6 +8,9 @@
# create non-reproducible builds, and cause dependency conflicts. Every installed version # create non-reproducible builds, and cause dependency conflicts. Every installed version
# should be explicitly tested, not an unknown future release. # should be explicitly tested, not an unknown future release.
# For MinIO/S3 operations in LoRA tests (replaces AWS CLI dependency)
boto3==1.42.4
boto3-stubs[s3]==1.42.9 # Type stubs for boto3 S3 client
# For IFEval dataset loading in kvbm tests # For IFEval dataset loading in kvbm tests
datasets==4.4.1 datasets==4.4.1
# For NATS object store verification in router tests # For NATS object store verification in router tests
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os import os
import shutil
from dataclasses import dataclass from dataclasses import dataclass
from typing import Generator from typing import Generator
...@@ -95,12 +94,12 @@ def minio_lora_service(): ...@@ -95,12 +94,12 @@ def minio_lora_service():
Provide a MinIO service with a pre-uploaded LoRA adapter for testing. Provide a MinIO service with a pre-uploaded LoRA adapter for testing.
This fixture: This fixture:
1. Starts a MinIO Docker container 1. Connects to existing MinIO or starts a Docker container
2. Creates the required S3 bucket 2. Creates the required S3 bucket
3. Downloads the LoRA adapter from Hugging Face Hub 3. Downloads the LoRA adapter from Hugging Face Hub
4. Uploads it to MinIO 4. Uploads it to MinIO
5. Yields the MinioLoraConfig with connection details 5. Yields the MinioLoraConfig with connection details
6. Cleans up after the test 6. Cleans up after the test (only stops container if we started it)
Usage: Usage:
def test_lora(minio_lora_service): def test_lora(minio_lora_service):
...@@ -108,23 +107,15 @@ def minio_lora_service(): ...@@ -108,23 +107,15 @@ def minio_lora_service():
# Use config.get_env_vars() for environment setup # Use config.get_env_vars() for environment setup
# Use config.get_s3_uri() to get the S3 URI for loading LoRA # Use config.get_s3_uri() to get the S3 URI for loading LoRA
""" """
# LoRA serve tests spin up a local MinIO via Docker. Some environments are
# intentionally minimal (e.g. vLLM-only containers) and do not include the
# docker CLI, in which case we skip the LoRA tests.
if shutil.which("docker") is None:
pytest.skip("LoRA serve tests require the docker CLI (MinIO container).")
config = MinioLoraConfig() config = MinioLoraConfig()
service = MinioService(config) service = MinioService(config)
try: try:
# Start MinIO # Start or connect to MinIO
service.start() service.start()
# Create bucket # Create bucket and upload LoRA
service.create_bucket() service.create_bucket()
# Download and upload LoRA
local_path = service.download_lora() local_path = service.download_lora()
service.upload_lora(local_path) service.upload_lora(local_path)
...@@ -134,6 +125,6 @@ def minio_lora_service(): ...@@ -134,6 +125,6 @@ def minio_lora_service():
yield config yield config
finally: finally:
# Stop MinIO and clean up # Stop MinIO only if we started it, clean up temp dirs
service.stop() service.stop()
service.cleanup_temp() service.cleanup_temp()
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""
MinIO Service and LoRA Test Utilities.
Provides infrastructure for LoRA adapter testing with S3-compatible storage.
Works in both CI (pre-started MinIO) and local development (auto-starts Docker).
"""
import logging import logging
import os import os
...@@ -9,9 +15,13 @@ import subprocess ...@@ -9,9 +15,13 @@ import subprocess
import tempfile import tempfile
import time import time
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path
from typing import Optional from typing import Optional
import boto3
import requests import requests
from botocore.client import Config
from botocore.exceptions import ClientError
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -26,7 +36,7 @@ DEFAULT_LORA_NAME = "codelion/Qwen3-0.6B-accuracy-recovery-lora" ...@@ -26,7 +36,7 @@ DEFAULT_LORA_NAME = "codelion/Qwen3-0.6B-accuracy-recovery-lora"
@dataclass @dataclass
class MinioLoraConfig: class MinioLoraConfig:
"""Configuration for MinIO and LoRA setup""" """Configuration for MinIO and LoRA setup."""
endpoint: str = MINIO_ENDPOINT endpoint: str = MINIO_ENDPOINT
access_key: str = MINIO_ACCESS_KEY access_key: str = MINIO_ACCESS_KEY
...@@ -37,11 +47,11 @@ class MinioLoraConfig: ...@@ -37,11 +47,11 @@ class MinioLoraConfig:
data_dir: Optional[str] = None data_dir: Optional[str] = None
def get_s3_uri(self) -> str: def get_s3_uri(self) -> str:
"""Get the S3 URI for the LoRA adapter""" """Get the S3 URI for the LoRA adapter."""
return f"s3://{self.bucket}/{self.lora_name}" return f"s3://{self.bucket}/{self.lora_name}"
def get_env_vars(self) -> dict: def get_env_vars(self) -> dict:
"""Get environment variables for AWS/MinIO access""" """Get environment variables for AWS/MinIO access."""
return { return {
"AWS_ENDPOINT": self.endpoint, "AWS_ENDPOINT": self.endpoint,
"AWS_ACCESS_KEY_ID": self.access_key, "AWS_ACCESS_KEY_ID": self.access_key,
...@@ -54,7 +64,14 @@ class MinioLoraConfig: ...@@ -54,7 +64,14 @@ class MinioLoraConfig:
class MinioService: class MinioService:
"""Manages MinIO Docker container lifecycle for tests""" """
Manages MinIO service lifecycle for tests.
Follows a "connect or create" pattern:
- First checks if MinIO is already running (CI or manual)
- If not, starts a Docker container (local development)
- Only cleans up containers it created
"""
CONTAINER_NAME = "dynamo-minio-test" CONTAINER_NAME = "dynamo-minio-test"
...@@ -62,22 +79,81 @@ class MinioService: ...@@ -62,22 +79,81 @@ class MinioService:
self.config = config self.config = config
self._logger = logging.getLogger(self.__class__.__name__) self._logger = logging.getLogger(self.__class__.__name__)
self._temp_download_dir: Optional[str] = None self._temp_download_dir: Optional[str] = None
self._s3_client = None
self._owns_container: bool = False
def _get_s3_client(self):
"""Get or create boto3 S3 client for MinIO."""
if self._s3_client is None:
self._s3_client = boto3.client(
"s3",
endpoint_url=self.config.endpoint,
aws_access_key_id=self.config.access_key,
aws_secret_access_key=self.config.secret_key,
config=Config(signature_version="s3v4"),
region_name="us-east-1",
)
return self._s3_client
def _is_healthy(self) -> bool:
"""Check if MinIO is running and healthy."""
health_url = f"{self.config.endpoint}/minio/health/live"
try:
response = requests.get(health_url, timeout=2)
return response.status_code == 200
except requests.RequestException:
return False
def _is_docker_available(self) -> bool:
"""Check if Docker daemon is accessible."""
try:
result = subprocess.run(["docker", "info"], capture_output=True, timeout=5)
return result.returncode == 0
except (subprocess.SubprocessError, FileNotFoundError):
return False
def start(self) -> None: def start(self) -> None:
"""Start MinIO container""" """
self._logger.info("Starting MinIO container...") Connect to MinIO service, starting a container if necessary.
Raises:
RuntimeError: If MinIO cannot be started or connected to.
"""
self._logger.info("Connecting to MinIO...")
# Check if MinIO is already running
if self._is_healthy():
self._logger.info("Connected to existing MinIO instance")
self._owns_container = False
return
# Try to start Docker container
if not self._is_docker_available():
raise RuntimeError(
"MinIO is not available and Docker is not accessible.\n"
"Start MinIO manually:\n"
" docker run -d -p 9000:9000 -p 9001:9001 "
"-e MINIO_ROOT_USER=minioadmin -e MINIO_ROOT_PASSWORD=minioadmin "
f"--name {self.CONTAINER_NAME} "
"quay.io/minio/minio server /data --console-address ':9001'"
)
# Create data directory self._start_container()
if self.config.data_dir: self._owns_container = True
data_dir = self.config.data_dir self._logger.info("MinIO container started successfully")
else:
data_dir = tempfile.mkdtemp(prefix="minio_test_")
self.config.data_dir = data_dir
# Stop existing container if running def _start_container(self) -> None:
self.stop() """Start MinIO Docker container."""
# Clean up any existing container
subprocess.run(
["docker", "rm", "-f", self.CONTAINER_NAME],
capture_output=True,
)
# Create data directory
if not self.config.data_dir:
self.config.data_dir = tempfile.mkdtemp(prefix="minio_test_")
# Start MinIO container
cmd = [ cmd = [
"docker", "docker",
"run", "run",
...@@ -88,8 +164,12 @@ class MinioService: ...@@ -88,8 +164,12 @@ class MinioService:
"9000:9000", "9000:9000",
"-p", "-p",
"9001:9001", "9001:9001",
"-e",
f"MINIO_ROOT_USER={self.config.access_key}",
"-e",
f"MINIO_ROOT_PASSWORD={self.config.secret_key}",
"-v", "-v",
f"{data_dir}:/data", f"{self.config.data_dir}:/data",
"quay.io/minio/minio", "quay.io/minio/minio",
"server", "server",
"/data", "/data",
...@@ -101,88 +181,54 @@ class MinioService: ...@@ -101,88 +181,54 @@ class MinioService:
if result.returncode != 0: if result.returncode != 0:
raise RuntimeError(f"Failed to start MinIO: {result.stderr}") raise RuntimeError(f"Failed to start MinIO: {result.stderr}")
# Wait for MinIO to be ready
self._wait_for_ready() self._wait_for_ready()
self._logger.info("MinIO started successfully")
def _wait_for_ready(self, timeout: int = 30) -> None: def _wait_for_ready(self, timeout: int = 30) -> None:
"""Wait for MinIO to be ready""" """Wait for MinIO to be ready."""
health_url = f"{self.config.endpoint}/minio/health/live"
start_time = time.time() start_time = time.time()
while time.time() - start_time < timeout: while time.time() - start_time < timeout:
try: if self._is_healthy():
response = requests.get(health_url, timeout=2) return
if response.status_code == 200:
return
except requests.RequestException:
pass
time.sleep(1) time.sleep(1)
raise RuntimeError(f"MinIO did not become ready within {timeout}s") raise RuntimeError(f"MinIO did not become ready within {timeout}s")
def stop(self) -> None: def stop(self) -> None:
"""Stop and remove MinIO container""" """Stop MinIO container if this instance started it."""
self._logger.info("Stopping MinIO container...") if not self._owns_container:
self._logger.debug("Not stopping MinIO (not owned by this instance)")
return
# Stop container self._logger.info("Stopping MinIO container...")
subprocess.run(
["docker", "stop", self.CONTAINER_NAME],
capture_output=True,
)
# Remove container
subprocess.run( subprocess.run(
["docker", "rm", self.CONTAINER_NAME], ["docker", "rm", "-f", self.CONTAINER_NAME],
capture_output=True, capture_output=True,
) )
self._owns_container = False
def create_bucket(self) -> None: def create_bucket(self) -> None:
"""Create the S3 bucket using AWS CLI""" """Create the S3 bucket if it doesn't exist."""
env = os.environ.copy() s3_client = self._get_s3_client()
env.update(
{ try:
"AWS_ACCESS_KEY_ID": self.config.access_key, s3_client.head_bucket(Bucket=self.config.bucket)
"AWS_SECRET_ACCESS_KEY": self.config.secret_key, self._logger.info(f"Bucket already exists: {self.config.bucket}")
} except ClientError as e:
) error_code = e.response.get("Error", {}).get("Code", "")
if error_code in ("404", "NoSuchBucket"):
# Check if bucket exists self._logger.info(f"Creating bucket: {self.config.bucket}")
result = subprocess.run( try:
[ s3_client.create_bucket(Bucket=self.config.bucket)
"aws", except ClientError as create_error:
"--endpoint-url", raise RuntimeError(
self.config.endpoint, f"Failed to create bucket: {create_error}"
"s3", ) from create_error
"ls", else:
f"s3://{self.config.bucket}", raise RuntimeError(f"Failed to check bucket: {e}") from e
],
capture_output=True,
text=True,
env=env,
)
if result.returncode != 0:
# Create bucket
self._logger.info(f"Creating bucket: {self.config.bucket}")
result = subprocess.run(
[
"aws",
"--endpoint-url",
self.config.endpoint,
"s3",
"mb",
f"s3://{self.config.bucket}",
],
capture_output=True,
text=True,
env=env,
)
if result.returncode != 0:
raise RuntimeError(f"Failed to create bucket: {result.stderr}")
def download_lora(self) -> str: def download_lora(self) -> str:
"""Download LoRA from Hugging Face Hub, returns temp directory path""" """Download LoRA from Hugging Face Hub, returns temp directory path."""
self._temp_download_dir = tempfile.mkdtemp(prefix="lora_download_") self._temp_download_dir = tempfile.mkdtemp(prefix="lora_download_")
self._logger.info( self._logger.info(
f"Downloading LoRA {self.config.lora_repo} to {self._temp_download_dir}" f"Downloading LoRA {self.config.lora_repo} to {self._temp_download_dir}"
...@@ -213,47 +259,38 @@ class MinioService: ...@@ -213,47 +259,38 @@ class MinioService:
return self._temp_download_dir return self._temp_download_dir
def upload_lora(self, local_path: str) -> None: def upload_lora(self, local_path: str) -> None:
"""Upload LoRA to MinIO""" """Upload LoRA to MinIO using boto3."""
self._logger.info( self._logger.info(
f"Uploading LoRA to s3://{self.config.bucket}/{self.config.lora_name}" f"Uploading LoRA to s3://{self.config.bucket}/{self.config.lora_name}"
) )
env = os.environ.copy() s3_client = self._get_s3_client()
env.update( local_path = Path(local_path)
{
"AWS_ACCESS_KEY_ID": self.config.access_key,
"AWS_SECRET_ACCESS_KEY": self.config.secret_key,
}
)
result = subprocess.run( for file_path in local_path.rglob("*"):
[ if not file_path.is_file():
"aws", continue
"--endpoint-url", if ".git" in file_path.parts:
self.config.endpoint, continue
"s3",
"sync",
local_path,
f"s3://{self.config.bucket}/{self.config.lora_name}",
"--exclude",
"*.git*",
],
capture_output=True,
text=True,
env=env,
)
if result.returncode != 0: relative_path = file_path.relative_to(local_path).as_posix()
raise RuntimeError(f"Failed to upload LoRA: {result.stderr}") s3_key = f"{self.config.lora_name}/{relative_path}"
try:
s3_client.upload_file(str(file_path), self.config.bucket, s3_key)
except ClientError as e:
raise RuntimeError(f"Failed to upload {file_path}: {e}") from e
self._logger.info("LoRA upload completed")
def cleanup_download(self) -> None: def cleanup_download(self) -> None:
"""Clean up temporary download directory only""" """Clean up temporary download directory only."""
if self._temp_download_dir and os.path.exists(self._temp_download_dir): if self._temp_download_dir and os.path.exists(self._temp_download_dir):
shutil.rmtree(self._temp_download_dir) shutil.rmtree(self._temp_download_dir)
self._temp_download_dir = None self._temp_download_dir = None
def cleanup_temp(self) -> None: def cleanup_temp(self) -> None:
"""Clean up all temporary directories including MinIO data dir""" """Clean up all temporary directories including MinIO data dir."""
self.cleanup_download() self.cleanup_download()
if self.config.data_dir and os.path.exists(self.config.data_dir): if self.config.data_dir and os.path.exists(self.config.data_dir):
...@@ -263,7 +300,7 @@ class MinioService: ...@@ -263,7 +300,7 @@ class MinioService:
def load_lora_adapter( def load_lora_adapter(
system_port: int, lora_name: str, s3_uri: str, timeout: int = 60 system_port: int, lora_name: str, s3_uri: str, timeout: int = 60
) -> None: ) -> None:
"""Load a LoRA adapter via the system API""" """Load a LoRA adapter via the system API."""
url = f"http://localhost:{system_port}/v1/loras" url = f"http://localhost:{system_port}/v1/loras"
payload = {"lora_name": lora_name, "source": {"uri": s3_uri}} payload = {"lora_name": lora_name, "source": {"uri": s3_uri}}
......
...@@ -708,7 +708,7 @@ def lora_chat_payload( ...@@ -708,7 +708,7 @@ def lora_chat_payload(
@pytest.mark.gpu_1 @pytest.mark.gpu_1
@pytest.mark.model("Qwen/Qwen3-0.6B") @pytest.mark.model("Qwen/Qwen3-0.6B")
@pytest.mark.timeout(600) @pytest.mark.timeout(600)
@pytest.mark.nightly @pytest.mark.post_merge
def test_lora_aggregated( def test_lora_aggregated(
request, request,
runtime_services_dynamic_ports, runtime_services_dynamic_ports,
...@@ -763,7 +763,7 @@ def test_lora_aggregated( ...@@ -763,7 +763,7 @@ def test_lora_aggregated(
@pytest.mark.gpu_2 @pytest.mark.gpu_2
@pytest.mark.model("Qwen/Qwen3-0.6B") @pytest.mark.model("Qwen/Qwen3-0.6B")
@pytest.mark.timeout(600) @pytest.mark.timeout(600)
@pytest.mark.nightly @pytest.mark.post_merge
def test_lora_aggregated_router( def test_lora_aggregated_router(
request, request,
runtime_services_dynamic_ports, runtime_services_dynamic_ports,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment