Unverified Commit f9839161 authored by Biswa Panda's avatar Biswa Panda Committed by GitHub
Browse files

feat(lora): add LoRA support for SGLang (#4769)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 21b44473
...@@ -80,6 +80,15 @@ async def init_decode( ...@@ -80,6 +80,15 @@ async def init_decode(
generate_endpoint = runtime.endpoint( generate_endpoint = runtime.endpoint(
f"{dynamo_args.namespace}.{dynamo_args.component}.{dynamo_args.endpoint}" f"{dynamo_args.namespace}.{dynamo_args.component}.{dynamo_args.endpoint}"
) )
load_lora_endpoint = runtime.endpoint(
f"{dynamo_args.namespace}.{dynamo_args.component}.load_lora"
)
unload_lora_endpoint = runtime.endpoint(
f"{dynamo_args.namespace}.{dynamo_args.component}.unload_lora"
)
list_loras_endpoint = runtime.endpoint(
f"{dynamo_args.namespace}.{dynamo_args.component}.list_loras"
)
shutdown_endpoints[:] = [generate_endpoint] shutdown_endpoints[:] = [generate_endpoint]
...@@ -132,6 +141,18 @@ async def init_decode( ...@@ -132,6 +141,18 @@ async def init_decode(
metrics_labels=metrics_labels, metrics_labels=metrics_labels,
health_check_payload=health_check_payload, health_check_payload=health_check_payload,
), ),
load_lora_endpoint.serve_endpoint(
handler.load_lora,
metrics_labels=metrics_labels,
),
unload_lora_endpoint.serve_endpoint(
handler.unload_lora,
metrics_labels=metrics_labels,
),
list_loras_endpoint.serve_endpoint(
handler.list_loras,
metrics_labels=metrics_labels,
),
register_model_with_readiness_gate( register_model_with_readiness_gate(
engine, engine,
generate_endpoint, generate_endpoint,
...@@ -187,6 +208,15 @@ async def init_prefill( ...@@ -187,6 +208,15 @@ async def init_prefill(
generate_endpoint = runtime.endpoint( generate_endpoint = runtime.endpoint(
f"{dynamo_args.namespace}.{dynamo_args.component}.{dynamo_args.endpoint}" f"{dynamo_args.namespace}.{dynamo_args.component}.{dynamo_args.endpoint}"
) )
load_lora_endpoint = runtime.endpoint(
f"{dynamo_args.namespace}.{dynamo_args.component}.load_lora"
)
unload_lora_endpoint = runtime.endpoint(
f"{dynamo_args.namespace}.{dynamo_args.component}.unload_lora"
)
list_loras_endpoint = runtime.endpoint(
f"{dynamo_args.namespace}.{dynamo_args.component}.list_loras"
)
shutdown_endpoints[:] = [generate_endpoint] shutdown_endpoints[:] = [generate_endpoint]
...@@ -228,6 +258,18 @@ async def init_prefill( ...@@ -228,6 +258,18 @@ async def init_prefill(
metrics_labels=metrics_labels, metrics_labels=metrics_labels,
health_check_payload=health_check_payload, health_check_payload=health_check_payload,
), ),
load_lora_endpoint.serve_endpoint(
handler.load_lora,
metrics_labels=metrics_labels,
),
unload_lora_endpoint.serve_endpoint(
handler.unload_lora,
metrics_labels=metrics_labels,
),
list_loras_endpoint.serve_endpoint(
handler.list_loras,
metrics_labels=metrics_labels,
),
register_model_with_readiness_gate( register_model_with_readiness_gate(
engine, engine,
generate_endpoint, generate_endpoint,
......
...@@ -272,6 +272,10 @@ class DecodeWorkerHandler(BaseWorkerHandler): ...@@ -272,6 +272,10 @@ class DecodeWorkerHandler(BaseWorkerHandler):
priority = (request.get("routing") or {}).get("priority") priority = (request.get("routing") or {}).get("priority")
logprob_kwargs = self._build_logprob_kwargs(request) logprob_kwargs = self._build_logprob_kwargs(request)
lora_path = self._resolve_lora(request)
if lora_path:
logging.debug(f"Request {context.id()} will use LoRA adapter: {lora_path}")
if self.serving_mode == DisaggregationMode.DECODE: if self.serving_mode == DisaggregationMode.DECODE:
# Check if bootstrap_info is pre-computed in the request (from frontend) # Check if bootstrap_info is pre-computed in the request (from frontend)
bootstrap_info = request.get("bootstrap_info") bootstrap_info = request.get("bootstrap_info")
...@@ -306,6 +310,7 @@ class DecodeWorkerHandler(BaseWorkerHandler): ...@@ -306,6 +310,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
rid=trace_id, rid=trace_id,
data_parallel_rank=dp_rank, data_parallel_rank=dp_rank,
**self._session_kwargs(request), **self._session_kwargs(request),
lora_path=lora_path,
**logprob_kwargs, **logprob_kwargs,
**self._priority_kwargs(priority), **self._priority_kwargs(priority),
) )
...@@ -340,6 +345,7 @@ class DecodeWorkerHandler(BaseWorkerHandler): ...@@ -340,6 +345,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
rid=trace_id, rid=trace_id,
data_parallel_rank=dp_rank, data_parallel_rank=dp_rank,
**self._session_kwargs(request), **self._session_kwargs(request),
lora_path=lora_path,
**logprob_kwargs, **logprob_kwargs,
**self._priority_kwargs(priority), **self._priority_kwargs(priority),
) )
......
...@@ -147,6 +147,12 @@ class PrefillWorkerHandler(BaseWorkerHandler): ...@@ -147,6 +147,12 @@ class PrefillWorkerHandler(BaseWorkerHandler):
trace_header = build_trace_headers(context) if self.enable_trace else None trace_header = build_trace_headers(context) if self.enable_trace else None
lora_path = self._resolve_lora(inner_request)
if lora_path:
logging.debug(
f"Prefill request {context.id()} will use LoRA adapter: {lora_path}"
)
results = await self.engine.async_generate( results = await self.engine.async_generate(
**input_param, **input_param,
sampling_params=sampling_params, sampling_params=sampling_params,
...@@ -158,6 +164,7 @@ class PrefillWorkerHandler(BaseWorkerHandler): ...@@ -158,6 +164,7 @@ class PrefillWorkerHandler(BaseWorkerHandler):
rid=trace_id, rid=trace_id,
data_parallel_rank=dp_rank, data_parallel_rank=dp_rank,
**self._session_kwargs(inner_request), **self._session_kwargs(inner_request),
lora_path=lora_path,
**self._priority_kwargs(priority), **self._priority_kwargs(priority),
) )
......
# LoRA with SGLang Backend
For the full LoRA integration guide (setup, usage, API reference, troubleshooting), see [the shared LoRA guide](../../../../common/lora.md).
## Quick Start
```bash
./setup_minio.sh # Start MinIO, download & upload LoRA
./agg_lora.sh # Launch SGLang frontend + worker with LoRA
```
## SGLang-Specific Notes
- The launch script uses `--lora-target-modules all` and `--max-lora-rank 64` by default
- Override with environment variables: `MODEL`, `LORA_NAME`, `DYN_SYSTEM_PORT`, `DYN_HTTP_PORT`
- SGLang LoRA loading goes through `engine.tokenizer_manager.load_lora_adapter()`
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Aggregated serving with LoRA support (SGLang backend).
# GPUs: 1
# Prerequisites: ./setup_minio.sh (starts MinIO, uploads LoRA)
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
# S3/MinIO credentials
export AWS_ENDPOINT=http://localhost:9000
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
export AWS_REGION=us-east-1
export AWS_ALLOW_HTTP=true
# Dynamo LoRA configuration
export DYN_LORA_ENABLED=true
export DYN_LORA_PATH=/tmp/dynamo_loras_minio
mkdir -p "$DYN_LORA_PATH"
MODEL="${MODEL:-Qwen/Qwen3-0.6B}"
LORA_NAME="${LORA_NAME:-codelion/Qwen3-0.6B-accuracy-recovery-lora}"
SYSTEM_PORT="${DYN_SYSTEM_PORT:-8081}"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
# Default to profiled KV token cap when not overridden by the test scheduler
: "${GPU_MEM_ARGS:=--max-total-tokens 2848}"
print_launch_banner --no-curl "Launching Aggregated Serving + LoRA (1 GPU)" "$MODEL" "$HTTP_PORT"
echo ""
echo "Once running, test with:"
echo " curl -s -X POST http://localhost:${SYSTEM_PORT}/v1/loras \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"lora_name\": \"${LORA_NAME}\", \"source\": {\"uri\": \"s3://my-loras/${LORA_NAME}\"}}' | jq ."
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"model\": \"${LORA_NAME}\", \"messages\": [{\"role\": \"user\", \"content\": \"What is deep learning?\"}], \"max_tokens\": 300}' | jq ."
echo "=========================================="
# Frontend
python3 -m dynamo.frontend &
# Worker
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \
python3 -m dynamo.sglang \
--model-path "$MODEL" \
--served-model-name "$MODEL" \
--page-size 16 \
--tp 1 \
--trust-remote-code \
--skip-tokenizer-init \
--enable-lora \
--max-lora-rank 64 \
--lora-target-modules all \
$GPU_MEM_ARGS &
wait_any_exit
../../../../common/setup_minio.sh
\ No newline at end of file
# S3-compatible storage backend LoRA Integration Guide # LoRA with vLLM Backend
This guide explains how to set up and use LoRA (Low-Rank Adaptation) adapters with Dynamo using S3-compatible storage backend (e.g. MinIO, AWS S3, GCS, etc.). For the full LoRA integration guide (setup, usage, API reference, troubleshooting), see [the shared LoRA guide](../../../../common/lora.md).
## Overview
This example demonstrates how to:
1. Set up MinIO as a local S3-compatible storage
2. Download LoRA adapters from Hugging Face Hub
3. Upload LoRA adapters to MinIO
4. Load and use LoRA adapters with Dynamo
5. Run inference with LoRA-adapted models
6. Manage (load/unload) LoRA adapters
## Prerequisites
### Required Software
- Docker (for running MinIO)
- Python 3.8+
- AWS CLI: `pip install awscli`
- Hugging Face CLI: `pip install huggingface-hub`
- jq (optional, for pretty JSON output): `sudo apt install jq`
### Python Dependencies
Make sure you have Dynamo installed with vLLM support:
```bash
pip install dynamo vllm
```
## Quick Start ## Quick Start
### Step 1: Setup MinIO and Upload LoRA
Run the setup script to start MinIO and download/upload a LoRA adapter from Hugging Face:
```bash
./setup_minio.sh
```
This script will:
- Start MinIO in a Docker container
- Download a LoRA adapter from Hugging Face Hub (default: `codelion/Qwen3-0.6B-accuracy-recovery-lora`)
- Upload the LoRA to MinIO at `s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora`
#### Script Options
The setup script supports different modes:
```bash
# Full setup (default) - start MinIO, download & upload LoRA
./setup_minio.sh
# Start MinIO only (without downloading/uploading)
./setup_minio.sh --start
# Stop MinIO
./setup_minio.sh --stop
# Show help
./setup_minio.sh --help
```
#### Customize the LoRA to Download
You can specify a different LoRA repository and name:
```bash
HF_LORA_REPO="username/lora-repo" \
LORA_NAME="my-lora" \
./setup_minio.sh
```
### Step 2: Launch Dynamo with LoRA Support
Start the Dynamo frontend and worker with LoRA support enabled:
```bash
./agg_lora.sh
```
This will:
- Set up AWS credentials for MinIO
- Start the Dynamo frontend on port 8000
- Start the Dynamo worker (vLLM) on port 8081 with LoRA support
Wait for the services to start (check the logs for "Application startup complete").
## Working with LoRAs
### 1. Check Available Models
List all available models (base model only at first):
```bash
curl http://localhost:8000/v1/models | jq .
```
### 2. Load a LoRA Adapter
Load a LoRA from S3-compatible storage backend (e.g. MinIO):
```bash
curl -X POST http://localhost:8081/v1/loras \
-H "Content-Type: application/json" \
-d '{
"lora_name": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
"source": {
"uri": "s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora"
}
}' | jq .
```
Expected response:
```json
{
"status": "success",
"message": "LoRA adapter 'codelion/Qwen3-0.6B-accuracy-recovery-lora' loaded successfully",
"lora_name": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
"lora_id": 1207343256
}
```
### 3. List Loaded LoRAs
Check which LoRAs are currently loaded:
```bash
curl http://localhost:8081/v1/loras | jq .
```
### 4. Verify LoRA in Models List
After loading, the LoRA should appear in the models list:
```bash
curl http://localhost:8000/v1/models | jq .
```
You should see both the base model and the LoRA adapter listed.
### 5. Run Inference with LoRA
#### Using the LoRA-adapted model:
```bash
curl -X POST http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
"messages": [{
"role": "user",
"content": "What is good low risk investment strategy?"
}],
"max_tokens": 300,
"temperature": 0.1
}' | jq .
```
#### For comparison, using the base model:
```bash ```bash
curl -X POST http://localhost:8000/v1/chat/completions \ ./setup_minio.sh # Start MinIO, download & upload LoRA
-H "Content-Type: application/json" \ ./agg_lora.sh # Launch vLLM frontend + worker with LoRA
-d '{
"model": "Qwen/Qwen3-0.6B",
"messages": [{
"role": "user",
"content": "What is good low risk investment strategy?"
}],
"max_tokens": 300
}' | jq .
```
### 6. Unload a LoRA
When you no longer need a LoRA, unload it to free up resources:
```bash
curl -X DELETE http://localhost:8081/v1/loras/codelion/Qwen3-0.6B-accuracy-recovery-lora | jq .
```
Expected response:
```json
{
"status": "success",
"message": "LoRA unloaded successfully"
}
``` ```
After unloading, the LoRA will be removed from both `/v1/loras` and `/v1/models` endpoints. ## vLLM-Specific Notes
## Configuration
### Environment Variables - Default `--max-lora-rank 64` (same as SGLang)
- Override with environment variables: `MODEL`, `LORA_NAME`, `MAX_MODEL_LEN`, `MAX_CONCURRENT_SEQS`
The following environment variables can be configured: ### KV-Aware Routing (2 GPUs)
```bash ```bash
# S3-compatible storage backend Configuration ./agg_lora_router.sh
export AWS_ENDPOINT=http://localhost:9000
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
export AWS_REGION=us-east-1
# Dynamo LoRA Configuration
export DYN_LORA_ENABLED=true
export DYN_LORA_PATH=/tmp/dynamo_loras_minio
``` ```
### MinIO Console Launches two vLLM workers behind a KV-aware router. Load the LoRA to both workers (ports 8081 and 8082), then requests are routed with KV cache affinity for better cache hit rates.
Access the MinIO web console at http://localhost:9001
- Username: `minioadmin`
- Password: `minioadmin`
## Troubleshooting
### MinIO won't start
- Check if ports 9000 and 9001 are already in use
- Ensure Docker is running
- Check Docker logs: `docker logs dynamo-minio`
- Try stopping any existing MinIO containers: `./setup_minio.sh --stop`
- Restart MinIO: `./setup_minio.sh --start`
### LoRA fails to load
- Verify the LoRA is uploaded to MinIO: `aws --endpoint-url=http://localhost:9000 s3 ls s3://my-loras/`
- Check AWS credentials are set correctly
- Ensure the LoRA files are compatible with the base model
- Check vLLM logs for detailed error messages
### Inference fails
- Verify the model name matches exactly (case-sensitive)
- Check if the LoRA is loaded: `curl http://localhost:8081/v1/loras`
- Ensure the base model supports the LoRA rank
- Check that max_lora_rank in the worker config is >= the LoRA rank
### Cache issues
- Check the cache directory: `ls -la /tmp/dynamo_loras_minio/`
- Clear the cache if needed: `rm -rf /tmp/dynamo_loras_minio/*`
- Ensure the cache directory is writable
## Advanced Usage
### Loading Multiple LoRAs
You can load multiple LoRA adapters simultaneously:
```bash
# Load first LoRA
curl -X POST http://localhost:8081/v1/loras \
-H "Content-Type: application/json" \
-d '{"lora_name": "lora1", "source": {"uri": "s3://my-loras/lora1"}}'
# Load second LoRA
curl -X POST http://localhost:8081/v1/loras \
-H "Content-Type: application/json" \
-d '{"lora_name": "lora2", "source": {"uri": "s3://my-loras/lora2"}}'
```
### Using Different Base Models
To use a different base model, modify the `--model` parameter in `agg_lora.sh`:
```bash
python -m dynamo.vllm --model meta-llama/Llama-2-7b-hf --enable-lora --max-lora-rank 64
```
Ensure your LoRAs are compatible with the chosen base model.
## Cleanup
### Stop Services
Press `Ctrl+C` in the terminal running `agg_lora.sh` to stop Dynamo services.
### Stop MinIO
```bash
# Using the setup script (recommended)
./setup_minio.sh --stop
# Or manually with Docker
docker stop dynamo-minio
docker rm dynamo-minio
```
### Clean Up Data
```bash
# Remove MinIO data
rm -rf ~/dynamo_minio_data
# Remove LoRA cache
rm -rf /tmp/dynamo_loras_minio
```
## API Reference
### Load LoRA
- **Endpoint**: `POST /v1/loras`
- **Body**: `{"lora_name": "string", "source": {"uri": "string"}}`
- **Response**: `{"status": "success", "lora_id": int}`
### List LoRAs
- **Endpoint**: `GET /v1/loras`
- **Response**: Array of loaded LoRAs
### Unload LoRA
- **Endpoint**: `DELETE /v1/loras/{lora_name}`
- **Response**: `{"status": "success", "message": "string"}`
### List Models
- **Endpoint**: `GET /v1/models`
- **Response**: OpenAI-compatible models list
### Chat Completions
- **Endpoint**: `POST /v1/chat/completions`
- **Body**: OpenAI-compatible chat completion request
- **Response**: OpenAI-compatible chat completion response
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Script to setup MinIO and upload LoRA adapters from Hugging Face Hub
set -e
# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Colors for output
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
NC='\033[0m' # No Color
# Configuration
MINIO_DATA_DIR="${HOME}/dynamo_minio_data"
MINIO_ENDPOINT="http://localhost:9000"
MINIO_ACCESS_KEY="minioadmin"
MINIO_SECRET_KEY="minioadmin"
BUCKET_NAME="my-loras"
# Default LoRA to download (can be overridden)
HF_LORA_REPO="${HF_LORA_REPO:-codelion/Qwen3-0.6B-accuracy-recovery-lora}"
LORA_NAME="${LORA_NAME:-codelion/Qwen3-0.6B-accuracy-recovery-lora}"
# TEMP_DIR will be created using mktemp when needed
TEMP_DIR=""
# HF_CLI_CMD will be set to either "hf" or "huggingface-cli" based on huggingface-hub python package version
# Starting from HF v0.34.0, the `huggingface-cli` command is deprecated in favor of `hf`.
# Please refer to https://huggingface.co/blog/hf-cli for more details.
HF_CLI_CMD=""
# Parse command line arguments
MODE="full"
if [ "$1" = "--start" ]; then
MODE="start"
elif [ "$1" = "--stop" ]; then
MODE="stop"
elif [ "$1" = "--help" ] || [ "$1" = "-h" ]; then
MODE="help"
elif [ -n "$1" ]; then
echo -e "${RED}Error: Unknown option '$1'${NC}"
MODE="help"
fi
print_info() {
echo -e "${YELLOW}$1${NC}"
}
print_success() {
echo -e "${GREEN}$1${NC}"
}
print_error() {
echo -e "${RED}$1${NC}"
}
# Show help message
show_help() {
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Setup MinIO and upload LoRA adapters from Hugging Face Hub"
echo ""
echo "Options:"
echo " (no options) Run full setup: start MinIO, download and upload LoRA"
echo " --start Only start MinIO container"
echo " --stop Stop and remove MinIO container"
echo " --help, -h Show this help message"
echo ""
echo "Environment Variables:"
echo " HF_LORA_REPO Hugging Face repository (default: ${HF_LORA_REPO:-codelion/Qwen3-0.6B-accuracy-recovery-lora})"
echo " LORA_NAME Local name for the LoRA (default: ${LORA_NAME:-codelion/Qwen3-0.6B-accuracy-recovery-lora})"
echo ""
echo "Examples:"
echo " $0 # Full setup"
echo " $0 --start # Start MinIO only"
echo " $0 --stop # Stop MinIO"
echo " HF_LORA_REPO=user/repo $0 # Use custom LoRA"
echo ""
}
# Check if required tools are installed
check_dependencies() {
print_info "Checking dependencies..."
if ! command -v docker &> /dev/null; then
echo "Error: docker is not installed"
exit 1
fi
if ! command -v aws &> /dev/null; then
echo "Error: aws-cli is not installed. Install with: pip install awscli"
exit 1
fi
# Check for either hf or huggingface-cli
if command -v hf &> /dev/null; then
HF_CLI_CMD="hf"
print_success "Found Hugging Face CLI: hf ($(hf version))"
elif command -v huggingface-cli &> /dev/null; then
HF_CLI_CMD="huggingface-cli"
print_success "Found Hugging Face CLI: huggingface-cli ($(huggingface-cli version))"
else
echo "Error: Neither 'hf' nor 'huggingface-cli' is installed. Install with: pip install huggingface-hub[cli]"
exit 1
fi
print_success "All dependencies are installed"
}
# Start MinIO using Docker
start_minio() {
print_info "Setting up MinIO..."
# Create data directory
mkdir -p "${MINIO_DATA_DIR}"
# Stop and remove existing container if it exists
docker stop dynamo-minio 2>/dev/null || true
docker rm dynamo-minio 2>/dev/null || true
# Start MinIO
print_info "Starting MinIO container..."
docker run -d \
--name dynamo-minio \
-p 9000:9000 \
-p 9001:9001 \
-v "${MINIO_DATA_DIR}:/data" \
quay.io/minio/minio server /data \
--console-address ":9001"
# Wait for MinIO to be ready
print_info "Waiting for MinIO to be ready..."
for i in {1..30}; do
if curl -s ${MINIO_ENDPOINT}/minio/health/live > /dev/null 2>&1; then
print_success "MinIO is ready"
break
fi
if [ $i -eq 30 ]; then
echo "Error: MinIO did not start in time"
exit 1
fi
sleep 1
done
print_success "MinIO started successfully"
echo " - MinIO API: ${MINIO_ENDPOINT}"
echo " - MinIO Console: http://localhost:9001"
echo " - Username: ${MINIO_ACCESS_KEY}"
echo " - Password: ${MINIO_SECRET_KEY}"
}
# Configure AWS CLI for MinIO
configure_aws_cli() {
print_info "Configuring AWS CLI for MinIO..."
export AWS_ACCESS_KEY_ID="${MINIO_ACCESS_KEY}"
export AWS_SECRET_ACCESS_KEY="${MINIO_SECRET_KEY}"
export AWS_ENDPOINT_URL="${MINIO_ENDPOINT}"
# Create bucket if it doesn't exist
if ! aws --endpoint-url=${MINIO_ENDPOINT} s3 ls s3://${BUCKET_NAME} 2>/dev/null; then
print_info "Creating bucket: ${BUCKET_NAME}"
aws --endpoint-url=${MINIO_ENDPOINT} s3 mb s3://${BUCKET_NAME}
print_success "Bucket created"
else
print_success "Bucket already exists: ${BUCKET_NAME}"
fi
}
# Download LoRA from Hugging Face Hub
download_lora_from_hf() {
print_info "Downloading LoRA from Hugging Face Hub..."
echo " - Repository: ${HF_LORA_REPO}"
echo " - Local name: ${LORA_NAME}"
# Create temporary directory using mktemp (global variable for cleanup)
TEMP_DIR=$(mktemp -d -t lora_download_XXXXXX)
# Download LoRA adapter files using the detected CLI
print_info "Downloading adapter files using ${HF_CLI_CMD}..."
if [ "${HF_CLI_CMD}" = "huggingface-cli" ]; then
huggingface-cli download "${HF_LORA_REPO}" \
--local-dir "${TEMP_DIR}" \
--local-dir-use-symlinks False
else
hf download "${HF_LORA_REPO}" \
--local-dir "${TEMP_DIR}"
fi
print_success "LoRA downloaded to ${TEMP_DIR}"
rm -rf "${TEMP_DIR}/.cache"
# List downloaded files
echo "Downloaded files:"
ls -lh "${TEMP_DIR}"
}
# Upload LoRA to MinIO
upload_lora_to_minio() {
print_info "Uploading LoRA to MinIO..."
# Upload all files to S3
aws --endpoint-url=${MINIO_ENDPOINT} s3 sync \
"${TEMP_DIR}" \
"s3://${BUCKET_NAME}/${LORA_NAME}" \
--exclude "*.git*"
print_success "LoRA uploaded to s3://${BUCKET_NAME}/${LORA_NAME}"
# List uploaded files
echo "Uploaded files:"
aws --endpoint-url=${MINIO_ENDPOINT} s3 ls "s3://${BUCKET_NAME}/${LORA_NAME}/" --recursive
}
# Cleanup temp files
cleanup() {
if [ -n "${TEMP_DIR}" ] && [ -d "${TEMP_DIR}" ]; then
print_info "Cleaning up temporary files..."
rm -rf "${TEMP_DIR}"
print_success "Cleanup complete"
fi
}
# Stop MinIO
stop_minio() {
print_info "Stopping MinIO..."
if docker ps | grep -q dynamo-minio; then
docker stop dynamo-minio 2>/dev/null
print_success "MinIO container stopped"
else
print_info "MinIO container is not running"
fi
if docker ps -a | grep -q dynamo-minio; then
docker rm dynamo-minio 2>/dev/null
print_success "MinIO container removed"
fi
echo ""
echo "MinIO has been stopped."
echo "Data is preserved in: ${MINIO_DATA_DIR}"
echo ""
echo "To start MinIO again:"
echo " $0 --start"
echo ""
}
# Start MinIO only (without downloading/uploading LoRA)
start_only() {
echo "========================================"
echo "Starting MinIO"
echo "========================================"
echo ""
start_minio
echo ""
echo "========================================"
echo "MinIO Started!"
echo "========================================"
echo ""
echo "MinIO is now running."
echo ""
echo "To upload a LoRA, run the full setup:"
echo " $0"
echo ""
echo "Or manually upload using AWS CLI:"
echo " export AWS_ACCESS_KEY_ID=${MINIO_ACCESS_KEY}"
echo " export AWS_SECRET_ACCESS_KEY=${MINIO_SECRET_KEY}"
echo " aws --endpoint-url=${MINIO_ENDPOINT} s3 cp your-lora/ s3://${BUCKET_NAME}/your-lora/ --recursive"
echo ""
echo "To stop MinIO:"
echo " $0 --stop"
echo ""
}
# Full setup (start MinIO + download/upload LoRA)
full_setup() {
echo "========================================"
echo "MinIO Setup & LoRA Upload Script"
echo "========================================"
echo ""
check_dependencies
echo ""
start_minio
echo ""
configure_aws_cli
echo ""
download_lora_from_hf
echo ""
upload_lora_to_minio
echo ""
cleanup
echo ""
echo "========================================"
echo "Setup Complete!"
echo "========================================"
echo ""
echo "MinIO is running and LoRA has been uploaded."
echo ""
echo "Next steps:"
echo " 1. Run the Dynamo service with LoRA support:"
echo " ${SCRIPT_DIR}/agg_lora.sh"
echo ""
echo " 2. Load the LoRA adapter:"
echo " curl -X POST http://localhost:8081/v1/loras \\"
echo " -H \"Content-Type: application/json\" \\"
echo " -d '{\"lora_name\": \"${LORA_NAME}\", \"source\": {\"uri\": \"s3://${BUCKET_NAME}/${LORA_NAME}\"}}'"
echo ""
echo " 3. Run inference with the LoRA:"
echo " curl -X POST http://localhost:8000/v1/chat/completions \\"
echo " -H \"Content-Type: application/json\" \\"
echo " -d '{\"model\": \"${LORA_NAME}\", \"messages\": [{\"role\": \"user\", \"content\": \"your prompt here\"}]}'"
echo ""
echo "To stop MinIO:"
echo " $0 --stop"
echo ""
}
# Main execution
case "$MODE" in
start)
start_only
;;
stop)
stop_minio
;;
help)
show_help
exit 0
;;
full)
full_setup
;;
*)
echo "Error: Unknown mode '$MODE'"
show_help
exit 1
;;
esac
../../../../common/setup_minio.sh
\ No newline at end of file
# S3-compatible Storage Backend LoRA Integration Guide
This guide explains how to set up and use LoRA (Low-Rank Adaptation) adapters with Dynamo using S3-compatible storage backend (e.g. MinIO, AWS S3, GCS, etc.).
## Overview
This example demonstrates how to:
1. Set up MinIO as a local S3-compatible storage
2. Download LoRA adapters from Hugging Face Hub
3. Upload LoRA adapters to MinIO
4. Load and use LoRA adapters with Dynamo
5. Run inference with LoRA-adapted models
6. Manage (load/unload) LoRA adapters
## Prerequisites
### Required Software
- Docker (for running MinIO)
- Python 3.10+
- AWS CLI: `pip install awscli`
- Hugging Face CLI: `pip install huggingface-hub[cli]`
- jq (optional, for pretty JSON output): `sudo apt install jq`
### Python Dependencies
Make sure you have Dynamo installed with your chosen backend. See the
[Dynamo quickstart guide](https://docs.nvidia.com/dynamo/getting-started/quickstart)
for setup instructions.
## Quick Start
### Step 1: Setup MinIO and Upload LoRA
Run the setup script to start MinIO and download/upload a LoRA adapter from Hugging Face:
```bash
./setup_minio.sh
```
This script will:
- Start MinIO in a Docker container
- Download a LoRA adapter from Hugging Face Hub (default: `codelion/Qwen3-0.6B-accuracy-recovery-lora`)
- Upload the LoRA to MinIO at `s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora`
#### Script Options
The setup script supports different modes:
```bash
# Full setup (default) - start MinIO, download & upload LoRA
./setup_minio.sh
# Start MinIO only (without downloading/uploading)
./setup_minio.sh --start
# Stop MinIO
./setup_minio.sh --stop
# Show help
./setup_minio.sh --help
```
#### Customize the LoRA to Download
You can specify a different LoRA repository and name:
```bash
HF_LORA_REPO="username/lora-repo" \
LORA_NAME="my-lora" \
./setup_minio.sh
```
### Step 2: Launch Dynamo with LoRA Support
Start the Dynamo frontend and worker with LoRA support enabled:
```bash
./agg_lora.sh
```
This will:
- Set up AWS credentials for MinIO
- Start the Dynamo frontend on port 8000
- Start the Dynamo worker on port 8081 with LoRA support
Wait for the services to start (check the logs for "Application startup complete").
## Working with LoRAs
### 1. Check Available Models
List all available models (base model only at first):
```bash
curl http://localhost:8000/v1/models | jq .
```
### 2. Load a LoRA Adapter
Load a LoRA from S3-compatible storage backend (e.g. MinIO):
```bash
curl -X POST http://localhost:8081/v1/loras \
-H "Content-Type: application/json" \
-d '{
"lora_name": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
"source": {
"uri": "s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora"
}
}' | jq .
```
Expected response:
```json
{
"status": "success",
"message": "LoRA adapter 'codelion/Qwen3-0.6B-accuracy-recovery-lora' loaded successfully",
"lora_name": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
"lora_id": 1207343256
}
```
### 3. List Loaded LoRAs
Check which LoRAs are currently loaded:
```bash
curl http://localhost:8081/v1/loras | jq .
```
### 4. Verify LoRA in Models List
After loading, the LoRA should appear in the models list:
```bash
curl http://localhost:8000/v1/models | jq .
```
You should see both the base model and the LoRA adapter listed.
### 5. Run Inference with LoRA
#### Using the LoRA-adapted model:
```bash
curl -X POST http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
"messages": [{
"role": "user",
"content": "What is good low risk investment strategy?"
}],
"max_tokens": 300,
"temperature": 0.1
}' | jq .
```
#### For comparison, using the base model:
```bash
curl -X POST http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen/Qwen3-0.6B",
"messages": [{
"role": "user",
"content": "What is good low risk investment strategy?"
}],
"max_tokens": 300
}' | jq .
```
### 6. Unload a LoRA
When you no longer need a LoRA, unload it to free up resources:
```bash
curl -X DELETE http://localhost:8081/v1/loras/codelion/Qwen3-0.6B-accuracy-recovery-lora | jq .
```
Expected response:
```json
{
"status": "success",
"message": "LoRA unloaded successfully"
}
```
After unloading, the LoRA will be removed from both `/v1/loras` and `/v1/models` endpoints.
## Configuration
### Environment Variables
The following environment variables can be configured:
```bash
# S3-compatible storage backend Configuration
export AWS_ENDPOINT=http://localhost:9000
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
export AWS_REGION=us-east-1
# Dynamo LoRA Configuration
export DYN_LORA_ENABLED=true
export DYN_LORA_PATH=/tmp/dynamo_loras_minio
```
### MinIO Console
Access the MinIO web console at `http://localhost:9001`
- Username: `minioadmin`
- Password: `minioadmin`
## Troubleshooting
### MinIO won't start
- Check if ports 9000 and 9001 are already in use
- Ensure Docker is running
- Check Docker logs: `docker logs dynamo-minio`
- Try stopping any existing MinIO containers: `./setup_minio.sh --stop`
- Restart MinIO: `./setup_minio.sh --start`
### LoRA fails to load
- Verify the LoRA is uploaded to MinIO: `aws --endpoint-url=http://localhost:9000 s3 ls s3://my-loras/`
- Check AWS credentials are set correctly
- Ensure the LoRA files are compatible with the base model
- Check worker logs for detailed error messages
### Inference fails
- Verify the model name matches exactly (case-sensitive)
- Check if the LoRA is loaded: `curl http://localhost:8081/v1/loras`
- Ensure the base model supports the LoRA rank
- Check that max_lora_rank in the worker config is >= the LoRA rank
### Cache issues
- Check the cache directory: `ls -la /tmp/dynamo_loras_minio/`
- Clear the cache if needed: `rm -rf /tmp/dynamo_loras_minio/*`
- Ensure the cache directory is writable
## Advanced Usage
### Loading Multiple LoRAs
You can load multiple LoRA adapters simultaneously:
```bash
# Load first LoRA
curl -X POST http://localhost:8081/v1/loras \
-H "Content-Type: application/json" \
-d '{"lora_name": "lora1", "source": {"uri": "s3://my-loras/lora1"}}'
# Load second LoRA
curl -X POST http://localhost:8081/v1/loras \
-H "Content-Type: application/json" \
-d '{"lora_name": "lora2", "source": {"uri": "s3://my-loras/lora2"}}'
```
### Using Different Base Models
To use a different base model, modify the `MODEL` environment variable:
```bash
MODEL=meta-llama/Llama-2-7b-hf ./agg_lora.sh
```
Ensure your LoRAs are compatible with the chosen base model.
## Cleanup
### Stop Services
Press `Ctrl+C` in the terminal running `agg_lora.sh` to stop Dynamo services.
### Stop MinIO
```bash
# Using the setup script (recommended)
./setup_minio.sh --stop
# Or manually with Docker
docker stop dynamo-minio
docker rm dynamo-minio
```
### Clean Up Data
```bash
# Remove MinIO data
rm -rf ~/dynamo_minio_data
# Remove LoRA cache
rm -rf /tmp/dynamo_loras_minio
```
## API Reference
### Load LoRA
- **Endpoint**: `POST /v1/loras`
- **Body**: `{"lora_name": "string", "source": {"uri": "string"}}`
- **Response**: `{"status": "success", "lora_id": int}`
### List LoRAs
- **Endpoint**: `GET /v1/loras`
- **Response**: Array of loaded LoRAs
### Unload LoRA
- **Endpoint**: `DELETE /v1/loras/{lora_name}`
- **Response**: `{"status": "success", "message": "string"}`
### List Models
- **Endpoint**: `GET /v1/models`
- **Response**: OpenAI-compatible models list
### Chat Completions
- **Endpoint**: `POST /v1/chat/completions`
- **Body**: OpenAI-compatible chat completion request
- **Response**: OpenAI-compatible chat completion response
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Shared script to set up MinIO and upload LoRA adapters from Hugging Face Hub.
# Backend-agnostic: symlink from any backend's lora/ directory.
# SCRIPT_DIR resolves to the directory of the symlink, not this file's location,
# so "Next steps" messages correctly reference the backend's launch script.
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Colors
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
NC='\033[0m'
# Configuration
MINIO_DATA_DIR="${HOME}/dynamo_minio_data"
MINIO_ENDPOINT="http://localhost:9000"
MINIO_ACCESS_KEY="minioadmin"
MINIO_SECRET_KEY="minioadmin"
BUCKET_NAME="my-loras"
# Default LoRA (override via env vars)
HF_LORA_REPO="${HF_LORA_REPO:-codelion/Qwen3-0.6B-accuracy-recovery-lora}"
LORA_NAME="${LORA_NAME:-codelion/Qwen3-0.6B-accuracy-recovery-lora}"
TEMP_DIR=""
# HF CLI: "hf" (v0.34.0+) or "huggingface-cli" (legacy)
HF_CLI_CMD=""
# Parse args
MODE="full"
case "${1:-}" in
--start) MODE="start" ;;
--stop) MODE="stop" ;;
-h|--help) MODE="help" ;;
"") MODE="full" ;;
*) echo -e "${RED}Error: Unknown option '$1'${NC}"; MODE="help" ;;
esac
info() { echo -e "${YELLOW}-> $1${NC}"; }
success() { echo -e "${GREEN}ok $1${NC}"; }
show_help() {
cat <<EOF
Usage: $0 [OPTIONS]
Setup MinIO and upload LoRA adapters from Hugging Face Hub.
Options:
(none) Full setup: start MinIO, download and upload LoRA
--start Start MinIO container only
--stop Stop and remove MinIO container
-h, --help Show this help
Environment Variables:
HF_LORA_REPO HF repository (default: $HF_LORA_REPO)
LORA_NAME Name for the LoRA (default: $LORA_NAME)
Examples:
$0 # Full setup
$0 --start # Start MinIO only
$0 --stop # Stop MinIO
HF_LORA_REPO=user/repo $0 # Custom LoRA
EOF
}
check_dependencies() {
info "Checking dependencies..."
command -v docker &>/dev/null || { echo "Error: docker not installed"; exit 1; }
command -v aws &>/dev/null || { echo "Error: aws-cli not installed (pip install awscli)"; exit 1; }
if command -v hf &>/dev/null; then
HF_CLI_CMD="hf"
elif command -v huggingface-cli &>/dev/null; then
HF_CLI_CMD="huggingface-cli"
else
echo "Error: Neither 'hf' nor 'huggingface-cli' installed (pip install huggingface-hub[cli])"
exit 1
fi
success "Dependencies OK (HF CLI: ${HF_CLI_CMD})"
}
start_minio() {
info "Setting up MinIO..."
mkdir -p "${MINIO_DATA_DIR}"
docker stop dynamo-minio 2>/dev/null || true
docker rm dynamo-minio 2>/dev/null || true
docker run -d --name dynamo-minio \
-p 9000:9000 -p 9001:9001 \
-v "${MINIO_DATA_DIR}:/data" \
quay.io/minio/minio server /data --console-address ":9001"
info "Waiting for MinIO..."
for i in {1..30}; do
curl -s ${MINIO_ENDPOINT}/minio/health/live >/dev/null 2>&1 && break
[ $i -eq 30 ] && { echo "Error: MinIO did not start in time"; exit 1; }
sleep 1
done
success "MinIO ready (API: ${MINIO_ENDPOINT}, Console: http://localhost:9001)"
}
configure_aws_cli() {
export AWS_ACCESS_KEY_ID="${MINIO_ACCESS_KEY}"
export AWS_SECRET_ACCESS_KEY="${MINIO_SECRET_KEY}"
export AWS_ENDPOINT_URL="${MINIO_ENDPOINT}"
if ! aws --endpoint-url=${MINIO_ENDPOINT} s3 ls s3://${BUCKET_NAME} 2>/dev/null; then
aws --endpoint-url=${MINIO_ENDPOINT} s3 mb s3://${BUCKET_NAME}
success "Bucket created: ${BUCKET_NAME}"
else
success "Bucket exists: ${BUCKET_NAME}"
fi
}
download_lora_from_hf() {
info "Downloading LoRA: ${HF_LORA_REPO}..."
TEMP_DIR=$(mktemp -d -t lora_download_XXXXXX)
if [ "${HF_CLI_CMD}" = "huggingface-cli" ]; then
huggingface-cli download "${HF_LORA_REPO}" \
--local-dir "${TEMP_DIR}" --local-dir-use-symlinks False
else
hf download "${HF_LORA_REPO}" --local-dir "${TEMP_DIR}"
fi
rm -rf "${TEMP_DIR}/.cache"
success "Downloaded to ${TEMP_DIR}"
}
upload_lora_to_minio() {
info "Uploading to s3://${BUCKET_NAME}/${LORA_NAME}..."
aws --endpoint-url=${MINIO_ENDPOINT} s3 sync \
"${TEMP_DIR}" "s3://${BUCKET_NAME}/${LORA_NAME}" --exclude "*.git*"
success "Upload complete"
}
cleanup() {
[ -n "${TEMP_DIR}" ] && [ -d "${TEMP_DIR}" ] && rm -rf "${TEMP_DIR}"
}
stop_minio() {
info "Stopping MinIO..."
docker stop dynamo-minio 2>/dev/null && success "Stopped" || info "Not running"
docker rm dynamo-minio 2>/dev/null && success "Removed" || true
echo "Data preserved in: ${MINIO_DATA_DIR}"
}
# --- Main ---
case "$MODE" in
help)
show_help; exit 0 ;;
stop)
stop_minio ;;
start)
start_minio ;;
full)
check_dependencies
start_minio
configure_aws_cli
download_lora_from_hf
upload_lora_to_minio
cleanup
echo ""
echo "Setup complete. Next steps:"
echo " 1. Launch: ${SCRIPT_DIR}/agg_lora.sh"
echo " 2. Load: curl -X POST http://localhost:8081/v1/loras \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"lora_name\": \"${LORA_NAME}\", \"source\": {\"uri\": \"s3://${BUCKET_NAME}/${LORA_NAME}\"}}'"
echo " 3. Infer: curl http://localhost:8000/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"model\": \"${LORA_NAME}\", \"messages\": [{\"role\": \"user\", \"content\": \"Hello\"}]}'"
echo " 4. Stop: $0 --stop"
;;
esac
...@@ -22,6 +22,7 @@ import boto3 ...@@ -22,6 +22,7 @@ import boto3
import requests import requests
from botocore.client import Config from botocore.client import Config
from botocore.exceptions import ClientError from botocore.exceptions import ClientError
from huggingface_hub import snapshot_download
if TYPE_CHECKING: if TYPE_CHECKING:
from mypy_boto3_s3.client import S3Client from mypy_boto3_s3.client import S3Client
...@@ -237,29 +238,17 @@ class MinioService: ...@@ -237,29 +238,17 @@ class MinioService:
f"Downloading LoRA {self.config.lora_repo} to {self._temp_download_dir}" f"Downloading LoRA {self.config.lora_repo} to {self._temp_download_dir}"
) )
# Run with HF_HUB_OFFLINE unset so the download works even when # Temporarily unset HF_HUB_OFFLINE so the download works even when
# the predownload_models fixture has already enabled offline mode. # the predownload_models fixture has already enabled offline mode.
# This only affects the subprocess env; the parent process is unchanged. old_offline = os.environ.pop("HF_HUB_OFFLINE", None)
env = os.environ.copy() try:
env.pop("HF_HUB_OFFLINE", None) snapshot_download(
result = subprocess.run(
[
"huggingface-cli",
"download",
self.config.lora_repo, self.config.lora_repo,
"--local-dir", local_dir=self._temp_download_dir,
self._temp_download_dir, )
"--local-dir-use-symlinks", finally:
"False", if old_offline is not None:
], os.environ["HF_HUB_OFFLINE"] = old_offline
capture_output=True,
text=True,
env=env,
)
if result.returncode != 0:
raise RuntimeError(f"Failed to download LoRA: {result.stderr}")
# Clean up cache directory # Clean up cache directory
cache_dir = os.path.join(self._temp_download_dir, ".cache") cache_dir = os.path.join(self._temp_download_dir, ".cache")
......
...@@ -5,6 +5,7 @@ import dataclasses ...@@ -5,6 +5,7 @@ import dataclasses
import logging import logging
import os import os
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Optional
import pytest import pytest
...@@ -14,6 +15,7 @@ from tests.serve.common import ( ...@@ -14,6 +15,7 @@ from tests.serve.common import (
params_with_model_mark, params_with_model_mark,
run_serve_deployment, run_serve_deployment,
) )
from tests.serve.lora_utils import MinioLoraConfig
from tests.utils.constants import DefaultPort from tests.utils.constants import DefaultPort
from tests.utils.engine_process import EngineConfig from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import ( from tests.utils.payload_builder import (
...@@ -28,6 +30,7 @@ from tests.utils.payload_builder import ( ...@@ -28,6 +30,7 @@ from tests.utils.payload_builder import (
responses_payload_default, responses_payload_default,
responses_stream_payload_default, responses_stream_payload_default,
) )
from tests.utils.payloads import LoraTestChatPayload
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -498,3 +501,97 @@ def test_sglang_disagg_dp_attention( ...@@ -498,3 +501,97 @@ def test_sglang_disagg_dp_attention(
"""Test sglang disaggregated with DP attention (requires 4 GPUs)""" """Test sglang disaggregated with DP attention (requires 4 GPUs)"""
# Kept for reference; this test uses a different launch path and is skipped # Kept for reference; this test uses a different launch path and is skipped
# ── LoRA Tests ──────────────────────────────────────────────────────────────
lora_dir = os.path.join(sglang_dir, "launch/lora")
def lora_chat_payload(
lora_name: str,
s3_uri: str,
system_port: int = DefaultPort.SYSTEM1.value,
repeat_count: int = 2,
expected_response: Optional[list] = None,
expected_log: Optional[list] = None,
max_tokens: int = 100,
temperature: float = 0.0,
) -> LoraTestChatPayload:
"""Create a LoRA-enabled chat payload for testing"""
return LoraTestChatPayload(
body={
"model": lora_name,
"messages": [
{
"role": "user",
"content": "What is deep learning? Answer in one sentence.",
}
],
"max_tokens": max_tokens,
"temperature": temperature,
"stream": False,
},
lora_name=lora_name,
s3_uri=s3_uri,
system_port=system_port,
repeat_count=repeat_count,
expected_response=expected_response
or ["learning", "neural", "network", "AI", "model"],
expected_log=expected_log or [],
)
@pytest.mark.sglang
@pytest.mark.e2e
@pytest.mark.gpu_1
@pytest.mark.model("Qwen/Qwen3-0.6B")
@pytest.mark.profiled_vram_gib(4.7)
@pytest.mark.requested_sglang_kv_tokens(2848)
@pytest.mark.timeout(158)
@pytest.mark.pre_merge
def test_sglang_lora_aggregated(
request,
runtime_services_dynamic_ports,
predownload_models,
minio_lora_service,
dynamo_dynamic_ports,
):
"""
Test LoRA inference with aggregated SGLang deployment.
This test:
1. Uses MinIO fixture to provide S3-compatible storage with uploaded LoRA
2. Starts SGLang with LoRA support enabled
3. Loads the LoRA adapter via system API
4. Runs inference with the LoRA model
"""
minio_config: MinioLoraConfig = minio_lora_service
lora_payload = lora_chat_payload(
lora_name=minio_config.lora_name,
s3_uri=minio_config.get_s3_uri(),
system_port=DefaultPort.SYSTEM1.value,
repeat_count=2,
)
config = SGLangConfig(
name="test_sglang_lora_aggregated",
directory=sglang_dir,
script_name="lora/agg_lora.sh",
marks=[],
model="Qwen/Qwen3-0.6B",
timeout=158,
env=minio_config.get_env_vars(),
request_payloads=[lora_payload],
)
config = dataclasses.replace(
config, frontend_port=dynamo_dynamic_ports.frontend_port
)
run_serve_deployment(
config,
request,
ports=dynamo_dynamic_ports,
extra_env=minio_config.get_env_vars(),
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment