Unverified Commit f9839161 authored by Biswa Panda's avatar Biswa Panda Committed by GitHub
Browse files

feat(lora): add LoRA support for SGLang (#4769)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 21b44473
......@@ -80,6 +80,15 @@ async def init_decode(
generate_endpoint = runtime.endpoint(
f"{dynamo_args.namespace}.{dynamo_args.component}.{dynamo_args.endpoint}"
)
load_lora_endpoint = runtime.endpoint(
f"{dynamo_args.namespace}.{dynamo_args.component}.load_lora"
)
unload_lora_endpoint = runtime.endpoint(
f"{dynamo_args.namespace}.{dynamo_args.component}.unload_lora"
)
list_loras_endpoint = runtime.endpoint(
f"{dynamo_args.namespace}.{dynamo_args.component}.list_loras"
)
shutdown_endpoints[:] = [generate_endpoint]
......@@ -132,6 +141,18 @@ async def init_decode(
metrics_labels=metrics_labels,
health_check_payload=health_check_payload,
),
load_lora_endpoint.serve_endpoint(
handler.load_lora,
metrics_labels=metrics_labels,
),
unload_lora_endpoint.serve_endpoint(
handler.unload_lora,
metrics_labels=metrics_labels,
),
list_loras_endpoint.serve_endpoint(
handler.list_loras,
metrics_labels=metrics_labels,
),
register_model_with_readiness_gate(
engine,
generate_endpoint,
......@@ -187,6 +208,15 @@ async def init_prefill(
generate_endpoint = runtime.endpoint(
f"{dynamo_args.namespace}.{dynamo_args.component}.{dynamo_args.endpoint}"
)
load_lora_endpoint = runtime.endpoint(
f"{dynamo_args.namespace}.{dynamo_args.component}.load_lora"
)
unload_lora_endpoint = runtime.endpoint(
f"{dynamo_args.namespace}.{dynamo_args.component}.unload_lora"
)
list_loras_endpoint = runtime.endpoint(
f"{dynamo_args.namespace}.{dynamo_args.component}.list_loras"
)
shutdown_endpoints[:] = [generate_endpoint]
......@@ -228,6 +258,18 @@ async def init_prefill(
metrics_labels=metrics_labels,
health_check_payload=health_check_payload,
),
load_lora_endpoint.serve_endpoint(
handler.load_lora,
metrics_labels=metrics_labels,
),
unload_lora_endpoint.serve_endpoint(
handler.unload_lora,
metrics_labels=metrics_labels,
),
list_loras_endpoint.serve_endpoint(
handler.list_loras,
metrics_labels=metrics_labels,
),
register_model_with_readiness_gate(
engine,
generate_endpoint,
......
......@@ -272,6 +272,10 @@ class DecodeWorkerHandler(BaseWorkerHandler):
priority = (request.get("routing") or {}).get("priority")
logprob_kwargs = self._build_logprob_kwargs(request)
lora_path = self._resolve_lora(request)
if lora_path:
logging.debug(f"Request {context.id()} will use LoRA adapter: {lora_path}")
if self.serving_mode == DisaggregationMode.DECODE:
# Check if bootstrap_info is pre-computed in the request (from frontend)
bootstrap_info = request.get("bootstrap_info")
......@@ -306,6 +310,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
rid=trace_id,
data_parallel_rank=dp_rank,
**self._session_kwargs(request),
lora_path=lora_path,
**logprob_kwargs,
**self._priority_kwargs(priority),
)
......@@ -340,6 +345,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
rid=trace_id,
data_parallel_rank=dp_rank,
**self._session_kwargs(request),
lora_path=lora_path,
**logprob_kwargs,
**self._priority_kwargs(priority),
)
......
......@@ -147,6 +147,12 @@ class PrefillWorkerHandler(BaseWorkerHandler):
trace_header = build_trace_headers(context) if self.enable_trace else None
lora_path = self._resolve_lora(inner_request)
if lora_path:
logging.debug(
f"Prefill request {context.id()} will use LoRA adapter: {lora_path}"
)
results = await self.engine.async_generate(
**input_param,
sampling_params=sampling_params,
......@@ -158,6 +164,7 @@ class PrefillWorkerHandler(BaseWorkerHandler):
rid=trace_id,
data_parallel_rank=dp_rank,
**self._session_kwargs(inner_request),
lora_path=lora_path,
**self._priority_kwargs(priority),
)
......
# LoRA with SGLang Backend
For the full LoRA integration guide (setup, usage, API reference, troubleshooting), see [the shared LoRA guide](../../../../common/lora.md).
## Quick Start
```bash
./setup_minio.sh # Start MinIO, download & upload LoRA
./agg_lora.sh # Launch SGLang frontend + worker with LoRA
```
## SGLang-Specific Notes
- The launch script uses `--lora-target-modules all` and `--max-lora-rank 64` by default
- Override with environment variables: `MODEL`, `LORA_NAME`, `DYN_SYSTEM_PORT`, `DYN_HTTP_PORT`
- SGLang LoRA loading goes through `engine.tokenizer_manager.load_lora_adapter()`
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Aggregated serving with LoRA support (SGLang backend).
# GPUs: 1
# Prerequisites: ./setup_minio.sh (starts MinIO, uploads LoRA)
set -e
trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../../common/launch_utils.sh"
# S3/MinIO credentials
export AWS_ENDPOINT=http://localhost:9000
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
export AWS_REGION=us-east-1
export AWS_ALLOW_HTTP=true
# Dynamo LoRA configuration
export DYN_LORA_ENABLED=true
export DYN_LORA_PATH=/tmp/dynamo_loras_minio
mkdir -p "$DYN_LORA_PATH"
MODEL="${MODEL:-Qwen/Qwen3-0.6B}"
LORA_NAME="${LORA_NAME:-codelion/Qwen3-0.6B-accuracy-recovery-lora}"
SYSTEM_PORT="${DYN_SYSTEM_PORT:-8081}"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
GPU_MEM_ARGS=$(build_sglang_gpu_mem_args)
# Default to profiled KV token cap when not overridden by the test scheduler
: "${GPU_MEM_ARGS:=--max-total-tokens 2848}"
print_launch_banner --no-curl "Launching Aggregated Serving + LoRA (1 GPU)" "$MODEL" "$HTTP_PORT"
echo ""
echo "Once running, test with:"
echo " curl -s -X POST http://localhost:${SYSTEM_PORT}/v1/loras \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"lora_name\": \"${LORA_NAME}\", \"source\": {\"uri\": \"s3://my-loras/${LORA_NAME}\"}}' | jq ."
echo ""
echo " curl http://localhost:${HTTP_PORT}/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"model\": \"${LORA_NAME}\", \"messages\": [{\"role\": \"user\", \"content\": \"What is deep learning?\"}], \"max_tokens\": 300}' | jq ."
echo "=========================================="
# Frontend
python3 -m dynamo.frontend &
# Worker
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \
python3 -m dynamo.sglang \
--model-path "$MODEL" \
--served-model-name "$MODEL" \
--page-size 16 \
--tp 1 \
--trust-remote-code \
--skip-tokenizer-init \
--enable-lora \
--max-lora-rank 64 \
--lora-target-modules all \
$GPU_MEM_ARGS &
wait_any_exit
../../../../common/setup_minio.sh
\ No newline at end of file
# S3-compatible storage backend LoRA Integration Guide
# LoRA with vLLM Backend
This guide explains how to set up and use LoRA (Low-Rank Adaptation) adapters with Dynamo using S3-compatible storage backend (e.g. MinIO, AWS S3, GCS, etc.).
## Overview
This example demonstrates how to:
1. Set up MinIO as a local S3-compatible storage
2. Download LoRA adapters from Hugging Face Hub
3. Upload LoRA adapters to MinIO
4. Load and use LoRA adapters with Dynamo
5. Run inference with LoRA-adapted models
6. Manage (load/unload) LoRA adapters
## Prerequisites
### Required Software
- Docker (for running MinIO)
- Python 3.8+
- AWS CLI: `pip install awscli`
- Hugging Face CLI: `pip install huggingface-hub`
- jq (optional, for pretty JSON output): `sudo apt install jq`
### Python Dependencies
Make sure you have Dynamo installed with vLLM support:
```bash
pip install dynamo vllm
```
For the full LoRA integration guide (setup, usage, API reference, troubleshooting), see [the shared LoRA guide](../../../../common/lora.md).
## Quick Start
### Step 1: Setup MinIO and Upload LoRA
Run the setup script to start MinIO and download/upload a LoRA adapter from Hugging Face:
```bash
./setup_minio.sh
```
This script will:
- Start MinIO in a Docker container
- Download a LoRA adapter from Hugging Face Hub (default: `codelion/Qwen3-0.6B-accuracy-recovery-lora`)
- Upload the LoRA to MinIO at `s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora`
#### Script Options
The setup script supports different modes:
```bash
# Full setup (default) - start MinIO, download & upload LoRA
./setup_minio.sh
# Start MinIO only (without downloading/uploading)
./setup_minio.sh --start
# Stop MinIO
./setup_minio.sh --stop
# Show help
./setup_minio.sh --help
```
#### Customize the LoRA to Download
You can specify a different LoRA repository and name:
```bash
HF_LORA_REPO="username/lora-repo" \
LORA_NAME="my-lora" \
./setup_minio.sh
```
### Step 2: Launch Dynamo with LoRA Support
Start the Dynamo frontend and worker with LoRA support enabled:
```bash
./agg_lora.sh
```
This will:
- Set up AWS credentials for MinIO
- Start the Dynamo frontend on port 8000
- Start the Dynamo worker (vLLM) on port 8081 with LoRA support
Wait for the services to start (check the logs for "Application startup complete").
## Working with LoRAs
### 1. Check Available Models
List all available models (base model only at first):
```bash
curl http://localhost:8000/v1/models | jq .
```
### 2. Load a LoRA Adapter
Load a LoRA from S3-compatible storage backend (e.g. MinIO):
```bash
curl -X POST http://localhost:8081/v1/loras \
-H "Content-Type: application/json" \
-d '{
"lora_name": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
"source": {
"uri": "s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora"
}
}' | jq .
```
Expected response:
```json
{
"status": "success",
"message": "LoRA adapter 'codelion/Qwen3-0.6B-accuracy-recovery-lora' loaded successfully",
"lora_name": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
"lora_id": 1207343256
}
```
### 3. List Loaded LoRAs
Check which LoRAs are currently loaded:
```bash
curl http://localhost:8081/v1/loras | jq .
```
### 4. Verify LoRA in Models List
After loading, the LoRA should appear in the models list:
```bash
curl http://localhost:8000/v1/models | jq .
```
You should see both the base model and the LoRA adapter listed.
### 5. Run Inference with LoRA
#### Using the LoRA-adapted model:
```bash
curl -X POST http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
"messages": [{
"role": "user",
"content": "What is good low risk investment strategy?"
}],
"max_tokens": 300,
"temperature": 0.1
}' | jq .
```
#### For comparison, using the base model:
```bash
curl -X POST http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen/Qwen3-0.6B",
"messages": [{
"role": "user",
"content": "What is good low risk investment strategy?"
}],
"max_tokens": 300
}' | jq .
```
### 6. Unload a LoRA
When you no longer need a LoRA, unload it to free up resources:
```bash
curl -X DELETE http://localhost:8081/v1/loras/codelion/Qwen3-0.6B-accuracy-recovery-lora | jq .
```
Expected response:
```json
{
"status": "success",
"message": "LoRA unloaded successfully"
}
./setup_minio.sh # Start MinIO, download & upload LoRA
./agg_lora.sh # Launch vLLM frontend + worker with LoRA
```
After unloading, the LoRA will be removed from both `/v1/loras` and `/v1/models` endpoints.
## Configuration
## vLLM-Specific Notes
### Environment Variables
- Default `--max-lora-rank 64` (same as SGLang)
- Override with environment variables: `MODEL`, `LORA_NAME`, `MAX_MODEL_LEN`, `MAX_CONCURRENT_SEQS`
The following environment variables can be configured:
### KV-Aware Routing (2 GPUs)
```bash
# S3-compatible storage backend Configuration
export AWS_ENDPOINT=http://localhost:9000
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
export AWS_REGION=us-east-1
# Dynamo LoRA Configuration
export DYN_LORA_ENABLED=true
export DYN_LORA_PATH=/tmp/dynamo_loras_minio
./agg_lora_router.sh
```
### MinIO Console
Access the MinIO web console at http://localhost:9001
- Username: `minioadmin`
- Password: `minioadmin`
## Troubleshooting
### MinIO won't start
- Check if ports 9000 and 9001 are already in use
- Ensure Docker is running
- Check Docker logs: `docker logs dynamo-minio`
- Try stopping any existing MinIO containers: `./setup_minio.sh --stop`
- Restart MinIO: `./setup_minio.sh --start`
### LoRA fails to load
- Verify the LoRA is uploaded to MinIO: `aws --endpoint-url=http://localhost:9000 s3 ls s3://my-loras/`
- Check AWS credentials are set correctly
- Ensure the LoRA files are compatible with the base model
- Check vLLM logs for detailed error messages
### Inference fails
- Verify the model name matches exactly (case-sensitive)
- Check if the LoRA is loaded: `curl http://localhost:8081/v1/loras`
- Ensure the base model supports the LoRA rank
- Check that max_lora_rank in the worker config is >= the LoRA rank
### Cache issues
- Check the cache directory: `ls -la /tmp/dynamo_loras_minio/`
- Clear the cache if needed: `rm -rf /tmp/dynamo_loras_minio/*`
- Ensure the cache directory is writable
## Advanced Usage
### Loading Multiple LoRAs
You can load multiple LoRA adapters simultaneously:
```bash
# Load first LoRA
curl -X POST http://localhost:8081/v1/loras \
-H "Content-Type: application/json" \
-d '{"lora_name": "lora1", "source": {"uri": "s3://my-loras/lora1"}}'
# Load second LoRA
curl -X POST http://localhost:8081/v1/loras \
-H "Content-Type: application/json" \
-d '{"lora_name": "lora2", "source": {"uri": "s3://my-loras/lora2"}}'
```
### Using Different Base Models
To use a different base model, modify the `--model` parameter in `agg_lora.sh`:
```bash
python -m dynamo.vllm --model meta-llama/Llama-2-7b-hf --enable-lora --max-lora-rank 64
```
Ensure your LoRAs are compatible with the chosen base model.
## Cleanup
### Stop Services
Press `Ctrl+C` in the terminal running `agg_lora.sh` to stop Dynamo services.
### Stop MinIO
```bash
# Using the setup script (recommended)
./setup_minio.sh --stop
# Or manually with Docker
docker stop dynamo-minio
docker rm dynamo-minio
```
### Clean Up Data
```bash
# Remove MinIO data
rm -rf ~/dynamo_minio_data
# Remove LoRA cache
rm -rf /tmp/dynamo_loras_minio
```
## API Reference
### Load LoRA
- **Endpoint**: `POST /v1/loras`
- **Body**: `{"lora_name": "string", "source": {"uri": "string"}}`
- **Response**: `{"status": "success", "lora_id": int}`
### List LoRAs
- **Endpoint**: `GET /v1/loras`
- **Response**: Array of loaded LoRAs
### Unload LoRA
- **Endpoint**: `DELETE /v1/loras/{lora_name}`
- **Response**: `{"status": "success", "message": "string"}`
### List Models
- **Endpoint**: `GET /v1/models`
- **Response**: OpenAI-compatible models list
### Chat Completions
- **Endpoint**: `POST /v1/chat/completions`
- **Body**: OpenAI-compatible chat completion request
- **Response**: OpenAI-compatible chat completion response
Launches two vLLM workers behind a KV-aware router. Load the LoRA to both workers (ports 8081 and 8082), then requests are routed with KV cache affinity for better cache hit rates.
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Script to setup MinIO and upload LoRA adapters from Hugging Face Hub
set -e
# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Colors for output
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
NC='\033[0m' # No Color
# Configuration
MINIO_DATA_DIR="${HOME}/dynamo_minio_data"
MINIO_ENDPOINT="http://localhost:9000"
MINIO_ACCESS_KEY="minioadmin"
MINIO_SECRET_KEY="minioadmin"
BUCKET_NAME="my-loras"
# Default LoRA to download (can be overridden)
HF_LORA_REPO="${HF_LORA_REPO:-codelion/Qwen3-0.6B-accuracy-recovery-lora}"
LORA_NAME="${LORA_NAME:-codelion/Qwen3-0.6B-accuracy-recovery-lora}"
# TEMP_DIR will be created using mktemp when needed
TEMP_DIR=""
# HF_CLI_CMD will be set to either "hf" or "huggingface-cli" based on huggingface-hub python package version
# Starting from HF v0.34.0, the `huggingface-cli` command is deprecated in favor of `hf`.
# Please refer to https://huggingface.co/blog/hf-cli for more details.
HF_CLI_CMD=""
# Parse command line arguments
MODE="full"
if [ "$1" = "--start" ]; then
MODE="start"
elif [ "$1" = "--stop" ]; then
MODE="stop"
elif [ "$1" = "--help" ] || [ "$1" = "-h" ]; then
MODE="help"
elif [ -n "$1" ]; then
echo -e "${RED}Error: Unknown option '$1'${NC}"
MODE="help"
fi
print_info() {
echo -e "${YELLOW}$1${NC}"
}
print_success() {
echo -e "${GREEN}$1${NC}"
}
print_error() {
echo -e "${RED}$1${NC}"
}
# Show help message
show_help() {
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Setup MinIO and upload LoRA adapters from Hugging Face Hub"
echo ""
echo "Options:"
echo " (no options) Run full setup: start MinIO, download and upload LoRA"
echo " --start Only start MinIO container"
echo " --stop Stop and remove MinIO container"
echo " --help, -h Show this help message"
echo ""
echo "Environment Variables:"
echo " HF_LORA_REPO Hugging Face repository (default: ${HF_LORA_REPO:-codelion/Qwen3-0.6B-accuracy-recovery-lora})"
echo " LORA_NAME Local name for the LoRA (default: ${LORA_NAME:-codelion/Qwen3-0.6B-accuracy-recovery-lora})"
echo ""
echo "Examples:"
echo " $0 # Full setup"
echo " $0 --start # Start MinIO only"
echo " $0 --stop # Stop MinIO"
echo " HF_LORA_REPO=user/repo $0 # Use custom LoRA"
echo ""
}
# Check if required tools are installed
check_dependencies() {
print_info "Checking dependencies..."
if ! command -v docker &> /dev/null; then
echo "Error: docker is not installed"
exit 1
fi
if ! command -v aws &> /dev/null; then
echo "Error: aws-cli is not installed. Install with: pip install awscli"
exit 1
fi
# Check for either hf or huggingface-cli
if command -v hf &> /dev/null; then
HF_CLI_CMD="hf"
print_success "Found Hugging Face CLI: hf ($(hf version))"
elif command -v huggingface-cli &> /dev/null; then
HF_CLI_CMD="huggingface-cli"
print_success "Found Hugging Face CLI: huggingface-cli ($(huggingface-cli version))"
else
echo "Error: Neither 'hf' nor 'huggingface-cli' is installed. Install with: pip install huggingface-hub[cli]"
exit 1
fi
print_success "All dependencies are installed"
}
# Start MinIO using Docker
start_minio() {
print_info "Setting up MinIO..."
# Create data directory
mkdir -p "${MINIO_DATA_DIR}"
# Stop and remove existing container if it exists
docker stop dynamo-minio 2>/dev/null || true
docker rm dynamo-minio 2>/dev/null || true
# Start MinIO
print_info "Starting MinIO container..."
docker run -d \
--name dynamo-minio \
-p 9000:9000 \
-p 9001:9001 \
-v "${MINIO_DATA_DIR}:/data" \
quay.io/minio/minio server /data \
--console-address ":9001"
# Wait for MinIO to be ready
print_info "Waiting for MinIO to be ready..."
for i in {1..30}; do
if curl -s ${MINIO_ENDPOINT}/minio/health/live > /dev/null 2>&1; then
print_success "MinIO is ready"
break
fi
if [ $i -eq 30 ]; then
echo "Error: MinIO did not start in time"
exit 1
fi
sleep 1
done
print_success "MinIO started successfully"
echo " - MinIO API: ${MINIO_ENDPOINT}"
echo " - MinIO Console: http://localhost:9001"
echo " - Username: ${MINIO_ACCESS_KEY}"
echo " - Password: ${MINIO_SECRET_KEY}"
}
# Configure AWS CLI for MinIO
configure_aws_cli() {
print_info "Configuring AWS CLI for MinIO..."
export AWS_ACCESS_KEY_ID="${MINIO_ACCESS_KEY}"
export AWS_SECRET_ACCESS_KEY="${MINIO_SECRET_KEY}"
export AWS_ENDPOINT_URL="${MINIO_ENDPOINT}"
# Create bucket if it doesn't exist
if ! aws --endpoint-url=${MINIO_ENDPOINT} s3 ls s3://${BUCKET_NAME} 2>/dev/null; then
print_info "Creating bucket: ${BUCKET_NAME}"
aws --endpoint-url=${MINIO_ENDPOINT} s3 mb s3://${BUCKET_NAME}
print_success "Bucket created"
else
print_success "Bucket already exists: ${BUCKET_NAME}"
fi
}
# Download LoRA from Hugging Face Hub
download_lora_from_hf() {
print_info "Downloading LoRA from Hugging Face Hub..."
echo " - Repository: ${HF_LORA_REPO}"
echo " - Local name: ${LORA_NAME}"
# Create temporary directory using mktemp (global variable for cleanup)
TEMP_DIR=$(mktemp -d -t lora_download_XXXXXX)
# Download LoRA adapter files using the detected CLI
print_info "Downloading adapter files using ${HF_CLI_CMD}..."
if [ "${HF_CLI_CMD}" = "huggingface-cli" ]; then
huggingface-cli download "${HF_LORA_REPO}" \
--local-dir "${TEMP_DIR}" \
--local-dir-use-symlinks False
else
hf download "${HF_LORA_REPO}" \
--local-dir "${TEMP_DIR}"
fi
print_success "LoRA downloaded to ${TEMP_DIR}"
rm -rf "${TEMP_DIR}/.cache"
# List downloaded files
echo "Downloaded files:"
ls -lh "${TEMP_DIR}"
}
# Upload LoRA to MinIO
upload_lora_to_minio() {
print_info "Uploading LoRA to MinIO..."
# Upload all files to S3
aws --endpoint-url=${MINIO_ENDPOINT} s3 sync \
"${TEMP_DIR}" \
"s3://${BUCKET_NAME}/${LORA_NAME}" \
--exclude "*.git*"
print_success "LoRA uploaded to s3://${BUCKET_NAME}/${LORA_NAME}"
# List uploaded files
echo "Uploaded files:"
aws --endpoint-url=${MINIO_ENDPOINT} s3 ls "s3://${BUCKET_NAME}/${LORA_NAME}/" --recursive
}
# Cleanup temp files
cleanup() {
if [ -n "${TEMP_DIR}" ] && [ -d "${TEMP_DIR}" ]; then
print_info "Cleaning up temporary files..."
rm -rf "${TEMP_DIR}"
print_success "Cleanup complete"
fi
}
# Stop MinIO
stop_minio() {
print_info "Stopping MinIO..."
if docker ps | grep -q dynamo-minio; then
docker stop dynamo-minio 2>/dev/null
print_success "MinIO container stopped"
else
print_info "MinIO container is not running"
fi
if docker ps -a | grep -q dynamo-minio; then
docker rm dynamo-minio 2>/dev/null
print_success "MinIO container removed"
fi
echo ""
echo "MinIO has been stopped."
echo "Data is preserved in: ${MINIO_DATA_DIR}"
echo ""
echo "To start MinIO again:"
echo " $0 --start"
echo ""
}
# Start MinIO only (without downloading/uploading LoRA)
start_only() {
echo "========================================"
echo "Starting MinIO"
echo "========================================"
echo ""
start_minio
echo ""
echo "========================================"
echo "MinIO Started!"
echo "========================================"
echo ""
echo "MinIO is now running."
echo ""
echo "To upload a LoRA, run the full setup:"
echo " $0"
echo ""
echo "Or manually upload using AWS CLI:"
echo " export AWS_ACCESS_KEY_ID=${MINIO_ACCESS_KEY}"
echo " export AWS_SECRET_ACCESS_KEY=${MINIO_SECRET_KEY}"
echo " aws --endpoint-url=${MINIO_ENDPOINT} s3 cp your-lora/ s3://${BUCKET_NAME}/your-lora/ --recursive"
echo ""
echo "To stop MinIO:"
echo " $0 --stop"
echo ""
}
# Full setup (start MinIO + download/upload LoRA)
full_setup() {
echo "========================================"
echo "MinIO Setup & LoRA Upload Script"
echo "========================================"
echo ""
check_dependencies
echo ""
start_minio
echo ""
configure_aws_cli
echo ""
download_lora_from_hf
echo ""
upload_lora_to_minio
echo ""
cleanup
echo ""
echo "========================================"
echo "Setup Complete!"
echo "========================================"
echo ""
echo "MinIO is running and LoRA has been uploaded."
echo ""
echo "Next steps:"
echo " 1. Run the Dynamo service with LoRA support:"
echo " ${SCRIPT_DIR}/agg_lora.sh"
echo ""
echo " 2. Load the LoRA adapter:"
echo " curl -X POST http://localhost:8081/v1/loras \\"
echo " -H \"Content-Type: application/json\" \\"
echo " -d '{\"lora_name\": \"${LORA_NAME}\", \"source\": {\"uri\": \"s3://${BUCKET_NAME}/${LORA_NAME}\"}}'"
echo ""
echo " 3. Run inference with the LoRA:"
echo " curl -X POST http://localhost:8000/v1/chat/completions \\"
echo " -H \"Content-Type: application/json\" \\"
echo " -d '{\"model\": \"${LORA_NAME}\", \"messages\": [{\"role\": \"user\", \"content\": \"your prompt here\"}]}'"
echo ""
echo "To stop MinIO:"
echo " $0 --stop"
echo ""
}
# Main execution
case "$MODE" in
start)
start_only
;;
stop)
stop_minio
;;
help)
show_help
exit 0
;;
full)
full_setup
;;
*)
echo "Error: Unknown mode '$MODE'"
show_help
exit 1
;;
esac
../../../../common/setup_minio.sh
\ No newline at end of file
# S3-compatible Storage Backend LoRA Integration Guide
This guide explains how to set up and use LoRA (Low-Rank Adaptation) adapters with Dynamo using S3-compatible storage backend (e.g. MinIO, AWS S3, GCS, etc.).
## Overview
This example demonstrates how to:
1. Set up MinIO as a local S3-compatible storage
2. Download LoRA adapters from Hugging Face Hub
3. Upload LoRA adapters to MinIO
4. Load and use LoRA adapters with Dynamo
5. Run inference with LoRA-adapted models
6. Manage (load/unload) LoRA adapters
## Prerequisites
### Required Software
- Docker (for running MinIO)
- Python 3.10+
- AWS CLI: `pip install awscli`
- Hugging Face CLI: `pip install huggingface-hub[cli]`
- jq (optional, for pretty JSON output): `sudo apt install jq`
### Python Dependencies
Make sure you have Dynamo installed with your chosen backend. See the
[Dynamo quickstart guide](https://docs.nvidia.com/dynamo/getting-started/quickstart)
for setup instructions.
## Quick Start
### Step 1: Setup MinIO and Upload LoRA
Run the setup script to start MinIO and download/upload a LoRA adapter from Hugging Face:
```bash
./setup_minio.sh
```
This script will:
- Start MinIO in a Docker container
- Download a LoRA adapter from Hugging Face Hub (default: `codelion/Qwen3-0.6B-accuracy-recovery-lora`)
- Upload the LoRA to MinIO at `s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora`
#### Script Options
The setup script supports different modes:
```bash
# Full setup (default) - start MinIO, download & upload LoRA
./setup_minio.sh
# Start MinIO only (without downloading/uploading)
./setup_minio.sh --start
# Stop MinIO
./setup_minio.sh --stop
# Show help
./setup_minio.sh --help
```
#### Customize the LoRA to Download
You can specify a different LoRA repository and name:
```bash
HF_LORA_REPO="username/lora-repo" \
LORA_NAME="my-lora" \
./setup_minio.sh
```
### Step 2: Launch Dynamo with LoRA Support
Start the Dynamo frontend and worker with LoRA support enabled:
```bash
./agg_lora.sh
```
This will:
- Set up AWS credentials for MinIO
- Start the Dynamo frontend on port 8000
- Start the Dynamo worker on port 8081 with LoRA support
Wait for the services to start (check the logs for "Application startup complete").
## Working with LoRAs
### 1. Check Available Models
List all available models (base model only at first):
```bash
curl http://localhost:8000/v1/models | jq .
```
### 2. Load a LoRA Adapter
Load a LoRA from S3-compatible storage backend (e.g. MinIO):
```bash
curl -X POST http://localhost:8081/v1/loras \
-H "Content-Type: application/json" \
-d '{
"lora_name": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
"source": {
"uri": "s3://my-loras/codelion/Qwen3-0.6B-accuracy-recovery-lora"
}
}' | jq .
```
Expected response:
```json
{
"status": "success",
"message": "LoRA adapter 'codelion/Qwen3-0.6B-accuracy-recovery-lora' loaded successfully",
"lora_name": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
"lora_id": 1207343256
}
```
### 3. List Loaded LoRAs
Check which LoRAs are currently loaded:
```bash
curl http://localhost:8081/v1/loras | jq .
```
### 4. Verify LoRA in Models List
After loading, the LoRA should appear in the models list:
```bash
curl http://localhost:8000/v1/models | jq .
```
You should see both the base model and the LoRA adapter listed.
### 5. Run Inference with LoRA
#### Using the LoRA-adapted model:
```bash
curl -X POST http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "codelion/Qwen3-0.6B-accuracy-recovery-lora",
"messages": [{
"role": "user",
"content": "What is good low risk investment strategy?"
}],
"max_tokens": 300,
"temperature": 0.1
}' | jq .
```
#### For comparison, using the base model:
```bash
curl -X POST http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen/Qwen3-0.6B",
"messages": [{
"role": "user",
"content": "What is good low risk investment strategy?"
}],
"max_tokens": 300
}' | jq .
```
### 6. Unload a LoRA
When you no longer need a LoRA, unload it to free up resources:
```bash
curl -X DELETE http://localhost:8081/v1/loras/codelion/Qwen3-0.6B-accuracy-recovery-lora | jq .
```
Expected response:
```json
{
"status": "success",
"message": "LoRA unloaded successfully"
}
```
After unloading, the LoRA will be removed from both `/v1/loras` and `/v1/models` endpoints.
## Configuration
### Environment Variables
The following environment variables can be configured:
```bash
# S3-compatible storage backend Configuration
export AWS_ENDPOINT=http://localhost:9000
export AWS_ACCESS_KEY_ID=minioadmin
export AWS_SECRET_ACCESS_KEY=minioadmin
export AWS_REGION=us-east-1
# Dynamo LoRA Configuration
export DYN_LORA_ENABLED=true
export DYN_LORA_PATH=/tmp/dynamo_loras_minio
```
### MinIO Console
Access the MinIO web console at `http://localhost:9001`
- Username: `minioadmin`
- Password: `minioadmin`
## Troubleshooting
### MinIO won't start
- Check if ports 9000 and 9001 are already in use
- Ensure Docker is running
- Check Docker logs: `docker logs dynamo-minio`
- Try stopping any existing MinIO containers: `./setup_minio.sh --stop`
- Restart MinIO: `./setup_minio.sh --start`
### LoRA fails to load
- Verify the LoRA is uploaded to MinIO: `aws --endpoint-url=http://localhost:9000 s3 ls s3://my-loras/`
- Check AWS credentials are set correctly
- Ensure the LoRA files are compatible with the base model
- Check worker logs for detailed error messages
### Inference fails
- Verify the model name matches exactly (case-sensitive)
- Check if the LoRA is loaded: `curl http://localhost:8081/v1/loras`
- Ensure the base model supports the LoRA rank
- Check that max_lora_rank in the worker config is >= the LoRA rank
### Cache issues
- Check the cache directory: `ls -la /tmp/dynamo_loras_minio/`
- Clear the cache if needed: `rm -rf /tmp/dynamo_loras_minio/*`
- Ensure the cache directory is writable
## Advanced Usage
### Loading Multiple LoRAs
You can load multiple LoRA adapters simultaneously:
```bash
# Load first LoRA
curl -X POST http://localhost:8081/v1/loras \
-H "Content-Type: application/json" \
-d '{"lora_name": "lora1", "source": {"uri": "s3://my-loras/lora1"}}'
# Load second LoRA
curl -X POST http://localhost:8081/v1/loras \
-H "Content-Type: application/json" \
-d '{"lora_name": "lora2", "source": {"uri": "s3://my-loras/lora2"}}'
```
### Using Different Base Models
To use a different base model, modify the `MODEL` environment variable:
```bash
MODEL=meta-llama/Llama-2-7b-hf ./agg_lora.sh
```
Ensure your LoRAs are compatible with the chosen base model.
## Cleanup
### Stop Services
Press `Ctrl+C` in the terminal running `agg_lora.sh` to stop Dynamo services.
### Stop MinIO
```bash
# Using the setup script (recommended)
./setup_minio.sh --stop
# Or manually with Docker
docker stop dynamo-minio
docker rm dynamo-minio
```
### Clean Up Data
```bash
# Remove MinIO data
rm -rf ~/dynamo_minio_data
# Remove LoRA cache
rm -rf /tmp/dynamo_loras_minio
```
## API Reference
### Load LoRA
- **Endpoint**: `POST /v1/loras`
- **Body**: `{"lora_name": "string", "source": {"uri": "string"}}`
- **Response**: `{"status": "success", "lora_id": int}`
### List LoRAs
- **Endpoint**: `GET /v1/loras`
- **Response**: Array of loaded LoRAs
### Unload LoRA
- **Endpoint**: `DELETE /v1/loras/{lora_name}`
- **Response**: `{"status": "success", "message": "string"}`
### List Models
- **Endpoint**: `GET /v1/models`
- **Response**: OpenAI-compatible models list
### Chat Completions
- **Endpoint**: `POST /v1/chat/completions`
- **Body**: OpenAI-compatible chat completion request
- **Response**: OpenAI-compatible chat completion response
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Shared script to set up MinIO and upload LoRA adapters from Hugging Face Hub.
# Backend-agnostic: symlink from any backend's lora/ directory.
# SCRIPT_DIR resolves to the directory of the symlink, not this file's location,
# so "Next steps" messages correctly reference the backend's launch script.
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Colors
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
NC='\033[0m'
# Configuration
MINIO_DATA_DIR="${HOME}/dynamo_minio_data"
MINIO_ENDPOINT="http://localhost:9000"
MINIO_ACCESS_KEY="minioadmin"
MINIO_SECRET_KEY="minioadmin"
BUCKET_NAME="my-loras"
# Default LoRA (override via env vars)
HF_LORA_REPO="${HF_LORA_REPO:-codelion/Qwen3-0.6B-accuracy-recovery-lora}"
LORA_NAME="${LORA_NAME:-codelion/Qwen3-0.6B-accuracy-recovery-lora}"
TEMP_DIR=""
# HF CLI: "hf" (v0.34.0+) or "huggingface-cli" (legacy)
HF_CLI_CMD=""
# Parse args
MODE="full"
case "${1:-}" in
--start) MODE="start" ;;
--stop) MODE="stop" ;;
-h|--help) MODE="help" ;;
"") MODE="full" ;;
*) echo -e "${RED}Error: Unknown option '$1'${NC}"; MODE="help" ;;
esac
info() { echo -e "${YELLOW}-> $1${NC}"; }
success() { echo -e "${GREEN}ok $1${NC}"; }
show_help() {
cat <<EOF
Usage: $0 [OPTIONS]
Setup MinIO and upload LoRA adapters from Hugging Face Hub.
Options:
(none) Full setup: start MinIO, download and upload LoRA
--start Start MinIO container only
--stop Stop and remove MinIO container
-h, --help Show this help
Environment Variables:
HF_LORA_REPO HF repository (default: $HF_LORA_REPO)
LORA_NAME Name for the LoRA (default: $LORA_NAME)
Examples:
$0 # Full setup
$0 --start # Start MinIO only
$0 --stop # Stop MinIO
HF_LORA_REPO=user/repo $0 # Custom LoRA
EOF
}
check_dependencies() {
info "Checking dependencies..."
command -v docker &>/dev/null || { echo "Error: docker not installed"; exit 1; }
command -v aws &>/dev/null || { echo "Error: aws-cli not installed (pip install awscli)"; exit 1; }
if command -v hf &>/dev/null; then
HF_CLI_CMD="hf"
elif command -v huggingface-cli &>/dev/null; then
HF_CLI_CMD="huggingface-cli"
else
echo "Error: Neither 'hf' nor 'huggingface-cli' installed (pip install huggingface-hub[cli])"
exit 1
fi
success "Dependencies OK (HF CLI: ${HF_CLI_CMD})"
}
start_minio() {
info "Setting up MinIO..."
mkdir -p "${MINIO_DATA_DIR}"
docker stop dynamo-minio 2>/dev/null || true
docker rm dynamo-minio 2>/dev/null || true
docker run -d --name dynamo-minio \
-p 9000:9000 -p 9001:9001 \
-v "${MINIO_DATA_DIR}:/data" \
quay.io/minio/minio server /data --console-address ":9001"
info "Waiting for MinIO..."
for i in {1..30}; do
curl -s ${MINIO_ENDPOINT}/minio/health/live >/dev/null 2>&1 && break
[ $i -eq 30 ] && { echo "Error: MinIO did not start in time"; exit 1; }
sleep 1
done
success "MinIO ready (API: ${MINIO_ENDPOINT}, Console: http://localhost:9001)"
}
configure_aws_cli() {
export AWS_ACCESS_KEY_ID="${MINIO_ACCESS_KEY}"
export AWS_SECRET_ACCESS_KEY="${MINIO_SECRET_KEY}"
export AWS_ENDPOINT_URL="${MINIO_ENDPOINT}"
if ! aws --endpoint-url=${MINIO_ENDPOINT} s3 ls s3://${BUCKET_NAME} 2>/dev/null; then
aws --endpoint-url=${MINIO_ENDPOINT} s3 mb s3://${BUCKET_NAME}
success "Bucket created: ${BUCKET_NAME}"
else
success "Bucket exists: ${BUCKET_NAME}"
fi
}
download_lora_from_hf() {
info "Downloading LoRA: ${HF_LORA_REPO}..."
TEMP_DIR=$(mktemp -d -t lora_download_XXXXXX)
if [ "${HF_CLI_CMD}" = "huggingface-cli" ]; then
huggingface-cli download "${HF_LORA_REPO}" \
--local-dir "${TEMP_DIR}" --local-dir-use-symlinks False
else
hf download "${HF_LORA_REPO}" --local-dir "${TEMP_DIR}"
fi
rm -rf "${TEMP_DIR}/.cache"
success "Downloaded to ${TEMP_DIR}"
}
upload_lora_to_minio() {
info "Uploading to s3://${BUCKET_NAME}/${LORA_NAME}..."
aws --endpoint-url=${MINIO_ENDPOINT} s3 sync \
"${TEMP_DIR}" "s3://${BUCKET_NAME}/${LORA_NAME}" --exclude "*.git*"
success "Upload complete"
}
cleanup() {
[ -n "${TEMP_DIR}" ] && [ -d "${TEMP_DIR}" ] && rm -rf "${TEMP_DIR}"
}
stop_minio() {
info "Stopping MinIO..."
docker stop dynamo-minio 2>/dev/null && success "Stopped" || info "Not running"
docker rm dynamo-minio 2>/dev/null && success "Removed" || true
echo "Data preserved in: ${MINIO_DATA_DIR}"
}
# --- Main ---
case "$MODE" in
help)
show_help; exit 0 ;;
stop)
stop_minio ;;
start)
start_minio ;;
full)
check_dependencies
start_minio
configure_aws_cli
download_lora_from_hf
upload_lora_to_minio
cleanup
echo ""
echo "Setup complete. Next steps:"
echo " 1. Launch: ${SCRIPT_DIR}/agg_lora.sh"
echo " 2. Load: curl -X POST http://localhost:8081/v1/loras \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"lora_name\": \"${LORA_NAME}\", \"source\": {\"uri\": \"s3://${BUCKET_NAME}/${LORA_NAME}\"}}'"
echo " 3. Infer: curl http://localhost:8000/v1/chat/completions \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"model\": \"${LORA_NAME}\", \"messages\": [{\"role\": \"user\", \"content\": \"Hello\"}]}'"
echo " 4. Stop: $0 --stop"
;;
esac
......@@ -22,6 +22,7 @@ import boto3
import requests
from botocore.client import Config
from botocore.exceptions import ClientError
from huggingface_hub import snapshot_download
if TYPE_CHECKING:
from mypy_boto3_s3.client import S3Client
......@@ -237,29 +238,17 @@ class MinioService:
f"Downloading LoRA {self.config.lora_repo} to {self._temp_download_dir}"
)
# Run with HF_HUB_OFFLINE unset so the download works even when
# Temporarily unset HF_HUB_OFFLINE so the download works even when
# the predownload_models fixture has already enabled offline mode.
# This only affects the subprocess env; the parent process is unchanged.
env = os.environ.copy()
env.pop("HF_HUB_OFFLINE", None)
result = subprocess.run(
[
"huggingface-cli",
"download",
old_offline = os.environ.pop("HF_HUB_OFFLINE", None)
try:
snapshot_download(
self.config.lora_repo,
"--local-dir",
self._temp_download_dir,
"--local-dir-use-symlinks",
"False",
],
capture_output=True,
text=True,
env=env,
)
if result.returncode != 0:
raise RuntimeError(f"Failed to download LoRA: {result.stderr}")
local_dir=self._temp_download_dir,
)
finally:
if old_offline is not None:
os.environ["HF_HUB_OFFLINE"] = old_offline
# Clean up cache directory
cache_dir = os.path.join(self._temp_download_dir, ".cache")
......
......@@ -5,6 +5,7 @@ import dataclasses
import logging
import os
from dataclasses import dataclass, field
from typing import Optional
import pytest
......@@ -14,6 +15,7 @@ from tests.serve.common import (
params_with_model_mark,
run_serve_deployment,
)
from tests.serve.lora_utils import MinioLoraConfig
from tests.utils.constants import DefaultPort
from tests.utils.engine_process import EngineConfig
from tests.utils.payload_builder import (
......@@ -28,6 +30,7 @@ from tests.utils.payload_builder import (
responses_payload_default,
responses_stream_payload_default,
)
from tests.utils.payloads import LoraTestChatPayload
logger = logging.getLogger(__name__)
......@@ -498,3 +501,97 @@ def test_sglang_disagg_dp_attention(
"""Test sglang disaggregated with DP attention (requires 4 GPUs)"""
# Kept for reference; this test uses a different launch path and is skipped
# ── LoRA Tests ──────────────────────────────────────────────────────────────
lora_dir = os.path.join(sglang_dir, "launch/lora")
def lora_chat_payload(
lora_name: str,
s3_uri: str,
system_port: int = DefaultPort.SYSTEM1.value,
repeat_count: int = 2,
expected_response: Optional[list] = None,
expected_log: Optional[list] = None,
max_tokens: int = 100,
temperature: float = 0.0,
) -> LoraTestChatPayload:
"""Create a LoRA-enabled chat payload for testing"""
return LoraTestChatPayload(
body={
"model": lora_name,
"messages": [
{
"role": "user",
"content": "What is deep learning? Answer in one sentence.",
}
],
"max_tokens": max_tokens,
"temperature": temperature,
"stream": False,
},
lora_name=lora_name,
s3_uri=s3_uri,
system_port=system_port,
repeat_count=repeat_count,
expected_response=expected_response
or ["learning", "neural", "network", "AI", "model"],
expected_log=expected_log or [],
)
@pytest.mark.sglang
@pytest.mark.e2e
@pytest.mark.gpu_1
@pytest.mark.model("Qwen/Qwen3-0.6B")
@pytest.mark.profiled_vram_gib(4.7)
@pytest.mark.requested_sglang_kv_tokens(2848)
@pytest.mark.timeout(158)
@pytest.mark.pre_merge
def test_sglang_lora_aggregated(
request,
runtime_services_dynamic_ports,
predownload_models,
minio_lora_service,
dynamo_dynamic_ports,
):
"""
Test LoRA inference with aggregated SGLang deployment.
This test:
1. Uses MinIO fixture to provide S3-compatible storage with uploaded LoRA
2. Starts SGLang with LoRA support enabled
3. Loads the LoRA adapter via system API
4. Runs inference with the LoRA model
"""
minio_config: MinioLoraConfig = minio_lora_service
lora_payload = lora_chat_payload(
lora_name=minio_config.lora_name,
s3_uri=minio_config.get_s3_uri(),
system_port=DefaultPort.SYSTEM1.value,
repeat_count=2,
)
config = SGLangConfig(
name="test_sglang_lora_aggregated",
directory=sglang_dir,
script_name="lora/agg_lora.sh",
marks=[],
model="Qwen/Qwen3-0.6B",
timeout=158,
env=minio_config.get_env_vars(),
request_payloads=[lora_payload],
)
config = dataclasses.replace(
config, frontend_port=dynamo_dynamic_ports.frontend_port
)
run_serve_deployment(
config,
request,
ports=dynamo_dynamic_ports,
extra_env=minio_config.get_env_vars(),
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment